diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 45babcb9e6f4a6a01711a99d97e901d185dc23ac..f9ffb266414456024560e4a55d5a78abb4d25016 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -117,15 +117,15 @@ def get_community_table(communities: List[List[str]], return pd.DataFrame(dic) -def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: +def lm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: """ - Make the lmm analysis to see if the exon regulated by a splicing factor \ + Make the lm analysis to see if the exon regulated by a splicing factor \ are equally distributed among the communities. :param df: The dataframe :param outfile: A name of a file :param nt: the nucleotide of interest - :return: the pvalue of lmm + :return: the pvalue of lm """ pandas2ri.activate() @@ -143,7 +143,6 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: dev.off() return(anova(mod, null_mod, test="Chisq")) } - """ % (nt, nt)) folder = outfile.parent / "diagnostics" folder.mkdir(parents=True, exist_ok=True) @@ -152,19 +151,19 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: return res["Pr(>Chisq)"][1] -def lmm_maker_summary(df: pd.DataFrame, outfile: Path, cpnt: str - ) -> pd.DataFrame: +def lm_maker_summary(df: pd.DataFrame, outfile: Path, cpnt: str + ) -> pd.DataFrame: """ - Make the lmm analysis to see if the exon regulated by a splicing factor \ + Make the lm analysis to see if the exon regulated by a splicing factor \ are equally distributed among the communities. :param df: The dataframe :param outfile: A name of a file :param cpnt: The component (nt, aa, dnt) of interest - :return: the pvalue of lmm + :return: the pvalue of lm """ pandas2ri.activate() - lmm = r( + lmf = r( """ require("DHARMa") @@ -181,7 +180,8 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, cpnt: str folder = outfile.parent / "diagnostics" folder.mkdir(parents=True, exist_ok=True) partial_name = outfile.name.replace('.txt', '') - res_df = lmm(df, str(folder), partial_name).reset_index() + df.to_csv(f'frequency_{cpnt}.txt', sep="\t", index=False) + res_df = lmf(df, str(folder), partial_name).reset_index() res_df.rename({'index': 'community'}, inplace=True, axis=1) res_df['community'] = res_df['community'].str.replace('community', '') res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" @@ -232,9 +232,9 @@ def create_ctrl_community(df: pd.DataFrame, return df -def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, - cpnt: str, outfile_diag: Path, cpnt_type: str - ) -> Tuple[pd.DataFrame, pd.DataFrame]: +def lm_with_ctrl(df: pd.DataFrame, feature: str, region: str, + cpnt: str, outfile_diag: Path, cpnt_type: str + ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ :param df: df: A dataframe containing the frequency of each nucleotide \ @@ -251,7 +251,7 @@ def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, list of feature. """ ndf = create_ctrl_community(df, feature, region, cpnt_type) - return ndf, lmm_maker_summary(ndf, outfile_diag, cpnt) + return ndf, lm_maker_summary(ndf, outfile_diag, cpnt) def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: @@ -397,7 +397,7 @@ def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, produce the database :return: The dataframe ready for barplot visualisation """ - if test_type == "lmm": + if test_type == "lm": # removing the size parameter df = df[df["community"] != "log(_size)"].copy() df.rename({"Pr(>|t|)": "p-adj"}, axis=1, inplace=True) @@ -433,7 +433,7 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, :param feature: The king of feature of interest """ sns.set() - test_name = "permutation" if test_type == "perm" else "lmm" + test_name = "permutation" if test_type == "perm" else "lm" g = sns.catplot(x="community", y=cpnt, data=df_bar, kind="bar", ci="sd", aspect=2.5, height=12, errwidth=0.5, capsize=.4, palette=["red"] + ["lightgray"] * (df_bar.shape[0] - 1)) @@ -454,8 +454,8 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, g.savefig(outfile) -def expand_results_lmm(df: pd.DataFrame, rdf: pd.DataFrame, - cpnt: str, feature: str) -> pd.DataFrame: +def expand_results_lm(df: pd.DataFrame, rdf: pd.DataFrame, + cpnt: str, feature: str) -> pd.DataFrame: """ Merge df and rdf together. @@ -528,18 +528,18 @@ def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, :param cpnt: The component (nt, aa, dnt) of interest :param outfile_ctrl: file used to stored the table and the figure \ containing the test communities and the control community - :param test_type: The type of test to make (permutation or lmm) + :param test_type: The type of test to make (permutation or lm) :param df_ctrl: A dataframe containing the frequency of each nucleotide \ in each exons/gene in fasterdb. :param dic_com: A dictionary linking each community to the exons \ it contains. :param iteration: The number of sub samples to create """ - if test_type == "lmm": - ndf, rdf = lmm_with_ctrl(df, feature, region, cpnt, - outfile_ctrl.parents[1] / outfile_ctrl.name, - cpnt_type) - df_bar = expand_results_lmm(ndf, rdf, cpnt, feature) + if test_type == "lm": + ndf, rdf = lm_with_ctrl(df, feature, region, cpnt, + outfile_ctrl.parents[1] / outfile_ctrl.name, + cpnt_type) + df_bar = expand_results_lm(ndf, rdf, cpnt, feature) else: rdf = perm_with_ctrl(df, feature, cpnt, df_ctrl, dic_com, iteration) df_bar = expand_results_perm(df, rdf, cpnt, feature, iteration) @@ -589,7 +589,7 @@ def create_outfiles(project: str, weight: int, global_weight: int, can be 'nt', 'dnt' or 'aa'. :param cpnt: The component (nt, aa, dnt) of interest :param feature: The kind of feature analysed - :param test_type: The type of test to make (permutation or lmm) + :param test_type: The type of test to make (permutation or lm) :return: file used to store diagnostics and a file used to store the \ table containing the test communities and the control community """ @@ -635,7 +635,7 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, it contains. :param feature: The kind of feature analysed :param region: the region of interest to extract from gene - :param test_type: The type of test to make (permutation or lmm) + :param test_type: The type of test to make (permutation or lm) :param iteration: The number of sub samples to create """ logging.debug(f"{test_type} for {project}, w:{weight}, " @@ -644,7 +644,7 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, same_gene, feature, cpnt_type, cpnt, test_type) res = {"project": project, "cpnt": cpnt, - 'pval': lmm_maker(df, outfile, cpnt)} + 'pval': lm_maker(df, outfile, cpnt)} create_and_save_ctrl_dataframe(df, feature, region, cpnt_type, cpnt, outfile_ctrl, test_type, df_ctrl, dic_com, iteration) @@ -708,13 +708,13 @@ def create_dataframes(project, weight, global_weight, same_gene, feature, same gene :param feature: The kind of analysed feature :param region: the region of interest to extract from gene - :param test_type: The type of test to make (permutation or lmm) + :param test_type: The type of test to make (permutation or lm) :param component_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. """ df = create_dataframe(project, weight, global_weight, same_gene, feature, region, component_type) - if test_type == 'lmm': + if test_type == 'lm': df_ctrl = pd.DataFrame() dic_com = {} else: @@ -725,17 +725,17 @@ def create_dataframes(project, weight, global_weight, same_gene, feature, return df, df_ctrl, dic_com -def multiple_nt_lmm_launcher(ps: int, - weight: int, - global_weight: int, - project: str, - same_gene: bool, - feature: str = 'exon', - region: str = '', - component_type: str = "nt", - test_type: str = "lmm", - iteration: int = 1000, - logging_level: str = "DISABLE"): +def multiple_nt_lm_launcher(ps: int, + weight: int, + global_weight: int, + project: str, + same_gene: bool, + feature: str = 'exon', + region: str = '', + component_type: str = "nt", + test_type: str = "lm", + iteration: int = 1000, + logging_level: str = "DISABLE"): """ Launch the statistical analysis for every