diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index ec799b42d545507a413d64d4d92292eed7a1c78f..48896d40879815fffa759794518de252c62a100b 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -135,6 +135,86 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: return res["Pr(>Chisq)"][1] +def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str + ) -> pd.DataFrame: + """ + Make the lmm analysis to see if the exon regulated by a splicing factor \ + are equally distributed among the communities. + + :param df: The dataframe + :param outfile: A name of a file + :param nt: the nucleotide of interest + :return: the pvalue of lmm + + """ + pandas2ri.activate() + lmm = r( + """ + require("DHARMa") + + function(data, folder, partial_name) { + mod <- lm(%s ~ log(community_size) + community, data=data) + simulationOutput <- simulateResiduals(fittedModel = mod, n = 250) + png(paste0(folder, "/dignostics_summary", partial_name, ".png")) + plot(simulationOutput) + dev.off() + return(as.data.frame(summary(mod)$coefficients)) + } + + """ % nt) + folder = outfile.parent / "diagnostics" + folder.mkdir(parents=True, exist_ok=True) + partial_name = outfile.name.replace('.txt', '') + res_df = lmm(df, str(folder), partial_name).reset_index() + res_df.rename({'index': 'community'}, inplace=True, axis=1) + res_df['community'] = res_df['community'].str.replace('community', '') + res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" + mean_df = df[[nt, "community", "community_size"]].\ + groupby(["community", "community_size"]).mean().reset_index() + return res_df.merge(mean_df, how="left", on="community") + + +def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: + """ + Return the id of every gene/exons in chia-pet database. + + :param cnx: A connection to chiapet database + :param feature: The feature of interest + :return: The list of feature id + """ + query = f"SELECT DISTINCT id FROM cin_{feature}" + c = cnx.cursor() + c.execute(query) + res = c.fetchall() + return [str(cid[0]) for cid in res] + + +def create_ctrl_community(df: pd.DataFrame, outfile: Path, + feature: str = 'exon') -> pd.DataFrame: + """ + :param df: A dataframe containing the frequency of each feature in a \ + community. + :param outfile: The output table containing frequencies + :param feature: The kind of feature to analyse + :return: A dataframe containing the frequency of every nucleotides \ + of every exon in a large community + """ + if outfile.is_file(): + return pd.read_csv(outfile, sep="\t") + size_threshold = 10 if feature == "gene" else 50 + cnx = sqlite3.connect(ConfigGraph.db_file) + ft_id = get_ft_id(cnx, feature) + list_ft = [cid for cid in ft_id + if cid not in df[f"id_{feature}"].to_list()] + df_nt = get_nt_frequency(cnx, list_ft, feature) + df_com = get_community_table([list_ft], size_threshold, feature) + df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") + df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] + df = pd.concat([df, df_nt], axis=0, ignore_index=True) + df.to_csv(outfile, sep="\t", index=False) + return df + + def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, global_weight: int, same_gene: bool, nt: str, feature: str = "exon", @@ -172,6 +252,17 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, res['project'] = project res['nt'] = nt res['pval'] = pval + nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt", + "_ctrl.txt") + ndf = create_ctrl_community(df, nt_ctrl_table, feature) + sum_df = lmm_maker_summary(ndf, outfile, nt) + outfile_ctrl = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, feature, + f"{nt}_VS_CTRL_stat.txt", + True) + noutfile_ctrl = nfolder / outfile_ctrl.name + sum_df.to_csv(noutfile_ctrl, sep="\t", index=False) return pd.Series(res) @@ -247,7 +338,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], nfile_table = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, feature, - f"nt_table.txt", + f"_nt_table.txt", True) df.to_csv(nfile_table, sep="\t", index=False) dic_df[ckey] = df