From 263f00e635909740b232d8bae85de4999cdaacf4 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Tue, 27 Oct 2020 14:04:46 +0100
Subject: [PATCH] src/find_interaction_cluster/nt_and_community.py: add a
 script allowing to create a control community to see if the communities are
 different from the rest of the genes/exons not in a community

---
 .../nt_and_community.py                       | 93 ++++++++++++++++++-
 1 file changed, 92 insertions(+), 1 deletion(-)

diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py
index ec799b42..48896d40 100644
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -135,6 +135,86 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float:
     return res["Pr(>Chisq)"][1]
 
 
+def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
+                      ) -> pd.DataFrame:
+    """
+    Make the lmm analysis to see if the exon regulated by a splicing factor \
+    are equally distributed among the communities.
+
+    :param df: The dataframe
+    :param outfile: A name of a file
+    :param nt: the nucleotide of interest
+    :return: the pvalue of lmm
+
+    """
+    pandas2ri.activate()
+    lmm = r(
+        """
+        require("DHARMa")
+
+        function(data, folder, partial_name) {
+            mod <- lm(%s ~ log(community_size) +  community, data=data)
+            simulationOutput <- simulateResiduals(fittedModel = mod, n = 250)
+            png(paste0(folder, "/dignostics_summary", partial_name, ".png"))
+            plot(simulationOutput)
+            dev.off()
+            return(as.data.frame(summary(mod)$coefficients))
+        }
+
+        """ % nt)
+    folder = outfile.parent / "diagnostics"
+    folder.mkdir(parents=True, exist_ok=True)
+    partial_name = outfile.name.replace('.txt', '')
+    res_df = lmm(df, str(folder), partial_name).reset_index()
+    res_df.rename({'index': 'community'}, inplace=True, axis=1)
+    res_df['community'] = res_df['community'].str.replace('community', '')
+    res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL"
+    mean_df = df[[nt, "community", "community_size"]].\
+        groupby(["community", "community_size"]).mean().reset_index()
+    return res_df.merge(mean_df, how="left", on="community")
+
+
+def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
+    """
+    Return the id of every gene/exons in chia-pet database.
+
+    :param cnx: A connection to chiapet database
+    :param feature: The feature of interest
+    :return: The list of feature id
+    """
+    query = f"SELECT DISTINCT id FROM cin_{feature}"
+    c = cnx.cursor()
+    c.execute(query)
+    res = c.fetchall()
+    return [str(cid[0]) for cid in res]
+
+
+def create_ctrl_community(df: pd.DataFrame, outfile: Path,
+                          feature: str = 'exon') -> pd.DataFrame:
+    """
+    :param df: A dataframe containing the frequency of each feature in a \
+    community.
+    :param outfile: The output table containing frequencies
+    :param feature: The kind of feature to analyse
+    :return: A dataframe containing the frequency of every nucleotides \
+    of every exon in a large community
+    """
+    if outfile.is_file():
+        return pd.read_csv(outfile, sep="\t")
+    size_threshold = 10 if feature == "gene" else 50
+    cnx = sqlite3.connect(ConfigGraph.db_file)
+    ft_id = get_ft_id(cnx, feature)
+    list_ft = [cid for cid in ft_id
+               if cid not in df[f"id_{feature}"].to_list()]
+    df_nt = get_nt_frequency(cnx, list_ft, feature)
+    df_com = get_community_table([list_ft], size_threshold, feature)
+    df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}")
+    df_nt['community'] = ['C-CTRL'] * df_nt.shape[0]
+    df = pd.concat([df, df_nt], axis=0, ignore_index=True)
+    df.to_csv(outfile, sep="\t", index=False)
+    return df
+
+
 def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
                             global_weight: int, same_gene: bool,
                             nt: str, feature: str = "exon",
@@ -172,6 +252,17 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
     res['project'] = project
     res['nt'] = nt
     res['pval'] = pval
+    nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt",
+                                                            "_ctrl.txt")
+    ndf = create_ctrl_community(df, nt_ctrl_table, feature)
+    sum_df = lmm_maker_summary(ndf, outfile, nt)
+    outfile_ctrl = ConfigGraph.get_community_file(project, weight,
+                                                  global_weight,
+                                                  same_gene, feature,
+                                                  f"{nt}_VS_CTRL_stat.txt",
+                                                  True)
+    noutfile_ctrl = nfolder / outfile_ctrl.name
+    sum_df.to_csv(noutfile_ctrl, sep="\t", index=False)
     return pd.Series(res)
 
 
@@ -247,7 +338,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
             nfile_table = ConfigGraph.get_community_file(project, weight,
                                                          global_weight,
                                                          same_gene, feature,
-                                                         f"nt_table.txt",
+                                                         f"_nt_table.txt",
                                                          True)
             df.to_csv(nfile_table, sep="\t", index=False)
             dic_df[ckey] = df
-- 
GitLab