From efbd8f90ddbbed6982c7f3e2d6b41275c3ceed8f Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Fri, 7 Aug 2020 11:45:49 +0200
Subject: [PATCH] src/find_interaction_cluster/sf_and_communities.py: creation
 of expand_dataframe, glmm_maker and glmm_statistics to see if the regulation
 of exons by a splicing factor between communities is not random

---
 .../sf_and_communities.py                     | 141 +++++++++++++++++-
 1 file changed, 135 insertions(+), 6 deletions(-)

diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py
index 92209a18..724406d3 100644
--- a/src/find_interaction_cluster/sf_and_communities.py
+++ b/src/find_interaction_cluster/sf_and_communities.py
@@ -24,6 +24,8 @@ from functools import reduce
 from operator import add
 import statsmodels.formula.api as api
 from statsmodels.stats.multitest import multipletests
+from rpy2.robjects import r, pandas2ri
+from pathlib import Path
 
 
 def get_exon_regulated_in_communities(community: List[str],
@@ -127,9 +129,125 @@ def logistic_regression(n1: int, t1: int, n2: int, t2: int) -> float:
     return api.logit('y ~ x', data=df).fit(disp=False).pvalues['x[T.test]']
 
 
+def expand_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Expend the dataframe df to be able to make a general mixed model on it.
+
+    :param df: The dataframe to expand
+    :return:  The dataframe expanded
+
+    >>> d = pd.DataFrame({"community": ["C0", "C1"],
+    ... "reg_in_community": [2, 3],
+    ... "community_size": [5, 7],
+    ... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]})
+    >>> expand_dataframe(d)
+        is_reg community  reg_in_community  community_size
+    0        1        C0                 2               5
+    1        1        C0                 2               5
+    2        0        C0                 2               5
+    3        0        C0                 2               5
+    4        0        C0                 2               5
+    5        1        C1                 3               7
+    6        1        C1                 3               7
+    7        1        C1                 3               7
+    8        0        C1                 3               7
+    9        0        C1                 3               7
+    10       0        C1                 3               7
+    11       0        C1                 3               7
+
+    """
+    content = []
+    for row in df.values:
+        row = list(row)
+        values = [1] * row[1] + [0] * (row[2] - row[1])
+        for v in values:
+            content.append([v] + row)
+    df = pd.DataFrame(content, columns=["is_reg"] + list(df.columns))
+    df.drop(["%reg in community", "pval", "padj"], axis=1, inplace=True)
+    return df
+
+
+def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float:
+    """
+    Make the glmm analysis to see if the exon regulated by a splicing factor \
+    are equally distributed among the communities.
+
+    :param expanded_df: The expanded dataframe
+    :param outfile: A name of a file
+    :return: the pvalue of glmm
+
+    >>> d = pd.DataFrame({"community": ["C0", "C1"],
+    ... "reg_in_community": [2, 3],
+    ... "community_size": [5, 7],
+    ... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]})
+    >>> e_df = expand_dataframe(d)
+    >>> outfile = ConfigGraph.get_community_file("Test", 1, 1, True,
+    ... "_stat.txt", True)
+    >>> glmm_maker(e_df, outfile)
+    1.0
+    """
+    pandas2ri.activate()
+    glmm = r(
+        """
+        require("lme4")
+        require("DHARMa")
+        
+        function(data, folder, partial_name) {
+            null_mod <- glm(is_reg ~ log(community_size) , family=binomial, data=data)
+            mod <- glmer(is_reg ~ log(community_size) + (1 | community), data=data, family=binomial, nAGQ=0)
+            simulationOutput <- simulateResiduals(fittedModel = mod, n = 250)
+            png(paste0(folder, "/dignostics_", partial_name, ".png"))
+            plot(simulationOutput)
+            dev.off()
+            return(anova(mod, null_mod, test="Chisq"))
+        }
+        
+        """)
+    folder = outfile.parent / "diagnostics"
+    folder.mkdir(parents=True, exist_ok=True)
+    partial_name = outfile.name.replace('.txt', '')
+    return glmm(expanded_df, str(folder), partial_name)["Pr(>Chisq)"][1]
+
+
+def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str,
+                    project: str, weight: int, global_weight: int,
+                    same_gene: bool) -> pd.Series:
+    """
+    Create the glmm statistics for a given splicing factor with \
+    given communities.
+
+    :param df: dataframe contaning stats data.
+    :param sf_name: The splicing factor of interest
+    :param reg: The regulation of interest
+    :param project: The name of the project of interest
+    :param weight: The minimum weight of interaction to consider
+    :param global_weight: The global weight to consider. if \
+    the global weight is equal to 0 then then density figure are calculated \
+    by project, else all projet are merge together and the interaction \
+    seen in `global_weight` project are taken into account
+    :param same_gene: Say if we consider as co-localised, exons within the \
+    same gene (True) or not (False) (default False)
+    :return: The glmm pvalue among with other informations
+    """
+    ndf = df.loc[-df['community'].isin(["All-community", "FASTERDB"]),
+          :].copy()
+    expanded_df = expand_dataframe(ndf)
+    outfile = ConfigGraph.get_community_file(project, weight, global_weight,
+                                             same_gene,
+                                             f"{sf_name}_{reg}_stat.txt", True)
+    noutfold = outfile.parent / "expanded_df"
+    noutfold.mkdir(exist_ok=True, parents=True)
+    noutfile = noutfold / outfile.name
+    expanded_df.to_csv(noutfile, sep="\t", index=False)
+    pval = glmm_maker(expanded_df, outfile)
+    return pd.Series({"project": project, "sf": sf_name, "reg": reg,
+                      "pval": pval})
+
+
 def get_stat4communities(sf_name: str, reg: str,
                          project: str, weight: int,
-                         global_weight: int, same_gene: bool) -> pd.DataFrame:
+                         global_weight: int, same_gene: bool
+                         ) -> Tuple[pd.DataFrame, pd.Series]:
     """
     Get data (proportion of `reg` regulated exons by a splicing factor
     `sf_name` within a community) for every community.
@@ -172,7 +290,10 @@ def get_stat4communities(sf_name: str, reg: str,
     d['sf'] = [sf_name] * len(d["community"])
     d['reg'] = [reg] * len(d["community"])
     d['project'] = [project] * len(d["community"])
-    return pd.DataFrame(d)
+    df = pd.DataFrame(d)
+    s = glmm_statistics(df, sf_name, reg, project, weight, global_weight,
+                        same_gene)
+    return df, s
 
 
 def get_sfname() -> List[str]:
@@ -240,19 +361,27 @@ def multiple_stat_launcher(ps: int, weights: List[int],
             processes[ckey] = [pool.apply_async(get_stat4communities, args)]
         else:
             processes[ckey].append(pool.apply_async(get_stat4communities, args))
-    for p in processes:
+    for p, value in processes.items():
         project, weight = p.split("_")
-        list_df = []
-        for proc in processes[p]:
-            list_df.append(proc.get(timeout=None))
+        list_tuples = [proc.get(timeout=None) for proc in value]
         pool.close()
         pool.join()
+        list_df = [t[0] for t in list_tuples]
+        list_series = [t[1] for t in list_tuples]
         df = pd.concat(list_df, axis=0, ignore_index=True)
         outfile = ConfigGraph.get_community_file(project, weight,
                                                  dic_project[project],
                                                  same_gene,
                                                  "_stat.txt", True)
         df.to_csv(outfile, sep="\t", index=False)
+        glm_df = pd.DataFrame(list_series)
+        glm_df["padj"] = multipletests(glm_df['pval'].values,
+                                       method='fdr_bh')[1]
+        outfile = ConfigGraph.get_community_file(project, weight,
+                                                 dic_project[project],
+                                                 same_gene,
+                                                 "_glmm_stat.txt", True)
+        glm_df.to_csv(outfile, sep="\t", index=False)
 
 
 if __name__ == "__main__":
-- 
GitLab