From aacaf00f74a197f22c837ec49fc9aca81db4836b Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Mon, 23 Nov 2020 15:47:31 +0100
Subject: [PATCH] src/find_interaction_cluster/nt_and_community.py: add
 permutations tests

---
 .../nt_and_community.py                       | 291 +++++++++++++++---
 1 file changed, 242 insertions(+), 49 deletions(-)

diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py
index d95347d5..a899ebac 100644
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -7,12 +7,11 @@ Description: The goal of this script is to see if the nucleotide \
 frequency is not random between communities
 """
 
-
 import pandas as pd
 import logging
 import sqlite3
 from .config import ConfigGraph, get_communities
-from typing import List
+from typing import List, Tuple, Dict
 from doctest import testmod
 from functools import reduce
 from pathlib import Path
@@ -22,6 +21,9 @@ from .community_finder import get_projects
 from ..logging_conf import logging_def
 from itertools import product
 import multiprocessing as mp
+import numpy as np
+from .radomization_test_ppi import get_pvalue
+from tqdm import tqdm
 
 
 def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
@@ -82,20 +84,20 @@ def get_community_table(communities: List[List[str]],
     >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']]
     >>> get_community_table(c, 3, 'exon')
       community id_exon  community_size
-    0        C1     7_9               3
-    1        C1    4_19               3
-    2        C1     3_3               3
+    0        C2     7_9               3
+    1        C2    4_19               3
+    2        C2     3_3               3
     >>> c = [['1', '2'], ['7', '49', '3']]
     >>> get_community_table(c, 3, 'gene')
       community id_gene  community_size
-    0        C1       7               3
-    1        C1      49               3
-    2        C1       3               3
+    0        C2       7               3
+    1        C2      49               3
+    2        C2       3               3
     """
     dic = {"community": [], f"id_{feature}": [], "community_size": []}
     for k, comm in enumerate(communities):
         if len(comm) >= size_threshold:
-            name = f'C{k}'
+            name = f'C{k + 1}'
             clen = len(comm)
             for exon in comm:
                 dic["community"].append(name)
@@ -173,7 +175,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
     res_df.rename({'index': 'community'}, inplace=True, axis=1)
     res_df['community'] = res_df['community'].str.replace('community', '')
     res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL"
-    mean_df = df[[nt, "community", "community_size"]].\
+    mean_df = df[[nt, "community", "community_size"]]. \
         groupby(["community", "community_size"]).mean().reset_index()
     return res_df.merge(mean_df, how="left", on="community")
 
@@ -193,21 +195,18 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
     return [str(cid[0]) for cid in res]
 
 
-def create_ctrl_community(df: pd.DataFrame, outfile: Path,
+def create_ctrl_community(df: pd.DataFrame,
                           feature: str = 'exon', region: str = ""
                           ) -> pd.DataFrame:
     """
     :param df: A dataframe containing the frequency of each feature in a \
     community.
-    :param outfile: The output table containing frequencies
     :param feature: The kind of feature to analyse
     :param region: only use if feature is 'gene'. Used to focus on \
     a given region in genes (can be gene, exon, intron).
     :return: A dataframe containing the frequency of every nucleotides \
     of every exon in a large community
     """
-    if outfile.is_file():
-        return pd.read_csv(outfile, sep="\t")
     size_threshold = 10 if feature == "gene" else 50
     cnx = sqlite3.connect(ConfigGraph.db_file)
     ft_id = get_ft_id(cnx, feature)
@@ -218,15 +217,165 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
     df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}")
     df_nt['community'] = ['C-CTRL'] * df_nt.shape[0]
     df = pd.concat([df, df_nt], axis=0, ignore_index=True)
-    df.to_csv(outfile, sep="\t", index=False)
     return df
 
 
+def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str,
+                  nt: str, outfile_diag: Path) -> pd.DataFrame:
+    """
+
+    :param df: df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param feature: The kind of feature analysed
+    :param region: the region of interest to extract from gene
+    :param nt: The nucleotide of interest
+    :param outfile_diag: File from which the diagnostics folder will be \
+    inferred
+    :return: The dataframe with the p-value compared to the control \
+    list of exons.
+    """
+    ndf = create_ctrl_community(df, feature, region)
+    return lmm_maker_summary(ndf, outfile_diag, nt)
+
+
+def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict:
+    """
+    Create a dictionary containing the exons contained in each community.
+
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param feature: the feature of interest (exon, gene)
+    :return: A dictionary linking each community to the exons it contains
+
+    >>> dataf = pd.DataFrame({"id_gene": ['1', '2', '3', '4'],
+    ... 'community': ['C1', 'C1', 'C2', 'C2']})
+    >>> get_feature_by_community(dataf, 'gene')
+    {'C1': ['1', '2'], 'C2': ['3', '4']}
+    >>> dataf.rename({"id_gene": "id_exon"}, axis=1, inplace=True)
+    >>> get_feature_by_community(dataf, 'exon')
+    {'C1': ['1', '2'], 'C2': ['3', '4']}
+    """
+    dic = {}
+    for i in range(df.shape[0]):
+        com, id_ft = df.iloc[i, :][['community', f'id_{feature}']]
+
+        if com in dic:
+            dic[com].append(id_ft)
+        else:
+            dic[com] = [id_ft]
+    return dic
+
+
+def get_permutation_mean(df_ctrl: pd.DataFrame,
+                         nt: str, size: int, iteration: int) -> List[float]:
+    """
+    Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
+    of `nt` frequencies from it.
+
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param nt: A nucleotide of interest
+    :param size: The size of each sub samples to create
+    :param iteration: The number of sub samples to create
+    :return: The list of mean frequencies of `nt` in each subsample
+    """
+    return [
+        float(np.mean(df_ctrl[nt].sample(size, replace=True).values))
+        for _ in range(iteration)
+    ]
+
+
+def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame,
+                        nt: str, iteration: int) -> Tuple[float, float, str]:
+    """
+    Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
+    of `nt` frequencies from it.
+
+    :param row: A line of a dataframe containing the frequency of \
+    each feature inside a community.
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param nt: A nucleotide of interest
+    :param iteration: The number of sub samples to create
+    :return: The ctrl mean frequency value of `nt` \
+    the pvalue and the regulation of the enrichment/impoverishment \
+    of the community in `row` compared to control exons.
+    """
+    list_values = get_permutation_mean(df_ctrl, nt, row["community_size"],
+                                       iteration)
+    pval, reg = get_pvalue(np.array(list_values), row[nt], iteration)
+    return float(np.mean(list_values)), pval, reg
+
+
+def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str,
+                 nt: str, iteration: int, dic_com: Dict) -> pd.DataFrame:
+    """
+    Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
+    of `nt` frequencies from it.
+
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param feature: The feature of interest (gene, exon)
+    :param nt: A nucleotide of interest
+    :param iteration: The number of sub samples to create
+    :param dic_com: A dictionary linking each community to the exons \
+    it contains.
+    :return: The dataframe containing p-values and regulation \
+    indicating the enrichment of
+    """
+    list_pval, list_reg, mean_ctrl = ([] for _ in range(3))
+    for i in tqdm(range(df.shape[0]), desc="performing permutations"):
+        row = df.iloc[i, :]
+        res = perm_community_pval(row,
+                                  df_ctrl.loc[
+                                    -df_ctrl[f'id_{feature}'
+                                             ].isin(dic_com[row['community']]),
+                                    :],
+                                  nt, iteration)
+        [x.append(y) for x, y in zip([mean_ctrl, list_pval, list_reg], res)]
+    adj_pvals = multipletests(list_pval, alpha=0.05,
+                              method='fdr_bh',
+                              is_sorted=False,
+                              returnsorted=False)[1]
+    adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . "
+                for i in range(len(list_reg))]
+    df[f'mean_{iteration}_ctrl'] = mean_ctrl
+    df[f'p-adj'] = adj_pvals
+    df[f'reg-adj'] = adj_regs
+    return df
+
+
+def perm_with_ctrl(df: pd.DataFrame, feature: str,
+                   nt: str, df_ctrl: pd.DataFrame, dic_com: Dict,
+                   iteration: int) -> pd.DataFrame:
+    """
+
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param feature: The kind of feature analysed
+    :param nt: The nucleotide of interest
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param dic_com: A dictionary linking each community to the exons \
+    it contains.
+    :param iteration: The number of sub samples to create
+    :return: The dataframe with the p-value compared to the control \
+    list of exons.
+    """
+    mean_df = df[[nt, "community", "community_size"]]. \
+        groupby(["community", "community_size"]).mean().reset_index()
+    return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com)
+
+
 def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
                             global_weight: int, same_gene: bool,
-                            nt: str, feature: str = "exon",
-                            region: str = "",
-                            ) -> pd.Series:
+                            nt: str, df_ctrl: pd.DataFrame,
+                            dic_com: Dict,
+                            feature: str = "exon",
+                            region: str = "", test_type: str = "",
+                            iteration: int = 1000) -> pd.Series:
     """
     Get data (proportion of `reg` regulated exons by a splicing factor
     `sf_name` within a community) for every community.
@@ -242,17 +391,22 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False) (default False)
     :param nt: The nucleotide of interest
-    :param feature: The king of feature analysed
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param dic_com: A dictionary linking each community to the exons \
+    it contains.
+    :param feature: The kind of feature analysed
     :param region: the region of interest to extract from gene
+    :param test_type: The type of test to make (permutation or lmm)
+    :param iteration: The number of sub samples to create
     """
-    logging.debug(f"lmm for {project}, w:{weight}, "
+    logging.debug(f"{test_type} for {project}, w:{weight}, "
                   f"g:{global_weight} nt: {nt}")
     res = {"project": [], "nt": [], 'pval': []}
-
     outfile = ConfigGraph.get_community_file(project, weight,
                                              global_weight,
                                              same_gene, feature,
-                                             f"{nt}_stat.txt",
+                                             f"{nt}_stat_{test_type}.txt",
                                              "sf_community_enrichment")
     nfolder = outfile.parent / "nt_analysis"
     nfolder.mkdir(exist_ok=True, parents=True)
@@ -261,23 +415,25 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
     res['project'] = project
     res['nt'] = nt
     res['pval'] = pval
-    nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt",
-                                                            "_ctrl.txt")
-    print(df.head())
-    ndf = create_ctrl_community(df, nt_ctrl_table, feature, region)
-    sum_df = lmm_maker_summary(ndf, outfile, nt)
     outfile_ctrl = ConfigGraph.get_community_file(project, weight,
                                                   global_weight,
                                                   same_gene, feature,
-                                                  f"{nt}_VS_CTRL_stat.txt",
+                                                  f"{nt}_VS_CTRL_stat_"
+                                                  f"{test_type}.txt",
                                                   "sf_community_enrichment")
+    if test_type == "lmm":
+        rdf = lmm_with_ctrl(df, feature, region, nt, outfile)
+    else:
+        rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
     noutfile_ctrl = nfolder / outfile_ctrl.name
-    sum_df.to_csv(noutfile_ctrl, sep="\t", index=False)
+    rdf.to_csv(noutfile_ctrl, sep="\t", index=False)
     return pd.Series(res)
 
 
 def create_dataframe(project: str, weight: int, global_weight: int,
-                     same_gene: bool, feature: str = 'exon') -> pd.DataFrame:
+                     same_gene: bool, feature: str = 'exon',
+                     region: str = "", from_communities: bool = True
+                     ) -> pd.DataFrame:
     """
     :param project: The name of the project of interest
     :param weight: The minimum weight of interaction to consider
@@ -288,29 +444,65 @@ def create_dataframe(project: str, weight: int, global_weight: int,
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False)
     :param feature: The kind of feature to analyse
+    :param from_communities: True if we only select gene/exons
+    :param region: the region of interest to extract from gene
     :return: A dataframe containing the frequency of every nucleotides \
     of every exon in a large community
     """
     size_threshold = 10 if feature == "gene" else 50
-    result = ConfigGraph.get_community_file(project, weight, global_weight,
-                                            same_gene, feature,
-                                            "_communities.txt")
-    communities = get_communities(result, 0)
-    ncommunities = [c for c in communities if len(c) >= size_threshold]
     cnx = sqlite3.connect(ConfigGraph.db_file)
-    list_exon = reduce(lambda x, y: x + y, ncommunities)
-    df = get_nt_frequency(cnx, list_exon, feature)
-    df_com = get_community_table(communities, size_threshold, feature)
-    df = df.merge(df_com, how="left", on=f"id_{feature}")
+    if from_communities:
+        result = ConfigGraph.get_community_file(project, weight, global_weight,
+                                                same_gene, feature,
+                                                "_communities.txt")
+        communities = get_communities(result, 0)
+        ncommunities = [c for c in communities if len(c) >= size_threshold]
+        list_exon = reduce(lambda x, y: x + y, ncommunities)
+
+        df_com = get_community_table(communities, size_threshold, feature)
+        df = get_nt_frequency(cnx, list_exon, feature, region)
+        df = df.merge(df_com, how="left", on=f"id_{feature}")
+    else:
+        list_exon = get_ft_id(cnx, feature)
+        df = get_nt_frequency(cnx, list_exon, feature, region)
     return df
 
 
+def create_dataframes(project, weight, global_weight, same_gene, feature,
+                      region, test_type
+                      ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
+    """
+    :param weight: The weight of interaction to consider
+    :param global_weight: The global weighs to consider. if \
+    the global weight is equal to 0  then the project `project` is \
+    used.
+    :param project: The project name, used only if global_weight = 0
+    :param same_gene: Say if we consider as co-localised exon within the \
+    same gene
+    :param feature: The kind of analysed feature
+    :param region: the region of interest to extract from gene
+    :param test_type: The type of test to make (permutation or lmm)
+    """
+    df = create_dataframe(project, weight, global_weight, same_gene,
+                          feature, region)
+    if test_type == 'lmm':
+        df_ctrl = pd.DataFrame()
+        dic_com = {}
+    else:
+        df_ctrl = create_dataframe(project, weight, global_weight, same_gene,
+                                   feature, region, from_communities=False)
+        dic_com = get_feature_by_community(df, feature)
+    return df, df_ctrl, dic_com
+
+
 def multiple_nt_lmm_launcher(ps: int,
                              weight: int,
                              global_weight: int,
                              project: str,
                              same_gene: bool,
                              feature: str = 'exon', region: str = '',
+                             test_type: str = "lmm",
+                             iteration: int = 1000,
                              logging_level: str = "DISABLE"):
     """
     Launch the statistical analysis for every
@@ -325,6 +517,8 @@ def multiple_nt_lmm_launcher(ps: int,
     same gene
     :param feature: The kind of analysed feature
     :param region: the region of interest to extract from gene
+    :param test_type: The type of test to make (permutation or lmm)
+    :param iteration: The number of iteration to perform
     :param logging_level: Level of information to display
     """
     ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
@@ -332,22 +526,21 @@ def multiple_nt_lmm_launcher(ps: int,
     logging.info("Checking if communities as an effect on nucleotide "
                  "frequency")
     project = get_projects(global_weight, project)
-    nt_list = ["A", "C", "G", "T", "S", "W"]
+    nt_list = ["A"]  # , "C", "G", "T", "S", "W"]
     condition = list(product([project], [weight], nt_list))
     processes = []
     pool = mp.Pool(processes=min(ps, len(condition)))
-    logging.debug("Calculating stats...")
+    logging.debug("Creating tables")
+    df, df_ctrl, dic_com = create_dataframes(project, weight, global_weight,
+                                             same_gene, feature, region,
+                                             test_type)
     for project, weight, nt in condition:
-        df = create_dataframe(project, weight, global_weight, same_gene,
-                              feature)
         nfile_table = ConfigGraph.get_community_file(
-                project, weight, global_weight, same_gene, feature,
-                f"_nt_table.txt", "sf_community_enrichment")
+            project, weight, global_weight, same_gene, feature,
+            f"_nt_table.txt", "sf_community_enrichment")
         df.to_csv(nfile_table, sep="\t", index=False)
-
-
-        args = [df, project, weight, global_weight, same_gene, nt, feature,
-                region]
+        args = [df, project, weight, global_weight, same_gene, nt, df_ctrl,
+                dic_com, feature, region, test_type, iteration]
         processes.append(pool.apply_async(get_stat_nt_communities, args))
     results = [p.get(timeout=None) for p in processes]
     pool.close()
-- 
GitLab