src/find_interaction_cluster/nt_and_community.py: add permutations tests

aacaf00f · nfontrod · 10a59086 · aacaf00f
Commit aacaf00f authored 4 years ago by nfontrod
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -7,12 +7,11 @@ Description: The goal of this script is to see if the nucleotide \
 frequency is not random between communities
 """
 import pandas as pd
 import logging
 import sqlite3
 from .config import ConfigGraph, get_communities
-from typing import List
+from typing import List, Tuple, Dict
 from doctest import testmod
 from functools import reduce
 from pathlib import Path
@@ -22,6 +21,9 @@ from .community_finder import get_projects
 from ..logging_conf import logging_def
 from itertools import product
 import multiprocessing as mp
+import numpy as np
+from .radomization_test_ppi import get_pvalue
+from tqdm import tqdm
 def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
@@ -82,20 +84,20 @@ def get_community_table(communities: List[List[str]],
    >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']]
    >>> get_community_table(c, 3, 'exon')
      community id_exon  community_size
-    0        C1     7_9               3
+    0        C2     7_9               3
-    1        C1    4_19               3
+    1        C2    4_19               3
-    2        C1     3_3               3
+    2        C2     3_3               3
    >>> c = [['1', '2'], ['7', '49', '3']]
    >>> get_community_table(c, 3, 'gene')
      community id_gene  community_size
-    0        C1       7               3
+    0        C2       7               3
-    1        C1      49               3
+    1        C2      49               3
-    2        C1       3               3
+    2        C2       3               3
    """
    dic = {"community": [], f"id_{feature}": [], "community_size": []}
    for k, comm in enumerate(communities):
        if len(comm) >= size_threshold:
-            name = f'C{k}'
+            name = f'C{k + 1}'
            clen = len(comm)
            for exon in comm:
                dic["community"].append(name)
@@ -173,7 +175,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
    res_df.rename({'index': 'community'}, inplace=True, axis=1)
    res_df['community'] = res_df['community'].str.replace('community', '')
    res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL"
-    mean_df = df[[nt, "community", "community_size"]].\
+    mean_df = df[[nt, "community", "community_size"]]. \
        groupby(["community", "community_size"]).mean().reset_index()
    return res_df.merge(mean_df, how="left", on="community")
@@ -193,21 +195,18 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
    return [str(cid[0]) for cid in res]
-def create_ctrl_community(df: pd.DataFrame, outfile: Path,
+def create_ctrl_community(df: pd.DataFrame,
                          feature: str = 'exon', region: str = ""
                          ) -> pd.DataFrame:
    """
    :param df: A dataframe containing the frequency of each feature in a \
    community.
-    :param outfile: The output table containing frequencies
    :param feature: The kind of feature to analyse
    :param region: only use if feature is 'gene'. Used to focus on \
    a given region in genes (can be gene, exon, intron).
    :return: A dataframe containing the frequency of every nucleotides \
    of every exon in a large community
    """
-    if outfile.is_file():
-        return pd.read_csv(outfile, sep="\t")
    size_threshold = 10 if feature == "gene" else 50
    cnx = sqlite3.connect(ConfigGraph.db_file)
    ft_id = get_ft_id(cnx, feature)
@@ -218,15 +217,165 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
    df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}")
    df_nt['community'] = ['C-CTRL'] * df_nt.shape[0]
    df = pd.concat([df, df_nt], axis=0, ignore_index=True)
-    df.to_csv(outfile, sep="\t", index=False)
    return df
+def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str,
+                  nt: str, outfile_diag: Path) -> pd.DataFrame:
+    """
+    :param df: df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param feature: The kind of feature analysed
+    :param region: the region of interest to extract from gene
+    :param nt: The nucleotide of interest
+    :param outfile_diag: File from which the diagnostics folder will be \
+    inferred
+    :return: The dataframe with the p-value compared to the control \
+    list of exons.
+    """
+    ndf = create_ctrl_community(df, feature, region)
+    return lmm_maker_summary(ndf, outfile_diag, nt)
+def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict:
+    """
+    Create a dictionary containing the exons contained in each community.
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param feature: the feature of interest (exon, gene)
+    :return: A dictionary linking each community to the exons it contains
+    >>> dataf = pd.DataFrame({"id_gene": ['1', '2', '3', '4'],
+    ... 'community': ['C1', 'C1', 'C2', 'C2']})
+    >>> get_feature_by_community(dataf, 'gene')
+    {'C1': ['1', '2'], 'C2': ['3', '4']}
+    >>> dataf.rename({"id_gene": "id_exon"}, axis=1, inplace=True)
+    >>> get_feature_by_community(dataf, 'exon')
+    {'C1': ['1', '2'], 'C2': ['3', '4']}
+    """
+    dic = {}
+    for i in range(df.shape[0]):
+        com, id_ft = df.iloc[i, :][['community', f'id_{feature}']]
+        if com in dic:
+            dic[com].append(id_ft)
+        else:
+            dic[com] = [id_ft]
+    return dic
+def get_permutation_mean(df_ctrl: pd.DataFrame,
+                         nt: str, size: int, iteration: int) -> List[float]:
+    """
+    Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
+    of `nt` frequencies from it.
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param nt: A nucleotide of interest
+    :param size: The size of each sub samples to create
+    :param iteration: The number of sub samples to create
+    :return: The list of mean frequencies of `nt` in each subsample
+    """
+    return [
+        float(np.mean(df_ctrl[nt].sample(size, replace=True).values))
+        for _ in range(iteration)
+    ]
+def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame,
+                        nt: str, iteration: int) -> Tuple[float, float, str]:
+    """
+    Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
+    of `nt` frequencies from it.
+    :param row: A line of a dataframe containing the frequency of \
+    each feature inside a community.
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param nt: A nucleotide of interest
+    :param iteration: The number of sub samples to create
+    :return: The ctrl mean frequency value of `nt` \
+    the pvalue and the regulation of the enrichment/impoverishment \
+    of the community in `row` compared to control exons.
+    """
+    list_values = get_permutation_mean(df_ctrl, nt, row["community_size"],
+                                       iteration)
+    pval, reg = get_pvalue(np.array(list_values), row[nt], iteration)
+    return float(np.mean(list_values)), pval, reg
+def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str,
+                 nt: str, iteration: int, dic_com: Dict) -> pd.DataFrame:
+    """
+    Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
+    of `nt` frequencies from it.
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param feature: The feature of interest (gene, exon)
+    :param nt: A nucleotide of interest
+    :param iteration: The number of sub samples to create
+    :param dic_com: A dictionary linking each community to the exons \
+    it contains.
+    :return: The dataframe containing p-values and regulation \
+    indicating the enrichment of
+    """
+    list_pval, list_reg, mean_ctrl = ([] for _ in range(3))
+    for i in tqdm(range(df.shape[0]), desc="performing permutations"):
+        row = df.iloc[i, :]
+        res = perm_community_pval(row,
+                                  df_ctrl.loc[
+                                    -df_ctrl[f'id_{feature}'
+                                             ].isin(dic_com[row['community']]),
+                                    :],
+                                  nt, iteration)
+        [x.append(y) for x, y in zip([mean_ctrl, list_pval, list_reg], res)]
+    adj_pvals = multipletests(list_pval, alpha=0.05,
+                              method='fdr_bh',
+                              is_sorted=False,
+                              returnsorted=False)[1]
+    adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . "
+                for i in range(len(list_reg))]
+    df[f'mean_{iteration}_ctrl'] = mean_ctrl
+    df[f'p-adj'] = adj_pvals
+    df[f'reg-adj'] = adj_regs
+    return df
+def perm_with_ctrl(df: pd.DataFrame, feature: str,
+                   nt: str, df_ctrl: pd.DataFrame, dic_com: Dict,
+                   iteration: int) -> pd.DataFrame:
+    """
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param feature: The kind of feature analysed
+    :param nt: The nucleotide of interest
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param dic_com: A dictionary linking each community to the exons \
+    it contains.
+    :param iteration: The number of sub samples to create
+    :return: The dataframe with the p-value compared to the control \
+    list of exons.
+    """
+    mean_df = df[[nt, "community", "community_size"]]. \
+        groupby(["community", "community_size"]).mean().reset_index()
+    return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com)
 def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
                            global_weight: int, same_gene: bool,
-                            nt: str, feature: str = "exon",
+                            nt: str, df_ctrl: pd.DataFrame,
-                            region: str = "",
+                            dic_com: Dict,
-                            ) -> pd.Series:
+                            feature: str = "exon",
+                            region: str = "", test_type: str = "",
+                            iteration: int = 1000) -> pd.Series:
    """
    Get data (proportion of `reg` regulated exons by a splicing factor
    `sf_name` within a community) for every community.
@@ -242,17 +391,22 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
    :param same_gene: Say if we consider as co-localised, exons within the \
    same gene (True) or not (False) (default False)
    :param nt: The nucleotide of interest
-    :param feature: The king of feature analysed
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param dic_com: A dictionary linking each community to the exons \
+    it contains.
+    :param feature: The kind of feature analysed
    :param region: the region of interest to extract from gene
+    :param test_type: The type of test to make (permutation or lmm)
+    :param iteration: The number of sub samples to create
    """
-    logging.debug(f"lmm for {project}, w:{weight}, "
+    logging.debug(f"{test_type} for {project}, w:{weight}, "
                  f"g:{global_weight} nt: {nt}")
    res = {"project": [], "nt": [], 'pval': []}
    outfile = ConfigGraph.get_community_file(project, weight,
                                             global_weight,
                                             same_gene, feature,
-                                             f"{nt}_stat.txt",
+                                             f"{nt}_stat_{test_type}.txt",
                                             "sf_community_enrichment")
    nfolder = outfile.parent / "nt_analysis"
    nfolder.mkdir(exist_ok=True, parents=True)
@@ -261,23 +415,25 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
    res['project'] = project
    res['nt'] = nt
    res['pval'] = pval
-    nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt",
-                                                            "_ctrl.txt")
-    print(df.head())
-    ndf = create_ctrl_community(df, nt_ctrl_table, feature, region)
-    sum_df = lmm_maker_summary(ndf, outfile, nt)
    outfile_ctrl = ConfigGraph.get_community_file(project, weight,
                                                  global_weight,
                                                  same_gene, feature,
-                                                  f"{nt}_VS_CTRL_stat.txt",
+                                                  f"{nt}_VS_CTRL_stat_"
+                                                  f"{test_type}.txt",
                                                  "sf_community_enrichment")
+    if test_type == "lmm":
+        rdf = lmm_with_ctrl(df, feature, region, nt, outfile)
+    else:
+        rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
    noutfile_ctrl = nfolder / outfile_ctrl.name
-    sum_df.to_csv(noutfile_ctrl, sep="\t", index=False)
+    rdf.to_csv(noutfile_ctrl, sep="\t", index=False)
    return pd.Series(res)
 def create_dataframe(project: str, weight: int, global_weight: int,
-                     same_gene: bool, feature: str = 'exon') -> pd.DataFrame:
+                     same_gene: bool, feature: str = 'exon',
+                     region: str = "", from_communities: bool = True
+                     ) -> pd.DataFrame:
    """
    :param project: The name of the project of interest
    :param weight: The minimum weight of interaction to consider
@@ -288,29 +444,65 @@ def create_dataframe(project: str, weight: int, global_weight: int,
    :param same_gene: Say if we consider as co-localised, exons within the \
    same gene (True) or not (False)
    :param feature: The kind of feature to analyse
+    :param from_communities: True if we only select gene/exons
+    :param region: the region of interest to extract from gene
    :return: A dataframe containing the frequency of every nucleotides \
    of every exon in a large community
    """
    size_threshold = 10 if feature == "gene" else 50
-    result = ConfigGraph.get_community_file(project, weight, global_weight,
-                                            same_gene, feature,
-                                            "_communities.txt")
-    communities = get_communities(result, 0)
-    ncommunities = [c for c in communities if len(c) >= size_threshold]
    cnx = sqlite3.connect(ConfigGraph.db_file)
-    list_exon = reduce(lambda x, y: x + y, ncommunities)
+    if from_communities:
-    df = get_nt_frequency(cnx, list_exon, feature)
+        result = ConfigGraph.get_community_file(project, weight, global_weight,
-    df_com = get_community_table(communities, size_threshold, feature)
+                                                same_gene, feature,
-    df = df.merge(df_com, how="left", on=f"id_{feature}")
+                                                "_communities.txt")
+        communities = get_communities(result, 0)
+        ncommunities = [c for c in communities if len(c) >= size_threshold]
+        list_exon = reduce(lambda x, y: x + y, ncommunities)
+        df_com = get_community_table(communities, size_threshold, feature)
+        df = get_nt_frequency(cnx, list_exon, feature, region)
+        df = df.merge(df_com, how="left", on=f"id_{feature}")
+    else:
+        list_exon = get_ft_id(cnx, feature)
+        df = get_nt_frequency(cnx, list_exon, feature, region)
    return df
+def create_dataframes(project, weight, global_weight, same_gene, feature,
+                      region, test_type
+                      ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
+    """
+    :param weight: The weight of interaction to consider
+    :param global_weight: The global weighs to consider. if \
+    the global weight is equal to 0  then the project `project` is \
+    used.
+    :param project: The project name, used only if global_weight = 0
+    :param same_gene: Say if we consider as co-localised exon within the \
+    same gene
+    :param feature: The kind of analysed feature
+    :param region: the region of interest to extract from gene
+    :param test_type: The type of test to make (permutation or lmm)
+    """
+    df = create_dataframe(project, weight, global_weight, same_gene,
+                          feature, region)
+    if test_type == 'lmm':
+        df_ctrl = pd.DataFrame()
+        dic_com = {}
+    else:
+        df_ctrl = create_dataframe(project, weight, global_weight, same_gene,
+                                   feature, region, from_communities=False)
+        dic_com = get_feature_by_community(df, feature)
+    return df, df_ctrl, dic_com
 def multiple_nt_lmm_launcher(ps: int,
                             weight: int,
                             global_weight: int,
                             project: str,
                             same_gene: bool,
                             feature: str = 'exon', region: str = '',
+                             test_type: str = "lmm",
+                             iteration: int = 1000,
                             logging_level: str = "DISABLE"):
    """
    Launch the statistical analysis for every
@@ -325,6 +517,8 @@ def multiple_nt_lmm_launcher(ps: int,
    same gene
    :param feature: The kind of analysed feature
    :param region: the region of interest to extract from gene
+    :param test_type: The type of test to make (permutation or lmm)
+    :param iteration: The number of iteration to perform
    :param logging_level: Level of information to display
    """
    ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
@@ -332,22 +526,21 @@ def multiple_nt_lmm_launcher(ps: int,
    logging.info("Checking if communities as an effect on nucleotide "
                 "frequency")
    project = get_projects(global_weight, project)
-    nt_list = ["A", "C", "G", "T", "S", "W"]
+    nt_list = ["A"]  # , "C", "G", "T", "S", "W"]
    condition = list(product([project], [weight], nt_list))
    processes = []
    pool = mp.Pool(processes=min(ps, len(condition)))
-    logging.debug("Calculating stats...")
+    logging.debug("Creating tables")
+    df, df_ctrl, dic_com = create_dataframes(project, weight, global_weight,
+                                             same_gene, feature, region,
+                                             test_type)
    for project, weight, nt in condition:
-        df = create_dataframe(project, weight, global_weight, same_gene,
-                              feature)
        nfile_table = ConfigGraph.get_community_file(
-                project, weight, global_weight, same_gene, feature,
+            project, weight, global_weight, same_gene, feature,
-                f"_nt_table.txt", "sf_community_enrichment")
+            f"_nt_table.txt", "sf_community_enrichment")
        df.to_csv(nfile_table, sep="\t", index=False)
+        args = [df, project, weight, global_weight, same_gene, nt, df_ctrl,
+                dic_com, feature, region, test_type, iteration]
-        args = [df, project, weight, global_weight, same_gene, nt, feature,
-                region]
        processes.append(pool.apply_async(get_stat_nt_communities, args))
    results = [p.get(timeout=None) for p in processes]
    pool.close()