From 93caebe123b084aa8b9056e9fe0f6ab356e7fe5e Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Tue, 17 Nov 2020 15:11:55 +0100
Subject: [PATCH] src/find_interaction_cluster/radomization_test_ppi.py: add 3
 functions to check if the number communities that are enriched in common
 genes between dna and protein level is also enriched

---
 .../radomization_test_ppi.py                  | 233 +++++++++++++++++-
 1 file changed, 220 insertions(+), 13 deletions(-)

diff --git a/src/find_interaction_cluster/radomization_test_ppi.py b/src/find_interaction_cluster/radomization_test_ppi.py
index fa046d4a..4a7e2e95 100644
--- a/src/find_interaction_cluster/radomization_test_ppi.py
+++ b/src/find_interaction_cluster/radomization_test_ppi.py
@@ -90,8 +90,8 @@ def get_common_genes(dna_gene: np.array, ppi_gene: np.array) -> int:
 
 
 def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array],
-                         ppi_size: int, ppi_gene: np.array, iteration: int
-                         ) -> np.array:
+                         ppi_size: int, ppi_gene: np.array, iteration: int,
+                         use_seed: bool = True) -> np.array:
     """
     Return the number of common gene between genes in the community \
     `community`and a sublist of randomly chosen genes in `ppi_gene` of \
@@ -103,6 +103,7 @@ def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array],
     :param ppi_size: The size of the ppi_gene subsample
     :param ppi_gene: All the gene interacting at protein level
     :param iteration: The number of time we want to repeat the process
+    :param use_seed: True to use a fixed seed, false else.
     :return: The number of gene in common between the genes in community \
     `community` and the subsample of ppi_size
 
@@ -119,7 +120,8 @@ def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array],
     >>> get_list_common_gene("C1", dna_gene, 10, ppi_g, 5)
     array([5, 5, 5, 5, 5])
     """
-    np.random.seed(1)
+    if use_seed:
+        np.random.seed(1)
     common = [
         get_common_genes(
             dic_dna_gene[community],
@@ -180,11 +182,24 @@ def get_pvalue(values: np.array, target: float, iteration: int
     return pval, regulation
 
 
+def adjust_regulation(x: pd.Series) -> str:
+    """
+    Return the adjusted regulation.
+
+    :param x: A row a the dataframe
+    :return: The adjusted row
+    """
+    return x["regulation"] if x["p-adjust"] <= 0.05 else " . "
+
+
 def update_overlap_df(df_overlap: pd.DataFrame,
                       dic_dna_gene: Dict[str, np.array],
-                      ppi_gene: np.array, iteration: int
-                      ) -> pd.DataFrame:
+                      ppi_gene: np.array, iteration: int,
+                      use_seed: bool = True,
+                      ) -> Tuple[pd.DataFrame, np.array]:
     """
+    Perform a randomization test to check if communities at \
+    DNA level are more similar at protein level than randomly expected.
 
     :param df_overlap: A dataframe containing every gene community at dna \
     level and the number of common gene with a gne community at protein \
@@ -193,36 +208,48 @@ def update_overlap_df(df_overlap: pd.DataFrame,
     level
     :param ppi_gene: All the gene interacting at protein level
     :param iteration: The number of time we want to repeat the process
-    :return: The updated Df (column p-value, p-adjust and regulation
+    :param use_seed: True to use a fixed seed, false else.
+    :return: The updated Df (column p-value, p-adjust and regulation) + \
+    the list of common gene shared between gene community at DNA and \
+    protein level obtained by randmization analysis
 
     >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"],
     ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1],
     ... "size_com-ppi": [10, 10, 10]})
     >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]}
     >>> ppi_g = list(range(11))
-    >>> update_overlap_df(do, dna_gene, ppi_g, 100).iloc[:,
-    ... [0, 1, 2, 4, 5]]
+    >>> d, di = update_overlap_df(do, dna_gene, ppi_g, 100)
+    >>> d.iloc[:, [0, 1, 2, 4, 5]]
       DNA_community  community_size  nb_com-ppi  p-value  p-adjust
     0            C1               2           2     0.79      0.79
     1            C2               3           3     0.72      0.79
     2            C3               2           1     0.18      0.54
-    >>> d = update_overlap_df(do, dna_gene, list(range(1000)), 100)
+    >>> d.iloc[:, 6].to_list() == d.iloc[:, 7].to_list() == [" . "] * 3
+    True
+    >>> d, di = update_overlap_df(do, dna_gene, list(range(1000)), 100)
     >>> d.iloc[:, [0, 1, 2, 4, 5]]
       DNA_community  community_size  nb_com-ppi  p-value  p-adjust
     0            C1               2           2     0.01      0.01
     1            C2               3           3     0.00      0.00
     2            C3               2           1     0.00      0.00
-    >>> d["regulation"].to_list() == [" + "] * 3
+    >>> d.iloc[:, 6].to_list() == d.iloc[:, 7].to_list() == [" + "] * 3
+    True
+    >>> len(di.keys()) == 3
+    True
+    >>> len(di[list(di.keys())[0]]) == 100
     True
     """
     pval_list = []
     reg_list = []
+    dic = {}
     for i in tqdm(range(df_overlap.shape[0])):
-        values = get_list_common_gene(df_overlap.iloc[i, :]["DNA_community"],
+        community = df_overlap.iloc[i, :]["DNA_community"]
+        values = get_list_common_gene(community,
                                       dic_dna_gene,
                                       df_overlap.iloc[i, :]["size_com-ppi"],
                                       ppi_gene,
-                                      iteration)
+                                      iteration, use_seed)
+        dic[community] = values
         pval, reg = get_pvalue(values, df_overlap.iloc[i, :]["nb_com-ppi"],
                                iteration)
         pval_list.append(pval)
@@ -233,7 +260,187 @@ def update_overlap_df(df_overlap: pd.DataFrame,
                                            is_sorted=False,
                                            returnsorted=False)[1]
     df_overlap["regulation"] = reg_list
-    return df_overlap
+    df_overlap["regulation-adjusted"] = \
+        df_overlap.apply(adjust_regulation, axis=1)
+    return df_overlap, dic
+
+
+def summarize_df_overlap(df_overlap: pd.DataFrame) -> pd.DataFrame:
+    """
+    Create a summary of the dataframe `df_overlap` that indicates the number \
+    of common gene shared between communities at protein and DNA level.
+
+    :param df_overlap: The dataframe containing community at DNA and \
+    protein level and their enrichment
+    :return: The dataframe showing the number of enrichment, impoverishment \
+    or no regulation
+
+    >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"],
+    ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1],
+    ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0],
+    ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3,
+    ... "regulation-adjusted": [" + "] * 3})
+    >>> summarize_df_overlap(do)
+      regulation-adjusted  community_size  nb_com-ppi  size_com-ppi  number
+    0                  +         2.333333         2.0          10.0       3
+    1                  -              NaN         NaN           NaN       0
+    2                  .              NaN         NaN           NaN       0
+    """
+    df_overlap = df_overlap[["regulation-adjusted", "community_size",
+                            "nb_com-ppi", "size_com-ppi", "p-adjust"]]
+    df_tmp = pd.DataFrame({"regulation-adjusted": [" + ", " - ", " . "],
+                           "community_size": [np.nan] * 3,
+                           "nb_com-ppi": [np.nan] * 3,
+                           "size_com-ppi": [np.nan] * 3,
+                           "p-adjust": [np.nan] * 3})
+    df = pd.concat([df_overlap, df_tmp], axis=0, ignore_index=True)
+    df = df.groupby("regulation-adjusted").agg({"community_size": "mean",
+                                                "nb_com-ppi": "mean",
+                                                "size_com-ppi": "mean",
+                                                "p-adjust": "count"})
+    return df.rename({"p-adjust": "number"}, axis=1).reset_index()
+
+
+def get_enriched_impoverished_communities(df_overlap: pd.DataFrame,
+                                          dic_dna_gene: Dict[str, np.array],
+                                          ppi_gene: np.array, iteration: int,
+                                          dic_values: Dict[str, np.array],
+                                          use_seed: bool = True
+                                         ) -> Dict[str, int]:
+    """
+    For each communities, randomly samples only once in the list \
+    of `ppi_gene` to get the number of common gene betwwen this sample \
+    and the community of gene at DNA level. Then this function determines \
+    how many communites are enriched, impovershied thanks to `di_values`.
+
+    :param df_overlap: A dataframe containing every gene community at dna \
+    level and the number of common gene with a gne community at protein \
+    level
+    :param dic_dna_gene: The dictionary containing gene interacting at dna \
+    level
+    :param ppi_gene: All the gene interacting at protein level
+    :param iteration: The number of time we want to repeat the process
+    :param dic_values: A dictionary containing the number of common \
+    gene between communities at DNA and protein level for each \
+    subsampling made from the ppi_gene list.
+    :param use_seed: True to use a fixed seed, false else.
+    :return: The number of enriched, impoverished number of \
+    genes shared betwwen gene at DNA and protein level.
+
+    >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"],
+    ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1],
+    ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0],
+    ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3,
+    ... "regulation-adjusted": [" + "] * 3})
+    >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]}
+    >>> ppi_g = list(range(11))
+    >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2],
+    ... "C2": [0] * 95 + [1, 1, 1, 2, 2],
+    ... "C3": [0] * 75 + [1] * 10 + [2] * 10 + [3] * 5}
+    >>> get_enriched_impoverished_communities(do, dna_gene, ppi_g, 100, dv)
+    {' + ': 2, ' . ': 1, ' - ': 0}
+    >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2],
+    ... "C2": [0] * 95 + [1, 1, 1, 2, 2],
+    ... "C3": [3] * 75 + [4] * 23 + [1] * 2}
+    >>> get_enriched_impoverished_communities(do, dna_gene, ppi_g, 100, dv)
+    {' + ': 2, ' . ': 0, ' - ': 1}
+    """
+    pval_list = []
+    reg_list = []
+    for i in range(df_overlap.shape[0]):
+        community = df_overlap.iloc[i, :]["DNA_community"]
+        my_value = get_list_common_gene(community,
+                                        dic_dna_gene,
+                                        df_overlap.iloc[i, :]["size_com-ppi"],
+                                        ppi_gene,
+                                        1, use_seed)[0]
+        pval, reg = get_pvalue(dic_values[community], my_value,
+                               iteration)
+        pval_list.append(pval)
+        reg_list.append(reg)
+    p_adjust = multipletests(pval_list, alpha=0.05, method='fdr_bh',
+                             is_sorted=False, returnsorted=False)[1]
+    dic = {" + ": 0, " . ": 0, " - ": 0}
+    for i, reg in enumerate(reg_list):
+        dic[reg if p_adjust[i] <= 0.05 else " . "] += 1
+    return dic
+
+
+def summary_randomisation_test(df_overlap: pd.DataFrame,
+                               dic_dna_gene: Dict[str, np.array],
+                               ppi_gene: np.array, iteration: int,
+                               dic_values: Dict[str, np.array],
+                               use_seed: bool = True
+                               ) -> pd.DataFrame:
+    """
+    Says if the number of community that shared an enriched number \
+    of common gene between the protein and DNA level is enriched as well.
+
+    :param df_overlap: A dataframe containing every gene community at dna \
+    level and the number of common gene with a gne community at protein \
+    level
+    :param dic_dna_gene: The dictionary containing gene interacting at dna \
+    level
+    :param ppi_gene: All the gene interacting at protein level
+    :param iteration: The number of time we want to repeat the process
+    :param dic_values: A dictionary containing the number of common \
+    gene between communities at DNA and protein level for each \
+    subsampling made from the ppi_gene list.
+    :param use_seed: True to use a fixed seed, false else.
+    :return: The updated summary table
+
+    >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"],
+    ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1],
+    ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0],
+    ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3,
+    ... "regulation-adjusted": [" + "] * 3})
+    >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]}
+    >>> ppi_g = list(range(11))
+    >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2],
+    ... "C2": [0] * 95 + [1, 1, 1, 2, 2],
+    ... "C3": [0] * 75 + [1] * 10 + [2] * 10 + [3] * 5}
+    >>> ds = summary_randomisation_test(do, dna_gene, ppi_g, 100, dv)
+    >>> ds.iloc[:, range(4)]
+      regulation-adjusted  community_size  nb_com-ppi  size_com-ppi
+    0                  +         2.333333         2.0          10.0
+    1                  -              NaN         NaN           NaN
+    2                  .              NaN         NaN           NaN
+    >>> ds.iloc[:, range(4, 7)]
+       number  mean_100_number  p-adjust
+    0       3              2.0       0.0
+    1       0              0.0       1.0
+    2       0              1.0       0.0
+    >>> ds.iloc[:, 7].to_list() == [" + ", " . ", " - "]
+    True
+    """
+    df_summarized = summarize_df_overlap(df_overlap)
+    dic_rand_sum = {k: [] for k in df_summarized["regulation-adjusted"].values}
+    for _ in tqdm(range(iteration), desc="Making stats to summarise global "
+                                         "enrichment"):
+        d = get_enriched_impoverished_communities(df_overlap, dic_dna_gene,
+                                                  ppi_gene, iteration,
+                                                  dic_values, use_seed)
+        for k, v in dic_rand_sum.items():
+            v.append(d[k])
+    pvalues = []
+    regulations = []
+    mean_number = []
+    for i in range(df_summarized.shape[0]):
+        reg, val = df_summarized.iloc[i, :][["regulation-adjusted", "number"]]
+        pval, creg = get_pvalue(dic_rand_sum[reg], val,
+                               iteration)
+        # print(reg, val, dic_rand_sum[reg])
+        pvalues.append(pval)
+        regulations.append(creg)
+        mean_number.append(np.mean(dic_rand_sum[reg]))
+    p_adjust = multipletests(pvalues, alpha=0.05, method='fdr_bh',
+                             is_sorted=False, returnsorted=False)[1]
+    reg_adj = [regulations[i] if p_adjust[i] <= 0.05 else " . "
+               for i in range(len(regulations))]
+    df_summarized[f"mean_{iteration}_number"] = mean_number
+    df_summarized[f"p-adjust"] = p_adjust
+    df_summarized[f"reg_adj"] = reg_adj
+    return df_summarized
 
 
 if __name__ == "__main__":
-- 
GitLab