src/find_interaction_cluster/graph_figures/graph_functions.py: add the...

src/find_interaction_cluster/graph_figures/graph_functions.py: add the possibility of selecting communities by doing a permutation test

src/find_interaction_cluster/graph_figures/graph_functions.py: add the...
src/find_interaction_cluster/graph_figures/graph_functions.py: add the possibility of selecting communities by doing a permutation test
7fea9838 · nfontrod · a99189bf · 7fea9838
Commit 7fea9838 authored 3 years ago by nfontrod
--- a/src/find_interaction_cluster/graph_figures/graph_functions.py
+++ b/src/find_interaction_cluster/graph_figures/graph_functions.py
@@ -6,7 +6,6 @@
 Description:
 """
 import sqlite3
-
 import networkx as nx
 import pandas as pd
 from .config_graph_figure import ConfigGraph, Config
@@ -20,6 +19,9 @@ import numpy as np
 from ..community_finder import lighten_color, create_figure
 from ..sf_and_communities import get_sfname
 from itertools import product
+from ..community_figures.fig_functions import get_feature_by_community, \
+    perm_with_ctrl
+import multiprocessing as mp


 class Parameters:
@@ -98,11 +100,32 @@ def check_if_exist(graph_file: Path) -> None:
            f"parameter to build it")


-def get_regulated_community(df: pd.DataFrame, threshold: float,
-                            min_reg: int = 2) -> List[str]:
+def get_regulated_community_perm(df: pd.DataFrame, iteration: int,
+                                 feature: str) -> List[str]:
+    """
+    Get the list of community containing a significant amount of \
+    regulated genes/exons thanks to a permutation test.
+
+    :param df: A dataframe containing for each gene, the community \
+    to which it belong and if it is regulated by a splicing factor.
+    :param iteration: The number of iteration to perform
+    :param feature: The kind of feature to use
+    :return: The list of colony containing a significant amount of \
+    genes/exons regulated by a splicing factor
+    """
+    dic_com = get_feature_by_community(df, feature)
+    df_stat = perm_with_ctrl(df, feature, "regulation", dic_com, iteration)
+    return df_stat.loc[(df_stat['p-adj'] <= 0.05) &
+                       (df_stat["reg-adj"] == " + ") &
+                       (df_stat["community_size"] >= 10),
+                       "community"].to_list()
+
+
+def get_regulated_community_thresholds(df: pd.DataFrame, threshold: float,
+                                       min_reg: int = 2) -> List[str]:
    """
    Get the list of community containing a significant amount of \
-    regulated genes/exons.
+    regulated genes/exons by using thresholds.

    :param df: A dataframe containing for each gene, the community \
    to which it belong and if it is regulated by a splicing factor.
@@ -110,22 +133,22 @@ def get_regulated_community(df: pd.DataFrame, threshold: float,
    select it (but it must also contains at least min_reg gene regulated)
    :param min_reg: The minimum of regulated exon in a community to \
    take it into account
-    :return: The list of colony containing a significant amount of \
+    :return: The list of community containing a significant amount of \
    genes/exons regulated by a splicing factor

    >>> d = pd.DataFrame({"id_gene": [1, 2, 3, 4, 5, 6, 7, 8, 9],
    ... "community": ["C1"] * 3 + ["C2"] * 3 + ["C3"] * 3,
    ... "community_size": [3] * 9,
    ... "regulation": [1, 0, 0, 1, 1, 0, 0, 0, 0]})
-    >>> get_regulated_community(d, 0.1, 2)
+    >>> get_regulated_community_thresholds(d, 0.1, 2)
    ['C2']
-    >>> get_regulated_community(d, 0.1, 3)
+    >>> get_regulated_community_thresholds(d, 0.1, 3)
    []
-    >>> get_regulated_community(d, 0.1, 1)
+    >>> get_regulated_community_thresholds(d, 0.1, 1)
    ['C1', 'C2']
-    >>> get_regulated_community(d, 0.5, 1)
+    >>> get_regulated_community_thresholds(d, 0.5, 1)
    ['C2']
-    >>> get_regulated_community(d, 0.0, 0)
+    >>> get_regulated_community_thresholds(d, 0.0, 0)
    ['C1', 'C2', 'C3']
    """
    ndf = df[["community", "community_size", "regulation"]]\
@@ -136,6 +159,29 @@ def get_regulated_community(df: pd.DataFrame, threshold: float,
                   "community"].to_list()


+def get_regulated_community(df: pd.DataFrame, feature: str,
+                            threshold: float, min_reg: int = 2,
+                            iteration: int = 0):
+    """
+    Get the list of community containing a significant amount of \
+    regulated genes/exons with the wanted method.
+
+    :param df: A dataframe containing for each gene, the community \
+    to which it belong and if it is regulated by a splicing factor.
+    :param threshold: Minimum frequency of gene regulated in a colony to \
+    select it (but it must also contains at least min_reg gene regulated)
+    :param min_reg: The minimum of regulated exon in a community to \
+    take it into account
+    :param iteration: The number of iteration to perform
+    :param feature: The kind of feature to use
+    :return: The list of community containing a significant amount of \
+    genes/exons regulated by a splicing factor
+    """
+    if iteration >= 20:
+        return get_regulated_community_perm(df, iteration, feature)
+    return get_regulated_community_thresholds(df, threshold, min_reg)
+
+
 def select_gene_in_selected_communities(df: pd.DataFrame,
                                        list_communities: List[str],
                                        feature: str
@@ -232,8 +278,8 @@ def build_community_dic(list_ft: List[List], list_community: List[str],
    return dic_community


-def get_title(sf_name: str, reg: str, threshold: float, min_reg, feature: str
-              ) -> str:
+def get_title(sf_name: str, reg: str, threshold: float, min_reg, feature: str,
+              iteration: int) -> str:
    """
    Return a title

@@ -244,20 +290,25 @@ def get_title(sf_name: str, reg: str, threshold: float, min_reg, feature: str
    :param min_reg: The minimum of regulated exon in a community to \
    take it into account
    :param feature: The kind of feature we want to analyse
+    :param iteration: The number of iteration to perform
    :return: The title of the figure

-    >>> get_title("SF1", "down", 0.1, 2, "gene")
+    >>> get_title("SF1", "down", 0.1, 2, "gene", 0)
    'Figure of the communities containing at least 10.0 % of their genes \
 (or more than 2 genes) down-regulated by SF1'
    """
    reg = "regulated" if reg == "reg" else f"{reg}-regulated"
-    return f"Figure of the communities containing at least " \
-           f"{threshold * 100} % of their {feature}s (or more than " \
-           f"{min_reg} {feature}s) {reg} by {sf_name}"
+    if iteration < 20:
+        return f"Figure of the communities containing at least " \
+               f"{threshold * 100} % of their {feature}s (or more than " \
+               f"{min_reg} {feature}s) {reg} by {sf_name}"
+    return f"Figure of the communities enriched in {feature}s" \
+           f" {reg} by {sf_name} (permutation test {iteration} iteration)"


 def get_outfile(graph_file: Path, sf_name: str, reg: str, threshold: float,
-                min_reg, feature: str) -> Path:
+                min_reg, feature: str, iteration: int, num_community: int
+                ) -> Path:
    """
    Return The outfile of the figure

@@ -269,16 +320,29 @@ def get_outfile(graph_file: Path, sf_name: str, reg: str, threshold: float,
    :param min_reg: The minimum of regulated exon in a community to \
    take it into account
    :param feature: The kind of feature we want to analyse
+    :param iteration: The number of iteration to perform
+    :param num_community: The number of community in the figure
    """
-    output_dir = graph_file.parent / "graph_figures"
-    output_dir.mkdir(exist_ok=True)
-    return output_dir / f"{sf_name}_{reg}_{threshold}_{min_reg}_{feature}.html"
+    output_tmp = graph_file.parent / "graph_figures"
+    if iteration < 20:
+        output_dir = output_tmp / "graph_threshold"
+        output_dir.mkdir(exist_ok=True, parents=True)
+        return output_dir / \
+               f"{sf_name}_{reg}_{threshold}_{min_reg}_{feature}" \
+               f"_cluster-{num_community}.html"
+    else:
+        output_dir = output_tmp / "graph_permutation"
+        output_dir.mkdir(exist_ok=True, parents=True)
+        return output_dir / \
+               f"{sf_name}_{reg}_perm-{iteration}_{feature}_" \
+               f"cluster-{num_community}.html"


 def create_graph_figure(project: str, weight: int, global_weight: int,
                        same_gene: bool, inflation: float, cell_line: str,
                        feature: str, sf_name: str, reg: str, threshold: float,
-                        min_reg: int = 2):
+                        min_reg: int = 2, iteration: int = 0,
+                        min_community: int = 3):
    """
    :param project: A project name of interest. Used only if \
    global_weight is 0
@@ -299,7 +363,13 @@ def create_graph_figure(project: str, weight: int, global_weight: int,
    select it (but it must also contains at least min_reg gene regulated)
    :param min_reg: The minimum of regulated exon in a community to \
    take it into account
+    :param iteration: If this parameter is greater or equal to 20 then a \
+    permutation test is made to find the significantly enriched communities. \
+    Below 20, significant communities are found
+    :param min_community: The minimum number of enriched community \
+    required to produce a figure
    """
+    print(f"Working on {sf_name}, {reg}")
    p = Parameters(project, weight, global_weight, same_gene, inflation,
                   cell_line, feature)
    graph_file, comm_file = recover_json_graph_of_interest(p)
@@ -309,32 +379,39 @@ def create_graph_figure(project: str, weight: int, global_weight: int,
    df_com_file = pd.read_csv(comm_file, sep="\t")
    df_com = get_community_tables(df_com_file, feature)
    full_com = merge_dataframes(reg_table, df_com, feature)
-    full_com = full_com[full_com["community_size"] >= 10.0]
-    reg_ft = full_com.loc[full_com["regulation"] == 1, f"id_{feature}"].to_list()
-    list_communities = get_regulated_community(full_com, threshold, min_reg)
-    if len(list_communities) >= 3:
+    reg_ft = full_com.loc[full_com["regulation"] == 1,
+                          f"id_{feature}"].to_list()
+    full_com.loc[full_com["community_size"] < 10.0,
+                 ["community", "community_size"]] = [np.nan, np.nan]
+    list_communities = get_regulated_community(full_com, feature, threshold,
+                                               min_reg, iteration)
+    num_communities = len(list_communities)
+    if num_communities >= min_community:
        list_ft = select_gene_in_selected_communities(full_com,
                                                      list_communities,
                                                      feature)
        graph = load_graphic(graph_file)
        sub_graph = subgraph_creation(graph, list_ft)
-
        dic_community = build_community_dic(list_ft, list_communities, reg_ft)
-        my_title = get_title(sf_name, reg, threshold, min_reg, feature)
+        my_title = get_title(sf_name, reg, threshold, min_reg, feature,
+                             iteration)
        outfile = get_outfile(graph_file, sf_name, reg, threshold, min_reg,
-                              feature)
+                              feature, iteration, num_communities)
        create_figure(sub_graph, outfile, dic_community, my_title)
    else:
-        print(f"only {len(list_communities)} cluster(s) found ! "
+        print(f"only {num_communities} cluster(s) found ! "
              f"no figure created")



-def create_many_graph_figures(project: str, weight: int, global_weight: int,
-                              same_gene: bool, inflation: float, cell_line: str,
+def create_many_graph_figures(ps: int, project: str, weight: int,
+                              global_weight: int, same_gene: bool,
+                              inflation: float, cell_line: str,
                              feature: str, threshold: float,
-                              min_reg: int = 2):
+                              min_reg: int = 2, iteration: int = 0,
+                              min_community: int = 3):
    """
+    :param ps: The number of processes to create
    :param project: A project name of interest. Used only if \
    global_weight is 0
    :param weight: The weight of interaction to consider
@@ -350,14 +427,24 @@ def create_many_graph_figures(project: str, weight: int, global_weight: int,
    select it (but it must also contains at least min_reg gene regulated)
    :param min_reg: The minimum of regulated exon in a community to \
    take it into account
+    :param iteration: If this parameter is greater or equal to 20 then a \
+    permutation test is made to find the significantly enriched communities. \
+    Below 20, significant communities are found
+    :param min_community: The minimum number of enriched community \
+    required to produce a figure
    """
    sf_list = get_sfname()
-    my_prod = product(sf_list, ["down", "up"])
+    my_prod = list(product(sf_list, ["down", "up"]))
+    processes = []
+    pool = mp.Pool(processes=min(len(my_prod), ps))
    for sf_name, reg in my_prod:
-        print(f"Working on {sf_name}, {reg}")
-        create_graph_figure(project, weight, global_weight, same_gene,
-                            inflation, cell_line, feature, sf_name, reg,
-                            threshold, min_reg)
+        args = [project, weight, global_weight, same_gene, inflation,
+                cell_line, feature, sf_name, reg, threshold, min_reg,
+                iteration, min_community]
+        processes.append(pool.apply_async(create_graph_figure, args))
+    pool.close()
+    [p.get(timeout=None) for p in processes]
+    pool.join()