From f05e0585a97d431272eaf2282423304268249068 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Fri, 20 Nov 2020 16:12:10 +0100
Subject: [PATCH] src/find_interaction_cluster/*.py: replace list parameters
 weights and global_weights by int paramet weight and global_weight

---
 src/find_interaction_cluster/__main__.py      | 24 +++---
 .../community_finder.py                       | 75 ++++++-------------
 .../create_ppi_files.py                       | 50 ++++++-------
 .../nt_and_community.py                       | 42 +++++------
 .../sf_and_communities.py                     | 37 +++++----
 5 files changed, 95 insertions(+), 133 deletions(-)

diff --git a/src/find_interaction_cluster/__main__.py b/src/find_interaction_cluster/__main__.py
index 7fecb019..a196eeb3 100644
--- a/src/find_interaction_cluster/__main__.py
+++ b/src/find_interaction_cluster/__main__.py
@@ -20,12 +20,13 @@ from ..logging_conf import logging_def
 
 
 @lp.parse
-def launcher(weight: List[int] = (1),
-             global_weight: List[int] = (0),
+def launcher(weight: int = 1,
+             global_weight: int = 0,
              same_gene: bool = True,
              ps: int = ConfigGraph.cpu,
              html_fig: bool = False, feature: str = 'exon', region: str = '',
              iteration_ppi: int = 1000,
+             project: str = "GSM1018963_GSM1018964",
              logging_level: str = "DISABLE"):
     """
     Script used to find communities inside  exon co-localized within a project
@@ -41,9 +42,11 @@ def launcher(weight: List[int] = (1),
     :param  html_fig: True to display the html figure (default False).
     :param feature: The feature we want to analyse (default 'exon')
     :param region: The region of a gene to analyse (used only if feature \
-    is 'gene') (default '').
+    is 'gene') (default '') (can be 'gene', 'exon', 'intron').
     :param logging_level: The level of data to display (default 'DISABLE')
     :param iteration_ppi: the number of iteration for ppi analysis
+    :param project: The project name of interest \
+    (only used is global_weight = 0).
     :param ps: The number of processes to use
     """
     logging_def(ConfigGraph.community_folder, __file__, logging_level)
@@ -53,15 +56,14 @@ def launcher(weight: List[int] = (1),
         same_gene = True
     if not ConfigGraph.get_hipmcl_prog().is_file():
         install_hipmcl("INFO")
-    multiple_community_launcher(1, weight, global_weight, same_gene, html_fig,
-                                feature, logging_level)
-    multiple_stat_launcher(ps, weight, global_weight, same_gene, feature,
-                           logging_level)
-    multiple_nt_lmm_launcher(ps, weight, global_weight, same_gene, feature,
-                             region,
-                             logging_level)
+    multiple_community_launcher(weight, global_weight, project, same_gene,
+                                html_fig, feature, logging_level)
+    multiple_stat_launcher(ps, weight, global_weight, project, same_gene,
+                           feature, logging_level)
+    multiple_nt_lmm_launcher(ps, weight, global_weight, project,
+                             same_gene, feature, region, logging_level)
     if feature == "gene":
-        ppi_stat_launcher(ps, weight, global_weight, same_gene,
+        ppi_stat_launcher(weight, global_weight, project, same_gene,
                           ConfigGraph.ppi_threshold, iteration_ppi,
                           logging_level)
 
diff --git a/src/find_interaction_cluster/community_finder.py b/src/find_interaction_cluster/community_finder.py
index b4180848..d65d5a81 100644
--- a/src/find_interaction_cluster/community_finder.py
+++ b/src/find_interaction_cluster/community_finder.py
@@ -253,8 +253,8 @@ def get_figure_title(project, weight, global_weight, same_gene, feature):
     :param feature: The kind of analysed features
     :return: A figure title
     """
-    title = f"Co-localisation between {feature}s having a weight greater than " \
-            f"{weight} in "
+    title = f"Co-localisation between {feature}s having a weight greater " \
+            f"than {weight} in "
     if global_weight == 0:
         title += f"the project {project}"
     else:
@@ -347,7 +347,7 @@ def community_finder(weight: int, global_weight: int, project: str = "",
     logging.debug('Done !')
 
 
-def get_projects(global_weight: int) -> List[str]:
+def get_projects(global_weight: int, project: str) -> str:
     """
     Get projects name.
 
@@ -355,69 +355,38 @@ def get_projects(global_weight: int) -> List[str]:
     the global weight is equal to 0 then then density figure are calculated \
     by project, else all projet are merge together and the interaction \
     seen in `global_weight` project are taken into account
-    :return: The list of the project to consider
+    :param project: The name of a project
+    :return: The project to consider
     """
     if global_weight != 0:
-        return [f'Global-weight-{global_weight}']
+        return f'Global-weight-{global_weight}'
     else:
-        return ConfigGraph.good_projects
+        return project
 
 
-def get_projects_name(global_weights: List[int]) -> Tuple[List[str], Dict]:
-    """
-    Get projects name given a list of global_weight and a dictionary linking,
-    each project name to it's corresponding global weight.
-
-    :param global_weights: The list of global weights to consider. if \
-    the global weight is equal to 0 then then density figure are calculated \
-    by project, else all projet are merge together and the interaction \
-    seen in `global_weight` project are taken into account
-    :return: project names and a dictionary linking,
-    each name to it's corresponding global weight.
-    """
-    dic = {}
-    projects = []
-    for global_weight in global_weights:
-        tmp = get_projects(global_weight)
-        projects += tmp
-        for p in tmp:
-            dic[p] = global_weight
-    return projects, dic
-
-
-def multiple_community_launcher(ps: int, weights: List[int],
-                                global_weights: List[int],
-                                same_gene: bool, html_fig: bool = False,
+def multiple_community_launcher(weight: int,
+                                global_weight: int,
+                                project: str,
+                                same_gene: bool,
+                                html_fig: bool = False,
                                 feature: str = 'exon',
                                 logging_level: str = "DISABLE"):
     """
-    :param ps: The number of processes we want to use.
-    :param weights: The list of weights of interaction to consider
-    :param global_weights: The list global weights to consider. if \
-    the global weight is equal to 0 then then density figure are calculated \
-    by project, else all projcet are merge together and the interaction \
-    seen in `global_weight` project are taken into account
+    :param weight: The weight of interaction to consider
+    :param global_weight: The global weighs to consider. if \
+    the global weight is equal to 0  then the project `project` is \
+    used.
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
     :param html_fig: True to create an html figure, false else
     :param feature: The feature we want to analyse (default 'exon')
+    :param project: The project name, used only if global_weight = 0
     :param logging_level: Level of information to display
     """
     ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
     logging_def(ConfigGraph.community_folder, __file__, logging_level)
-    global_weights = list(np.unique(global_weights))
-    weights = list(np.unique(weights))
-    projects, dic_project = get_projects_name(global_weights)
-    condition = list(product(projects, weights))
-    processes = []
-    pool = mp.Pool(processes=min(ps, len(condition)))
-    for project, weight in condition:
-        global_weight = dic_project[project]
-        logging.info(f'Finding community for project : {project}, '
-                     f'global_weight : {global_weight}, weight: {weight}')
-        args = [weight, global_weight, project, same_gene, html_fig, feature]
-        processes.append(pool.apply_async(community_finder, args))
-    for proc in processes:
-        proc.get(timeout=None)
-    pool.close()
-    pool.join()
+    project = get_projects(global_weight, project)
+    logging.info(f'Finding community for project : {project}, '
+                 f'global_weight : {global_weight}, weight: {weight}')
+    community_finder(weight, global_weight, project, same_gene, html_fig,
+                     feature)
diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py
index 0c7a211d..1d401736 100644
--- a/src/find_interaction_cluster/create_ppi_files.py
+++ b/src/find_interaction_cluster/create_ppi_files.py
@@ -12,7 +12,7 @@ from typing import List
 from ..logging_conf import logging_def
 import numpy as np
 import logging
-from .community_finder import get_projects_name
+from .community_finder import get_projects
 from itertools import product
 import multiprocessing as mp
 from pathlib import Path
@@ -195,7 +195,8 @@ def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int
 
     :param df: The dataframe containing the number of \
     genes that clusters in community at DNA and protein level
-    :param the minimum size required to keep gene community at dna level.
+    :param size_threshold: the minimum size required to keep gene
+    community at dna level.
     :return: The dataframe with only one line per DNA community
 
     >>> test_df = pd.DataFrame({"id_gene": range(1, 12),
@@ -203,7 +204,7 @@ def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int
     ... "community_size": [9] * 5 + [14] * 5 + [19],
     ... "nb_com-ppi": [0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0],
     ... "size_com-ppi": [30, 50, 105, 102, 25, 30, 42, 47, 89, 12, 0]})
-    >>> filter_most_overllaping_ppi(test_df)
+    >>> filter_most_overllaping_ppi(test_df, 10)
       DNA_community  community_size  nb_com-ppi  size_com-ppi
     0            C1              10           3           105
     1            C2              15           2            42
@@ -324,7 +325,7 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path,
     df.to_csv(outstat, sep="\t", index=False)
     final_df = summary_randomisation_test(df, dic_dna_gene, ppi_gene,
                                           iteration, dic_values,
-                                          use_seed = False)
+                                          use_seed=False)
     outstat = ConfigGraph.get_community_file(project, weight, global_weight,
                                              same_gene, "gene",
                                              f"ppi_gene_table_{iteration}_"
@@ -333,8 +334,9 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path,
     final_df.to_csv(outstat, sep="\t", index=False)
 
 
-def ppi_stat_launcher(ps: int, weights: List[int],
-                      global_weights: List[int],
+def ppi_stat_launcher(weight: int,
+                      global_weight: int,
+                      project: str,
                       same_gene: bool, threshold: int = 700,
                       iteration: int = 1000,
                       logging_level: str = "DISABLE"):
@@ -342,12 +344,11 @@ def ppi_stat_launcher(ps: int, weights: List[int],
     Launch the statistical allowing to determine if interaction between \
     genes at dna level as an influence on the interactions at protein level.
 
-    :param ps: The number of processes we want to use.
-    :param weights: The list of weights of interaction to consider
-    :param global_weights: The list global weights to consider. if \
-    the global weight is equal to 0 then then density figure are calculated \
-    by project, else all projcet are merge together and the interaction \
-    seen in `global_weight` project are taken into account
+    :param weight: The weight of interaction to consider
+    :param global_weight: The global weighs to consider. if \
+    the global weight is equal to 0  then the project `project` is \
+    used.
+    :param project: The project name, used only if global_weight = 0
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
     :param threshold: The minimum threshold needed to consider the interaction
@@ -358,24 +359,15 @@ def ppi_stat_launcher(ps: int, weights: List[int],
     logging_def(ConfigGraph.community_folder, __file__, logging_level)
     logging.info("Checking if gene communities at DNA level have an "
                  "influence on communities at protein level")
-    global_weights = list(np.unique(global_weights))
-    weights = list(np.unique(weights))
-    projects, dic_project = get_projects_name(global_weights)
-    condition = list(product(projects, weights))
-    pool = mp.Pool(processes=min(ps, len(condition)))
+    project = get_projects(global_weight, project)
     logging.debug("Calculating stats...")
-    processes = []
-    for project, weight in condition:
-        global_weight = dic_project[project]
-        community_file = ConfigGraph.get_community_file(project, weight,
-                                                        global_weight,
-                                                        same_gene, "gene",
-                                                        f".txt")
-        args = [community_file, ConfigPPI.fasterdb_ppi, project,
-                       weight, global_weight, same_gene, threshold, iteration]
-        processes.append(pool.apply_async(ppi_stats_analysis, args))
-        results = [p.get(timeout=None) for p in processes]
+    community_file = ConfigGraph.get_community_file(project, weight,
+                                                    global_weight,
+                                                    same_gene, "gene",
+                                                    f".txt")
+    ppi_stats_analysis(community_file, ConfigPPI.fasterdb_ppi, project,
+                       weight, global_weight, same_gene, threshold, iteration)
 
 
 if __name__ == "__main__":
-    testmod()
\ No newline at end of file
+    testmod()
diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py
index 0b7a0140..8e7b6394 100644
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -18,8 +18,7 @@ from functools import reduce
 from pathlib import Path
 from rpy2.robjects import r, pandas2ri
 from statsmodels.stats.multitest import multipletests
-import numpy as np
-from .community_finder import get_projects_name
+from .community_finder import get_projects
 from ..logging_conf import logging_def
 from itertools import product
 import multiprocessing as mp
@@ -64,8 +63,8 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
              {query_region}
              """
     df = pd.read_sql_query(query, cnx)
-    df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\
-        .reset_index()
+    df = df.pivot_table(index=f"id_{feature}", columns="ft",
+                        values="frequency").reset_index()
     df[f"id_{feature}"] = df[f"id_{feature}"].astype(str)
     return df
 
@@ -203,6 +202,8 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
     community.
     :param outfile: The output table containing frequencies
     :param feature: The kind of feature to analyse
+    :param region: only use if feature is 'gene'. Used to focus on \
+    a given region in genes (can be gene, exon, intron).
     :return: A dataframe containing the frequency of every nucleotides \
     of every exon in a large community
     """
@@ -304,20 +305,22 @@ def create_dataframe(project: str, weight: int, global_weight: int,
     return df
 
 
-def multiple_nt_lmm_launcher(ps: int, weights: List[int],
-                             global_weights: List[int],
+def multiple_nt_lmm_launcher(ps: int,
+                             weight: int,
+                             global_weight: int,
+                             project: str,
                              same_gene: bool,
                              feature: str = 'exon', region: str = '',
                              logging_level: str = "DISABLE"):
     """
     Launch the statistical analysis for every
 
-    :param ps: The number of processes we want to use.
-    :param weights: The list of weights of interaction to consider
-    :param global_weights: The list global weights to consider. if \
-    the global weight is equal to 0 then then density figure are calculated \
-    by project, else all projcet are merge together and the interaction \
-    seen in `global_weight` project are taken into account
+    :param ps: The number of processes to use
+    :param weight: The weight of interaction to consider
+    :param global_weight: The global weighs to consider. if \
+    the global weight is equal to 0  then the project `project` is \
+    used.
+    :param project: The project name, used only if global_weight = 0
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
     :param feature: The kind of analysed feature
@@ -328,17 +331,14 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
     logging_def(ConfigGraph.community_folder, __file__, logging_level)
     logging.info("Checking if communities as an effect on nucleotide "
                  "frequency")
-    global_weights = list(np.unique(global_weights))
-    weights = list(np.unique(weights))
-    projects, dic_project = get_projects_name(global_weights)
+    project = get_projects(global_weight, project)
     nt_list = ["A", "C", "G", "T", "S", "W"]
-    condition = list(product(projects, weights, nt_list))
+    condition = list(product([project], [weight], nt_list))
     processes = {}
     pool = mp.Pool(processes=min(ps, len(condition)))
     logging.debug("Calculating stats...")
     dic_df = {}
     for project, weight, nt in condition:
-        global_weight = dic_project[project]
         ckey = get_key(project, weight)
         if ckey in dic_df:
             df = dic_df[ckey]
@@ -352,11 +352,11 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
             dic_df[ckey] = df
         args = [df, project, weight, global_weight, same_gene, nt, feature,
                 region]
-        if ckey not in processes.keys():
-            processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)]
-        else:
+        if ckey in processes:
             processes[ckey].append(
                 pool.apply_async(get_stat_nt_communities, args))
+        else:
+            processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)]
     for p, value in processes.items():
         project, weight = p.split("_")
         results = [p.get(timeout=None) for p in value]
@@ -365,7 +365,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
         fdf = pd.DataFrame(results)
         fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1]
         outfile = ConfigGraph.get_community_file(project, weight,
-                                                 dic_project[project],
+                                                 global_weight,
                                                  same_gene, feature,
                                                  f"lmm-nt_stat.txt",
                                                  "sf_community_enrichment")
diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py
index ac965851..d81dc593 100644
--- a/src/find_interaction_cluster/sf_and_communities.py
+++ b/src/find_interaction_cluster/sf_and_communities.py
@@ -15,7 +15,7 @@ from .community_finder import get_communities
 import pandas as pd
 import numpy as np
 from itertools import product
-from .community_finder import get_projects_name
+from .community_finder import get_projects
 import multiprocessing as mp
 import logging
 from ..logging_conf import logging_def
@@ -362,19 +362,21 @@ def get_key(project: str, weight: int) -> str:
     return f"{project}_{weight}"
 
 
-def multiple_stat_launcher(ps: int, weights: List[int],
-                           global_weights: List[int],
+def multiple_stat_launcher(ps: int,
+                           weight: int,
+                           global_weight: int,
+                           project: str,
                            same_gene: bool, feature: str = 'exon',
                            logging_level: str = "DISABLE"):
     """
     Launch the statistical analysis for every
 
-    :param ps: The number of processes we want to use.
-    :param weights: The list of weights of interaction to consider
-    :param global_weights: The list global weights to consider. if \
-    the global weight is equal to 0 then then density figure are calculated \
-    by project, else all projcet are merge together and the interaction \
-    seen in `global_weight` project are taken into account
+    :param ps: The number of processes to use
+    :param weight: The weight of interaction to consider
+    :param global_weight: The global weighs to consider. if \
+    the global weight is equal to 0  then the project `project` is \
+    used.
+    :param project: The project name, used only if global_weight = 0
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
     :param feature: The feature we want to analyse
@@ -385,22 +387,19 @@ def multiple_stat_launcher(ps: int, weights: List[int],
     logging.info(f"Checking if communities contains often {feature}s "
                  f"regulated by a splicing factor")
     sf_list = get_sfname()
-    global_weights = list(np.unique(global_weights))
-    weights = list(np.unique(weights))
-    projects, dic_project = get_projects_name(global_weights)
-    condition = list(product(projects, weights, sf_list, ['down', 'up']))
+    project = get_projects(global_weight, project)
+    condition = list(product([project], [weight], sf_list, ['down', 'up']))
     processes = {}
     pool = mp.Pool(processes=min(ps, len(condition)))
     logging.debug("Calculating stats...")
     for project, weight, sf_name, reg in condition:
         ckey = get_key(project, weight)
-        global_weight = dic_project[project]
         args = [sf_name, reg, project, weight, global_weight, same_gene,
                 feature]
-        if ckey not in processes.keys():
-            processes[ckey] = [pool.apply_async(get_stat4communities, args)]
-        else:
+        if ckey in processes:
             processes[ckey].append(pool.apply_async(get_stat4communities, args))
+        else:
+            processes[ckey] = [pool.apply_async(get_stat4communities, args)]
     for p, value in processes.items():
         project, weight = p.split("_")
         list_tuples = [proc.get(timeout=None) for proc in value]
@@ -410,7 +409,7 @@ def multiple_stat_launcher(ps: int, weights: List[int],
         list_series = [t[1] for t in list_tuples]
         df = pd.concat(list_df, axis=0, ignore_index=True)
         outfile = ConfigGraph.get_community_file(project, weight,
-                                                 dic_project[project],
+                                                 global_weight,
                                                  same_gene, feature,
                                                  "_stat.txt",
                                                  "sf_community_enrichment")
@@ -419,7 +418,7 @@ def multiple_stat_launcher(ps: int, weights: List[int],
         glm_df["padj"] = multipletests(glm_df['pval'].values,
                                        method='fdr_bh')[1]
         outfile = ConfigGraph.get_community_file(project, weight,
-                                                 dic_project[project],
+                                                 global_weight,
                                                  same_gene, feature,
                                                  "_glmm_stat.txt",
                                                  "sf_community_enrichment")
-- 
GitLab