From c0104bd4566b8256859a4539e09010bbb2a4415c Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Mon, 26 Oct 2020 15:12:42 +0100
Subject: [PATCH] src/find_interaction_cluster/*.py: add a paramter feature

---
 src/find_interaction_cluster/__main__.py      | 18 +++--
 .../community_finder.py                       | 62 ++++++++------
 src/find_interaction_cluster/config.py        |  7 +-
 .../nt_and_community.py                       | 80 ++++++++++++-------
 .../sf_and_communities.py                     | 77 +++++++++++++-----
 5 files changed, 166 insertions(+), 78 deletions(-)

diff --git a/src/find_interaction_cluster/__main__.py b/src/find_interaction_cluster/__main__.py
index 2d7271ae..71a3bcc3 100644
--- a/src/find_interaction_cluster/__main__.py
+++ b/src/find_interaction_cluster/__main__.py
@@ -14,6 +14,8 @@ from typing import List
 from .install_hipMCL import install_hipmcl
 from .sf_and_communities import multiple_stat_launcher
 from .nt_and_community import multiple_nt_lmm_launcher
+import logging
+from ..logging_conf import logging_def
 
 
 @lp.parse
@@ -21,7 +23,7 @@ def launcher(weight: List[int] = (1),
              global_weight: List[int] = (0),
              same_gene: bool = True,
              ps: int = ConfigGraph.cpu,
-             html_fig: bool = False,
+             html_fig: bool = False, feature: str = 'exon',
              logging_level: str = "DISABLE"):
     """
     Script used to find communities inside  exon co-localized within a project
@@ -35,15 +37,21 @@ def launcher(weight: List[int] = (1),
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False) (default True)
     :param  html_fig: True to display the html figure (default False).
+    :param feature: The feature we want to analyse (default 'exon')
     :param logging_level: The level of data to display (default 'DISABLE')
     """
+    logging_def(ConfigGraph.community_folder, __file__, logging_level)
+    if feature == "gene" and not same_gene:
+        logging.warning(f"When level is equal to gene then same_gene must "
+                        f"be true. Setting it to True.")
+        same_gene = True
     if not ConfigGraph.get_hipmcl_prog().is_file():
         install_hipmcl("INFO")
     multiple_community_launcher(1, weight, global_weight, same_gene, html_fig,
-                                logging_level)
-    multiple_stat_launcher(ps, weight, global_weight, same_gene,
-                           logging_level)
-    multiple_nt_lmm_launcher(ps, weight, global_weight, same_gene,
+                                feature, logging_level)
+    # multiple_stat_launcher(ps, weight, global_weight, same_gene, feature,
+    #                        logging_level)
+    multiple_nt_lmm_launcher(ps, weight, global_weight, same_gene, feature,
                              logging_level)
 
 
diff --git a/src/find_interaction_cluster/community_finder.py b/src/find_interaction_cluster/community_finder.py
index 0ae68de0..b4180848 100644
--- a/src/find_interaction_cluster/community_finder.py
+++ b/src/find_interaction_cluster/community_finder.py
@@ -88,14 +88,15 @@ def write_cytoscape_graph(graph: nx.Graph, dic_community: Dict[str, str],
 
 
 def find_communities(graph: nx.Graph, project: str,
-                     outfile: Path, result_file: Path
-                     ) -> Tuple[pd.DataFrame, Dict]:
+                     outfile: Path, result_file: Path,
+                     feature: str = 'exon') -> Tuple[pd.DataFrame, Dict]:
     """
     Find the communities within the graph.
 
     :param graph: Graph of co-localized exons
     :param project: The name of the project
     :param outfile: A file containing interaction
+    :param feature: the kind of analysed feature (exons or gene)
     :return: The communities within the graph and the community \
     to wich each exon belong
     """
@@ -111,7 +112,7 @@ def find_communities(graph: nx.Graph, project: str,
     modularity = community.modularity(graph, communities, weight='X')
     d = {'community': [], 'nodes': [], 'edges': [], 'EC': [], 'HCS': [],
          '%E vs E in complete G': [],
-         'cov': [], 'modularity': [], 'exons': []}
+         'cov': [], 'modularity': [], f'{feature}s': []}
     colors = cm.hsv(np.linspace(0, 1, len(communities)))
     logging.debug('Creating community result file')
     for k, c in enumerate(communities):
@@ -139,7 +140,7 @@ def find_communities(graph: nx.Graph, project: str,
         d['EC'].append(edge_connectivity)
         d['HCS'].append(is_hc)
         d['%E vs E in complete G'].append(complete)
-        d['exons'].append(', '.join(sorted(list(c))))
+        d[f'{feature}s'].append(', '.join(sorted(list(c))))
         d['cov'].append(cov)
         d['modularity'].append(round(modularity, 5))
         if nb_nodes <= 9:
@@ -147,8 +148,8 @@ def find_communities(graph: nx.Graph, project: str,
     d['project'] = [project] * len(d['community'])
     df = pd.DataFrame(d)
     if warn > 0:
-        logging.warning(f" {warn} communities were composed by one exon or "
-                        f"less !")
+        logging.warning(f" {warn} communities were composed by one {feature} "
+                        f"or less !")
     return df, dic_community
 
 
@@ -237,7 +238,7 @@ def create_figure(graph: nx.Graph, outfile: Path, dic_community: Dict,
                         auto_open=False, validate=False)
 
 
-def get_figure_title(project, weight, global_weight, same_gene):
+def get_figure_title(project, weight, global_weight, same_gene, feature):
     """
     The title of the graphics.
 
@@ -249,27 +250,30 @@ def get_figure_title(project, weight, global_weight, same_gene):
     seen in `global_weight` project are taken into account
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False) (default False)
+    :param feature: The kind of analysed features
     :return: A figure title
     """
-    title = f"Co-localisation between exons having a weight greater than " \
+    title = f"Co-localisation between {feature}s having a weight greater than " \
             f"{weight} in "
     if global_weight == 0:
         title += f"the project {project}"
     else:
         title += f" at least {global_weight} chia-pet projects"
-    if not same_gene:
-        title += ' (co-localisation within gene are NOT taken into account)'
-    else:
-        title += ' (co-localisation within gene ARE taken into account)'
+    if feature == 'exon':
+        if not same_gene:
+            title += ' (co-localisation within gene are NOT taken into ' \
+                     'account)'
+        else:
+            title += ' (co-localisation within gene ARE taken into account)'
     return title
 
 
 def write_interaction_file(arr_interaction: np.array, project: str,
                            weight: int, global_weight: int, same_gene: bool,
-                           use_weight: bool = False):
+                           use_weight: bool = False, feature: str = 'exon'):
     """
 
-    :param arr_interaction:  Each couples of co-localized exons within a \
+    :param arr_interaction:  Each couples of co-localized feature within a \
     project.
     :param project: The name of the project of interest
     :param weight: The minimum weight of interaction to consider
@@ -280,14 +284,16 @@ def write_interaction_file(arr_interaction: np.array, project: str,
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False) (default False)
     :param use_weight: Say if we want to write the weight into the result file.
+    :param feature: Says if we want to work at gene or at exons level
     :return:
     """
     logging.debug('Writing interaction files ...')
     outfile = ConfigGraph.get_community_file(project, weight, global_weight,
-                                             same_gene, "_interation.txt")
+                                             same_gene, feature,
+                                             f"_interation.txt")
     result = ConfigGraph.get_community_file(project, weight, global_weight,
-                                            same_gene,
-                                            "_communities.txt")
+                                            same_gene, feature,
+                                            f"_communities.txt")
     with outfile.open('w') as f:
         for exon1, exon2, cweight in arr_interaction:
             if not use_weight:
@@ -298,7 +304,7 @@ def write_interaction_file(arr_interaction: np.array, project: str,
 
 def community_finder(weight: int, global_weight: int, project: str = "",
                      same_gene=True, html_fig: bool = False,
-                     logging_level: str = "DISABLE"):
+                     feature: str = "exon", logging_level: str = "DISABLE"):
     """
     Find communities inside co-localisation between exons found in \
     a ChIA-PET project.
@@ -313,26 +319,30 @@ def community_finder(weight: int, global_weight: int, project: str = "",
     same gene (True) or not (False) (default False)
     :param logging_level: The level of data to display (default 'DISABLE')
     :param html_fig: True to create an html figure, false else
+    :param feature: The feature we want to analyse (default 'exon')
     """
     ConfigGraph.output_folder.mkdir(exist_ok=True, parents=True)
     logging_def(ConfigGraph.output_folder, __file__, logging_level)
     cnx = sqlite3.connect(ConfigGraph.db_file)
     interaction = get_project_colocalisation(cnx, project, weight,
-                                             global_weight, same_gene, True)
+                                             global_weight, same_gene, True,
+                                             level=feature)
     outfile, result_file = write_interaction_file(interaction, project,
                                                   weight, global_weight,
-                                                  same_gene)
+                                                  same_gene, feature=feature)
     graph = create_graph(interaction)
-    df, dic_community = find_communities(graph, project, outfile, result_file)
+    df, dic_community = find_communities(graph, project, outfile, result_file,
+                                         feature)
     logging.debug('Writing results ...')
     outfiles = [ConfigGraph.get_community_file(
-        project, weight, global_weight, same_gene, ext)
-        for ext in ['.txt', '.cyjs', '.html']]
+        project, weight, global_weight, same_gene, feature, ext)
+        for ext in [f'.txt', f'.cyjs', f'.html']]
     df.to_csv(outfiles[0], sep="\t", index=False)
     logging.debug("Saving the graph ...")
     write_cytoscape_graph(graph, dic_community, outfiles[1])
     if html_fig:
-        fig_title = get_figure_title(project, weight, global_weight, same_gene)
+        fig_title = get_figure_title(project, weight, global_weight, same_gene,
+                                     feature)
         create_figure(graph, outfiles[2], dic_community, fig_title)
     logging.debug('Done !')
 
@@ -378,6 +388,7 @@ def get_projects_name(global_weights: List[int]) -> Tuple[List[str], Dict]:
 def multiple_community_launcher(ps: int, weights: List[int],
                                 global_weights: List[int],
                                 same_gene: bool, html_fig: bool = False,
+                                feature: str = 'exon',
                                 logging_level: str = "DISABLE"):
     """
     :param ps: The number of processes we want to use.
@@ -389,6 +400,7 @@ def multiple_community_launcher(ps: int, weights: List[int],
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
     :param html_fig: True to create an html figure, false else
+    :param feature: The feature we want to analyse (default 'exon')
     :param logging_level: Level of information to display
     """
     ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
@@ -403,7 +415,7 @@ def multiple_community_launcher(ps: int, weights: List[int],
         global_weight = dic_project[project]
         logging.info(f'Finding community for project : {project}, '
                      f'global_weight : {global_weight}, weight: {weight}')
-        args = [weight, global_weight, project, same_gene, html_fig]
+        args = [weight, global_weight, project, same_gene, html_fig, feature]
         processes.append(pool.apply_async(community_finder, args))
     for proc in processes:
         proc.get(timeout=None)
diff --git a/src/find_interaction_cluster/config.py b/src/find_interaction_cluster/config.py
index 0fbafd92..4c6cbcfb 100644
--- a/src/find_interaction_cluster/config.py
+++ b/src/find_interaction_cluster/config.py
@@ -36,7 +36,8 @@ def get_weight_folder(weight: int, global_weight: int):
 
 
 def get_community_file(project: str, weight: int, global_weight: int,
-                       same_gene: bool, ext: str = ".txt", stat: bool = False):
+                       same_gene: bool, feature: str = 'exon',
+                       ext: str = ".txt", stat: bool = False):
     """
     Get the output file of interest.
 
@@ -48,6 +49,7 @@ def get_community_file(project: str, weight: int, global_weight: int,
     seen in `global_weight` project are taken into account
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
+    :param the kind of feature analyzed
     :param ext: The file extension
     :param stat: True to place the result in 'sf_community_enrichment' \
      subfolder
@@ -59,7 +61,8 @@ def get_community_file(project: str, weight: int, global_weight: int,
     folder.mkdir(exist_ok=True, parents=True)
     if global_weight != 0:
         project = f"global-weight-{global_weight}"
-    return folder / f"{project}_weight-{weight}_same_gene-{same_gene}{ext}"
+    return folder / \
+           f"{project}_weight-{weight}_same_gene-{same_gene}_{feature}{ext}"
 
 
 def get_hipmcl_folder() -> Path:
diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py
index c9dbfceb..ec799b42 100644
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -26,36 +26,46 @@ import multiprocessing as mp
 from .sf_and_communities import get_key
 
 
-def get_nt_frequency(cnx: sqlite3.Connection, list_exon: List[str]
-                     ) -> pd.DataFrame:
+def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
+                     feature: str) -> pd.DataFrame:
     """
-    Get the frequency of every nucleotides for exons in list_exon.
+    Get the frequency of every nucleotides for features in list_ft.
 
     :param cnx: Connection to chia-pet database
-    :param list_exon: The list of exons for which we want to get \
+    :param list_ft: The list of exons for which we want to get
+    :param feature: the kind of feature analysed
     :return: the frequency of nucleotides for the list of exons.
 
     >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file),
-    ... ["1_1", "1_2"])
+    ... ["1_1", "1_2"], "exon")
     >>> d[["id_exon", 'A', 'C', 'G', 'T']]
     ft id_exon         A         C         G         T
     0      1_1  16.63480  34.60803  32.12237  16.63480
     1      1_2  16.06426  26.10442  39.75904  18.07229
+    >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file),
+    ... ['1', '2'], "gene")
+    >>> d[["id_gene", 'A', 'C', 'G', 'T']]
+    ft id_gene         A         C         G         T
+    0        1  29.49376  18.34271  18.43874  33.72479
+    1        2  31.90401  16.40251  18.79033  32.90315
     """
+    if feature == "gene":
+        list_ft = [int(ft) for ft in list_ft]
     query = f"""
-             SELECT ft, id_exon, frequency
-             FROM cin_exon_frequency
-             WHERE id_exon IN {tuple(list_exon)}
+             SELECT ft, id_{feature}, frequency
+             FROM cin_{feature}_frequency
+             WHERE id_{feature} IN {tuple(list_ft)}
              AND ft_type="nt" 
              """
     df = pd.read_sql_query(query, cnx)
-    df = df.pivot_table(index="id_exon", columns="ft", values="frequency")\
+    df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\
         .reset_index()
+    df[f"id_{feature}"] = df[f"id_{feature}"].astype(str)
     return df
 
 
 def get_community_table(communities: List[List[str]],
-                        size_threshold: int) -> pd.DataFrame:
+                        size_threshold: int, feature: str) -> pd.DataFrame:
     """
     return the table indicating the name of the exons and the \
     the name of the community.
@@ -63,22 +73,29 @@ def get_community_table(communities: List[List[str]],
     :param communities: List of community of exons
     :param size_threshold: The required size a community must \
     have to be considered
+    :param feature: The kind of feature analysed
     :return: table of community
     >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']]
-    >>> get_community_table(c, 3)
+    >>> get_community_table(c, 3, 'exon')
       community id_exon  community_size
     0        C1     7_9               3
     1        C1    4_19               3
     2        C1     3_3               3
+    >>> c = [['1', '2'], ['7', '49', '3']]
+    >>> get_community_table(c, 3, 'gene')
+      community id_gene  community_size
+    0        C1       7               3
+    1        C1      49               3
+    2        C1       3               3
     """
-    dic = {"community": [], "id_exon": [], "community_size": []}
+    dic = {"community": [], f"id_{feature}": [], "community_size": []}
     for k, comm in enumerate(communities):
         if len(comm) >= size_threshold:
             name = f'C{k}'
             clen = len(comm)
             for exon in comm:
                 dic["community"].append(name)
-                dic['id_exon'].append(exon)
+                dic[f'id_{feature}'].append(exon)
                 dic["community_size"].append(clen)
     return pd.DataFrame(dic)
 
@@ -114,12 +131,13 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float:
     folder = outfile.parent / "diagnostics"
     folder.mkdir(parents=True, exist_ok=True)
     partial_name = outfile.name.replace('.txt', '')
-    return lmm(df, str(folder), partial_name)["Pr(>Chisq)"][1]
+    res = lmm(df, str(folder), partial_name)
+    return res["Pr(>Chisq)"][1]
 
 
 def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
                             global_weight: int, same_gene: bool,
-                            nt: str
+                            nt: str, feature: str = "exon",
                             ) -> pd.Series:
     """
     Get data (proportion of `reg` regulated exons by a splicing factor
@@ -136,6 +154,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False) (default False)
     :param nt: The nucleotide of interest
+    :param feature: The king of feature analysed
     """
     logging.debug(f"lmm for {project}, w:{weight}, "
                   f"g:{global_weight} nt: {nt}")
@@ -143,7 +162,8 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
 
     outfile = ConfigGraph.get_community_file(project, weight,
                                              global_weight,
-                                             same_gene, f"{nt}_stat.txt",
+                                             same_gene, feature,
+                                             f"{nt}_stat.txt",
                                              True)
     nfolder = outfile.parent / "nt_analysis"
     nfolder.mkdir(exist_ok=True, parents=True)
@@ -156,7 +176,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
 
 
 def create_dataframe(project: str, weight: int, global_weight: int,
-                     same_gene: bool) -> pd.DataFrame:
+                     same_gene: bool, feature: str = 'exon') -> pd.DataFrame:
     """
     :param project: The name of the project of interest
     :param weight: The minimum weight of interaction to consider
@@ -166,26 +186,29 @@ def create_dataframe(project: str, weight: int, global_weight: int,
     seen in `global_weight` project are taken into account
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False)
+    :param feature: The kind of feature to analyse
     :return: A dataframe containing the frequency of every nucleotides \
     of every exon in a large community
     """
-    size_threshold = 50
+    size_threshold = 10 if feature == "gene" else 50
     result = ConfigGraph.get_community_file(project, weight, global_weight,
-                                            same_gene,
+                                            same_gene, feature,
                                             "_communities.txt")
     communities = get_communities(result, 0)
     ncommunities = [c for c in communities if len(c) >= size_threshold]
     cnx = sqlite3.connect(ConfigGraph.db_file)
     list_exon = reduce(lambda x, y: x + y, ncommunities)
-    df = get_nt_frequency(cnx, list_exon)
-    df_com = get_community_table(communities, size_threshold)
-    df = df.merge(df_com, how="left", on="id_exon")
+    df = get_nt_frequency(cnx, list_exon, feature)
+    df_com = get_community_table(communities, size_threshold, feature)
+    df = df.merge(df_com, how="left", on=f"id_{feature}")
     return df
 
 
 def multiple_nt_lmm_launcher(ps: int, weights: List[int],
                              global_weights: List[int],
-                             same_gene: bool, logging_level: str = "DISABLE"):
+                             same_gene: bool,
+                             feature: str = 'exon',
+                             logging_level: str = "DISABLE"):
     """
     Launch the statistical analysis for every
 
@@ -197,6 +220,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
     seen in `global_weight` project are taken into account
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
+    :param feature: The kind of analysed feature
     :param logging_level: Level of information to display
     """
     ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
@@ -218,15 +242,16 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
         if ckey in dic_df:
             df = dic_df[ckey]
         else:
-            df = create_dataframe(project, weight, global_weight, same_gene)
+            df = create_dataframe(project, weight, global_weight, same_gene,
+                                  feature)
             nfile_table = ConfigGraph.get_community_file(project, weight,
                                                          global_weight,
-                                                         same_gene,
+                                                         same_gene, feature,
                                                          f"nt_table.txt",
                                                          True)
             df.to_csv(nfile_table, sep="\t", index=False)
             dic_df[ckey] = df
-        args = [df, project, weight, global_weight, same_gene, nt]
+        args = [df, project, weight, global_weight, same_gene, nt, feature]
         if ckey not in processes.keys():
             processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)]
         else:
@@ -241,7 +266,8 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
         fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1]
         outfile = ConfigGraph.get_community_file(project, weight,
                                                  dic_project[project],
-                                                 same_gene, f"lmm-nt_stat.txt",
+                                                 same_gene, feature,
+                                                 f"lmm-nt_stat.txt",
                                                  True)
         nfolder = outfile.parent / "nt_analysis"
         noutfile = nfolder / outfile.name
diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py
index 724406d3..b950c8b2 100644
--- a/src/find_interaction_cluster/sf_and_communities.py
+++ b/src/find_interaction_cluster/sf_and_communities.py
@@ -28,9 +28,9 @@ from rpy2.robjects import r, pandas2ri
 from pathlib import Path
 
 
-def get_exon_regulated_in_communities(community: List[str],
-                                      regulated_exons: List[str]
-                                      ) -> Tuple[int, int, float]:
+def get_ft_regulated_in_communities(community: List[str],
+                                    regulated_exons: List[str]
+                                    ) -> Tuple[int, int, float]:
     """
 
     :param community: strongly co-localized group of exons belonging to \
@@ -83,7 +83,7 @@ def get_stat_community_exons(cnx: sqlite3.Connection, dic: Dict[str, List],
     n_reg, n_tot = np.nan, np.nan
     for cur_com, name in [[full_com, 'All-community'], [ctrl_exons, "FASTERDB"]]:
         reg_community, tot_commmunity, prop = \
-            get_exon_regulated_in_communities(cur_com, regulated_exons)
+            get_ft_regulated_in_communities(cur_com, regulated_exons)
         if name == 'All-community':
             n_reg = reg_community
             n_tot = tot_commmunity
@@ -211,7 +211,7 @@ def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float:
 
 def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str,
                     project: str, weight: int, global_weight: int,
-                    same_gene: bool) -> pd.Series:
+                    same_gene: bool, feature: str = "exon") -> pd.Series:
     """
     Create the glmm statistics for a given splicing factor with \
     given communities.
@@ -227,13 +227,14 @@ def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str,
     seen in `global_weight` project are taken into account
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False) (default False)
+    :param feature: The kind of feature analysed
     :return: The glmm pvalue among with other informations
     """
     ndf = df.loc[-df['community'].isin(["All-community", "FASTERDB"]),
           :].copy()
     expanded_df = expand_dataframe(ndf)
     outfile = ConfigGraph.get_community_file(project, weight, global_weight,
-                                             same_gene,
+                                             same_gene, feature,
                                              f"{sf_name}_{reg}_stat.txt", True)
     noutfold = outfile.parent / "expanded_df"
     noutfold.mkdir(exist_ok=True, parents=True)
@@ -244,9 +245,36 @@ def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str,
                       "pval": pval})
 
 
+def adapt_regulated_list(cnx: sqlite3.Connection,
+                         exon_list: List, sf_name: str, reg: str,
+                         feature: str = 'exon') -> List:
+    """
+    Transform a list containing exons into a list containing gene only if \
+    feature is equal to 'gene'.
+
+    :param cnx: connection to chia-pet database
+    :param exon_list: A list of exons
+    :param sf_name: The name of the splicing factor regulating the exons \
+    in exon_list
+    :param reg: The regulation of the exons regulated by sf_name
+    :param feature: The kind of feature to analyse
+    :return: if feature is exon then return the original list else,
+    return the list containing the regulated gene
+    """
+    if feature == 'exon':
+        return exon_list
+    dic = {"down": "up", "up": "down"}
+    regulated_dic, number = get_every_events_4_a_sl(cnx, sf_name, dic[reg])
+    exon_list2 = regulated_dic[f"{sf_name}_{dic[reg]}"]
+    genes1 = list(np.unique([exon.split('_')[0] for exon in exon_list]))
+    genes2 = list(np.unique([exon.split('_')[0] for exon in exon_list2]))
+    return [gene for gene in genes1 if gene not in genes2]
+
+
 def get_stat4communities(sf_name: str, reg: str,
                          project: str, weight: int,
-                         global_weight: int, same_gene: bool
+                         global_weight: int, same_gene: bool,
+                         feature: str = 'exon',
                          ) -> Tuple[pd.DataFrame, pd.Series]:
     """
     Get data (proportion of `reg` regulated exons by a splicing factor
@@ -262,23 +290,31 @@ def get_stat4communities(sf_name: str, reg: str,
     seen in `global_weight` project are taken into account
     :param same_gene: Say if we consider as co-localised, exons within the \
     same gene (True) or not (False) (default False)
+    :param feature: The kind of analysed feature
     """
     logging.debug(f"Working on {sf_name}-{reg}, for {project}, w:{weight}, "
                   f"g:{global_weight}")
     cnx = sqlite3.connect(ConfigGraph.db_file)
     result = ConfigGraph.get_community_file(project, weight, global_weight,
-                                            same_gene,
+                                            same_gene, feature,
                                             "_communities.txt")
     communities = get_communities(result, 0)
     regulated_dic, number = get_every_events_4_a_sl(cnx, sf_name, reg)
-    reg_exons = regulated_dic[sf_name + "_" + reg]
+    reg_ft = regulated_dic[sf_name + "_" + reg]
+    reg_ft = adapt_regulated_list(cnx, reg_ft, sf_name, reg, feature)
+    if len(reg_ft) == 0:
+        raise IndexError(f'There no {feature} {reg} regulated specifically '
+                         f'by {sf_name}')
     d = {"community": [], "reg_in_community": [], "community_size": [],
          "%reg in community": [], 'pval': []}
-    d, n2, t2 = get_stat_community_exons(cnx, d, communities, reg_exons)
+    d, n2, t2 = get_stat_community_exons(cnx, d, communities, reg_ft)
+    threshold = 49
+    if feature == "gene":
+        threshold = 9
     for k, community in enumerate(communities):
-        if len(community) > 49:
+        if len(community) > threshold:
             reg_community, tot_commmunity, prop = \
-                get_exon_regulated_in_communities(community, reg_exons)
+                get_ft_regulated_in_communities(community, reg_ft)
             d["community"].append(f"C{k + 1}")
             d['reg_in_community'].append(reg_community)
             d['community_size'].append(tot_commmunity)
@@ -292,7 +328,7 @@ def get_stat4communities(sf_name: str, reg: str,
     d['project'] = [project] * len(d["community"])
     df = pd.DataFrame(d)
     s = glmm_statistics(df, sf_name, reg, project, weight, global_weight,
-                        same_gene)
+                        same_gene, feature)
     return df, s
 
 
@@ -327,7 +363,8 @@ def get_key(project: str, weight: int) -> str:
 
 def multiple_stat_launcher(ps: int, weights: List[int],
                            global_weights: List[int],
-                           same_gene: bool, logging_level: str = "DISABLE"):
+                           same_gene: bool, feature: str = 'exon',
+                           logging_level: str = "DISABLE"):
     """
     Launch the statistical analysis for every
 
@@ -339,12 +376,13 @@ def multiple_stat_launcher(ps: int, weights: List[int],
     seen in `global_weight` project are taken into account
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
+    :param feature: The feature we want to analyse
     :param logging_level: Level of information to display
     """
     ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
     logging_def(ConfigGraph.community_folder, __file__, logging_level)
-    logging.info("Checking if communities contains often exons regulated by a"
-                 " splicing factor")
+    logging.info(f"Checking if communities contains often {feature}s "
+                 f"regulated by a splicing factor")
     sf_list = get_sfname()
     global_weights = list(np.unique(global_weights))
     weights = list(np.unique(weights))
@@ -356,7 +394,8 @@ def multiple_stat_launcher(ps: int, weights: List[int],
     for project, weight, sf_name, reg in condition:
         ckey = get_key(project, weight)
         global_weight = dic_project[project]
-        args = [sf_name, reg, project, weight, global_weight, same_gene]
+        args = [sf_name, reg, project, weight, global_weight, same_gene,
+                feature]
         if ckey not in processes.keys():
             processes[ckey] = [pool.apply_async(get_stat4communities, args)]
         else:
@@ -371,7 +410,7 @@ def multiple_stat_launcher(ps: int, weights: List[int],
         df = pd.concat(list_df, axis=0, ignore_index=True)
         outfile = ConfigGraph.get_community_file(project, weight,
                                                  dic_project[project],
-                                                 same_gene,
+                                                 same_gene, feature,
                                                  "_stat.txt", True)
         df.to_csv(outfile, sep="\t", index=False)
         glm_df = pd.DataFrame(list_series)
@@ -379,7 +418,7 @@ def multiple_stat_launcher(ps: int, weights: List[int],
                                        method='fdr_bh')[1]
         outfile = ConfigGraph.get_community_file(project, weight,
                                                  dic_project[project],
-                                                 same_gene,
+                                                 same_gene, feature,
                                                  "_glmm_stat.txt", True)
         glm_df.to_csv(outfile, sep="\t", index=False)
 
-- 
GitLab