From c2cfe0701c63899f8ed35ef3cab08cbfdc84f2de Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Thu, 29 Oct 2020 16:52:23 +0100
Subject: [PATCH] src/find_interaction_cluster/nt_and_community.py: add a
 region parameter

---
 .../nt_and_community.py                       | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py
index 48896d40..3f769be1 100644
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -27,13 +27,14 @@ from .sf_and_communities import get_key
 
 
 def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
-                     feature: str) -> pd.DataFrame:
+                     feature: str, region: str = "") -> pd.DataFrame:
     """
     Get the frequency of every nucleotides for features in list_ft.
 
     :param cnx: Connection to chia-pet database
     :param list_ft: The list of exons for which we want to get
     :param feature: the kind of feature analysed
+    :param region: The region of gene analysed if feature is gene
     :return: the frequency of nucleotides for the list of exons.
 
     >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file),
@@ -49,13 +50,18 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
     0        1  29.49376  18.34271  18.43874  33.72479
     1        2  31.90401  16.40251  18.79033  32.90315
     """
+    query_region = ""
     if feature == "gene":
         list_ft = [int(ft) for ft in list_ft]
+        if region == "":
+            region = "gene"
+        query_region = f"AND region = '{region}'"
     query = f"""
              SELECT ft, id_{feature}, frequency
              FROM cin_{feature}_frequency
              WHERE id_{feature} IN {tuple(list_ft)}
              AND ft_type="nt" 
+             {query_region}
              """
     df = pd.read_sql_query(query, cnx)
     df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\
@@ -190,7 +196,8 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
 
 
 def create_ctrl_community(df: pd.DataFrame, outfile: Path,
-                          feature: str = 'exon') -> pd.DataFrame:
+                          feature: str = 'exon', region: str = ""
+                          ) -> pd.DataFrame:
     """
     :param df: A dataframe containing the frequency of each feature in a \
     community.
@@ -206,7 +213,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
     ft_id = get_ft_id(cnx, feature)
     list_ft = [cid for cid in ft_id
                if cid not in df[f"id_{feature}"].to_list()]
-    df_nt = get_nt_frequency(cnx, list_ft, feature)
+    df_nt = get_nt_frequency(cnx, list_ft, feature, region)
     df_com = get_community_table([list_ft], size_threshold, feature)
     df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}")
     df_nt['community'] = ['C-CTRL'] * df_nt.shape[0]
@@ -218,6 +225,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
 def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
                             global_weight: int, same_gene: bool,
                             nt: str, feature: str = "exon",
+                            region: str = "",
                             ) -> pd.Series:
     """
     Get data (proportion of `reg` regulated exons by a splicing factor
@@ -235,6 +243,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
     same gene (True) or not (False) (default False)
     :param nt: The nucleotide of interest
     :param feature: The king of feature analysed
+    :param region: the region of interest to extract from gene
     """
     logging.debug(f"lmm for {project}, w:{weight}, "
                   f"g:{global_weight} nt: {nt}")
@@ -254,7 +263,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
     res['pval'] = pval
     nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt",
                                                             "_ctrl.txt")
-    ndf = create_ctrl_community(df, nt_ctrl_table, feature)
+    ndf = create_ctrl_community(df, nt_ctrl_table, feature, region)
     sum_df = lmm_maker_summary(ndf, outfile, nt)
     outfile_ctrl = ConfigGraph.get_community_file(project, weight,
                                                   global_weight,
@@ -298,7 +307,7 @@ def create_dataframe(project: str, weight: int, global_weight: int,
 def multiple_nt_lmm_launcher(ps: int, weights: List[int],
                              global_weights: List[int],
                              same_gene: bool,
-                             feature: str = 'exon',
+                             feature: str = 'exon', region: str = '',
                              logging_level: str = "DISABLE"):
     """
     Launch the statistical analysis for every
@@ -312,6 +321,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
     :param same_gene: Say if we consider as co-localised exon within the \
     same gene
     :param feature: The kind of analysed feature
+    :param region: the region of interest to extract from gene
     :param logging_level: Level of information to display
     """
     ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
@@ -342,7 +352,8 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
                                                          True)
             df.to_csv(nfile_table, sep="\t", index=False)
             dic_df[ckey] = df
-        args = [df, project, weight, global_weight, same_gene, nt, feature]
+        args = [df, project, weight, global_weight, same_gene, nt, feature,
+                region]
         if ckey not in processes.keys():
             processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)]
         else:
-- 
GitLab