src/find_interaction_cluster/nt_and_community.py: add a region parameter

c2cfe070 · nfontrod · 011bb626 · c2cfe070
Commit c2cfe070 authored 4 years ago by nfontrod
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -27,13 +27,14 @@ from .sf_and_communities import get_key
 def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
-                     feature: str) -> pd.DataFrame:
+                     feature: str, region: str = "") -> pd.DataFrame:
    """
    Get the frequency of every nucleotides for features in list_ft.
    :param cnx: Connection to chia-pet database
    :param list_ft: The list of exons for which we want to get
    :param feature: the kind of feature analysed
+    :param region: The region of gene analysed if feature is gene
    :return: the frequency of nucleotides for the list of exons.
    >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file),
@@ -49,13 +50,18 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
    0        1  29.49376  18.34271  18.43874  33.72479
    1        2  31.90401  16.40251  18.79033  32.90315
    """
+    query_region = ""
    if feature == "gene":
        list_ft = [int(ft) for ft in list_ft]
+        if region == "":
+            region = "gene"
+        query_region = f"AND region = '{region}'"
    query = f"""
             SELECT ft, id_{feature}, frequency
             FROM cin_{feature}_frequency
             WHERE id_{feature} IN {tuple(list_ft)}
             AND ft_type="nt" 
+             {query_region}
             """
    df = pd.read_sql_query(query, cnx)
    df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\
@@ -190,7 +196,8 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
 def create_ctrl_community(df: pd.DataFrame, outfile: Path,
-                          feature: str = 'exon') -> pd.DataFrame:
+                          feature: str = 'exon', region: str = ""
+                          ) -> pd.DataFrame:
    """
    :param df: A dataframe containing the frequency of each feature in a \
    community.
@@ -206,7 +213,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
    ft_id = get_ft_id(cnx, feature)
    list_ft = [cid for cid in ft_id
               if cid not in df[f"id_{feature}"].to_list()]
-    df_nt = get_nt_frequency(cnx, list_ft, feature)
+    df_nt = get_nt_frequency(cnx, list_ft, feature, region)
    df_com = get_community_table([list_ft], size_threshold, feature)
    df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}")
    df_nt['community'] = ['C-CTRL'] * df_nt.shape[0]
@@ -218,6 +225,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
 def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
                            global_weight: int, same_gene: bool,
                            nt: str, feature: str = "exon",
+                            region: str = "",
                            ) -> pd.Series:
    """
    Get data (proportion of `reg` regulated exons by a splicing factor
@@ -235,6 +243,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
    same gene (True) or not (False) (default False)
    :param nt: The nucleotide of interest
    :param feature: The king of feature analysed
+    :param region: the region of interest to extract from gene
    """
    logging.debug(f"lmm for {project}, w:{weight}, "
                  f"g:{global_weight} nt: {nt}")
@@ -254,7 +263,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
    res['pval'] = pval
    nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt",
                                                            "_ctrl.txt")
-    ndf = create_ctrl_community(df, nt_ctrl_table, feature)
+    ndf = create_ctrl_community(df, nt_ctrl_table, feature, region)
    sum_df = lmm_maker_summary(ndf, outfile, nt)
    outfile_ctrl = ConfigGraph.get_community_file(project, weight,
                                                  global_weight,
@@ -298,7 +307,7 @@ def create_dataframe(project: str, weight: int, global_weight: int,
 def multiple_nt_lmm_launcher(ps: int, weights: List[int],
                             global_weights: List[int],
                             same_gene: bool,
-                             feature: str = 'exon',
+                             feature: str = 'exon', region: str = '',
                             logging_level: str = "DISABLE"):
    """
    Launch the statistical analysis for every
@@ -312,6 +321,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
    :param same_gene: Say if we consider as co-localised exon within the \
    same gene
    :param feature: The kind of analysed feature
+    :param region: the region of interest to extract from gene
    :param logging_level: Level of information to display
    """
    ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
@@ -342,7 +352,8 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
                                                         True)
            df.to_csv(nfile_table, sep="\t", index=False)
            dic_df[ckey] = df
-        args = [df, project, weight, global_weight, same_gene, nt, feature]
+        args = [df, project, weight, global_weight, same_gene, nt, feature,
+                region]
        if ckey not in processes.keys():
            processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)]
        else: