From c2cfe0701c63899f8ed35ef3cab08cbfdc84f2de Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Thu, 29 Oct 2020 16:52:23 +0100 Subject: [PATCH] src/find_interaction_cluster/nt_and_community.py: add a region parameter --- .../nt_and_community.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 48896d40..3f769be1 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -27,13 +27,14 @@ from .sf_and_communities import get_key def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], - feature: str) -> pd.DataFrame: + feature: str, region: str = "") -> pd.DataFrame: """ Get the frequency of every nucleotides for features in list_ft. :param cnx: Connection to chia-pet database :param list_ft: The list of exons for which we want to get :param feature: the kind of feature analysed + :param region: The region of gene analysed if feature is gene :return: the frequency of nucleotides for the list of exons. >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file), @@ -49,13 +50,18 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], 0 1 29.49376 18.34271 18.43874 33.72479 1 2 31.90401 16.40251 18.79033 32.90315 """ + query_region = "" if feature == "gene": list_ft = [int(ft) for ft in list_ft] + if region == "": + region = "gene" + query_region = f"AND region = '{region}'" query = f""" SELECT ft, id_{feature}, frequency FROM cin_{feature}_frequency WHERE id_{feature} IN {tuple(list_ft)} AND ft_type="nt" + {query_region} """ df = pd.read_sql_query(query, cnx) df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\ @@ -190,7 +196,8 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: def create_ctrl_community(df: pd.DataFrame, outfile: Path, - feature: str = 'exon') -> pd.DataFrame: + feature: str = 'exon', region: str = "" + ) -> pd.DataFrame: """ :param df: A dataframe containing the frequency of each feature in a \ community. @@ -206,7 +213,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path, ft_id = get_ft_id(cnx, feature) list_ft = [cid for cid in ft_id if cid not in df[f"id_{feature}"].to_list()] - df_nt = get_nt_frequency(cnx, list_ft, feature) + df_nt = get_nt_frequency(cnx, list_ft, feature, region) df_com = get_community_table([list_ft], size_threshold, feature) df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] @@ -218,6 +225,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path, def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, global_weight: int, same_gene: bool, nt: str, feature: str = "exon", + region: str = "", ) -> pd.Series: """ Get data (proportion of `reg` regulated exons by a splicing factor @@ -235,6 +243,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, same gene (True) or not (False) (default False) :param nt: The nucleotide of interest :param feature: The king of feature analysed + :param region: the region of interest to extract from gene """ logging.debug(f"lmm for {project}, w:{weight}, " f"g:{global_weight} nt: {nt}") @@ -254,7 +263,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, res['pval'] = pval nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt", "_ctrl.txt") - ndf = create_ctrl_community(df, nt_ctrl_table, feature) + ndf = create_ctrl_community(df, nt_ctrl_table, feature, region) sum_df = lmm_maker_summary(ndf, outfile, nt) outfile_ctrl = ConfigGraph.get_community_file(project, weight, global_weight, @@ -298,7 +307,7 @@ def create_dataframe(project: str, weight: int, global_weight: int, def multiple_nt_lmm_launcher(ps: int, weights: List[int], global_weights: List[int], same_gene: bool, - feature: str = 'exon', + feature: str = 'exon', region: str = '', logging_level: str = "DISABLE"): """ Launch the statistical analysis for every @@ -312,6 +321,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], :param same_gene: Say if we consider as co-localised exon within the \ same gene :param feature: The kind of analysed feature + :param region: the region of interest to extract from gene :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -342,7 +352,8 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], True) df.to_csv(nfile_table, sep="\t", index=False) dic_df[ckey] = df - args = [df, project, weight, global_weight, same_gene, nt, feature] + args = [df, project, weight, global_weight, same_gene, nt, feature, + region] if ckey not in processes.keys(): processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)] else: -- GitLab