diff --git a/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py index 9f4fd7792ef3c7745dba45407e9e2d3ecb1d1946..26f71d3ab7640e47a108c3a2bbff23bb81d95ab0 100644 --- a/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py +++ b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py @@ -72,6 +72,10 @@ def prepare_df(clip_df: pd.DataFrame, feature: str) -> pd.DataFrame: True """ df = clip_df.copy() + if "regulation" in df.columns: + df.rename({"regulation": "peak_density", + "mean_regulation": "mean_peak_density"}, axis=1, + inplace=True) df = df.loc[df[f"id_{feature}"] != "ctrl", ["peak_density", "community", "community_size", "p-adj", "reg-adj", f"id_{feature}"]] @@ -402,7 +406,8 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, df_comp: pd.DataFrame, cpnt_type: str, cpnt: str, df_id_ctrl: pd.DataFrame, write_comp_table: bool, - display_size: bool) -> None: + display_size: bool, expression_file: str + ) -> None: """ Create a component figure for a clip file @@ -421,6 +426,7 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, :param write_comp_table: True to save comp table, false else :param display_size: True to display the size of the community . False to display nothing. (default False) + :param expression_file: A file containing expression values of genes """ logging.info(f"Working on {clip_table_file.name} - {cpnt} ({cpnt_type})") df_group = get_interest_groups(clip_table_file, feature, @@ -445,6 +451,7 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, f"{threshold_feature}_dft{default_groups}.{ext}" for ext in ['txt', 'pdf', 'bar.txt']] outfiles[0].parent.mkdir(exist_ok=True) + df_comp = filter_composition_dataframe(df_comp, feature, expression_file) df_stat = create_stat_df(df_comp, cpnt) df_stat.to_csv(outfiles[0], sep="\t", index=False) df_comp = update_composition_group(df_comp, display_size) @@ -453,10 +460,57 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, df_comp.to_csv(outfiles[2], sep="\t", index=False) +def filter_composition_dataframe(df_comp: pd.DataFrame, feature: str, + expression_file: str + ) -> pd.DataFrame: + """ + Remove genes/exons with weak expression if a file `expression_file` is \ + given. This files contains the expression of genes. + + :param df_comp: A dataframe containing the composition of each \ + feature of interest + :param feature: The feature of interest + :param expression_file: A file containing the expression of gene + :return: The dataframe filtered if `expression_file` is a file, else \ + return the same dataframe given in input + >>> d = pd.DataFrame({"id_gene": [1, 2, 3], "A": [10, 15, 26], + ... "group": ['ctrl', 'ctrl', 'ctrl']}) + >>> filter_composition_dataframe(d, "gene", + ... Config.tests_files / "expression.txt") + id_gene A group + 0 1 10 ctrl + 2 3 26 ctrl + >>> d = pd.DataFrame({"id_exon": ['1_1', '2_1', '3_1'], "A": [10, 15, 26], + ... "group": ['ctrl', 'ctrl', 'ctrl']}) + >>> filter_composition_dataframe(d, "exon", + ... Config.tests_files / "expression.txt") + id_exon A group + 0 1_1 10 ctrl + 2 3_1 26 ctrl + >>> filter_composition_dataframe(d, "gene", "") + id_exon A group id_gene + 0 1_1 10 ctrl 1 + 1 2_1 15 ctrl 2 + 2 3_1 26 ctrl 3 + """ + if not expression_file or not Path(expression_file).is_file(): + return df_comp + df = pd.read_csv(expression_file, sep="\t") + genes = df.loc[df["baseMean"] >= 10., "gene"].to_list() + if feature == "exon": + df_comp["id_gene"] = df_comp["id_exon"].str.replace(r"_\d+", "" + ).astype(int) + df_comp = df_comp[df_comp["id_gene"].isin(genes)].copy() + if feature == "exon": + df_comp.drop("id_gene", axis=1, inplace=True) + return df_comp + + def create_multiple_figures(clip_result_folder: Path, feature: str, threshold_feature: int, default_groups: int, cpnt_type: str, region: str = "gene", ps: int = 1, display_size: bool = False, + expression_file: str = "", logging_level: str = "DEBUG") -> None: """ Create multiple composition figures from a result clip folder. @@ -475,6 +529,7 @@ def create_multiple_figures(clip_result_folder: Path, feature: str, :param ps: The number of processes to create :param display_size: True to display the size of the community, False to display nothing. (default False) + :param expression_file: A file containing expression values of genes :param logging_level: The level of data to display (default 'DISABLE') """ logging_def(ConfigGraph.community_folder, __file__, logging_level) @@ -496,9 +551,9 @@ def create_multiple_figures(clip_result_folder: Path, feature: str, for clip_file, cpnt in conditions: args = [clip_file, feature, threshold_feature, default_groups, df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table, - display_size] - write_bar_table = False + display_size, expression_file] processes.append(pool.apply_async(create_figure_4_clip_table, args)) + write_bar_table = False [p.get(timeout=None) for p in processes] pool.close() pool.join() @@ -563,7 +618,9 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str, df_comp: pd.DataFrame, cpnt_type: str, cpnt: str, df_id_ctrl: pd.DataFrame, write_comp_table: bool, - display_size: bool) -> Optional[Path]: + display_size: bool, + expression_file: str + ) -> Optional[Path]: """ Create a component figure for a clip file @@ -582,6 +639,8 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str, :param write_comp_table: True to save comp table, false else :param display_size: True to display the size of the community . False to display nothing. (default False) + :param expression_file: A file containing the expression values of \ + genes :return: The folder where the figure is created """ logging.info(f"Working on {clip_table_file.parent.name} - " @@ -608,6 +667,7 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str, f"{threshold_feature}_dft{default_groups}.{ext}" for ext in ['txt', 'pdf', 'bar.txt']] outfiles[0].parent.mkdir(exist_ok=True) + df_comp = filter_composition_dataframe(df_comp, feature, expression_file) df_stat = create_stat_df(df_comp, cpnt) df_stat.to_csv(outfiles[0], sep="\t", index=False) df_comp = update_composition_group(df_comp, display_size) @@ -621,6 +681,7 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str, threshold_feature: int, default_groups: int, cpnt_type: str, region: str = "gene", ps: int = 1, display_size: bool = False, + expression_file: str = "", logging_level: str = "DEBUG") -> None: """ Create multiple composition figures from a result clip folder (produced \ @@ -640,6 +701,8 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str, :param ps: The number of processes to create :param display_size: True to display the size of the community, False to display nothing. (default False) + :param expression_file: A file containing the expression values of \ + genes :param logging_level: The level of data to display (default 'DISABLE') """ logging_def(ConfigGraph.community_folder, __file__, logging_level) @@ -664,10 +727,10 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str, df_ctrl = dic_df_ctrl[clip_file.name.replace(".tmp_bar.txt", "")] args = [clip_file, feature, threshold_feature, default_groups, df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table, - display_size] - write_bar_table = False + display_size, expression_file] processes.append(pool.apply_async(create_figure_multi_clip_table, args)) + write_bar_table = False results = [p.get(timeout=None) for p in processes] pool.close() pool.join() @@ -682,6 +745,7 @@ def multi_tads_launcher(clip_result_folder: str, feature: str, threshold_feature: int = 100, default_groups: int = 20, cpnt_type: str = "nt", region: str = "gene", ps: int = 1, display_size: bool = False, + expression_file: str = "", logging_level: str = "DEBUG") -> None: """ Launch multi_tads_compo_figures script @@ -701,12 +765,15 @@ def multi_tads_launcher(clip_result_folder: str, feature: str, :param ps: The number of processes to create (default 1) :param display_size: True to display the size of the community, False to display nothing. (default False) + :param expression_file: A file containing the expression values of \ + genes (default ''). If this file is provided, then the weakly expressed \ + genes are filtered out from the analysis. :param logging_level: The level of data to display (default 'DISABLE') """ multi_tads_compo_figures(Path(clip_result_folder), feature, threshold_feature, default_groups, cpnt_type, region, ps, display_size, - logging_level) + expression_file, logging_level) if __name__ == "__main__":