Skip to content
Snippets Groups Projects
Commit a824f6dc authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/clip_figures/clip_compo_analyser.py: changes to...

src/find_interaction_cluster/clip_figures/clip_compo_analyser.py: changes to be able to use an expression file to filter out unexpressed genes
parent 893c14f8
Branches
No related tags found
No related merge requests found
...@@ -72,6 +72,10 @@ def prepare_df(clip_df: pd.DataFrame, feature: str) -> pd.DataFrame: ...@@ -72,6 +72,10 @@ def prepare_df(clip_df: pd.DataFrame, feature: str) -> pd.DataFrame:
True True
""" """
df = clip_df.copy() df = clip_df.copy()
if "regulation" in df.columns:
df.rename({"regulation": "peak_density",
"mean_regulation": "mean_peak_density"}, axis=1,
inplace=True)
df = df.loc[df[f"id_{feature}"] != "ctrl", df = df.loc[df[f"id_{feature}"] != "ctrl",
["peak_density", "community", "community_size", "p-adj", ["peak_density", "community", "community_size", "p-adj",
"reg-adj", f"id_{feature}"]] "reg-adj", f"id_{feature}"]]
...@@ -402,7 +406,8 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, ...@@ -402,7 +406,8 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
df_comp: pd.DataFrame, cpnt_type: str, df_comp: pd.DataFrame, cpnt_type: str,
cpnt: str, df_id_ctrl: pd.DataFrame, cpnt: str, df_id_ctrl: pd.DataFrame,
write_comp_table: bool, write_comp_table: bool,
display_size: bool) -> None: display_size: bool, expression_file: str
) -> None:
""" """
Create a component figure for a clip file Create a component figure for a clip file
...@@ -421,6 +426,7 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, ...@@ -421,6 +426,7 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
:param write_comp_table: True to save comp table, false else :param write_comp_table: True to save comp table, false else
:param display_size: True to display the size of the community . :param display_size: True to display the size of the community .
False to display nothing. (default False) False to display nothing. (default False)
:param expression_file: A file containing expression values of genes
""" """
logging.info(f"Working on {clip_table_file.name} - {cpnt} ({cpnt_type})") logging.info(f"Working on {clip_table_file.name} - {cpnt} ({cpnt_type})")
df_group = get_interest_groups(clip_table_file, feature, df_group = get_interest_groups(clip_table_file, feature,
...@@ -445,6 +451,7 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, ...@@ -445,6 +451,7 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
f"{threshold_feature}_dft{default_groups}.{ext}" f"{threshold_feature}_dft{default_groups}.{ext}"
for ext in ['txt', 'pdf', 'bar.txt']] for ext in ['txt', 'pdf', 'bar.txt']]
outfiles[0].parent.mkdir(exist_ok=True) outfiles[0].parent.mkdir(exist_ok=True)
df_comp = filter_composition_dataframe(df_comp, feature, expression_file)
df_stat = create_stat_df(df_comp, cpnt) df_stat = create_stat_df(df_comp, cpnt)
df_stat.to_csv(outfiles[0], sep="\t", index=False) df_stat.to_csv(outfiles[0], sep="\t", index=False)
df_comp = update_composition_group(df_comp, display_size) df_comp = update_composition_group(df_comp, display_size)
...@@ -453,10 +460,57 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, ...@@ -453,10 +460,57 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
df_comp.to_csv(outfiles[2], sep="\t", index=False) df_comp.to_csv(outfiles[2], sep="\t", index=False)
def filter_composition_dataframe(df_comp: pd.DataFrame, feature: str,
expression_file: str
) -> pd.DataFrame:
"""
Remove genes/exons with weak expression if a file `expression_file` is \
given. This files contains the expression of genes.
:param df_comp: A dataframe containing the composition of each \
feature of interest
:param feature: The feature of interest
:param expression_file: A file containing the expression of gene
:return: The dataframe filtered if `expression_file` is a file, else \
return the same dataframe given in input
>>> d = pd.DataFrame({"id_gene": [1, 2, 3], "A": [10, 15, 26],
... "group": ['ctrl', 'ctrl', 'ctrl']})
>>> filter_composition_dataframe(d, "gene",
... Config.tests_files / "expression.txt")
id_gene A group
0 1 10 ctrl
2 3 26 ctrl
>>> d = pd.DataFrame({"id_exon": ['1_1', '2_1', '3_1'], "A": [10, 15, 26],
... "group": ['ctrl', 'ctrl', 'ctrl']})
>>> filter_composition_dataframe(d, "exon",
... Config.tests_files / "expression.txt")
id_exon A group
0 1_1 10 ctrl
2 3_1 26 ctrl
>>> filter_composition_dataframe(d, "gene", "")
id_exon A group id_gene
0 1_1 10 ctrl 1
1 2_1 15 ctrl 2
2 3_1 26 ctrl 3
"""
if not expression_file or not Path(expression_file).is_file():
return df_comp
df = pd.read_csv(expression_file, sep="\t")
genes = df.loc[df["baseMean"] >= 10., "gene"].to_list()
if feature == "exon":
df_comp["id_gene"] = df_comp["id_exon"].str.replace(r"_\d+", ""
).astype(int)
df_comp = df_comp[df_comp["id_gene"].isin(genes)].copy()
if feature == "exon":
df_comp.drop("id_gene", axis=1, inplace=True)
return df_comp
def create_multiple_figures(clip_result_folder: Path, feature: str, def create_multiple_figures(clip_result_folder: Path, feature: str,
threshold_feature: int, default_groups: int, threshold_feature: int, default_groups: int,
cpnt_type: str, region: str = "gene", cpnt_type: str, region: str = "gene",
ps: int = 1, display_size: bool = False, ps: int = 1, display_size: bool = False,
expression_file: str = "",
logging_level: str = "DEBUG") -> None: logging_level: str = "DEBUG") -> None:
""" """
Create multiple composition figures from a result clip folder. Create multiple composition figures from a result clip folder.
...@@ -475,6 +529,7 @@ def create_multiple_figures(clip_result_folder: Path, feature: str, ...@@ -475,6 +529,7 @@ def create_multiple_figures(clip_result_folder: Path, feature: str,
:param ps: The number of processes to create :param ps: The number of processes to create
:param display_size: True to display the size of the community, :param display_size: True to display the size of the community,
False to display nothing. (default False) False to display nothing. (default False)
:param expression_file: A file containing expression values of genes
:param logging_level: The level of data to display (default 'DISABLE') :param logging_level: The level of data to display (default 'DISABLE')
""" """
logging_def(ConfigGraph.community_folder, __file__, logging_level) logging_def(ConfigGraph.community_folder, __file__, logging_level)
...@@ -496,9 +551,9 @@ def create_multiple_figures(clip_result_folder: Path, feature: str, ...@@ -496,9 +551,9 @@ def create_multiple_figures(clip_result_folder: Path, feature: str,
for clip_file, cpnt in conditions: for clip_file, cpnt in conditions:
args = [clip_file, feature, threshold_feature, default_groups, args = [clip_file, feature, threshold_feature, default_groups,
df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table, df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table,
display_size] display_size, expression_file]
write_bar_table = False
processes.append(pool.apply_async(create_figure_4_clip_table, args)) processes.append(pool.apply_async(create_figure_4_clip_table, args))
write_bar_table = False
[p.get(timeout=None) for p in processes] [p.get(timeout=None) for p in processes]
pool.close() pool.close()
pool.join() pool.join()
...@@ -563,7 +618,9 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str, ...@@ -563,7 +618,9 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str,
df_comp: pd.DataFrame, cpnt_type: str, df_comp: pd.DataFrame, cpnt_type: str,
cpnt: str, df_id_ctrl: pd.DataFrame, cpnt: str, df_id_ctrl: pd.DataFrame,
write_comp_table: bool, write_comp_table: bool,
display_size: bool) -> Optional[Path]: display_size: bool,
expression_file: str
) -> Optional[Path]:
""" """
Create a component figure for a clip file Create a component figure for a clip file
...@@ -582,6 +639,8 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str, ...@@ -582,6 +639,8 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str,
:param write_comp_table: True to save comp table, false else :param write_comp_table: True to save comp table, false else
:param display_size: True to display the size of the community . :param display_size: True to display the size of the community .
False to display nothing. (default False) False to display nothing. (default False)
:param expression_file: A file containing the expression values of \
genes
:return: The folder where the figure is created :return: The folder where the figure is created
""" """
logging.info(f"Working on {clip_table_file.parent.name} - " logging.info(f"Working on {clip_table_file.parent.name} - "
...@@ -608,6 +667,7 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str, ...@@ -608,6 +667,7 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str,
f"{threshold_feature}_dft{default_groups}.{ext}" f"{threshold_feature}_dft{default_groups}.{ext}"
for ext in ['txt', 'pdf', 'bar.txt']] for ext in ['txt', 'pdf', 'bar.txt']]
outfiles[0].parent.mkdir(exist_ok=True) outfiles[0].parent.mkdir(exist_ok=True)
df_comp = filter_composition_dataframe(df_comp, feature, expression_file)
df_stat = create_stat_df(df_comp, cpnt) df_stat = create_stat_df(df_comp, cpnt)
df_stat.to_csv(outfiles[0], sep="\t", index=False) df_stat.to_csv(outfiles[0], sep="\t", index=False)
df_comp = update_composition_group(df_comp, display_size) df_comp = update_composition_group(df_comp, display_size)
...@@ -621,6 +681,7 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str, ...@@ -621,6 +681,7 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str,
threshold_feature: int, default_groups: int, threshold_feature: int, default_groups: int,
cpnt_type: str, region: str = "gene", cpnt_type: str, region: str = "gene",
ps: int = 1, display_size: bool = False, ps: int = 1, display_size: bool = False,
expression_file: str = "",
logging_level: str = "DEBUG") -> None: logging_level: str = "DEBUG") -> None:
""" """
Create multiple composition figures from a result clip folder (produced \ Create multiple composition figures from a result clip folder (produced \
...@@ -640,6 +701,8 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str, ...@@ -640,6 +701,8 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str,
:param ps: The number of processes to create :param ps: The number of processes to create
:param display_size: True to display the size of the community, :param display_size: True to display the size of the community,
False to display nothing. (default False) False to display nothing. (default False)
:param expression_file: A file containing the expression values of \
genes
:param logging_level: The level of data to display (default 'DISABLE') :param logging_level: The level of data to display (default 'DISABLE')
""" """
logging_def(ConfigGraph.community_folder, __file__, logging_level) logging_def(ConfigGraph.community_folder, __file__, logging_level)
...@@ -664,10 +727,10 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str, ...@@ -664,10 +727,10 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str,
df_ctrl = dic_df_ctrl[clip_file.name.replace(".tmp_bar.txt", "")] df_ctrl = dic_df_ctrl[clip_file.name.replace(".tmp_bar.txt", "")]
args = [clip_file, feature, threshold_feature, default_groups, args = [clip_file, feature, threshold_feature, default_groups,
df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table, df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table,
display_size] display_size, expression_file]
write_bar_table = False
processes.append(pool.apply_async(create_figure_multi_clip_table, processes.append(pool.apply_async(create_figure_multi_clip_table,
args)) args))
write_bar_table = False
results = [p.get(timeout=None) for p in processes] results = [p.get(timeout=None) for p in processes]
pool.close() pool.close()
pool.join() pool.join()
...@@ -682,6 +745,7 @@ def multi_tads_launcher(clip_result_folder: str, feature: str, ...@@ -682,6 +745,7 @@ def multi_tads_launcher(clip_result_folder: str, feature: str,
threshold_feature: int = 100, default_groups: int = 20, threshold_feature: int = 100, default_groups: int = 20,
cpnt_type: str = "nt", region: str = "gene", cpnt_type: str = "nt", region: str = "gene",
ps: int = 1, display_size: bool = False, ps: int = 1, display_size: bool = False,
expression_file: str = "",
logging_level: str = "DEBUG") -> None: logging_level: str = "DEBUG") -> None:
""" """
Launch multi_tads_compo_figures script Launch multi_tads_compo_figures script
...@@ -701,12 +765,15 @@ def multi_tads_launcher(clip_result_folder: str, feature: str, ...@@ -701,12 +765,15 @@ def multi_tads_launcher(clip_result_folder: str, feature: str,
:param ps: The number of processes to create (default 1) :param ps: The number of processes to create (default 1)
:param display_size: True to display the size of the community, :param display_size: True to display the size of the community,
False to display nothing. (default False) False to display nothing. (default False)
:param expression_file: A file containing the expression values of \
genes (default ''). If this file is provided, then the weakly expressed \
genes are filtered out from the analysis.
:param logging_level: The level of data to display (default 'DISABLE') :param logging_level: The level of data to display (default 'DISABLE')
""" """
multi_tads_compo_figures(Path(clip_result_folder), feature, multi_tads_compo_figures(Path(clip_result_folder), feature,
threshold_feature, default_groups, threshold_feature, default_groups,
cpnt_type, region, ps, display_size, cpnt_type, region, ps, display_size,
logging_level) expression_file, logging_level)
if __name__ == "__main__": if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment