Skip to content
Snippets Groups Projects
Commit a824f6dc authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/clip_figures/clip_compo_analyser.py: changes to...

src/find_interaction_cluster/clip_figures/clip_compo_analyser.py: changes to be able to use an expression file to filter out unexpressed genes
parent 893c14f8
No related branches found
No related tags found
No related merge requests found
......@@ -72,6 +72,10 @@ def prepare_df(clip_df: pd.DataFrame, feature: str) -> pd.DataFrame:
True
"""
df = clip_df.copy()
if "regulation" in df.columns:
df.rename({"regulation": "peak_density",
"mean_regulation": "mean_peak_density"}, axis=1,
inplace=True)
df = df.loc[df[f"id_{feature}"] != "ctrl",
["peak_density", "community", "community_size", "p-adj",
"reg-adj", f"id_{feature}"]]
......@@ -402,7 +406,8 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
df_comp: pd.DataFrame, cpnt_type: str,
cpnt: str, df_id_ctrl: pd.DataFrame,
write_comp_table: bool,
display_size: bool) -> None:
display_size: bool, expression_file: str
) -> None:
"""
Create a component figure for a clip file
......@@ -421,6 +426,7 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
:param write_comp_table: True to save comp table, false else
:param display_size: True to display the size of the community .
False to display nothing. (default False)
:param expression_file: A file containing expression values of genes
"""
logging.info(f"Working on {clip_table_file.name} - {cpnt} ({cpnt_type})")
df_group = get_interest_groups(clip_table_file, feature,
......@@ -445,6 +451,7 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
f"{threshold_feature}_dft{default_groups}.{ext}"
for ext in ['txt', 'pdf', 'bar.txt']]
outfiles[0].parent.mkdir(exist_ok=True)
df_comp = filter_composition_dataframe(df_comp, feature, expression_file)
df_stat = create_stat_df(df_comp, cpnt)
df_stat.to_csv(outfiles[0], sep="\t", index=False)
df_comp = update_composition_group(df_comp, display_size)
......@@ -453,10 +460,57 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
df_comp.to_csv(outfiles[2], sep="\t", index=False)
def filter_composition_dataframe(df_comp: pd.DataFrame, feature: str,
expression_file: str
) -> pd.DataFrame:
"""
Remove genes/exons with weak expression if a file `expression_file` is \
given. This files contains the expression of genes.
:param df_comp: A dataframe containing the composition of each \
feature of interest
:param feature: The feature of interest
:param expression_file: A file containing the expression of gene
:return: The dataframe filtered if `expression_file` is a file, else \
return the same dataframe given in input
>>> d = pd.DataFrame({"id_gene": [1, 2, 3], "A": [10, 15, 26],
... "group": ['ctrl', 'ctrl', 'ctrl']})
>>> filter_composition_dataframe(d, "gene",
... Config.tests_files / "expression.txt")
id_gene A group
0 1 10 ctrl
2 3 26 ctrl
>>> d = pd.DataFrame({"id_exon": ['1_1', '2_1', '3_1'], "A": [10, 15, 26],
... "group": ['ctrl', 'ctrl', 'ctrl']})
>>> filter_composition_dataframe(d, "exon",
... Config.tests_files / "expression.txt")
id_exon A group
0 1_1 10 ctrl
2 3_1 26 ctrl
>>> filter_composition_dataframe(d, "gene", "")
id_exon A group id_gene
0 1_1 10 ctrl 1
1 2_1 15 ctrl 2
2 3_1 26 ctrl 3
"""
if not expression_file or not Path(expression_file).is_file():
return df_comp
df = pd.read_csv(expression_file, sep="\t")
genes = df.loc[df["baseMean"] >= 10., "gene"].to_list()
if feature == "exon":
df_comp["id_gene"] = df_comp["id_exon"].str.replace(r"_\d+", ""
).astype(int)
df_comp = df_comp[df_comp["id_gene"].isin(genes)].copy()
if feature == "exon":
df_comp.drop("id_gene", axis=1, inplace=True)
return df_comp
def create_multiple_figures(clip_result_folder: Path, feature: str,
threshold_feature: int, default_groups: int,
cpnt_type: str, region: str = "gene",
ps: int = 1, display_size: bool = False,
expression_file: str = "",
logging_level: str = "DEBUG") -> None:
"""
Create multiple composition figures from a result clip folder.
......@@ -475,6 +529,7 @@ def create_multiple_figures(clip_result_folder: Path, feature: str,
:param ps: The number of processes to create
:param display_size: True to display the size of the community,
False to display nothing. (default False)
:param expression_file: A file containing expression values of genes
:param logging_level: The level of data to display (default 'DISABLE')
"""
logging_def(ConfigGraph.community_folder, __file__, logging_level)
......@@ -496,9 +551,9 @@ def create_multiple_figures(clip_result_folder: Path, feature: str,
for clip_file, cpnt in conditions:
args = [clip_file, feature, threshold_feature, default_groups,
df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table,
display_size]
write_bar_table = False
display_size, expression_file]
processes.append(pool.apply_async(create_figure_4_clip_table, args))
write_bar_table = False
[p.get(timeout=None) for p in processes]
pool.close()
pool.join()
......@@ -563,7 +618,9 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str,
df_comp: pd.DataFrame, cpnt_type: str,
cpnt: str, df_id_ctrl: pd.DataFrame,
write_comp_table: bool,
display_size: bool) -> Optional[Path]:
display_size: bool,
expression_file: str
) -> Optional[Path]:
"""
Create a component figure for a clip file
......@@ -582,6 +639,8 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str,
:param write_comp_table: True to save comp table, false else
:param display_size: True to display the size of the community .
False to display nothing. (default False)
:param expression_file: A file containing the expression values of \
genes
:return: The folder where the figure is created
"""
logging.info(f"Working on {clip_table_file.parent.name} - "
......@@ -608,6 +667,7 @@ def create_figure_multi_clip_table(clip_table_file: Path, feature: str,
f"{threshold_feature}_dft{default_groups}.{ext}"
for ext in ['txt', 'pdf', 'bar.txt']]
outfiles[0].parent.mkdir(exist_ok=True)
df_comp = filter_composition_dataframe(df_comp, feature, expression_file)
df_stat = create_stat_df(df_comp, cpnt)
df_stat.to_csv(outfiles[0], sep="\t", index=False)
df_comp = update_composition_group(df_comp, display_size)
......@@ -621,6 +681,7 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str,
threshold_feature: int, default_groups: int,
cpnt_type: str, region: str = "gene",
ps: int = 1, display_size: bool = False,
expression_file: str = "",
logging_level: str = "DEBUG") -> None:
"""
Create multiple composition figures from a result clip folder (produced \
......@@ -640,6 +701,8 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str,
:param ps: The number of processes to create
:param display_size: True to display the size of the community,
False to display nothing. (default False)
:param expression_file: A file containing the expression values of \
genes
:param logging_level: The level of data to display (default 'DISABLE')
"""
logging_def(ConfigGraph.community_folder, __file__, logging_level)
......@@ -664,10 +727,10 @@ def multi_tads_compo_figures(clip_result_folder: Path, feature: str,
df_ctrl = dic_df_ctrl[clip_file.name.replace(".tmp_bar.txt", "")]
args = [clip_file, feature, threshold_feature, default_groups,
df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table,
display_size]
write_bar_table = False
display_size, expression_file]
processes.append(pool.apply_async(create_figure_multi_clip_table,
args))
write_bar_table = False
results = [p.get(timeout=None) for p in processes]
pool.close()
pool.join()
......@@ -682,6 +745,7 @@ def multi_tads_launcher(clip_result_folder: str, feature: str,
threshold_feature: int = 100, default_groups: int = 20,
cpnt_type: str = "nt", region: str = "gene",
ps: int = 1, display_size: bool = False,
expression_file: str = "",
logging_level: str = "DEBUG") -> None:
"""
Launch multi_tads_compo_figures script
......@@ -701,12 +765,15 @@ def multi_tads_launcher(clip_result_folder: str, feature: str,
:param ps: The number of processes to create (default 1)
:param display_size: True to display the size of the community,
False to display nothing. (default False)
:param expression_file: A file containing the expression values of \
genes (default ''). If this file is provided, then the weakly expressed \
genes are filtered out from the analysis.
:param logging_level: The level of data to display (default 'DISABLE')
"""
multi_tads_compo_figures(Path(clip_result_folder), feature,
threshold_feature, default_groups,
cpnt_type, region, ps, display_size,
logging_level)
expression_file, logging_level)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment