Skip to content
Snippets Groups Projects
Commit 17441b57 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/clip_figures/clip_compo_analyser.py: chnages and...

src/find_interaction_cluster/clip_figures/clip_compo_analyser.py: chnages and creation of new functions

creation of function to handle clip results folder produced by clip_launcher_4_many_communities and modification of get_middle_group, get_extreme_groups and get_interest_groups to simplify the names of each groups + pep8 chnages
parent 1712468f
No related branches found
No related tags found
No related merge requests found
......@@ -9,10 +9,9 @@ has a composition biais.
"""
from pathlib import Path
from typing import List, Tuple, Optional
from typing import List, Tuple, Optional, Dict
import pandas as pd
from .config import ConfigClip, Config, ConfigGraph
import doctest
from math import floor, ceil
from ..nt_and_community import get_ft_id, get_cpnt_frequency, get_features
import sqlite3
......@@ -25,6 +24,8 @@ import matplotlib.pyplot as plt
from ...logging_conf import logging_def
import logging
import multiprocessing as mp
import subprocess as sp
import lazyparser as lp
def find_final_clip_tables(folder: Path) -> List[Path]:
......@@ -108,7 +109,7 @@ def get_feature_id(clip_df: pd.DataFrame, feature: str,
return df[df["community"].isin(list_community)][f"id_{feature}"].to_list()
def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str,
def get_extreme_groups(df: pd.DataFrame, group: str,
threshold_feature: int,
default_groups: int) -> Tuple[List, str]:
"""
......@@ -117,7 +118,6 @@ def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str,
:param df: A dataframe used to build the community figures \
of clip density.
:param table_file: The file from which the dataframe df was created
:param group: The kind of community to recover (enriched or impoverished)
:param threshold_feature: The minimum number of feature to \
recover from enriched/impoverished communities
......@@ -129,16 +129,16 @@ def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str,
>>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"),
... "gene")
>>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 5, 3)
(['C916', 'C690'], 'SF_test_enriched')
>>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 6, 3)
(['C680', 'C916', 'C690'], 'SF_test_enriched_nosig')
>>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 3, 3)
(['C852', 'C91'], 'SF_test_impoverished')
>>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 4, 3)
(['C852', 'C91', 'C10'], 'SF_test_impoverished_nosig')
"""
outname = table_file.name.rstrip("bar.txt") + group
>>> get_extreme_groups(d, "enriched", 5, 3)
(['C916', 'C690'], 'enriched')
>>> get_extreme_groups(d, "enriched", 6, 3)
(['C680', 'C916', 'C690'], 'enriched_nosig')
>>> get_extreme_groups(d, "impoverished", 3, 3)
(['C852', 'C91'], 'impoverished')
>>> get_extreme_groups(d, "impoverished", 4, 3)
(['C852', 'C91', 'C10'], 'impoverished_nosig')
"""
outname = group
my_reg = " + " if group == "enriched" else " - "
tmp = df[df["reg-adj"] == my_reg]
if not tmp.empty and sum(tmp["community_size"]) >= threshold_feature:
......@@ -151,15 +151,14 @@ def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str,
return list_com[:default_groups], outname
def get_middle_group(df: pd.DataFrame, table_file: Path,
default_groups: int) -> Tuple[List, str]:
def get_middle_group(df: pd.DataFrame, default_groups: int
) -> Tuple[List, str]:
"""
Recover the impoverished or enriched communities for peaks \
in a given splicing factor if they exists.
:param df: A dataframe used to build the community figures \
of clip density.
:param table_file: The file from which the dataframe df was created
:param default_groups: The number of community to get if none \
(or not enough) community are enriched or impoverished \
(depending on group parameter)
......@@ -168,16 +167,19 @@ def get_middle_group(df: pd.DataFrame, table_file: Path,
>>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"),
... "gene")
>>> get_middle_group(d, ConfigClip.test_clip_file, 3)
(['C232', 'C397', 'C952'], 'SF_test_middle')
>>> get_middle_group(d, ConfigClip.test_clip_file, 4)
(['C325', 'C232', 'C397', 'C952'], 'SF_test_middle')
>>> get_middle_group(d, 3)
(['C232', 'C397', 'C952'], 'middle')
>>> get_middle_group(d, 4)
(['C325', 'C232', 'C397', 'C952'], 'middle')
"""
outname = table_file.name.rstrip("bar.txt") + "middle"
outname = "middle"
list_com = df["community"].to_list()
index = floor(len(list_com) / 2)
return list_com[index - floor(default_groups / 2):
index + ceil(default_groups / 2)], outname
middle = list_com[index - floor(default_groups / 2):
index + ceil(default_groups / 2)]
middle_com = df[(df["community"].isin(middle)) &
(df['reg-adj'] == " . ")]["community"].to_list()
return middle_com, outname
def add_ctrl_df(df: pd.DataFrame, feature: str) -> pd.DataFrame:
......@@ -229,57 +231,57 @@ def get_interest_groups(clip_table_file: Path, feature: str,
if belong.
>>> get_interest_groups(ConfigClip.test_clip_file, "gene", 3, 3)
id_gene group
0 13856 SF_test_impoverished
1 13360 SF_test_impoverished
2 7831 SF_test_impoverished
3 17226 SF_test_middle
4 17418 SF_test_middle
5 16985 SF_test_middle
6 16897 SF_test_middle
7 16536 SF_test_middle
8 17214 SF_test_middle
9 17271 SF_test_middle
10 17360 SF_test_middle
11 16022 SF_test_middle
12 16660 SF_test_middle
13 17080 SF_test_middle
14 17038 SF_test_middle
15 10558 SF_test_middle
16 11520 SF_test_middle
17 10569 SF_test_middle
18 10641 SF_test_middle
19 10333 SF_test_middle
20 12509 SF_test_middle
21 11687 SF_test_middle
22 10886 SF_test_middle
23 10784 SF_test_middle
24 10997 SF_test_middle
25 10907 SF_test_middle
26 12115 SF_test_middle
27 10922 SF_test_middle
28 10586 SF_test_middle
29 12030 SF_test_middle
30 10627 SF_test_middle
31 12525 SF_test_middle
32 12172 SF_test_middle
33 11346 SF_test_middle
34 11949 SF_test_middle
35 15390 SF_test_middle
36 15348 SF_test_middle
37 14966 SF_test_middle
38 15272 SF_test_middle
39 15287 SF_test_middle
40 15135 SF_test_middle
41 14992 SF_test_middle
42 15006 SF_test_middle
43 15199 SF_test_middle
44 15285 SF_test_middle
45 11489 SF_test_enriched
46 11723 SF_test_enriched
47 9289 SF_test_enriched
48 9734 SF_test_enriched
49 9708 SF_test_enriched
id_gene group
0 13856 impoverished
1 13360 impoverished
2 7831 impoverished
3 17226 middle
4 17418 middle
5 16985 middle
6 16897 middle
7 16536 middle
8 17214 middle
9 17271 middle
10 17360 middle
11 16022 middle
12 16660 middle
13 17080 middle
14 17038 middle
15 10558 middle
16 11520 middle
17 10569 middle
18 10641 middle
19 10333 middle
20 12509 middle
21 11687 middle
22 10886 middle
23 10784 middle
24 10997 middle
25 10907 middle
26 12115 middle
27 10922 middle
28 10586 middle
29 12030 middle
30 10627 middle
31 12525 middle
32 12172 middle
33 11346 middle
34 11949 middle
35 15390 middle
36 15348 middle
37 14966 middle
38 15272 middle
39 15287 middle
40 15135 middle
41 14992 middle
42 15006 middle
43 15199 middle
44 15285 middle
45 11489 enriched
46 11723 enriched
47 9289 enriched
48 9734 enriched
49 9708 enriched
>>> r = get_interest_groups(ConfigClip.test_clip_file, "gene", 6, 3)
>>> r is None
True
......@@ -287,13 +289,12 @@ def get_interest_groups(clip_table_file: Path, feature: str,
ini_df = pd.read_csv(clip_table_file, sep="\t")
df = prepare_df(ini_df, feature)
enrich_com, name_enrich = get_extreme_groups(
df, clip_table_file, "enriched", threshold_feature, default_groups)
df, "enriched", threshold_feature, default_groups)
impov_com, name_impov = get_extreme_groups(
df, clip_table_file, "impoverished", threshold_feature, default_groups)
df, "impoverished", threshold_feature, default_groups)
if "nosig" in name_enrich and "nosig" in name_impov:
return None
middle_com, name_middle = get_middle_group(df, clip_table_file,
default_groups)
middle_com, name_middle = get_middle_group(df, default_groups)
enrich_grp = get_feature_id(ini_df, feature, enrich_com)
impov_grp = get_feature_id(ini_df, feature, impov_com)
middle_grp = get_feature_id(ini_df, feature, middle_com)
......@@ -425,8 +426,10 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str,
df_comp = df_comp.merge(df_group, how="left", on=f"id_{feature}")
df_comp = df_comp[-df_comp["group"].isna()]
sf_name = clip_table_file.name.rstrip("_bar.txt")
outfiles = [clip_table_file.parent / sf_name /
f"{cpnt_type}_{cpnt}_{sf_name}_thd"
res_folder = clip_table_file.parent / f"{cpnt_type}_analysis"
res_folder.mkdir(exist_ok=True)
outfiles = [res_folder / sf_name /
f"{cpnt_type}_{cpnt}_{sf_name}_thd"
f"{threshold_feature}_dft{default_groups}.{ext}"
for ext in ['txt', 'pdf', 'bar.txt']]
outfiles[0].parent.mkdir(exist_ok=True)
......@@ -489,5 +492,208 @@ def create_multiple_figures(clip_result_folder: Path, feature: str,
pool.join()
############################################################################
# Functions for composition analysis on
# A clip folder produced by clip_launcher_4_many_communities
############################################################################
def find_clip_directories(folder: Path) -> List[Path]:
"""
From a folder return every folder inside it.
:param folder: A folder containing other subfolder. This folder \
should have been produced by `clip_launcher_4_many_communities`.
:return: The list of files ending with *_bar.txt
>>> find_clip_directories(Path("fziufriubdsibf"))
Traceback (most recent call last):
...
NotADirectoryError: Folder fziufriubdsibf doesn't exists !
"""
if not folder.is_dir():
raise NotADirectoryError(f"Folder {folder} doesn't exists !")
return [d for d in list(folder.glob("*")) if d.is_dir()]
def get_ctrl_dic(list_files: List[Path], feature: str) -> Dict:
"""
Create a dictionary of control exons/genes
:param list_files: A list of clip density table
:param feature: gene or exon
:return: A dictionary containing the controls for each clip files
"""
return {
f.name.replace(".tmp_bar.txt", ""): add_ctrl_df(
pd.read_csv(f, sep="\t"), feature
)
for f in list_files
}
def merge_figures(folder: Path) -> None:
"""
Merge the figures together using imageMagick
:param folder: A folder containing pdf files
"""
fig_name = folder.parent.name
cmd = f"montage -geometry +1+1 -tile 1X3 " \
f"-compress jpeg -density 100 -label %t -pointsize 50 " \
f"{folder}/*dft*.pdf {folder}/{fig_name}.pdf"
sp.check_call(cmd, shell=True)
def create_figure_multi_clip_table(clip_table_file: Path, feature: str,
threshold_feature: int, default_groups: int,
df_comp: pd.DataFrame, cpnt_type: str,
cpnt: str, df_id_ctrl: pd.DataFrame,
write_comp_table: bool,
display_size: bool) -> Optional[Path]:
"""
Create a component figure for a clip file
:param clip_table_file: table containing data to produce the clip \
community figure.
:param feature: The feature of interest (gene or exon)
:param threshold_feature: The minimum number of feature to \
recover from enriched/impoverished communities
:param default_groups: The number of community to get if none \
(or not enough) community are enriched or impoverished \
(depending on group parameter)
:param df_comp: The dataframe containing the frequency of every `cpnt_type`
:param cpnt_type: The kind of component of interest
:param cpnt: The kind of component of interest
:param df_id_ctrl: A dataframe indicating control exons
:param write_comp_table: True to save comp table, false else
:param display_size: True to display the size of the community .
False to display nothing. (default False)
:return: The folder where the figure is created
"""
logging.info(f"Working on {clip_table_file.parent.name} - "
f"{clip_table_file.name} - {cpnt} ({cpnt_type})")
df_group = get_interest_groups(clip_table_file, feature,
threshold_feature, default_groups)
if df_group is None:
return None
df_group = df_group.drop_duplicates()
if feature == "gene":
df_group["id_gene"] = df_group["id_gene"].astype(int)
df_comp["id_gene"] = df_comp["id_gene"].astype(int)
df_group = pd.concat([df_group, df_id_ctrl], ignore_index=True)
if len(np.unique(df_group[f"id_{feature}"].values)) != df_group.shape[0]:
print(df_group[df_group.duplicated(subset=f"id_{feature}")])
raise ValueError("Found duplicates value in df_group, exiting...")
df_comp = df_comp.merge(df_group, how="left", on=f"id_{feature}")
df_comp = df_comp[-df_comp["group"].isna()]
sf_name = clip_table_file.parent.name
res_folder = clip_table_file.parent / f"{cpnt_type}_analysis"
res_folder.mkdir(exist_ok=True)
outfiles = [res_folder /
f"{clip_table_file.name.split('.')[0]}_{cpnt_type}_{cpnt}_thd"
f"{threshold_feature}_dft{default_groups}.{ext}"
for ext in ['txt', 'pdf', 'bar.txt']]
outfiles[0].parent.mkdir(exist_ok=True)
df_stat = create_stat_df(df_comp, cpnt)
df_stat.to_csv(outfiles[0], sep="\t", index=False)
df_comp = update_composition_group(df_comp, display_size)
make_barplot(df_comp, outfiles[1], cpnt, cpnt_type, sf_name)
if write_comp_table:
df_comp.to_csv(outfiles[2], sep="\t", index=False)
return res_folder
def multi_tads_compo_figures(clip_result_folder: Path, feature: str,
threshold_feature: int, default_groups: int,
cpnt_type: str, region: str = "gene",
ps: int = 1, display_size: bool = False,
logging_level: str = "DEBUG") -> None:
"""
Create multiple composition figures from a result clip folder (produced \
by clip_launcher_4_many_communities).
:param clip_result_folder: Folder containing tables used to \
create clip community figures
:param feature: The feature of interest (exon, or gene)
:param threshold_feature: The minimum number of feature needed \
inside communities enriched or impoverished in peak_density to consider \
those community (otherwise `default_groups` default number of community \
are taken
:param default_groups: The number of communities to take as default.
:param cpnt_type: The kind of component analysed
:param region: Only used when feature is gene. Corresponds to \
the region studied within genes.
:param ps: The number of processes to create
:param display_size: True to display the size of the community,
False to display nothing. (default False)
:param logging_level: The level of data to display (default 'DISABLE')
"""
logging_def(ConfigGraph.community_folder, __file__, logging_level)
list_folder = find_clip_directories(clip_result_folder)
list_tables = [find_final_clip_tables(folder) for folder in list_folder]
dic_df_ctrl = get_ctrl_dic(list_tables[0], feature)
list_tables = np.array(list_tables).flatten()
list_ft = get_features(cpnt_type)
if len(list_ft) == 0:
logging.warning(f"No clip table found in {clip_result_folder} !\n"
f"Exiting...")
return None
cnx = sqlite3.connect(Config.db_file)
df_comp = get_cpnt_frequency(cnx, get_ft_id(cnx, feature), feature, region,
cpnt_type)
conditions = list(product(list_tables, list_ft))
ps = min(len(conditions), ps)
pool = mp.Pool(processes=ps if ps > 0 else 1)
processes = []
write_bar_table = True
for clip_file, cpnt in conditions:
df_ctrl = dic_df_ctrl[clip_file.name.replace(".tmp_bar.txt", "")]
args = [clip_file, feature, threshold_feature, default_groups,
df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table,
display_size]
write_bar_table = False
processes.append(pool.apply_async(create_figure_multi_clip_table,
args))
results = [p.get(timeout=None) for p in processes]
pool.close()
pool.join()
list_folder = np.unique([r for r in results if r is not None])
for folder in list_folder:
merge_figures(folder)
@lp.parse(feature=["gene", "exon"], threshold_feature=range(1, 1001),
default_groups=range(1, 100), cpnt_type=["nt", "dnt", "aa"])
def multi_tads_launcher(clip_result_folder: str, feature: str,
threshold_feature: int = 100, default_groups: int = 20,
cpnt_type: str = "nt", region: str = "gene",
ps: int = 1, display_size: bool = False,
logging_level: str = "DEBUG") -> None:
"""
Launch multi_tads_compo_figures script
:param clip_result_folder: Folder containing tables used to \
create clip community figures
:param feature: The feature of interest (exon, or gene)
:param threshold_feature: The minimum number of feature needed \
inside communities enriched or impoverished in peak_density to consider \
those community (otherwise `default_groups` default number of community \
are taken. Default 100)
:param default_groups: The number of communities to take as default. \
(default 20)
:param cpnt_type: The kind of component analysed (default nt)
:param region: Only used when feature is gene. Corresponds to \
the region studied within genes. (default gene)
:param ps: The number of processes to create (default 1)
:param display_size: True to display the size of the community,
False to display nothing. (default False)
:param logging_level: The level of data to display (default 'DISABLE')
"""
multi_tads_compo_figures(Path(clip_result_folder), feature,
threshold_feature, default_groups,
cpnt_type, region, ps, display_size,
logging_level)
if __name__ == "__main__":
doctest.testmod()
multi_tads_launcher()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment