Skip to content
Snippets Groups Projects
Commit 496626e8 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/clip_figures/clip_compo_*: creation of a script...

src/find_interaction_cluster/clip_figures/clip_compo_*: creation of a script to analyse the nucleotide composition of communities enrichied or impoverished in peak density from clip data
parent b01ba86e
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is to check whether a communities \
of genes that are more frequently bind by a splicing factor \
has a composition biais.
"""
from pathlib import Path
from typing import List, Tuple, Optional
import pandas as pd
from .config import ConfigClip, Config, ConfigGraph
import doctest
from math import floor, ceil
from ..nt_and_community import get_ft_id, get_cpnt_frequency, get_features
import sqlite3
import numpy as np
from scipy.stats import mannwhitneyu
from itertools import combinations, product
from statsmodels.stats.multitest import multipletests
import seaborn as sns
import matplotlib.pyplot as plt
from ...logging_conf import logging_def
import logging
import multiprocessing as mp
def find_final_clip_tables(folder: Path) -> List[Path]:
"""
From a folder return every files ending with *_bar.txt.
:param folder: A folder containing final tables used to produce \
community figures for clip data (figure produced by the function \
clip_folder_analysis of the script clip_analyse.py).
:return: The list of files ending with *_bar.txt
>>> find_final_clip_tables(Path("fziufriubdsibf"))
Traceback (most recent call last):
...
NotADirectoryError: Folder fziufriubdsibf doesn't exists !
>>> find_final_clip_tables(Path("src/"))
[]
"""
if not folder.is_dir():
raise NotADirectoryError(f"Folder {folder} doesn't exists !")
return list(folder.glob("*_bar.txt"))
def prepare_df(clip_df: pd.DataFrame, feature: str) -> pd.DataFrame:
"""
Prepare the table containing data to produce the clip community figure.
:param clip_df: table containing data to produce the clip \
community figure.
:param feature: The feature of interest (gene or exon)
:return: The filtered table
>>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"),
... "gene")
>>> d.head()
community community_size p-adj reg-adj peak_density
8 C852 1.0 0.015356 - 0.000000
9 C91 2.0 0.027338 - 0.000000
0 C10 3.0 0.228296 . 0.046749
1 C165 12.0 0.252854 . 0.271487
3 C325 18.0 0.228296 . 0.272363
>>> d["community"].to_list() == ['C852', 'C91', 'C10', 'C165', 'C325',
... 'C232', 'C397', 'C952', 'C365', 'C680', 'C916', 'C690']
True
"""
df = clip_df.copy()
df = df.loc[df[f"id_{feature}"] != "ctrl",
["peak_density", "community", "community_size", "p-adj",
"reg-adj", f"id_{feature}"]]
df = df.groupby(["community", "community_size", "p-adj", "reg-adj"]
).mean().reset_index()
df = df.sort_values("peak_density", ascending=True)
return df
def get_feature_id(clip_df: pd.DataFrame, feature: str,
list_community: List[str]) -> pd.DataFrame:
"""
Get the list of gene contained in the list of communities given.
:param clip_df: table containing data to produce the clip \
community figure.
:param feature: The feature of interest (gene or exon)
:param list_community: A list of community
:return: The filtered table)
>>> d = pd.read_table(ConfigClip.test_clip_file, sep="\\t")
>>> get_feature_id(d, "gene", ['C916', 'C690'])
['11489', '11723', '9289', '9734', '9708']
>>> get_feature_id(d, "gene", ['C680', 'C916',
... 'C690'])
['16283', '16852', '11489', '11723', '9289', '9734', '9708']
>>> get_feature_id(d, "gene", ['C91', 'C852'])
['13856', '13360', '7831']
>>> get_feature_id(d, "gene", ['C91', 'C852',
... 'C10'])
['13856', '13360', '7831', '16763', '16676', '16817']
"""
df = clip_df.copy()
df = df[df[f"id_{feature}"] != "ctrl"]
return df[df["community"].isin(list_community)][f"id_{feature}"].to_list()
def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str,
threshold_feature: int,
default_groups: int) -> Tuple[List, str]:
"""
Recover the impoverished or enriched communities for peaks \
in a given splicing factor if they exists.
:param df: A dataframe used to build the community figures \
of clip density.
:param table_file: The file from which the dataframe df was created
:param group: The kind of community to recover (enriched or impoverished)
:param threshold_feature: The minimum number of feature to \
recover from enriched/impoverished communities
:param default_groups: The number of community to get if none \
(or not enough) community are enriched or impoverished \
(depending on group parameter)
:return: The list of enriched or impoverished community of feature \
and the name of the list
>>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"),
... "gene")
>>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 5, 3)
(['C916', 'C690'], 'SF_test_enriched')
>>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 6, 3)
(['C680', 'C916', 'C690'], 'SF_test_enriched_nosig')
>>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 3, 3)
(['C852', 'C91'], 'SF_test_impoverished')
>>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 4, 3)
(['C852', 'C91', 'C10'], 'SF_test_impoverished_nosig')
"""
outname = table_file.name.rstrip("bar.txt") + group
my_reg = " + " if group == "enriched" else " - "
tmp = df[df["reg-adj"] == my_reg]
if not tmp.empty and sum(tmp["community_size"]) >= threshold_feature:
return tmp["community"].to_list(), outname
outname += "_nosig"
list_com = df["community"].to_list()
if group == "enriched":
return list_com[-default_groups:], outname
else:
return list_com[:default_groups], outname
def get_middle_group(df: pd.DataFrame, table_file: Path,
default_groups: int) -> Tuple[List, str]:
"""
Recover the impoverished or enriched communities for peaks \
in a given splicing factor if they exists.
:param df: A dataframe used to build the community figures \
of clip density.
:param table_file: The file from which the dataframe df was created
:param default_groups: The number of community to get if none \
(or not enough) community are enriched or impoverished \
(depending on group parameter)
:return: The list of enriched or impoverished community of feature \
and the name of the list
>>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"),
... "gene")
>>> get_middle_group(d, ConfigClip.test_clip_file, 3)
(['C232', 'C397', 'C952'], 'SF_test_middle')
>>> get_middle_group(d, ConfigClip.test_clip_file, 4)
(['C325', 'C232', 'C397', 'C952'], 'SF_test_middle')
"""
outname = table_file.name.rstrip("bar.txt") + "middle"
list_com = df["community"].to_list()
index = floor(len(list_com) / 2)
return list_com[index - floor(default_groups / 2):
index + ceil(default_groups / 2)], outname
def add_ctrl_df(df: pd.DataFrame, feature: str) -> pd.DataFrame:
"""
Create a dataframe containing every genes/exons not in a community.
:param df: The dataframe used to create a community figure
:param feature: The feature of interest (exon or gene)
:return: A table containing 2 columns: id_{feature} and group \
(containing 'ctrl')
>>> import time
>>> s = time.time()
>>> add_ctrl_df(pd.DataFrame({"id_gene": list(range(1, 19420)) + ["ctrl"]
... }), "gene")
id_gene group
0 19420 ctrl
1 19421 ctrl
2 19422 ctrl
3 19423 ctrl
4 19424 ctrl
5 19425 ctrl
6 19426 ctrl
"""
list_id = get_ft_id(sqlite3.connect(Config.db_file), feature)
com_id = df[df[f"id_{feature}"] != "ctrl"][f"id_{feature}"].to_list()
if feature == "gene":
list_id = list(map(int, list_id))
com_id = list(map(int, com_id))
ctrl_id = [mid for mid in list_id if mid not in com_id]
return pd.DataFrame({f"id_{feature}": ctrl_id,
"group": ["ctrl"] * len(ctrl_id)})
def get_interest_groups(clip_table_file: Path, feature: str,
threshold_feature: int, default_groups: int
) -> Optional[pd.DataFrame]:
"""
:param clip_table_file: table containing data to produce the clip \
community figure.
:param feature: The feature of interest (gene or exon)
:param threshold_feature: The minimum number of feature to \
recover from enriched/impoverished communities
:param default_groups: The number of community to get if none \
(or not enough) community are enriched or impoverished \
(depending on group parameter)
:return: The dataframe indicating for a id of a feature, the groups where \
if belong.
>>> get_interest_groups(ConfigClip.test_clip_file, "gene", 3, 3)
id_gene group
0 13856 SF_test_impoverished
1 13360 SF_test_impoverished
2 7831 SF_test_impoverished
3 17226 SF_test_middle
4 17418 SF_test_middle
5 16985 SF_test_middle
6 16897 SF_test_middle
7 16536 SF_test_middle
8 17214 SF_test_middle
9 17271 SF_test_middle
10 17360 SF_test_middle
11 16022 SF_test_middle
12 16660 SF_test_middle
13 17080 SF_test_middle
14 17038 SF_test_middle
15 10558 SF_test_middle
16 11520 SF_test_middle
17 10569 SF_test_middle
18 10641 SF_test_middle
19 10333 SF_test_middle
20 12509 SF_test_middle
21 11687 SF_test_middle
22 10886 SF_test_middle
23 10784 SF_test_middle
24 10997 SF_test_middle
25 10907 SF_test_middle
26 12115 SF_test_middle
27 10922 SF_test_middle
28 10586 SF_test_middle
29 12030 SF_test_middle
30 10627 SF_test_middle
31 12525 SF_test_middle
32 12172 SF_test_middle
33 11346 SF_test_middle
34 11949 SF_test_middle
35 15390 SF_test_middle
36 15348 SF_test_middle
37 14966 SF_test_middle
38 15272 SF_test_middle
39 15287 SF_test_middle
40 15135 SF_test_middle
41 14992 SF_test_middle
42 15006 SF_test_middle
43 15199 SF_test_middle
44 15285 SF_test_middle
45 11489 SF_test_enriched
46 11723 SF_test_enriched
47 9289 SF_test_enriched
48 9734 SF_test_enriched
49 9708 SF_test_enriched
>>> r = get_interest_groups(ConfigClip.test_clip_file, "gene", 6, 3)
>>> r is None
True
"""
ini_df = pd.read_csv(clip_table_file, sep="\t")
df = prepare_df(ini_df, feature)
enrich_com, name_enrich = get_extreme_groups(
df, clip_table_file, "enriched", threshold_feature, default_groups)
impov_com, name_impov = get_extreme_groups(
df, clip_table_file, "impoverished", threshold_feature, default_groups)
if "nosig" in name_enrich and "nosig" in name_impov:
return None
middle_com, name_middle = get_middle_group(df, clip_table_file,
default_groups)
enrich_grp = get_feature_id(ini_df, feature, enrich_com)
impov_grp = get_feature_id(ini_df, feature, impov_com)
middle_grp = get_feature_id(ini_df, feature, middle_com)
return pd.DataFrame({f"id_{feature}": impov_grp + middle_grp + enrich_grp,
"group": [name_impov] * len(impov_grp) +
[name_middle] * len(middle_grp) +
[name_enrich] * len(enrich_grp)})
def create_stat_df(df_comp: pd.DataFrame, cpnt: str) -> pd.DataFrame:
"""
Compute statistical analysis for every group in df_comp.
:param df_comp: The dataframe containing the frequency of every `cpnt_type`
:param cpnt: The component studied
:return: Dataframe of p-values
>>> d = pd.DataFrame({"id_gene": [1, 2, 3, 4, 5, 6, 7, 8],
... "A": [1, 2, 3, 4, 5, 6, 7, 8], "group": ["test"] * 4 + ["ctrl"] * 4})
>>> create_stat_df(d, "A")
grp1 grp2 p-val
0 ctrl test 0.030383
>>> d = pd.DataFrame({"id_gene": list(range(1, 13)),
... "A": list(range(1, 12)) + [5], "group": ["test"] * 4 + ["ctrl"] * 4 +
... ["test2"] * 4})
>>> create_stat_df(d, "A")
grp1 grp2 p-val p-adj
0 ctrl test 0.030383 0.045574
1 ctrl test2 0.245383 0.245383
2 test test2 0.030383 0.045574
"""
comparison = combinations(np.unique(df_comp["group"]), 2)
content = []
for grp1, grp2 in comparison:
p_val = mannwhitneyu(
df_comp.loc[df_comp["group"] == grp1, cpnt].values,
df_comp.loc[df_comp["group"] == grp2, cpnt].values,
alternative="two-sided")[1]
content.append([grp1, grp2, p_val])
df = pd.DataFrame(content, columns=["grp1", "grp2", "p-val"])
if df.shape[0] > 1:
df["p-adj"] = multipletests(df['p-val'].values, method="fdr_bh")[1]
return df
def make_barplot(df_comp: pd.DataFrame, outfile: Path, cpnt: str,
cpnt_type: str, clip_name: str) -> None:
"""
Create a barplot showing the composition for different groups of feature.
:param df_comp: The dataframe containing the frequency of every `cpnt_type`
:param outfile: File were the figure will be created
:param cpnt: The component of interest
:param cpnt_type: The type of component of interest
:param clip_name: The name of the clip studied
"""
sns.set(context="talk")
g = sns.catplot(x="group", y=cpnt, data=df_comp, height=12, aspect=1.7,
kind="violin")
g.fig.suptitle(f"Mean frequency of {cpnt}({cpnt_type})"
f"among different community groups\n"
f"created from the peak_density from clip {clip_name}")
g.ax.set_ylabel(f'Frequency of {cpnt} ({cpnt_type})')
g.savefig(outfile)
plt.clf()
plt.close()
def create_figure_4_clip_table(clip_table_file: Path, feature: str,
threshold_feature: int, default_groups: int,
df_comp: pd.DataFrame, cpnt_type: str,
cpnt: str, df_id_ctrl: pd.DataFrame,
write_comp_table: bool) -> None:
"""
Create a component figure for a clip file
:param clip_table_file: table containing data to produce the clip \
community figure.
:param feature: The feature of interest (gene or exon)
:param threshold_feature: The minimum number of feature to \
recover from enriched/impoverished communities
:param default_groups: The number of community to get if none \
(or not enough) community are enriched or impoverished \
(depending on group parameter)
:param df_comp: The dataframe containing the frequency of every `cpnt_type`
:param cpnt_type: The kind of component of interest
:param cpnt: The kind of component of interest
:param df_id_ctrl: A dataframe indicating control exons
:param write_comp_table: True to save comp table, false else
"""
df_group = get_interest_groups(clip_table_file, feature,
threshold_feature, default_groups)
if feature == "gene":
df_group["id_gene"] = df_group["id_gene"].astype(int)
df_comp["id_gene"] = df_comp["id_gene"].astype(int)
if df_group is None:
return None
df_group = pd.concat([df_group, df_id_ctrl], ignore_index=True)
if len(np.unique(df_group[f"id_{feature}"].values)) != df_group.shape[0]:
raise ValueError("Found duplicates value in df_group, exiting...")
df_comp = df_comp.merge(df_group, how="left", on=f"id_{feature}")
df_comp = df_comp[-df_comp["group"].isna()]
sf_name = clip_table_file.name.rstrip("_bar.txt")
outfiles = [clip_table_file.parent / sf_name /
f"{cpnt_type}_{cpnt}_{sf_name}_thd"
f"{threshold_feature}_dft{default_groups}.{ext}"
for ext in ['txt', 'pdf', 'bar.txt']]
outfiles[0].parent.mkdir(exist_ok=True)
df_stat = create_stat_df(df_comp, cpnt)
df_stat.to_csv(outfiles[0], sep="\t", index=False)
make_barplot(df_comp, outfiles[1], cpnt, cpnt_type, sf_name)
if write_comp_table:
df_comp.to_csv(outfiles[2], sep="\t", index=False)
def create_multiple_figures(clip_result_folder: Path, feature: str,
threshold_feature: int, default_groups: int,
cpnt_type: str, region: str = "gene",
ps: int = 1,
logging_level: str = "DEBUG") -> None:
"""
Create multiple composition figures from a result clip folder.
:param clip_result_folder: Folder containing tables used to \
create clip community figures
:param feature: The feature of interest (exon, or gene)
:param threshold_feature: The minimum number of feature needed \
inside communities enriched or impoverished in peak_density to consider \
those community (otherwise `default_groups` default number of community \
are taken
:param default_groups: The number of communities to take as default.
:param cpnt_type: The kind of component analysed
:param region: Only used when feature is gene. Corresponds to \
the region studied within genes.
:param ps: The number of processes to create
:param logging_level: The level of data to display (default 'DISABLE')
"""
logging_def(ConfigGraph.community_folder, __file__, logging_level)
list_tables = find_final_clip_tables(clip_result_folder)
list_ft = get_features(cpnt_type)
if len(list_ft) == 0:
logging.warning(f"No clip table found in {clip_result_folder} !\n"
f"Exiting...")
return None
cnx = sqlite3.connect(Config.db_file)
df_comp = get_cpnt_frequency(cnx, get_ft_id(cnx, feature), feature, region,
cpnt_type)
df_ctrl = add_ctrl_df(pd.read_csv(list_tables[0], sep="\t"), feature)
conditions = list(product(list_tables, list_ft))
ps = min(len(conditions), ps)
pool = mp.Pool(processes=ps if ps > 0 else 1)
processes = []
conditions = [(Path("results/community_of_co-localized-exons/communities/weight-3_global_weight-3/CLIP_community_figures/PCBP1_merged_bar.txt"), "A")]
write_bar_table = True
for clip_file, cpnt in conditions:
args = [clip_file, feature, threshold_feature, default_groups,
df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table]
write_bar_table = False
processes.append(pool.apply_async(create_figure_4_clip_table, args))
[p.get(timeout=None) for p in processes]
pool.close()
pool.join()
if __name__ == "__main__":
doctest.testmod()
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: Launch clip_compo_analyser
"""
import lazyparser as lp
from .clip_compo_analyser import create_multiple_figures
from pathlib import Path
import multiprocessing as mp
@lp.parse(ps = range(1, mp.cpu_count() + 1), feature=["gene", "exon"],
threshold_feature="9 < threshold_feature < 10000",
default_groups=range(10, 101), cpnt_type=["nt", "dnt", "tnt", "aa"],
region=["gene", "exon", "intron"])
def launcher(clip_result_folder: str, feature: str,
threshold_feature: int, default_groups: int,
cpnt_type: str, region: str = "gene",
ps: int = 1, logging_level: str = "DISABLE") -> None:
"""
Create multiple composition figures from a result clip folder.
:param clip_result_folder: Folder containing tables used to \
create clip community figures
:param feature: The feature of interest (exon, or gene)
:param threshold_feature: The minimum number of feature needed \
inside communities enriched or impoverished in peak_density to consider \
those community (otherwise `default_groups` default number of community \
are taken
:param default_groups: The number of communities to take as default.
:param cpnt_type: The kind of component analysed
:param region: Only used when feature is gene. Corresponds to \
the region studied within genes. (default: 'gene')
:param ps: The number of processes to create (default 1)
:param logging_level: The level of data to display (default 'DISABLE')
"""
clip_result_folder = Path(clip_result_folder)
if not clip_result_folder.is_dir():
raise NotADirectoryError(f"Directory {clip_result_folder} don't exist")
create_multiple_figures(clip_result_folder, feature, threshold_feature,
default_groups, cpnt_type, region, ps,
logging_level)
# launcher()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment