diff --git a/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py new file mode 100644 index 0000000000000000000000000000000000000000..f4e064ef7298413d2bff6096e0180f288fadbb93 --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py @@ -0,0 +1,465 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to check whether a communities \ +of genes that are more frequently bind by a splicing factor \ +has a composition biais. +""" + +from pathlib import Path +from typing import List, Tuple, Optional +import pandas as pd +from .config import ConfigClip, Config, ConfigGraph +import doctest +from math import floor, ceil +from ..nt_and_community import get_ft_id, get_cpnt_frequency, get_features +import sqlite3 +import numpy as np +from scipy.stats import mannwhitneyu +from itertools import combinations, product +from statsmodels.stats.multitest import multipletests +import seaborn as sns +import matplotlib.pyplot as plt +from ...logging_conf import logging_def +import logging +import multiprocessing as mp + + +def find_final_clip_tables(folder: Path) -> List[Path]: + """ + From a folder return every files ending with *_bar.txt. + + :param folder: A folder containing final tables used to produce \ + community figures for clip data (figure produced by the function \ + clip_folder_analysis of the script clip_analyse.py). + :return: The list of files ending with *_bar.txt + + >>> find_final_clip_tables(Path("fziufriubdsibf")) + Traceback (most recent call last): + ... + NotADirectoryError: Folder fziufriubdsibf doesn't exists ! + >>> find_final_clip_tables(Path("src/")) + [] + """ + if not folder.is_dir(): + raise NotADirectoryError(f"Folder {folder} doesn't exists !") + return list(folder.glob("*_bar.txt")) + + +def prepare_df(clip_df: pd.DataFrame, feature: str) -> pd.DataFrame: + """ + Prepare the table containing data to produce the clip community figure. + + :param clip_df: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :return: The filtered table + + >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"), + ... "gene") + >>> d.head() + community community_size p-adj reg-adj peak_density + 8 C852 1.0 0.015356 - 0.000000 + 9 C91 2.0 0.027338 - 0.000000 + 0 C10 3.0 0.228296 . 0.046749 + 1 C165 12.0 0.252854 . 0.271487 + 3 C325 18.0 0.228296 . 0.272363 + >>> d["community"].to_list() == ['C852', 'C91', 'C10', 'C165', 'C325', + ... 'C232', 'C397', 'C952', 'C365', 'C680', 'C916', 'C690'] + True + """ + df = clip_df.copy() + df = df.loc[df[f"id_{feature}"] != "ctrl", + ["peak_density", "community", "community_size", "p-adj", + "reg-adj", f"id_{feature}"]] + df = df.groupby(["community", "community_size", "p-adj", "reg-adj"] + ).mean().reset_index() + df = df.sort_values("peak_density", ascending=True) + return df + + +def get_feature_id(clip_df: pd.DataFrame, feature: str, + list_community: List[str]) -> pd.DataFrame: + """ + Get the list of gene contained in the list of communities given. + + :param clip_df: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :param list_community: A list of community + :return: The filtered table) + + >>> d = pd.read_table(ConfigClip.test_clip_file, sep="\\t") + >>> get_feature_id(d, "gene", ['C916', 'C690']) + ['11489', '11723', '9289', '9734', '9708'] + >>> get_feature_id(d, "gene", ['C680', 'C916', + ... 'C690']) + ['16283', '16852', '11489', '11723', '9289', '9734', '9708'] + >>> get_feature_id(d, "gene", ['C91', 'C852']) + ['13856', '13360', '7831'] + >>> get_feature_id(d, "gene", ['C91', 'C852', + ... 'C10']) + ['13856', '13360', '7831', '16763', '16676', '16817'] + """ + df = clip_df.copy() + df = df[df[f"id_{feature}"] != "ctrl"] + return df[df["community"].isin(list_community)][f"id_{feature}"].to_list() + + +def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str, + threshold_feature: int, + default_groups: int) -> Tuple[List, str]: + """ + Recover the impoverished or enriched communities for peaks \ + in a given splicing factor if they exists. + + :param df: A dataframe used to build the community figures \ + of clip density. + :param table_file: The file from which the dataframe df was created + :param group: The kind of community to recover (enriched or impoverished) + :param threshold_feature: The minimum number of feature to \ + recover from enriched/impoverished communities + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :return: The list of enriched or impoverished community of feature \ + and the name of the list + + >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"), + ... "gene") + >>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 5, 3) + (['C916', 'C690'], 'SF_test_enriched') + >>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 6, 3) + (['C680', 'C916', 'C690'], 'SF_test_enriched_nosig') + >>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 3, 3) + (['C852', 'C91'], 'SF_test_impoverished') + >>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 4, 3) + (['C852', 'C91', 'C10'], 'SF_test_impoverished_nosig') + """ + outname = table_file.name.rstrip("bar.txt") + group + my_reg = " + " if group == "enriched" else " - " + tmp = df[df["reg-adj"] == my_reg] + if not tmp.empty and sum(tmp["community_size"]) >= threshold_feature: + return tmp["community"].to_list(), outname + outname += "_nosig" + list_com = df["community"].to_list() + if group == "enriched": + return list_com[-default_groups:], outname + else: + return list_com[:default_groups], outname + + +def get_middle_group(df: pd.DataFrame, table_file: Path, + default_groups: int) -> Tuple[List, str]: + """ + Recover the impoverished or enriched communities for peaks \ + in a given splicing factor if they exists. + + :param df: A dataframe used to build the community figures \ + of clip density. + :param table_file: The file from which the dataframe df was created + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :return: The list of enriched or impoverished community of feature \ + and the name of the list + + >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"), + ... "gene") + >>> get_middle_group(d, ConfigClip.test_clip_file, 3) + (['C232', 'C397', 'C952'], 'SF_test_middle') + >>> get_middle_group(d, ConfigClip.test_clip_file, 4) + (['C325', 'C232', 'C397', 'C952'], 'SF_test_middle') + """ + outname = table_file.name.rstrip("bar.txt") + "middle" + list_com = df["community"].to_list() + index = floor(len(list_com) / 2) + return list_com[index - floor(default_groups / 2): + index + ceil(default_groups / 2)], outname + + +def add_ctrl_df(df: pd.DataFrame, feature: str) -> pd.DataFrame: + """ + Create a dataframe containing every genes/exons not in a community. + + :param df: The dataframe used to create a community figure + :param feature: The feature of interest (exon or gene) + :return: A table containing 2 columns: id_{feature} and group \ + (containing 'ctrl') + + >>> import time + >>> s = time.time() + >>> add_ctrl_df(pd.DataFrame({"id_gene": list(range(1, 19420)) + ["ctrl"] + ... }), "gene") + id_gene group + 0 19420 ctrl + 1 19421 ctrl + 2 19422 ctrl + 3 19423 ctrl + 4 19424 ctrl + 5 19425 ctrl + 6 19426 ctrl + """ + list_id = get_ft_id(sqlite3.connect(Config.db_file), feature) + com_id = df[df[f"id_{feature}"] != "ctrl"][f"id_{feature}"].to_list() + if feature == "gene": + list_id = list(map(int, list_id)) + com_id = list(map(int, com_id)) + ctrl_id = [mid for mid in list_id if mid not in com_id] + return pd.DataFrame({f"id_{feature}": ctrl_id, + "group": ["ctrl"] * len(ctrl_id)}) + + +def get_interest_groups(clip_table_file: Path, feature: str, + threshold_feature: int, default_groups: int + ) -> Optional[pd.DataFrame]: + """ + + :param clip_table_file: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :param threshold_feature: The minimum number of feature to \ + recover from enriched/impoverished communities + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :return: The dataframe indicating for a id of a feature, the groups where \ + if belong. + + >>> get_interest_groups(ConfigClip.test_clip_file, "gene", 3, 3) + id_gene group + 0 13856 SF_test_impoverished + 1 13360 SF_test_impoverished + 2 7831 SF_test_impoverished + 3 17226 SF_test_middle + 4 17418 SF_test_middle + 5 16985 SF_test_middle + 6 16897 SF_test_middle + 7 16536 SF_test_middle + 8 17214 SF_test_middle + 9 17271 SF_test_middle + 10 17360 SF_test_middle + 11 16022 SF_test_middle + 12 16660 SF_test_middle + 13 17080 SF_test_middle + 14 17038 SF_test_middle + 15 10558 SF_test_middle + 16 11520 SF_test_middle + 17 10569 SF_test_middle + 18 10641 SF_test_middle + 19 10333 SF_test_middle + 20 12509 SF_test_middle + 21 11687 SF_test_middle + 22 10886 SF_test_middle + 23 10784 SF_test_middle + 24 10997 SF_test_middle + 25 10907 SF_test_middle + 26 12115 SF_test_middle + 27 10922 SF_test_middle + 28 10586 SF_test_middle + 29 12030 SF_test_middle + 30 10627 SF_test_middle + 31 12525 SF_test_middle + 32 12172 SF_test_middle + 33 11346 SF_test_middle + 34 11949 SF_test_middle + 35 15390 SF_test_middle + 36 15348 SF_test_middle + 37 14966 SF_test_middle + 38 15272 SF_test_middle + 39 15287 SF_test_middle + 40 15135 SF_test_middle + 41 14992 SF_test_middle + 42 15006 SF_test_middle + 43 15199 SF_test_middle + 44 15285 SF_test_middle + 45 11489 SF_test_enriched + 46 11723 SF_test_enriched + 47 9289 SF_test_enriched + 48 9734 SF_test_enriched + 49 9708 SF_test_enriched + >>> r = get_interest_groups(ConfigClip.test_clip_file, "gene", 6, 3) + >>> r is None + True + """ + ini_df = pd.read_csv(clip_table_file, sep="\t") + df = prepare_df(ini_df, feature) + enrich_com, name_enrich = get_extreme_groups( + df, clip_table_file, "enriched", threshold_feature, default_groups) + impov_com, name_impov = get_extreme_groups( + df, clip_table_file, "impoverished", threshold_feature, default_groups) + if "nosig" in name_enrich and "nosig" in name_impov: + return None + middle_com, name_middle = get_middle_group(df, clip_table_file, + default_groups) + enrich_grp = get_feature_id(ini_df, feature, enrich_com) + impov_grp = get_feature_id(ini_df, feature, impov_com) + middle_grp = get_feature_id(ini_df, feature, middle_com) + return pd.DataFrame({f"id_{feature}": impov_grp + middle_grp + enrich_grp, + "group": [name_impov] * len(impov_grp) + + [name_middle] * len(middle_grp) + + [name_enrich] * len(enrich_grp)}) + + +def create_stat_df(df_comp: pd.DataFrame, cpnt: str) -> pd.DataFrame: + """ + Compute statistical analysis for every group in df_comp. + + :param df_comp: The dataframe containing the frequency of every `cpnt_type` + :param cpnt: The component studied + :return: Dataframe of p-values + + >>> d = pd.DataFrame({"id_gene": [1, 2, 3, 4, 5, 6, 7, 8], + ... "A": [1, 2, 3, 4, 5, 6, 7, 8], "group": ["test"] * 4 + ["ctrl"] * 4}) + >>> create_stat_df(d, "A") + grp1 grp2 p-val + 0 ctrl test 0.030383 + >>> d = pd.DataFrame({"id_gene": list(range(1, 13)), + ... "A": list(range(1, 12)) + [5], "group": ["test"] * 4 + ["ctrl"] * 4 + + ... ["test2"] * 4}) + >>> create_stat_df(d, "A") + grp1 grp2 p-val p-adj + 0 ctrl test 0.030383 0.045574 + 1 ctrl test2 0.245383 0.245383 + 2 test test2 0.030383 0.045574 + + """ + comparison = combinations(np.unique(df_comp["group"]), 2) + content = [] + for grp1, grp2 in comparison: + p_val = mannwhitneyu( + df_comp.loc[df_comp["group"] == grp1, cpnt].values, + df_comp.loc[df_comp["group"] == grp2, cpnt].values, + alternative="two-sided")[1] + content.append([grp1, grp2, p_val]) + df = pd.DataFrame(content, columns=["grp1", "grp2", "p-val"]) + if df.shape[0] > 1: + df["p-adj"] = multipletests(df['p-val'].values, method="fdr_bh")[1] + return df + + +def make_barplot(df_comp: pd.DataFrame, outfile: Path, cpnt: str, + cpnt_type: str, clip_name: str) -> None: + """ + Create a barplot showing the composition for different groups of feature. + + :param df_comp: The dataframe containing the frequency of every `cpnt_type` + :param outfile: File were the figure will be created + :param cpnt: The component of interest + :param cpnt_type: The type of component of interest + :param clip_name: The name of the clip studied + """ + sns.set(context="talk") + g = sns.catplot(x="group", y=cpnt, data=df_comp, height=12, aspect=1.7, + kind="violin") + g.fig.suptitle(f"Mean frequency of {cpnt}({cpnt_type})" + f"among different community groups\n" + f"created from the peak_density from clip {clip_name}") + g.ax.set_ylabel(f'Frequency of {cpnt} ({cpnt_type})') + g.savefig(outfile) + plt.clf() + plt.close() + + +def create_figure_4_clip_table(clip_table_file: Path, feature: str, + threshold_feature: int, default_groups: int, + df_comp: pd.DataFrame, cpnt_type: str, + cpnt: str, df_id_ctrl: pd.DataFrame, + write_comp_table: bool) -> None: + """ + Create a component figure for a clip file + + :param clip_table_file: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :param threshold_feature: The minimum number of feature to \ + recover from enriched/impoverished communities + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :param df_comp: The dataframe containing the frequency of every `cpnt_type` + :param cpnt_type: The kind of component of interest + :param cpnt: The kind of component of interest + :param df_id_ctrl: A dataframe indicating control exons + :param write_comp_table: True to save comp table, false else + """ + df_group = get_interest_groups(clip_table_file, feature, + threshold_feature, default_groups) + if feature == "gene": + df_group["id_gene"] = df_group["id_gene"].astype(int) + df_comp["id_gene"] = df_comp["id_gene"].astype(int) + if df_group is None: + return None + df_group = pd.concat([df_group, df_id_ctrl], ignore_index=True) + + if len(np.unique(df_group[f"id_{feature}"].values)) != df_group.shape[0]: + + raise ValueError("Found duplicates value in df_group, exiting...") + df_comp = df_comp.merge(df_group, how="left", on=f"id_{feature}") + df_comp = df_comp[-df_comp["group"].isna()] + sf_name = clip_table_file.name.rstrip("_bar.txt") + outfiles = [clip_table_file.parent / sf_name / + f"{cpnt_type}_{cpnt}_{sf_name}_thd" + f"{threshold_feature}_dft{default_groups}.{ext}" + for ext in ['txt', 'pdf', 'bar.txt']] + outfiles[0].parent.mkdir(exist_ok=True) + df_stat = create_stat_df(df_comp, cpnt) + df_stat.to_csv(outfiles[0], sep="\t", index=False) + make_barplot(df_comp, outfiles[1], cpnt, cpnt_type, sf_name) + if write_comp_table: + df_comp.to_csv(outfiles[2], sep="\t", index=False) + + +def create_multiple_figures(clip_result_folder: Path, feature: str, + threshold_feature: int, default_groups: int, + cpnt_type: str, region: str = "gene", + ps: int = 1, + logging_level: str = "DEBUG") -> None: + """ + Create multiple composition figures from a result clip folder. + + :param clip_result_folder: Folder containing tables used to \ + create clip community figures + :param feature: The feature of interest (exon, or gene) + :param threshold_feature: The minimum number of feature needed \ + inside communities enriched or impoverished in peak_density to consider \ + those community (otherwise `default_groups` default number of community \ + are taken + :param default_groups: The number of communities to take as default. + :param cpnt_type: The kind of component analysed + :param region: Only used when feature is gene. Corresponds to \ + the region studied within genes. + :param ps: The number of processes to create + :param logging_level: The level of data to display (default 'DISABLE') + """ + logging_def(ConfigGraph.community_folder, __file__, logging_level) + list_tables = find_final_clip_tables(clip_result_folder) + list_ft = get_features(cpnt_type) + if len(list_ft) == 0: + logging.warning(f"No clip table found in {clip_result_folder} !\n" + f"Exiting...") + return None + cnx = sqlite3.connect(Config.db_file) + df_comp = get_cpnt_frequency(cnx, get_ft_id(cnx, feature), feature, region, + cpnt_type) + df_ctrl = add_ctrl_df(pd.read_csv(list_tables[0], sep="\t"), feature) + conditions = list(product(list_tables, list_ft)) + ps = min(len(conditions), ps) + pool = mp.Pool(processes=ps if ps > 0 else 1) + processes = [] + conditions = [(Path("results/community_of_co-localized-exons/communities/weight-3_global_weight-3/CLIP_community_figures/PCBP1_merged_bar.txt"), "A")] + write_bar_table = True + for clip_file, cpnt in conditions: + args = [clip_file, feature, threshold_feature, default_groups, + df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table] + write_bar_table = False + processes.append(pool.apply_async(create_figure_4_clip_table, args)) + [p.get(timeout=None) for p in processes] + pool.close() + pool.join() + + +if __name__ == "__main__": + doctest.testmod() diff --git a/src/find_interaction_cluster/clip_figures/clip_compo_launcher.py b/src/find_interaction_cluster/clip_figures/clip_compo_launcher.py new file mode 100644 index 0000000000000000000000000000000000000000..e4f1be61df44ec8a3c20e341eaba88f7b0788726 --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/clip_compo_launcher.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Launch clip_compo_analyser +""" + +import lazyparser as lp +from .clip_compo_analyser import create_multiple_figures +from pathlib import Path +import multiprocessing as mp + + +@lp.parse(ps = range(1, mp.cpu_count() + 1), feature=["gene", "exon"], + threshold_feature="9 < threshold_feature < 10000", + default_groups=range(10, 101), cpnt_type=["nt", "dnt", "tnt", "aa"], + region=["gene", "exon", "intron"]) +def launcher(clip_result_folder: str, feature: str, + threshold_feature: int, default_groups: int, + cpnt_type: str, region: str = "gene", + ps: int = 1, logging_level: str = "DISABLE") -> None: + """ + Create multiple composition figures from a result clip folder. + + :param clip_result_folder: Folder containing tables used to \ + create clip community figures + :param feature: The feature of interest (exon, or gene) + :param threshold_feature: The minimum number of feature needed \ + inside communities enriched or impoverished in peak_density to consider \ + those community (otherwise `default_groups` default number of community \ + are taken + :param default_groups: The number of communities to take as default. + :param cpnt_type: The kind of component analysed + :param region: Only used when feature is gene. Corresponds to \ + the region studied within genes. (default: 'gene') + :param ps: The number of processes to create (default 1) + :param logging_level: The level of data to display (default 'DISABLE') + """ + clip_result_folder = Path(clip_result_folder) + if not clip_result_folder.is_dir(): + raise NotADirectoryError(f"Directory {clip_result_folder} don't exist") + create_multiple_figures(clip_result_folder, feature, threshold_feature, + default_groups, cpnt_type, region, ps, + logging_level) + + +# launcher()