diff --git a/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py index 827fdbbc06efe63815a65c7649a61079712a6608..b4186e34d381626a03293ef311dfaf5c4b854d4a 100644 --- a/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py +++ b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py @@ -9,10 +9,9 @@ has a composition biais. """ from pathlib import Path -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Dict import pandas as pd from .config import ConfigClip, Config, ConfigGraph -import doctest from math import floor, ceil from ..nt_and_community import get_ft_id, get_cpnt_frequency, get_features import sqlite3 @@ -25,6 +24,8 @@ import matplotlib.pyplot as plt from ...logging_conf import logging_def import logging import multiprocessing as mp +import subprocess as sp +import lazyparser as lp def find_final_clip_tables(folder: Path) -> List[Path]: @@ -108,7 +109,7 @@ def get_feature_id(clip_df: pd.DataFrame, feature: str, return df[df["community"].isin(list_community)][f"id_{feature}"].to_list() -def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str, +def get_extreme_groups(df: pd.DataFrame, group: str, threshold_feature: int, default_groups: int) -> Tuple[List, str]: """ @@ -117,7 +118,6 @@ def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str, :param df: A dataframe used to build the community figures \ of clip density. - :param table_file: The file from which the dataframe df was created :param group: The kind of community to recover (enriched or impoverished) :param threshold_feature: The minimum number of feature to \ recover from enriched/impoverished communities @@ -129,16 +129,16 @@ def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str, >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"), ... "gene") - >>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 5, 3) - (['C916', 'C690'], 'SF_test_enriched') - >>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 6, 3) - (['C680', 'C916', 'C690'], 'SF_test_enriched_nosig') - >>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 3, 3) - (['C852', 'C91'], 'SF_test_impoverished') - >>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 4, 3) - (['C852', 'C91', 'C10'], 'SF_test_impoverished_nosig') - """ - outname = table_file.name.rstrip("bar.txt") + group + >>> get_extreme_groups(d, "enriched", 5, 3) + (['C916', 'C690'], 'enriched') + >>> get_extreme_groups(d, "enriched", 6, 3) + (['C680', 'C916', 'C690'], 'enriched_nosig') + >>> get_extreme_groups(d, "impoverished", 3, 3) + (['C852', 'C91'], 'impoverished') + >>> get_extreme_groups(d, "impoverished", 4, 3) + (['C852', 'C91', 'C10'], 'impoverished_nosig') + """ + outname = group my_reg = " + " if group == "enriched" else " - " tmp = df[df["reg-adj"] == my_reg] if not tmp.empty and sum(tmp["community_size"]) >= threshold_feature: @@ -151,15 +151,14 @@ def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str, return list_com[:default_groups], outname -def get_middle_group(df: pd.DataFrame, table_file: Path, - default_groups: int) -> Tuple[List, str]: +def get_middle_group(df: pd.DataFrame, default_groups: int + ) -> Tuple[List, str]: """ Recover the impoverished or enriched communities for peaks \ in a given splicing factor if they exists. :param df: A dataframe used to build the community figures \ of clip density. - :param table_file: The file from which the dataframe df was created :param default_groups: The number of community to get if none \ (or not enough) community are enriched or impoverished \ (depending on group parameter) @@ -168,16 +167,19 @@ def get_middle_group(df: pd.DataFrame, table_file: Path, >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"), ... "gene") - >>> get_middle_group(d, ConfigClip.test_clip_file, 3) - (['C232', 'C397', 'C952'], 'SF_test_middle') - >>> get_middle_group(d, ConfigClip.test_clip_file, 4) - (['C325', 'C232', 'C397', 'C952'], 'SF_test_middle') + >>> get_middle_group(d, 3) + (['C232', 'C397', 'C952'], 'middle') + >>> get_middle_group(d, 4) + (['C325', 'C232', 'C397', 'C952'], 'middle') """ - outname = table_file.name.rstrip("bar.txt") + "middle" + outname = "middle" list_com = df["community"].to_list() index = floor(len(list_com) / 2) - return list_com[index - floor(default_groups / 2): - index + ceil(default_groups / 2)], outname + middle = list_com[index - floor(default_groups / 2): + index + ceil(default_groups / 2)] + middle_com = df[(df["community"].isin(middle)) & + (df['reg-adj'] == " . ")]["community"].to_list() + return middle_com, outname def add_ctrl_df(df: pd.DataFrame, feature: str) -> pd.DataFrame: @@ -229,57 +231,57 @@ def get_interest_groups(clip_table_file: Path, feature: str, if belong. >>> get_interest_groups(ConfigClip.test_clip_file, "gene", 3, 3) - id_gene group - 0 13856 SF_test_impoverished - 1 13360 SF_test_impoverished - 2 7831 SF_test_impoverished - 3 17226 SF_test_middle - 4 17418 SF_test_middle - 5 16985 SF_test_middle - 6 16897 SF_test_middle - 7 16536 SF_test_middle - 8 17214 SF_test_middle - 9 17271 SF_test_middle - 10 17360 SF_test_middle - 11 16022 SF_test_middle - 12 16660 SF_test_middle - 13 17080 SF_test_middle - 14 17038 SF_test_middle - 15 10558 SF_test_middle - 16 11520 SF_test_middle - 17 10569 SF_test_middle - 18 10641 SF_test_middle - 19 10333 SF_test_middle - 20 12509 SF_test_middle - 21 11687 SF_test_middle - 22 10886 SF_test_middle - 23 10784 SF_test_middle - 24 10997 SF_test_middle - 25 10907 SF_test_middle - 26 12115 SF_test_middle - 27 10922 SF_test_middle - 28 10586 SF_test_middle - 29 12030 SF_test_middle - 30 10627 SF_test_middle - 31 12525 SF_test_middle - 32 12172 SF_test_middle - 33 11346 SF_test_middle - 34 11949 SF_test_middle - 35 15390 SF_test_middle - 36 15348 SF_test_middle - 37 14966 SF_test_middle - 38 15272 SF_test_middle - 39 15287 SF_test_middle - 40 15135 SF_test_middle - 41 14992 SF_test_middle - 42 15006 SF_test_middle - 43 15199 SF_test_middle - 44 15285 SF_test_middle - 45 11489 SF_test_enriched - 46 11723 SF_test_enriched - 47 9289 SF_test_enriched - 48 9734 SF_test_enriched - 49 9708 SF_test_enriched + id_gene group + 0 13856 impoverished + 1 13360 impoverished + 2 7831 impoverished + 3 17226 middle + 4 17418 middle + 5 16985 middle + 6 16897 middle + 7 16536 middle + 8 17214 middle + 9 17271 middle + 10 17360 middle + 11 16022 middle + 12 16660 middle + 13 17080 middle + 14 17038 middle + 15 10558 middle + 16 11520 middle + 17 10569 middle + 18 10641 middle + 19 10333 middle + 20 12509 middle + 21 11687 middle + 22 10886 middle + 23 10784 middle + 24 10997 middle + 25 10907 middle + 26 12115 middle + 27 10922 middle + 28 10586 middle + 29 12030 middle + 30 10627 middle + 31 12525 middle + 32 12172 middle + 33 11346 middle + 34 11949 middle + 35 15390 middle + 36 15348 middle + 37 14966 middle + 38 15272 middle + 39 15287 middle + 40 15135 middle + 41 14992 middle + 42 15006 middle + 43 15199 middle + 44 15285 middle + 45 11489 enriched + 46 11723 enriched + 47 9289 enriched + 48 9734 enriched + 49 9708 enriched >>> r = get_interest_groups(ConfigClip.test_clip_file, "gene", 6, 3) >>> r is None True @@ -287,13 +289,12 @@ def get_interest_groups(clip_table_file: Path, feature: str, ini_df = pd.read_csv(clip_table_file, sep="\t") df = prepare_df(ini_df, feature) enrich_com, name_enrich = get_extreme_groups( - df, clip_table_file, "enriched", threshold_feature, default_groups) + df, "enriched", threshold_feature, default_groups) impov_com, name_impov = get_extreme_groups( - df, clip_table_file, "impoverished", threshold_feature, default_groups) + df, "impoverished", threshold_feature, default_groups) if "nosig" in name_enrich and "nosig" in name_impov: return None - middle_com, name_middle = get_middle_group(df, clip_table_file, - default_groups) + middle_com, name_middle = get_middle_group(df, default_groups) enrich_grp = get_feature_id(ini_df, feature, enrich_com) impov_grp = get_feature_id(ini_df, feature, impov_com) middle_grp = get_feature_id(ini_df, feature, middle_com) @@ -425,8 +426,10 @@ def create_figure_4_clip_table(clip_table_file: Path, feature: str, df_comp = df_comp.merge(df_group, how="left", on=f"id_{feature}") df_comp = df_comp[-df_comp["group"].isna()] sf_name = clip_table_file.name.rstrip("_bar.txt") - outfiles = [clip_table_file.parent / sf_name / - f"{cpnt_type}_{cpnt}_{sf_name}_thd" + res_folder = clip_table_file.parent / f"{cpnt_type}_analysis" + res_folder.mkdir(exist_ok=True) + outfiles = [res_folder / sf_name / + f"{cpnt_type}_{cpnt}_{sf_name}_thd" f"{threshold_feature}_dft{default_groups}.{ext}" for ext in ['txt', 'pdf', 'bar.txt']] outfiles[0].parent.mkdir(exist_ok=True) @@ -489,5 +492,208 @@ def create_multiple_figures(clip_result_folder: Path, feature: str, pool.join() +############################################################################ +# Functions for composition analysis on +# A clip folder produced by clip_launcher_4_many_communities +############################################################################ + + +def find_clip_directories(folder: Path) -> List[Path]: + """ + From a folder return every folder inside it. + + :param folder: A folder containing other subfolder. This folder \ + should have been produced by `clip_launcher_4_many_communities`. + :return: The list of files ending with *_bar.txt + + >>> find_clip_directories(Path("fziufriubdsibf")) + Traceback (most recent call last): + ... + NotADirectoryError: Folder fziufriubdsibf doesn't exists ! + """ + if not folder.is_dir(): + raise NotADirectoryError(f"Folder {folder} doesn't exists !") + return [d for d in list(folder.glob("*")) if d.is_dir()] + + +def get_ctrl_dic(list_files: List[Path], feature: str) -> Dict: + """ + Create a dictionary of control exons/genes + :param list_files: A list of clip density table + :param feature: gene or exon + :return: A dictionary containing the controls for each clip files + """ + return { + f.name.replace(".tmp_bar.txt", ""): add_ctrl_df( + pd.read_csv(f, sep="\t"), feature + ) + for f in list_files + } + + +def merge_figures(folder: Path) -> None: + """ + Merge the figures together using imageMagick + + :param folder: A folder containing pdf files + """ + fig_name = folder.parent.name + cmd = f"montage -geometry +1+1 -tile 1X3 " \ + f"-compress jpeg -density 100 -label %t -pointsize 50 " \ + f"{folder}/*dft*.pdf {folder}/{fig_name}.pdf" + sp.check_call(cmd, shell=True) + + +def create_figure_multi_clip_table(clip_table_file: Path, feature: str, + threshold_feature: int, default_groups: int, + df_comp: pd.DataFrame, cpnt_type: str, + cpnt: str, df_id_ctrl: pd.DataFrame, + write_comp_table: bool, + display_size: bool) -> Optional[Path]: + """ + Create a component figure for a clip file + + :param clip_table_file: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :param threshold_feature: The minimum number of feature to \ + recover from enriched/impoverished communities + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :param df_comp: The dataframe containing the frequency of every `cpnt_type` + :param cpnt_type: The kind of component of interest + :param cpnt: The kind of component of interest + :param df_id_ctrl: A dataframe indicating control exons + :param write_comp_table: True to save comp table, false else + :param display_size: True to display the size of the community . + False to display nothing. (default False) + :return: The folder where the figure is created + """ + logging.info(f"Working on {clip_table_file.parent.name} - " + f"{clip_table_file.name} - {cpnt} ({cpnt_type})") + df_group = get_interest_groups(clip_table_file, feature, + threshold_feature, default_groups) + if df_group is None: + return None + df_group = df_group.drop_duplicates() + if feature == "gene": + df_group["id_gene"] = df_group["id_gene"].astype(int) + df_comp["id_gene"] = df_comp["id_gene"].astype(int) + df_group = pd.concat([df_group, df_id_ctrl], ignore_index=True) + if len(np.unique(df_group[f"id_{feature}"].values)) != df_group.shape[0]: + print(df_group[df_group.duplicated(subset=f"id_{feature}")]) + raise ValueError("Found duplicates value in df_group, exiting...") + df_comp = df_comp.merge(df_group, how="left", on=f"id_{feature}") + df_comp = df_comp[-df_comp["group"].isna()] + sf_name = clip_table_file.parent.name + res_folder = clip_table_file.parent / f"{cpnt_type}_analysis" + res_folder.mkdir(exist_ok=True) + outfiles = [res_folder / + f"{clip_table_file.name.split('.')[0]}_{cpnt_type}_{cpnt}_thd" + f"{threshold_feature}_dft{default_groups}.{ext}" + for ext in ['txt', 'pdf', 'bar.txt']] + outfiles[0].parent.mkdir(exist_ok=True) + df_stat = create_stat_df(df_comp, cpnt) + df_stat.to_csv(outfiles[0], sep="\t", index=False) + df_comp = update_composition_group(df_comp, display_size) + make_barplot(df_comp, outfiles[1], cpnt, cpnt_type, sf_name) + if write_comp_table: + df_comp.to_csv(outfiles[2], sep="\t", index=False) + return res_folder + + +def multi_tads_compo_figures(clip_result_folder: Path, feature: str, + threshold_feature: int, default_groups: int, + cpnt_type: str, region: str = "gene", + ps: int = 1, display_size: bool = False, + logging_level: str = "DEBUG") -> None: + """ + Create multiple composition figures from a result clip folder (produced \ + by clip_launcher_4_many_communities). + + :param clip_result_folder: Folder containing tables used to \ + create clip community figures + :param feature: The feature of interest (exon, or gene) + :param threshold_feature: The minimum number of feature needed \ + inside communities enriched or impoverished in peak_density to consider \ + those community (otherwise `default_groups` default number of community \ + are taken + :param default_groups: The number of communities to take as default. + :param cpnt_type: The kind of component analysed + :param region: Only used when feature is gene. Corresponds to \ + the region studied within genes. + :param ps: The number of processes to create + :param display_size: True to display the size of the community, + False to display nothing. (default False) + :param logging_level: The level of data to display (default 'DISABLE') + """ + logging_def(ConfigGraph.community_folder, __file__, logging_level) + list_folder = find_clip_directories(clip_result_folder) + list_tables = [find_final_clip_tables(folder) for folder in list_folder] + dic_df_ctrl = get_ctrl_dic(list_tables[0], feature) + list_tables = np.array(list_tables).flatten() + list_ft = get_features(cpnt_type) + if len(list_ft) == 0: + logging.warning(f"No clip table found in {clip_result_folder} !\n" + f"Exiting...") + return None + cnx = sqlite3.connect(Config.db_file) + df_comp = get_cpnt_frequency(cnx, get_ft_id(cnx, feature), feature, region, + cpnt_type) + conditions = list(product(list_tables, list_ft)) + ps = min(len(conditions), ps) + pool = mp.Pool(processes=ps if ps > 0 else 1) + processes = [] + write_bar_table = True + for clip_file, cpnt in conditions: + df_ctrl = dic_df_ctrl[clip_file.name.replace(".tmp_bar.txt", "")] + args = [clip_file, feature, threshold_feature, default_groups, + df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table, + display_size] + write_bar_table = False + processes.append(pool.apply_async(create_figure_multi_clip_table, + args)) + results = [p.get(timeout=None) for p in processes] + pool.close() + pool.join() + list_folder = np.unique([r for r in results if r is not None]) + for folder in list_folder: + merge_figures(folder) + + +@lp.parse(feature=["gene", "exon"], threshold_feature=range(1, 1001), + default_groups=range(1, 100), cpnt_type=["nt", "dnt", "aa"]) +def multi_tads_launcher(clip_result_folder: str, feature: str, + threshold_feature: int = 100, default_groups: int = 20, + cpnt_type: str = "nt", region: str = "gene", + ps: int = 1, display_size: bool = False, + logging_level: str = "DEBUG") -> None: + """ + Launch multi_tads_compo_figures script + + :param clip_result_folder: Folder containing tables used to \ + create clip community figures + :param feature: The feature of interest (exon, or gene) + :param threshold_feature: The minimum number of feature needed \ + inside communities enriched or impoverished in peak_density to consider \ + those community (otherwise `default_groups` default number of community \ + are taken. Default 100) + :param default_groups: The number of communities to take as default. \ + (default 20) + :param cpnt_type: The kind of component analysed (default nt) + :param region: Only used when feature is gene. Corresponds to \ + the region studied within genes. (default gene) + :param ps: The number of processes to create (default 1) + :param display_size: True to display the size of the community, + False to display nothing. (default False) + :param logging_level: The level of data to display (default 'DISABLE') + """ + multi_tads_compo_figures(Path(clip_result_folder), feature, + threshold_feature, default_groups, + cpnt_type, region, ps, display_size, + logging_level) + + if __name__ == "__main__": - doctest.testmod() + multi_tads_launcher()