diff --git a/src/find_interaction_cluster/clip_figures/clip_analyser.py b/src/find_interaction_cluster/clip_figures/clip_analyser.py index 4346a2f2de4ce5267c92f51329db7a729bcf2e4f..43aa0a00401d927a6d2350cc303c703d150fe5e9 100644 --- a/src/find_interaction_cluster/clip_figures/clip_analyser.py +++ b/src/find_interaction_cluster/clip_figures/clip_analyser.py @@ -19,6 +19,9 @@ from ...logging_conf import logging_def import logging from typing import Tuple import multiprocessing as mp +from ...figures_utils.exons_interactions import get_every_events_4_a_sl +import sqlite3 +from ..sf_and_communities import get_sfname def bedtools_intersect(gene_bed: Path, clip_bed: Path, @@ -230,15 +233,74 @@ def select_community_file(project: str, weight: int, global_weight: int, raise FileNotFoundError(f"File {com_file} was not found !") tmp_name = com_file.name.replace(".txt", "") output = ConfigClip.output_folder / \ - f"CLIP_community_figures-{feature}-{tmp_name}" + f"CLIP_community_figures-{feature}-{tmp_name}" return com_file, output +def add_regulation_column(df_table: pd.DataFrame, sf_name: str, feature: str, + ) -> pd.DataFrame: + """ + Add a column community_data on df_table corresponding to the mean \ + number of exons regulated in each gene if feature is gene or 1 if \ + the exon is regulated. + + :param df_table: A dataframe containing the peak density for each gene \ + or exons. + :param sf_name: The splicing factor of interest + :param feature: The kind of feature analysed + :return: The dataframe updated + + >>> dexon = {"id_exon": ['11553_16', '3222_30', '1001_3'], + ... 'clip_peak': [0, 1, 0], "peak_density": [0, 0.1, 0], + ... 'community': ['C1', 'C1', 'C2']} + >>> add_regulation_column(pd.DataFrame(dexon), 'TRA2A_B', 'exon') + id_exon clip_peak peak_density community community_data + 0 11553_16 0 0.0 C1 1 + 1 3222_30 1 0.1 C1 1 + 2 1001_3 0 0.0 C2 0 + >>> dgene = {"id_gene": [11553, 3222, 1001], + ... 'clip_peak': [0, 1, 0], "peak_density": [0, 0.1, 0], + ... 'community': ['C1', 'C1', 'C2']} + >>> add_regulation_column(pd.DataFrame(dgene), 'TRA2A_B', 'gene') + id_gene clip_peak peak_density community community_data + 0 11553 0 0.0 C1 0.117647 + 1 3222 1 0.1 C1 0.020408 + 2 1001 0 0.0 C2 0.000000 + >>> add_regulation_column(pd.DataFrame(dgene), 'TRAgdghfh', 'gene') + id_gene clip_peak peak_density community + 0 11553 0 0.0 C1 + 1 3222 1 0.1 C1 + 2 1001 0 0.0 C2 + """ + if sf_name not in get_sfname(): + return df_table + up_exon, x = get_every_events_4_a_sl(sqlite3.connect(ConfigGraph.db_file), + sf_name, "up") + down_exon, x = get_every_events_4_a_sl( + sqlite3.connect(ConfigGraph.db_file), sf_name, "down") + exons = up_exon[f"{sf_name}_up"] + down_exon[f"{sf_name}_down"] + if feature == "exon": + df_table["community_data"] = [0] * len(df_table) + df_table.loc[df_table[f"id_{feature}"].isin(exons), + "community_data"] = 1 + else: + df = pd.read_csv(ConfigClip.bed_exon, sep="\t", + names=["chr", "start", "end", "id_exon", "sc", "s"]) + df = df[["id_exon"]] + df["id_gene"] = df["id_exon"].str.replace(r"_\d+", "").astype(int) + df["community_data"] = [0] * df.shape[0] + df.loc[df["id_exon"].isin(exons), "community_data"] = 1 + df.drop("id_exon", axis=1, inplace=True) + df = df.groupby("id_gene").mean().reset_index() + df_table = df_table.merge(df, how="left", on="id_gene") + return df_table + + def create_figure(project: str, weight: int, global_weight: int, same_gene: bool, feature: str, clip_file: Path, feature_bed: Path, test_type: str = "permutation", - iteration: int = 10000, display_size: bool=False, - community_file: str = "") -> None: + iteration: int = 10000, display_size: bool = False, + community_file: str = "", sl_reg: bool = False) -> None: """ Create the final figure :param project: The name of the project of interest @@ -261,7 +323,8 @@ def create_figure(project: str, weight: int, global_weight: int, used to find the community files computed with ChIA-PET data. :param display_size: True to display the size of the community. \ False to display nothing. (default False) - :param ps: The number of processes to create (default 1) + :param sl_reg: True to display the FaRLine regulation of the \ + same factor, False to not display it. """ logging.info(f"Working on {clip_file}") com_file, output = select_community_file(project, weight, global_weight, @@ -270,6 +333,10 @@ def create_figure(project: str, weight: int, global_weight: int, output.mkdir(exist_ok=True, parents=True) outfile = output / f"{clip_file.name.split('.')[0]}.pdf" final_table = create_table(feature, clip_file, feature_bed, com_file) + if sl_reg: + final_table = add_regulation_column(final_table, + clip_file.name.split("_")[0], + feature) create_community_fig(final_table, feature, "peak_density", outfile, test_type, iteration=iteration, display_size=display_size) @@ -279,8 +346,8 @@ def clip_folder_analysis(clip_folder: Path, project: str, weight: int, global_weight: int, same_gene: bool, feature: str, test_type: str = "permutation", iteration: int = 10000, display_size: bool=False, - community_file: str = "", ps: int = 1, - logging_level: str = "DEBUG") -> None: + community_file: str = "", sl_reg: bool = False, + ps: int = 1, logging_level: str = "DEBUG") -> None: """ Create the final figure :param project: The name of the project of interest @@ -301,6 +368,8 @@ def clip_folder_analysis(clip_folder: Path, project: str, weight: int, :param community_file: A file containing custom communities. If \ it equals to '' then weight, global weight and same genes parameter are \ used to find the community files computed with ChIA-PET data. + :param sl_reg: True to display the FaRLine regulation of the \ + same factor, False to not display it. :param ps: The number of processes to create (default 1) :param logging_level: The level of data to display (default 'DISABLE') """ @@ -309,11 +378,12 @@ def clip_folder_analysis(clip_folder: Path, project: str, weight: int, else ConfigClip.bed_exon files = list(clip_folder.glob("*.bed")) + \ list(clip_folder.glob("*.bed.gz")) + files = [files[0]] pool = mp.Pool(processes=min(len(files), ps)) processes = [] for mfile in files: args = [project, weight, global_weight, same_gene, feature, mfile, feature_bed, test_type, iteration, display_size, - community_file] + community_file, sl_reg] processes.append(pool.apply_async(create_figure, args)) [p.get(timeout=None) for p in processes]