diff --git a/.gitignore b/.gitignore index 07e67ccf8dbc0bb1cd23ebf8c233986479a3b908..f0c951ec266bd34508294c9baebe331bffb37ebf 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ src/*/*/__pycache__/* src/*.pyc src/*/*/*.pyc presentations/* +tests/__pycache__/*.pyc diff --git a/src/db_utils/config.py b/src/db_utils/config.py index 4c173c31cb2be7ad3e8b2fdd432a26e65ca5e4ff..7284333388e41b5450cd516df29bb6430f190b83 100755 --- a/src/db_utils/config.py +++ b/src/db_utils/config.py @@ -24,3 +24,4 @@ class Config: splicing_projects = data / 'splicing_lore_data' / \ 'splicing_lore_projects.txt' chia_pet_files = data / 'interactions_files' / 'chia_pet' + tests_files = Path(__file__).parents[2] / 'tests' / "files" \ No newline at end of file diff --git a/src/find_interaction_cluster/__main__.py b/src/find_interaction_cluster/__main__.py index 37102fa6918cacba2c6e068d85b068acc3e4c4c9..19361fad8859cd421a81b1ed8d46562dabaed614 100644 --- a/src/find_interaction_cluster/__main__.py +++ b/src/find_interaction_cluster/__main__.py @@ -22,7 +22,7 @@ from .colocalisation_n_ppi_analysis import coloc_ppi_stat_main @lp.parse(weight=range(1, 11), global_weight=range(11), feature=('gene', 'exon'), region=('', 'exon', 'intron', 'gene'), - iteration="iteration > 20", test_type=('lm', 'perm'), + iteration="iteration > 20", test_type=('lm', 'permutation'), component_type=("nt", "aa", "dnt")) def launcher(weight: int = 1, global_weight: int = 0, @@ -31,6 +31,7 @@ def launcher(weight: int = 1, html_fig: bool = False, feature: str = 'exon', region: str = '', component_type: str = 'nt', iteration: int = 1000, test_type: str = "lm", + display_size: bool = False, project: str = "GSM1018963_GSM1018964", logging_level: str = "DISABLE"): """ @@ -55,7 +56,9 @@ def launcher(weight: int = 1, :param project: The project name of interest \ (only used is global_weight = 0). :param test_type: The king of test to perform for frequency analysis. \ - (default 'lmm') (choose from 'lmm', 'perm') + (default 'lm') (choose from 'lm', 'permutation') + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) :param ps: The number of processes to use """ logging_def(ConfigGraph.community_folder, __file__, logging_level) @@ -67,11 +70,12 @@ def launcher(weight: int = 1, install_hipmcl("INFO") multiple_community_launcher(weight, global_weight, project, same_gene, html_fig, feature, logging_level) - multiple_stat_launcher(ps, weight, global_weight, project, same_gene, - feature, logging_level) + # multiple_stat_launcher(ps, weight, global_weight, project, same_gene, + # feature, logging_level) multiple_nt_lm_launcher(ps, weight, global_weight, project, same_gene, feature, region, component_type, - test_type, iteration, logging_level) + test_type, iteration, display_size, + logging_level=logging_level) if feature == "gene": # ppi_stat_launcher(weight, global_weight, project, same_gene, # ConfigGraph.ppi_threshold, iteration, diff --git a/src/find_interaction_cluster/clip_figures/__init__.py b/src/find_interaction_cluster/clip_figures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..60f3a118d41cf7494b1cd067bf5d649b42ba1e05 --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" diff --git a/src/find_interaction_cluster/clip_figures/__main__.py b/src/find_interaction_cluster/clip_figures/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..122674490a4d8228fb6ebb7341088b946c6710f7 --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/__main__.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Create the communities figures launcher from a community \ +folder +""" + +import lazyparser as lp +from pathlib import Path +from .clip_analyser import clip_folder_analysis + + +@lp.parse(test_type=["permutation", "lm"], feature=["gene", "exon"]) +def clip_analysis(clip_folder: str, weight: int = -1, + global_weight: int = -1, same_gene: bool = True, + feature: str = "exon", + project: str = "GSM1018963_GSM1018964", + test_type: str = "permutation", iteration: int = 10000, + display_size: bool=False, + community_file: str = "", + ps: int = 1, + logging_level: str = "DEBUG") -> None: + """ + Create the final figure. + + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param feature: The feature we want to analyse (default 'exon') + :param clip_folder: A folder containing clip file + :param test_type: The kind of test to perform for frequency analysis. \ + (default 'lm') (choose from 'lm', 'permutation') + :param iteration: The number of iteration + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. + :param ps: The number of processes to create (default 1) + :param logging_level: The level of data to display (default 'DISABLE') + :return: The final dataframe that can be used to create clip figures + """ + if (weight == -1 or global_weight == -1) and community_file == "": + raise ValueError(f"Global weight or weight can't be equal to one if " + f"community_file is not defined !") + clip_folder = Path(clip_folder) + if test_type == "lm": + raise NotImplementedError("test_type for lm is not implemented !") + if not clip_folder.is_dir(): + raise NotADirectoryError(f"{clip_folder} is not an existing directory") + clip_folder_analysis(clip_folder, project, weight, global_weight, + same_gene, feature, test_type, iteration, + display_size, community_file, ps, logging_level) + + +clip_analysis() \ No newline at end of file diff --git a/src/find_interaction_cluster/clip_figures/clip_analyser.py b/src/find_interaction_cluster/clip_figures/clip_analyser.py new file mode 100644 index 0000000000000000000000000000000000000000..4346a2f2de4ce5267c92f51329db7a729bcf2e4f --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/clip_analyser.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to create a community figure \ +from a bed file +""" + + +from .config import ConfigClip, ConfigGraph +import subprocess as sp +from pathlib import Path +from io import StringIO +import pandas as pd +from ..community_finder import multiple_community_launcher, get_projects +from ..community_figures.fig_functions import create_community_fig +from ...logging_conf import logging_def +import logging +from typing import Tuple +import multiprocessing as mp + + +def bedtools_intersect(gene_bed: Path, clip_bed: Path, + feature: str = "exon") -> pd.DataFrame: + """ + + :param gene_bed: A bed file containing FasterDB genes + :param clip_bed: A bed file containing CLIP peaks + :param feature: The feature we want to analyse (default 'exon') + :return: The number of peaks inside each genes + + >>> bedtools_intersect(ConfigClip.test_gene_bed, ConfigClip.test_clip_bed, + ... "gene") + id_gene clip_peak peak_density + 0 1 2 0.054877 + 1 2 1 0.029736 + 2 3 3 0.076251 + 3 4 0 0.000000 + 4 5 0 0.000000 + 5 6 0 0.000000 + 6 7 0 0.000000 + 7 8 0 0.000000 + 8 9 0 0.000000 + 9 415 0 0.000000 + 10 10123 0 0.000000 + + """ + res = sp.check_output(f"bedtools intersect -a {gene_bed} " + f"-b {clip_bed} -c", shell=True) + res = StringIO(str(res, "utf-8")) + id_col = f"id_{feature}" + columns = ["chr", "start", "end", id_col, "name", "strand", "clip_peak"] + df = pd.read_csv(res, sep="\t", names=columns) + df['peak_density'] = df['clip_peak'] / ((df['end'] - df['start']) / 1000) + return df[[f'id_{feature}', 'clip_peak', 'peak_density']] + + +def find_or_create_community(project: str, weight: int, global_weight: int, + same_gene: bool, feature: str) -> Path: + """ + Find the community file of interest, or if it doesn't exist, create it. + + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param feature: The feature we want to analyse (default 'exon') + :return: The file containing community + + >>> res = find_or_create_community("", 3, 3, True, "gene") + >>> res_path = str(res.relative_to(Path(__file__).parents[3])) + >>> res_path.endswith("global-weight-3_weight-3_same_gene-True_gene.txt") + True + >>> f = "results/community_of_co-localized-exons/" + \ + "communities/weight-3_global_weight-3" + >>> res_path.startswith(f) + True + """ + cfile = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, feature, ".txt") + if not cfile.is_file(): + project = get_projects(global_weight, project) + multiple_community_launcher(weight, global_weight, project, same_gene, + False, feature) + if not cfile.is_file(): + raise FileNotFoundError(f"The file {cfile} could not be created !") + return cfile + + +def get_community_tables(df: pd.DataFrame, feature: str) -> pd.DataFrame: + """ + Create a dataframe indicating for each feature, to which community it \ + belong. + + :param df: A dataframe of community + :param feature: The feature we want to analyse (default 'exon') + :return: dataframe indicating for each feature, to which community it \ + belong. + + >>> d = pd.DataFrame({"community": ["C1", "C2"], "nodes": [2, 3], + ... "edges": [1, 3], "EC": [1, 1], "HCS": ["no", "no"], + ... "%E vs E in complete G": [36, 40], "cov": [0.96, 0.96], + ... "modularity": [0.97, 0.97], "genes": ["1, 2", "3, 4, 5"], + ... "project": ["Global-weight-3", "Global-weight-3"]}) + >>> get_community_tables(d, "gene") + id_gene community community_size + 0 1 C1 2 + 1 2 C1 2 + 2 3 C2 3 + 3 4 C2 3 + 4 5 C2 3 + """ + dic = {f"id_{feature}": [], "community": [], "community_size": []} + for i in range(df.shape[0]): + row = df.iloc[i, :] + ft = row[f"{feature}s"].split(", ") + dic[f"id_{feature}"] += row[f"{feature}s"].split(", ") + dic["community"] += [row["community"]] * len(ft) + dic["community_size"] += [row["nodes"]] * len(ft) + df = pd.DataFrame(dic) + if feature == "gene": + df["id_gene"] = df["id_gene"].astype(int) + return df + + +def merge_dataframes(peak_df: pd.DataFrame, com_df: pd.DataFrame, + feature: str) -> pd.DataFrame: + """ + Merge a dataframe containing the numbers of clip peak per feature \ + with the dataframe containing the community of each feature. + + :param peak_df: Dataframe containing the number of clip peak per \ + feature + :param com_df: Dataframe containing the community of each figure + :param feature: The feature we want to analyse (default 'exon') + :return: The dataframe merged + + >>> d1 = pd.DataFrame({"id_gene": [0, 1, 2, 3, 4, 5, 6, 7], + ... "peak_density": [0.054877, 0.029736, 0.076251, 0, 0, 0, 0, 0]}) + >>> d2 = pd.DataFrame({"id_gene": [0, 1, 2, 3, 4], + ... "community": ["C1", "C1", "C2", "C2", "C2"], + ... "community_size": [2, 2, 3, 3, 3]}) + >>> merge_dataframes(d1, d2, "gene") + id_gene peak_density community community_size + 0 0 0.054877 C1 2.0 + 1 1 0.029736 C1 2.0 + 2 2 0.076251 C2 3.0 + 3 3 0.000000 C2 3.0 + 4 4 0.000000 C2 3.0 + 5 5 0.000000 NaN NaN + 6 6 0.000000 NaN NaN + 7 7 0.000000 NaN NaN + + + """ + return peak_df.merge(com_df, how="left", on=f"id_{feature}") + + +def create_table(feature: str, clip_file: Path, + feature_bed: Path, com_file: Path) -> pd.DataFrame: + """ + Create the final table used to create community figure. + + :param feature: The feature we want to analyse (default 'exon') + :param clip_file: A bed file containing clip + :param feature_bed: A bed files containing exons or genes depending on \ + feature parameter. + :param com_file: A file containing communities + :return: The final dataframe that can be used to create clip figures + + >>> cf = find_or_create_community("", 3, 3, True, "gene") + >>> create_table("gene", ConfigClip.test_clip_bed, + ... ConfigClip.test_gene_bed, cf) + id_gene clip_peak peak_density community community_size + 0 1 2 0.054877 NaN NaN + 1 2 1 0.029736 NaN NaN + 2 3 3 0.076251 NaN NaN + 3 4 0 0.000000 NaN NaN + 4 5 0 0.000000 NaN NaN + 5 6 0 0.000000 NaN NaN + 6 7 0 0.000000 NaN NaN + 7 8 0 0.000000 NaN NaN + 8 9 0 0.000000 NaN NaN + 9 415 0 0.000000 C1 11.0 + 10 10123 0 0.000000 NaN NaN + + + """ + threshold = 10 if feature == "gene" else 50 + df_clip = bedtools_intersect(feature_bed, clip_file, feature) + df_tmp = pd.read_csv(com_file, sep="\t") + df_com = get_community_tables(df_tmp, feature) + df_com = df_com.loc[df_com["community_size"] >= threshold, :] + return merge_dataframes(df_clip, df_com, feature) + + +def select_community_file(project: str, weight: int, global_weight: int, + same_gene: bool, feature: str, + community_file: str = "") -> Tuple[Path, Path]: + """ + Return the community file and output folder that will be used. + + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param feature: The feature we want to analyse (default 'exon') + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. + :return: The community file used and the output folder used. + + """ + if community_file == "": + com_file = find_or_create_community(project, weight, global_weight, + same_gene, feature) + output = com_file.parent / f"CLIP_community_figures_{feature}" + else: + com_file = Path(community_file) + if not com_file.is_file(): + raise FileNotFoundError(f"File {com_file} was not found !") + tmp_name = com_file.name.replace(".txt", "") + output = ConfigClip.output_folder / \ + f"CLIP_community_figures-{feature}-{tmp_name}" + return com_file, output + + +def create_figure(project: str, weight: int, global_weight: int, + same_gene: bool, feature: str, clip_file: Path, + feature_bed: Path, test_type: str = "permutation", + iteration: int = 10000, display_size: bool=False, + community_file: str = "") -> None: + """ + Create the final figure + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param feature: The feature we want to analyse (default 'exon') + :param clip_file: A bed file containing clip + :param feature_bed: A bed files containing exons or genes depending on \ + feature parameter. + :param test_type: The king of test to perform for frequency analysis. \ + (default 'lm') (choose from 'lm', 'permutation') + :param iteration: The number of iteration to make + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param ps: The number of processes to create (default 1) + """ + logging.info(f"Working on {clip_file}") + com_file, output = select_community_file(project, weight, global_weight, + same_gene, feature, + community_file) + output.mkdir(exist_ok=True, parents=True) + outfile = output / f"{clip_file.name.split('.')[0]}.pdf" + final_table = create_table(feature, clip_file, feature_bed, com_file) + create_community_fig(final_table, feature, "peak_density", outfile, + test_type, iteration=iteration, + display_size=display_size) + + +def clip_folder_analysis(clip_folder: Path, project: str, weight: int, + global_weight: int, same_gene: bool, feature: str, + test_type: str = "permutation", + iteration: int = 10000, display_size: bool=False, + community_file: str = "", ps: int = 1, + logging_level: str = "DEBUG") -> None: + """ + Create the final figure + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param feature: The feature we want to analyse (default 'exon') + :param clip_folder: A folder containing clip file + :param test_type: The king of test to perform for frequency analysis. \ + (default 'lm') (choose from 'lm', 'permutation') + :param iteration: The number of iteration to make + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. + :param ps: The number of processes to create (default 1) + :param logging_level: The level of data to display (default 'DISABLE') + """ + logging_def(ConfigGraph.community_folder, __file__, logging_level) + feature_bed = ConfigClip.bed_gene if feature == "gene" \ + else ConfigClip.bed_exon + files = list(clip_folder.glob("*.bed")) + \ + list(clip_folder.glob("*.bed.gz")) + pool = mp.Pool(processes=min(len(files), ps)) + processes = [] + for mfile in files: + args = [project, weight, global_weight, same_gene, feature, + mfile, feature_bed, test_type, iteration, display_size, + community_file] + processes.append(pool.apply_async(create_figure, args)) + [p.get(timeout=None) for p in processes] diff --git a/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py new file mode 100644 index 0000000000000000000000000000000000000000..827fdbbc06efe63815a65c7649a61079712a6608 --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py @@ -0,0 +1,493 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to check whether a communities \ +of genes that are more frequently bind by a splicing factor \ +has a composition biais. +""" + +from pathlib import Path +from typing import List, Tuple, Optional +import pandas as pd +from .config import ConfigClip, Config, ConfigGraph +import doctest +from math import floor, ceil +from ..nt_and_community import get_ft_id, get_cpnt_frequency, get_features +import sqlite3 +import numpy as np +from scipy.stats import mannwhitneyu +from itertools import combinations, product +from statsmodels.stats.multitest import multipletests +import seaborn as sns +import matplotlib.pyplot as plt +from ...logging_conf import logging_def +import logging +import multiprocessing as mp + + +def find_final_clip_tables(folder: Path) -> List[Path]: + """ + From a folder return every files ending with *_bar.txt. + + :param folder: A folder containing final tables used to produce \ + community figures for clip data (figure produced by the function \ + clip_folder_analysis of the script clip_analyse.py). + :return: The list of files ending with *_bar.txt + + >>> find_final_clip_tables(Path("fziufriubdsibf")) + Traceback (most recent call last): + ... + NotADirectoryError: Folder fziufriubdsibf doesn't exists ! + >>> find_final_clip_tables(Path("src/")) + [] + """ + if not folder.is_dir(): + raise NotADirectoryError(f"Folder {folder} doesn't exists !") + return list(folder.glob("*_bar.txt")) + + +def prepare_df(clip_df: pd.DataFrame, feature: str) -> pd.DataFrame: + """ + Prepare the table containing data to produce the clip community figure. + + :param clip_df: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :return: The filtered table + + >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"), + ... "gene") + >>> d.head() + community community_size p-adj reg-adj peak_density + 8 C852 1.0 0.015356 - 0.000000 + 9 C91 2.0 0.027338 - 0.000000 + 0 C10 3.0 0.228296 . 0.046749 + 1 C165 12.0 0.252854 . 0.271487 + 3 C325 18.0 0.228296 . 0.272363 + >>> d["community"].to_list() == ['C852', 'C91', 'C10', 'C165', 'C325', + ... 'C232', 'C397', 'C952', 'C365', 'C680', 'C916', 'C690'] + True + """ + df = clip_df.copy() + df = df.loc[df[f"id_{feature}"] != "ctrl", + ["peak_density", "community", "community_size", "p-adj", + "reg-adj", f"id_{feature}"]] + df = df.groupby(["community", "community_size", "p-adj", "reg-adj"] + ).mean().reset_index() + df = df.sort_values("peak_density", ascending=True) + return df + + +def get_feature_id(clip_df: pd.DataFrame, feature: str, + list_community: List[str]) -> pd.DataFrame: + """ + Get the list of gene contained in the list of communities given. + + :param clip_df: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :param list_community: A list of community + :return: The filtered table) + + >>> d = pd.read_table(ConfigClip.test_clip_file, sep="\\t") + >>> get_feature_id(d, "gene", ['C916', 'C690']) + ['11489', '11723', '9289', '9734', '9708'] + >>> get_feature_id(d, "gene", ['C680', 'C916', + ... 'C690']) + ['16283', '16852', '11489', '11723', '9289', '9734', '9708'] + >>> get_feature_id(d, "gene", ['C91', 'C852']) + ['13856', '13360', '7831'] + >>> get_feature_id(d, "gene", ['C91', 'C852', + ... 'C10']) + ['13856', '13360', '7831', '16763', '16676', '16817'] + """ + df = clip_df.copy() + df = df[df[f"id_{feature}"] != "ctrl"] + return df[df["community"].isin(list_community)][f"id_{feature}"].to_list() + + +def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str, + threshold_feature: int, + default_groups: int) -> Tuple[List, str]: + """ + Recover the impoverished or enriched communities for peaks \ + in a given splicing factor if they exists. + + :param df: A dataframe used to build the community figures \ + of clip density. + :param table_file: The file from which the dataframe df was created + :param group: The kind of community to recover (enriched or impoverished) + :param threshold_feature: The minimum number of feature to \ + recover from enriched/impoverished communities + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :return: The list of enriched or impoverished community of feature \ + and the name of the list + + >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"), + ... "gene") + >>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 5, 3) + (['C916', 'C690'], 'SF_test_enriched') + >>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 6, 3) + (['C680', 'C916', 'C690'], 'SF_test_enriched_nosig') + >>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 3, 3) + (['C852', 'C91'], 'SF_test_impoverished') + >>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 4, 3) + (['C852', 'C91', 'C10'], 'SF_test_impoverished_nosig') + """ + outname = table_file.name.rstrip("bar.txt") + group + my_reg = " + " if group == "enriched" else " - " + tmp = df[df["reg-adj"] == my_reg] + if not tmp.empty and sum(tmp["community_size"]) >= threshold_feature: + return tmp["community"].to_list(), outname + outname += "_nosig" + list_com = df["community"].to_list() + if group == "enriched": + return list_com[-default_groups:], outname + else: + return list_com[:default_groups], outname + + +def get_middle_group(df: pd.DataFrame, table_file: Path, + default_groups: int) -> Tuple[List, str]: + """ + Recover the impoverished or enriched communities for peaks \ + in a given splicing factor if they exists. + + :param df: A dataframe used to build the community figures \ + of clip density. + :param table_file: The file from which the dataframe df was created + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :return: The list of enriched or impoverished community of feature \ + and the name of the list + + >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"), + ... "gene") + >>> get_middle_group(d, ConfigClip.test_clip_file, 3) + (['C232', 'C397', 'C952'], 'SF_test_middle') + >>> get_middle_group(d, ConfigClip.test_clip_file, 4) + (['C325', 'C232', 'C397', 'C952'], 'SF_test_middle') + """ + outname = table_file.name.rstrip("bar.txt") + "middle" + list_com = df["community"].to_list() + index = floor(len(list_com) / 2) + return list_com[index - floor(default_groups / 2): + index + ceil(default_groups / 2)], outname + + +def add_ctrl_df(df: pd.DataFrame, feature: str) -> pd.DataFrame: + """ + Create a dataframe containing every genes/exons not in a community. + + :param df: The dataframe used to create a community figure + :param feature: The feature of interest (exon or gene) + :return: A table containing 2 columns: id_{feature} and group \ + (containing 'ctrl') + + >>> import time + >>> s = time.time() + >>> add_ctrl_df(pd.DataFrame({"id_gene": list(range(1, 19420)) + ["ctrl"] + ... }), "gene") + id_gene group + 0 19420 ctrl + 1 19421 ctrl + 2 19422 ctrl + 3 19423 ctrl + 4 19424 ctrl + 5 19425 ctrl + 6 19426 ctrl + """ + list_id = get_ft_id(sqlite3.connect(Config.db_file), feature) + com_id = df[df[f"id_{feature}"] != "ctrl"][f"id_{feature}"].to_list() + if feature == "gene": + list_id = list(map(int, list_id)) + com_id = list(map(int, com_id)) + ctrl_id = [mid for mid in list_id if mid not in com_id] + return pd.DataFrame({f"id_{feature}": ctrl_id, + "group": ["ctrl"] * len(ctrl_id)}) + + +def get_interest_groups(clip_table_file: Path, feature: str, + threshold_feature: int, default_groups: int + ) -> Optional[pd.DataFrame]: + """ + + :param clip_table_file: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :param threshold_feature: The minimum number of feature to \ + recover from enriched/impoverished communities + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :return: The dataframe indicating for a id of a feature, the groups where \ + if belong. + + >>> get_interest_groups(ConfigClip.test_clip_file, "gene", 3, 3) + id_gene group + 0 13856 SF_test_impoverished + 1 13360 SF_test_impoverished + 2 7831 SF_test_impoverished + 3 17226 SF_test_middle + 4 17418 SF_test_middle + 5 16985 SF_test_middle + 6 16897 SF_test_middle + 7 16536 SF_test_middle + 8 17214 SF_test_middle + 9 17271 SF_test_middle + 10 17360 SF_test_middle + 11 16022 SF_test_middle + 12 16660 SF_test_middle + 13 17080 SF_test_middle + 14 17038 SF_test_middle + 15 10558 SF_test_middle + 16 11520 SF_test_middle + 17 10569 SF_test_middle + 18 10641 SF_test_middle + 19 10333 SF_test_middle + 20 12509 SF_test_middle + 21 11687 SF_test_middle + 22 10886 SF_test_middle + 23 10784 SF_test_middle + 24 10997 SF_test_middle + 25 10907 SF_test_middle + 26 12115 SF_test_middle + 27 10922 SF_test_middle + 28 10586 SF_test_middle + 29 12030 SF_test_middle + 30 10627 SF_test_middle + 31 12525 SF_test_middle + 32 12172 SF_test_middle + 33 11346 SF_test_middle + 34 11949 SF_test_middle + 35 15390 SF_test_middle + 36 15348 SF_test_middle + 37 14966 SF_test_middle + 38 15272 SF_test_middle + 39 15287 SF_test_middle + 40 15135 SF_test_middle + 41 14992 SF_test_middle + 42 15006 SF_test_middle + 43 15199 SF_test_middle + 44 15285 SF_test_middle + 45 11489 SF_test_enriched + 46 11723 SF_test_enriched + 47 9289 SF_test_enriched + 48 9734 SF_test_enriched + 49 9708 SF_test_enriched + >>> r = get_interest_groups(ConfigClip.test_clip_file, "gene", 6, 3) + >>> r is None + True + """ + ini_df = pd.read_csv(clip_table_file, sep="\t") + df = prepare_df(ini_df, feature) + enrich_com, name_enrich = get_extreme_groups( + df, clip_table_file, "enriched", threshold_feature, default_groups) + impov_com, name_impov = get_extreme_groups( + df, clip_table_file, "impoverished", threshold_feature, default_groups) + if "nosig" in name_enrich and "nosig" in name_impov: + return None + middle_com, name_middle = get_middle_group(df, clip_table_file, + default_groups) + enrich_grp = get_feature_id(ini_df, feature, enrich_com) + impov_grp = get_feature_id(ini_df, feature, impov_com) + middle_grp = get_feature_id(ini_df, feature, middle_com) + return pd.DataFrame({f"id_{feature}": impov_grp + middle_grp + enrich_grp, + "group": [name_impov] * len(impov_grp) + + [name_middle] * len(middle_grp) + + [name_enrich] * len(enrich_grp)}) + + +def create_stat_df(df_comp: pd.DataFrame, cpnt: str) -> pd.DataFrame: + """ + Compute statistical analysis for every group in df_comp. + + :param df_comp: The dataframe containing the frequency of every `cpnt_type` + :param cpnt: The component studied + :return: Dataframe of p-values + + >>> d = pd.DataFrame({"id_gene": [1, 2, 3, 4, 5, 6, 7, 8], + ... "A": [1, 2, 3, 4, 5, 6, 7, 8], "group": ["test"] * 4 + ["ctrl"] * 4}) + >>> create_stat_df(d, "A") + grp1 grp2 p-val + 0 ctrl test 0.030383 + >>> d = pd.DataFrame({"id_gene": list(range(1, 13)), + ... "A": list(range(1, 12)) + [5], "group": ["test"] * 4 + ["ctrl"] * 4 + + ... ["test2"] * 4}) + >>> create_stat_df(d, "A") + grp1 grp2 p-val p-adj + 0 ctrl test 0.030383 0.045574 + 1 ctrl test2 0.245383 0.245383 + 2 test test2 0.030383 0.045574 + + """ + comparison = combinations(np.unique(df_comp["group"]), 2) + content = [] + for grp1, grp2 in comparison: + p_val = mannwhitneyu( + df_comp.loc[df_comp["group"] == grp1, cpnt].values, + df_comp.loc[df_comp["group"] == grp2, cpnt].values, + alternative="two-sided")[1] + content.append([grp1, grp2, p_val]) + df = pd.DataFrame(content, columns=["grp1", "grp2", "p-val"]) + if df.shape[0] > 1: + df["p-adj"] = multipletests(df['p-val'].values, method="fdr_bh")[1] + return df + + +def make_barplot(df_comp: pd.DataFrame, outfile: Path, cpnt: str, + cpnt_type: str, clip_name: str) -> None: + """ + Create a barplot showing the composition for different groups of feature. + + :param df_comp: The dataframe containing the frequency of every `cpnt_type` + :param outfile: File were the figure will be created + :param cpnt: The component of interest + :param cpnt_type: The type of component of interest + :param clip_name: The name of the clip studied + """ + sns.set(context="talk") + g = sns.catplot(x="group", y=cpnt, data=df_comp, height=12, aspect=1.7, + kind="violin") + g.fig.suptitle(f"Mean frequency of {cpnt}({cpnt_type})" + f"among different community groups\n" + f"created from the peak_density from clip {clip_name}") + g.ax.set_ylabel(f'Frequency of {cpnt} ({cpnt_type})') + g.savefig(outfile) + plt.clf() + plt.close() + + +def update_composition_group(df_comp: pd.DataFrame, display_size: bool + ) -> pd.DataFrame: + """ + Update the group name of the dataframe df_com. + + :param df_comp: The dataframe containing the frequency of every `cpnt_type` + :param display_size: True to display the size of \ + the community. False to display nothing. + :return: if display_size is False return df_comp else return df_comp \ + with the group column containing the size of the groups. + """ + if not display_size: + return df_comp + d = { + gr: f"{gr}({df_comp[df_comp['group'] == gr].shape[0]})" + for gr in df_comp["group"].unique() + } + df_comp["group"] = df_comp["group"].map(d) + return df_comp + + +def create_figure_4_clip_table(clip_table_file: Path, feature: str, + threshold_feature: int, default_groups: int, + df_comp: pd.DataFrame, cpnt_type: str, + cpnt: str, df_id_ctrl: pd.DataFrame, + write_comp_table: bool, + display_size: bool) -> None: + """ + Create a component figure for a clip file + + :param clip_table_file: table containing data to produce the clip \ + community figure. + :param feature: The feature of interest (gene or exon) + :param threshold_feature: The minimum number of feature to \ + recover from enriched/impoverished communities + :param default_groups: The number of community to get if none \ + (or not enough) community are enriched or impoverished \ + (depending on group parameter) + :param df_comp: The dataframe containing the frequency of every `cpnt_type` + :param cpnt_type: The kind of component of interest + :param cpnt: The kind of component of interest + :param df_id_ctrl: A dataframe indicating control exons + :param write_comp_table: True to save comp table, false else + :param display_size: True to display the size of the community . + False to display nothing. (default False) + """ + logging.info(f"Working on {clip_table_file.name} - {cpnt} ({cpnt_type})") + df_group = get_interest_groups(clip_table_file, feature, + threshold_feature, default_groups) + if df_group is None: + return None + df_group = df_group.drop_duplicates() + if feature == "gene": + df_group["id_gene"] = df_group["id_gene"].astype(int) + df_comp["id_gene"] = df_comp["id_gene"].astype(int) + df_group = pd.concat([df_group, df_id_ctrl], ignore_index=True) + if len(np.unique(df_group[f"id_{feature}"].values)) != df_group.shape[0]: + print(df_group[df_group.duplicated(subset=f"id_{feature}")]) + raise ValueError("Found duplicates value in df_group, exiting...") + df_comp = df_comp.merge(df_group, how="left", on=f"id_{feature}") + df_comp = df_comp[-df_comp["group"].isna()] + sf_name = clip_table_file.name.rstrip("_bar.txt") + outfiles = [clip_table_file.parent / sf_name / + f"{cpnt_type}_{cpnt}_{sf_name}_thd" + f"{threshold_feature}_dft{default_groups}.{ext}" + for ext in ['txt', 'pdf', 'bar.txt']] + outfiles[0].parent.mkdir(exist_ok=True) + df_stat = create_stat_df(df_comp, cpnt) + df_stat.to_csv(outfiles[0], sep="\t", index=False) + df_comp = update_composition_group(df_comp, display_size) + make_barplot(df_comp, outfiles[1], cpnt, cpnt_type, sf_name) + if write_comp_table: + df_comp.to_csv(outfiles[2], sep="\t", index=False) + + +def create_multiple_figures(clip_result_folder: Path, feature: str, + threshold_feature: int, default_groups: int, + cpnt_type: str, region: str = "gene", + ps: int = 1, display_size: bool = False, + logging_level: str = "DEBUG") -> None: + """ + Create multiple composition figures from a result clip folder. + + :param clip_result_folder: Folder containing tables used to \ + create clip community figures + :param feature: The feature of interest (exon, or gene) + :param threshold_feature: The minimum number of feature needed \ + inside communities enriched or impoverished in peak_density to consider \ + those community (otherwise `default_groups` default number of community \ + are taken + :param default_groups: The number of communities to take as default. + :param cpnt_type: The kind of component analysed + :param region: Only used when feature is gene. Corresponds to \ + the region studied within genes. + :param ps: The number of processes to create + :param display_size: True to display the size of the community, + False to display nothing. (default False) + :param logging_level: The level of data to display (default 'DISABLE') + """ + logging_def(ConfigGraph.community_folder, __file__, logging_level) + list_tables = find_final_clip_tables(clip_result_folder) + list_ft = get_features(cpnt_type) + if len(list_ft) == 0: + logging.warning(f"No clip table found in {clip_result_folder} !\n" + f"Exiting...") + return None + cnx = sqlite3.connect(Config.db_file) + df_comp = get_cpnt_frequency(cnx, get_ft_id(cnx, feature), feature, region, + cpnt_type) + df_ctrl = add_ctrl_df(pd.read_csv(list_tables[0], sep="\t"), feature) + conditions = list(product(list_tables, list_ft)) + ps = min(len(conditions), ps) + pool = mp.Pool(processes=ps if ps > 0 else 1) + processes = [] + write_bar_table = True + for clip_file, cpnt in conditions: + args = [clip_file, feature, threshold_feature, default_groups, + df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table, + display_size] + write_bar_table = False + processes.append(pool.apply_async(create_figure_4_clip_table, args)) + [p.get(timeout=None) for p in processes] + pool.close() + pool.join() + + +if __name__ == "__main__": + doctest.testmod() diff --git a/src/find_interaction_cluster/clip_figures/clip_compo_launcher.py b/src/find_interaction_cluster/clip_figures/clip_compo_launcher.py new file mode 100644 index 0000000000000000000000000000000000000000..9edd36867c6e9cf1e17d8538a3df47bf6e829822 --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/clip_compo_launcher.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Launch clip_compo_analyser +""" + +import lazyparser as lp +from .clip_compo_analyser import create_multiple_figures +from pathlib import Path +import multiprocessing as mp + + +@lp.parse(ps = range(1, mp.cpu_count() + 1), feature=["gene", "exon"], + threshold_feature="9 < threshold_feature < 10000", + default_groups=range(10, 101), cpnt_type=["nt", "dnt", "tnt", "aa"], + region=["gene", "exon", "intron"]) +def launcher(clip_result_folder: str, feature: str, + threshold_feature: int, default_groups: int, + cpnt_type: str, region: str = "gene", + ps: int = 1, display_size: bool=False, + logging_level: str = "DISABLE") -> None: + """ + Create multiple composition figures from a result clip folder. + + :param clip_result_folder: Folder containing tables used to \ + create clip community figures + :param feature: The feature of interest (exon, or gene) + :param threshold_feature: The minimum number of feature needed \ + inside communities enriched or impoverished in peak_density to consider \ + those community (otherwise `default_groups` default number of community \ + are taken + :param default_groups: The number of communities to take as default. + :param cpnt_type: The kind of component analysed + :param region: Only used when feature is gene. Corresponds to \ + the region studied within genes. (default: 'gene') + :param ps: The number of processes to create (default 1) + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param logging_level: The level of data to display (default 'DISABLE') + """ + clip_result_folder = Path(clip_result_folder) + if not clip_result_folder.is_dir(): + raise NotADirectoryError(f"Directory {clip_result_folder} don't exist") + create_multiple_figures(clip_result_folder, feature, threshold_feature, + default_groups, cpnt_type, region, ps, + display_size, logging_level) + +if __name__ == "__main__": + launcher() diff --git a/src/find_interaction_cluster/clip_figures/config.py b/src/find_interaction_cluster/clip_figures/config.py new file mode 100644 index 0000000000000000000000000000000000000000..3e221ee56d7ab0d99318dc068800503a4ae6802f --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/config.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: This file contains the variables needed to make \ +this submodule work +""" + + +from pathlib import Path +from ..config import ConfigGraph, Config + + +class ConfigClip: + """ + Contains required variables for this submodules + """ + output_folder = ConfigGraph.output_folder / "clip_community_figures" + clip_folder = Config.data / "CLIP_bed" + bed_gene = Config.bed_gene + bed_exon = Config.bed_exon + test_gene_bed = Config.tests_files / "genes.bed" + test_clip_bed = Config.tests_files / "clip.bed" + test_clip_file = Config.tests_files / "SF_test_bar.txt" \ No newline at end of file diff --git a/src/find_interaction_cluster/community_figures/__main__.py b/src/find_interaction_cluster/community_figures/__main__.py index e01081b74bdc737a8ddba3584afa4dc77cfbe615..884f616020ab508cd8d371331b4759b5773972c2 100644 --- a/src/find_interaction_cluster/community_figures/__main__.py +++ b/src/find_interaction_cluster/community_figures/__main__.py @@ -49,12 +49,13 @@ def load_and_check_table(table: str, feature: str, target_col: str): return df -@lp.parse(table="file", test_type=["lm", "permutation"], +@lp.parse(table="file", test_type=["lm", "permutation", "logit"], iteration="20 < iteration", sd_community = ["y", "n", "Y", "N"]) def create_community_figures(table: str, feature: str, target_col: str, output: str, outfile: str, test_type: str, target_kind: str = "", sd_community: str = "y", - iteration: int = 10000) -> None: + iteration: int = 10000, + display_size: bool = False) -> None: """ Create a dataframe with a control community, save it as a table and \ as a barplot figure. @@ -75,6 +76,8 @@ def create_community_figures(table: str, feature: str, target_col: str, is only used if test_type = 'permutation' (default 10000). :param sd_community: y to display the standard deviation for communities \ false else. + :param display_size: True to display the size of the community above \ + each one of them False to display nothing. (default False) """ df = load_and_check_table(table, feature, target_col) if not outfile.endswith(".pdf"): @@ -83,7 +86,7 @@ def create_community_figures(table: str, feature: str, target_col: str, sd_community = sd_community.lower() == 'y' create_community_fig(df, feature, target_col, moutfile, test_type, target_kind=target_kind, iteration=iteration, - sd_community=sd_community) + sd_community=sd_community, display_size=display_size) if __name__ == "__main__": diff --git a/src/find_interaction_cluster/community_figures/fig_functions.py b/src/find_interaction_cluster/community_figures/fig_functions.py index 20747cae31f8df8677c4f55bb701387f6bbc9dea..d7367e87eef4b4044a729f3b87a8656e8b8fd2ae 100644 --- a/src/find_interaction_cluster/community_figures/fig_functions.py +++ b/src/find_interaction_cluster/community_figures/fig_functions.py @@ -15,6 +15,7 @@ from statsmodels.stats.multitest import multipletests import numpy as np from ..radomization_test_ppi import get_pvalue import seaborn as sns +import matplotlib.pyplot as plt def get_community_table(communities: List[List[str]], @@ -53,8 +54,8 @@ def get_community_table(communities: List[List[str]], return pd.DataFrame(dic) -def lm_maker_summary(df: pd.DataFrame, outfile: Path, target_col: str - ) -> pd.DataFrame: +def lm_maker_summary(df: pd.DataFrame, outfile: Path, target_col: str, + test_type: str) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Make the lm analysis to see if the exon regulated by a splicing factor \ are equally distributed among the communities. @@ -65,15 +66,29 @@ def lm_maker_summary(df: pd.DataFrame, outfile: Path, target_col: str and the size of the community of the feature if it has one (None, else). :param outfile: A name of a file :param target_col: The name of the column containing the data of interest - :return: the pvalue of lm + :param test_type: The type of test to make (permutation or lm) + :return: the entry dataframe and the result dataframe post analysis """ pandas2ri.activate() + if test_type == "lm": + mod = f"mod <- lm({target_col} ~ log(community_size) + " \ + f"community, data=data)" + else: + mod = f"mod <- glm({target_col} ~ log(community_size) + community," \ + f"data=data, family=binomial(link='logit'))" + df[target_col] = df[target_col].astype(int) + tmp = df[[target_col, 'community']].groupby('community').mean().reset_index() + bad_groups = tmp.loc[tmp[target_col] == 0, "community"].to_list() + if "C-CTRL" in bad_groups: + print("Control group as a mean value equals to 0, exiting...") + exit(1) + df = df[-df["community"].isin(bad_groups)] lmf = r( """ require("DHARMa") function(data, folder, partial_name) { - mod <- lm(%s ~ log(community_size) + community, data=data) + %s simulationOutput <- simulateResiduals(fittedModel = mod, n = 250) png(paste0(folder, "/dignostics_summary", partial_name, ".png")) plot(simulationOutput) @@ -81,7 +96,7 @@ def lm_maker_summary(df: pd.DataFrame, outfile: Path, target_col: str return(as.data.frame(summary(mod)$coefficients)) } - """ % target_col) + """ % mod) folder = outfile.parent / "diagnostics" folder.mkdir(parents=True, exist_ok=True) partial_name = outfile.name.replace('.pdf', '') @@ -92,11 +107,11 @@ def lm_maker_summary(df: pd.DataFrame, outfile: Path, target_col: str res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" mean_df = df[[target_col, "community", "community_size"]]. \ groupby(["community", "community_size"]).mean().reset_index() - return res_df.merge(mean_df, how="left", on="community") + return df, res_df.merge(mean_df, how="left", on="community") def lm_with_ctrl(df: pd.DataFrame, - target_col: str, outfile: Path, + target_col: str, outfile: Path, test_type: str ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ :param df: A dataframe containing the id of the chosen `feature` \ @@ -105,6 +120,7 @@ def lm_with_ctrl(df: pd.DataFrame, and the size of the community of the feature if it has one (None, else). :param target_col: The name of the column containing the data of interest :param outfile: File that will contains the final figure + :param test_type: The type of test to make (permutation or lm) :return: The dataframe with ctrl exon and \ The dataframe with the p-value compared to the control \ list of feature. @@ -113,11 +129,12 @@ def lm_with_ctrl(df: pd.DataFrame, size = df.loc[df["community"] == "C-CTRL", :].shape[0] df['community_size'] = df['community_size'].fillna(size) df['community_size'] = df['community_size'].astype(int) - return df, lm_maker_summary(df, outfile, target_col) + return lm_maker_summary(df, outfile, target_col, test_type) def expand_results_lm(df: pd.DataFrame, rdf: pd.DataFrame, - target_col: str, feature: str) -> pd.DataFrame: + target_col: str, feature: str, + test_type: str) -> pd.DataFrame: """ Merge df and rdf together. @@ -130,9 +147,10 @@ def expand_results_lm(df: pd.DataFrame, rdf: pd.DataFrame, exons. :param target_col: The name of the column containing the data of interest :param feature: The feature of interest + :param test_type: The kind of test to make :return: The merged dataframe: i.e df with the stats columns """ - p_col = "Pr(>|t|)" + p_col = "Pr(>|t|)" if test_type == "lm" else "Pr(>|z|)" df = df[[f"id_{feature}", target_col, "community", "community_size"]].copy() rdf = rdf[["community", "community_size", p_col, target_col]].copy() @@ -141,7 +159,8 @@ def expand_results_lm(df: pd.DataFrame, rdf: pd.DataFrame, df = df.merge(rdf, how="left", on=["community", "community_size"]) df_ctrl = df[df["community"] == "C-CTRL"] df = df[df["community"] != "C-CTRL"].copy() - df.sort_values(f"mean_{target_col}", ascending=True, inplace=True) + df.sort_values([f"mean_{target_col}", "community"], ascending=True, + inplace=True) return pd.concat([df_ctrl, df], axis=0, ignore_index=True) @@ -303,7 +322,8 @@ def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, target_col: str, "community", "community_size"]].copy() ctrl_df = rdf[[f"{target_col}_mean_{iteration}_ctrl", f"{target_col}_std_{iteration}_ctrl", "community"]].copy() - rdf = rdf[["community", "community_size", target_col, "p-adj"]].copy() + rdf = rdf[["community", "community_size", target_col, "p-adj", + "reg-adj"]].copy() rdf.rename({target_col: f"mean_{target_col}"}, axis=1, inplace=True) df = df.merge(rdf, how="left", on=["community", "community_size"]) df.sort_values(f"mean_{target_col}", ascending=True, inplace=True) @@ -314,9 +334,41 @@ def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, target_col: str, return pd.concat([df_ctrl, df], axis=0, ignore_index=True) -def make_barplot(df_bar: pd.DataFrame, outfile: Path, +def display_size_fig(g: sns.FacetGrid, display_size: bool, target_col: str, + df_bar: pd.DataFrame): + """ + + :param g: A seaborn FacetGrid + :param display_size: True to display the size of the community above \ + each one of them False to display nothing. (default False) + :param target_col: The name of the column containing the data of interest + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community (without control) + :return: The seaborn fascetgrid + """ + xrange = g.ax.get_xlim() + if display_size: + df_size = df_bar[[f"mean_{target_col}", "community", + "community_size"]].drop_duplicates() + ax2 = g.ax.twinx() + ax2.set_ylabel('community_size', color="green") + df_size.plot(x="community", y="community_size", kind="scatter", ax=ax2, + legend=False, zorder=55, + color=(0.2, 0.8, 0.2, 0.4)) + ax2.tick_params(axis='y', labelcolor="green") + ax2.grid(False) + sizes = df_size["community_size"].to_list() + if max(sizes) - min(sizes) > 500: + ax2.set_yscale("log") + g.ax.set_xlim(xrange) + g.set(xticklabels=[]) + return g + + +def make_barplot(df_bar: pd.DataFrame, outfile: Path, test_type: str, target_col: str, feature: str, target_kind: str = "", - sd_community: Optional[str] = "sd") -> None: + sd_community: Optional[str] = "sd", + display_size: bool = False) -> None: """ Create a barplot showing the frequency of `nt` for every community \ of exons/gene in `df_bar`. @@ -326,10 +378,13 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, :param outfile: File were the figure will be stored :param target_kind: An optional name that describe a bit further \ target_col. + :param test_type: The kind of test to perform :param target_col: The name of the column containing the data of interest :param feature: The king of feature of interest :param sd_community: sd to display community error bar, None to display \ nothing + :param display_size: True to display the size of the community above \ + each one of them False to display nothing. (default False) """ sns.set(context="poster") g = sns.catplot(x="community", y=target_col, data=df_bar, kind="point", @@ -342,9 +397,9 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, g.fig.subplots_adjust(top=0.9) target_kind = f" ({target_kind})" if target_kind else "" g.fig.suptitle(f"Mean frequency of {target_col}{target_kind}" - f"among community of {feature}s\n" + f" among community of {feature}s\n" f"(stats obtained with a lm test)") - g.set(xticklabels=[]) + g = display_size_fig(g, display_size, target_col, df_bar) g.ax.set_ylabel(f'Frequency of {target_col}') df_bara = df_bar.drop_duplicates(subset="community", keep="first") for i, p in enumerate(g2.ax.patches): @@ -358,12 +413,14 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, ha='center', va='center', xytext=(0, 10), fontsize=12, textcoords='offset points') g.savefig(outfile) + plt.close() def make_barplot_perm(df_bar: pd.DataFrame, outfile: Path, target_col: str, feature: str, target_kind: str = "", - sd_community: Optional[str] = "sd") -> None: + sd_community: Optional[str] = "sd", + display_size: bool = False) -> None: """ Create a barplot showing the frequency of `nt` for every community \ of exons/gene in `df_bar`. @@ -377,6 +434,8 @@ def make_barplot_perm(df_bar: pd.DataFrame, outfile: Path, :param feature: The king of feature of interest :param sd_community: sd to display community error bar, None to display \ nothing + :param display_size: True to display the size of the community above \ + each one of them False to display nothing. (default False) """ sns.set(context="poster") df_ctrl = df_bar.loc[df_bar[f"id_{feature}"] == 'ctrl', :] @@ -399,9 +458,9 @@ def make_barplot_perm(df_bar: pd.DataFrame, outfile: Path, g.fig.subplots_adjust(top=0.9) target_kind = f" ({target_kind})" if target_kind else "" g.fig.suptitle(f"Mean frequency of {target_col}{target_kind}" - f"among community of {feature}s\n" + f" among community of {feature}s\n" f"(stats obtained with a permutation test)") - g.set(xticklabels=[]) + g = display_size_fig(g, display_size, target_col, df_bar) g.ax.set_ylabel(f'Frequency of {target_col}') df_bara = df_bar.drop_duplicates(subset="community", keep="first") for i, p in enumerate(g2.ax.patches): @@ -415,11 +474,13 @@ def make_barplot_perm(df_bar: pd.DataFrame, outfile: Path, ha='center', va='center', xytext=(0, 10), fontsize=12, textcoords='offset points') g.savefig(outfile) + plt.close() def barplot_creation(df_bar: pd.DataFrame, outfig: Path, cpnt: str, test_type: str, feature: str, - target_kind: str, sd_community: bool) -> None: + target_kind: str, sd_community: bool, + display_size: bool) -> None: """ Reformat a dataframe with the enrichment of a nucleotide frequency \ for every feature for every community and then create a \ @@ -437,13 +498,16 @@ def barplot_creation(df_bar: pd.DataFrame, outfig: Path, target_col. :param sd_community: True to display the errors bars for communities, False else. + :param display_size: True to display the size of the community above \ + each one of them False to display nothing. (default False) """ sd_community = "sd" if sd_community else None - if test_type == "lm": - make_barplot(df_bar, outfig, cpnt, feature, target_kind) + if test_type != "permutation": + make_barplot(df_bar, outfig, test_type, cpnt, feature, target_kind, + display_size=display_size) else: make_barplot_perm(df_bar, outfig, cpnt, feature, target_kind, - sd_community) + sd_community, display_size) def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: @@ -466,6 +530,8 @@ def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: dic = {} for i in range(df.shape[0]): com, id_ft = df.iloc[i, :][['community', f'id_{feature}']] + if isinstance(com, float): + com = None if np.isnan(com) else com if com is not None: if com in dic: dic[com].append(id_ft) @@ -480,7 +546,8 @@ def create_community_fig(df: pd.DataFrame, feature: str, dic_com: Optional[Dict] = None, target_kind: str = "", iteration: int = 10000, - sd_community: bool = True) -> None: + sd_community: bool = True, + display_size: bool = False) -> None: """ Create a dataframe with a control community, save it as a table and \ as a barplot figure. @@ -501,13 +568,15 @@ def create_community_fig(df: pd.DataFrame, feature: str, :param iteration: The number of sub samples to create :param sd_community: True to display the errors bars for communities, False else. + :param display_size: True to display the size of the community above \ + each one of them False to display nothing. (default False) """ if dic_com is None: - dic_com = {} if test_type == 'lm' \ + dic_com = {} if test_type != 'permutation' \ else get_feature_by_community(df, feature) - if test_type == "lm": - ndf, rdf = lm_with_ctrl(df, target_col, outfile_ctrl) - df_bar = expand_results_lm(ndf, rdf, target_col, feature) + if test_type != "permutation": + ndf, rdf = lm_with_ctrl(df, target_col, outfile_ctrl, test_type) + df_bar = expand_results_lm(ndf, rdf, target_col, feature, test_type) else: rdf = perm_with_ctrl(df, feature, target_col, dic_com, iteration) df_bar = expand_results_perm(df, rdf, target_col, feature, iteration) @@ -516,4 +585,4 @@ def create_community_fig(df: pd.DataFrame, feature: str, bar_outfile = str(outfile_ctrl).replace(".pdf", "_bar.txt") df_bar.to_csv(bar_outfile, sep="\t", index=False) barplot_creation(df_bar, outfile_ctrl, target_col, test_type, feature, - target_kind, sd_community) + target_kind, sd_community, display_size) diff --git a/src/find_interaction_cluster/community_finder.py b/src/find_interaction_cluster/community_finder.py index d65d5a81cacaedc6dbe18b7885e4a89fc4568fa5..b7467fc4ad2995017f4ab99554df16662c016cb4 100644 --- a/src/find_interaction_cluster/community_finder.py +++ b/src/find_interaction_cluster/community_finder.py @@ -11,7 +11,7 @@ import networkx as nx import numpy as np from networkx.algorithms import community import sqlite3 -from .config import ConfigGraph, get_communities +from .config import ConfigGraph, get_communities_basefile from ..nt_composition.make_nt_correlation import get_project_colocalisation from ..logging_conf import logging_def import pandas as pd @@ -19,11 +19,9 @@ import logging import plotly.graph_objects as go import plotly from pathlib import Path -from typing import Tuple, Dict, List +from typing import Tuple, Dict import matplotlib.cm as cm from matplotlib.colors import to_rgba -from itertools import product -import multiprocessing as mp import json import subprocess as sp @@ -106,7 +104,7 @@ def find_communities(graph: nx.Graph, project: str, cmd = f"mpirun -np 1 {ConfigGraph.get_hipmcl_prog()} -M {outfile} " \ f"-I 1.5 -per-process-mem 50 -o {result_file}" sp.check_call(cmd, shell=True, stderr=sp.STDOUT) - communities = get_communities(result_file) + communities = get_communities_basefile(result_file) dic_community = {} cov = round(community.coverage(graph, communities), 2) modularity = community.modularity(graph, communities, weight='X') diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 2f9d7d0e0e6608e58603d11585d8792e3c73749f..c38f4f878d8fb7db8d6502908d0935771e3d196e 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -12,7 +12,7 @@ import logging import sqlite3 from .config import ConfigGraph, get_communities from typing import List, Tuple, Dict -from doctest import testmod +import lazyparser as lp from functools import reduce from pathlib import Path from rpy2.robjects import r, pandas2ri @@ -54,7 +54,7 @@ def get_cpnt_frequency(cnx: sqlite3.Connection, list_ft: List[str], >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), ... ['1', '2'], "gene", 'exon', 'aa') >>> d[["id_gene", "R", "K", "D", "Q", "E"]] - ft id_gene R K D Q E + ft id_gene R K D Q E 0 1 4.75247 5.19300 5.95391 4.07997 6.96189 1 2 4.34203 6.23736 6.77708 5.21984 7.01769 """ @@ -78,37 +78,36 @@ def get_cpnt_frequency(cnx: sqlite3.Connection, list_ft: List[str], return df -def get_community_table(communities: List[List[str]], +def get_community_table(communities: Dict[str, List[str]], size_threshold: int, feature: str) -> pd.DataFrame: """ return the table indicating the name of the exons and the \ the name of the community. - :param communities: List of community of exons + :param communities: Dictionary of community of exons :param size_threshold: The required size a community must \ have to be considered :param feature: The kind of feature analysed :return: table of community - >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']] + >>> c = {"C1": ['1_1', '2_5'], "C2": ['7_9', '4_19', '3_3']} >>> get_community_table(c, 3, 'exon') community id_exon community_size 0 C2 7_9 3 1 C2 4_19 3 2 C2 3_3 3 - >>> c = [['1', '2'], ['7', '49', '3']] + >>> c = {"G1": ['1', '2'], "G2": ['7', '49', '3']} >>> get_community_table(c, 3, 'gene') community id_gene community_size - 0 C2 7 3 - 1 C2 49 3 - 2 C2 3 3 + 0 G2 7 3 + 1 G2 49 3 + 2 G2 3 3 """ dic = {"community": [], f"id_{feature}": [], "community_size": []} - for k, comm in enumerate(communities): + for k, comm in communities.items(): if len(comm) >= size_threshold: - name = f'C{k + 1}' clen = len(comm) for exon in comm: - dic["community"].append(name) + dic["community"].append(k) dic[f'id_{feature}'].append(exon) dic["community_size"].append(clen) return pd.DataFrame(dic) @@ -183,9 +182,8 @@ def create_ctrl_community(df: pd.DataFrame, list_ft = [cid for cid in ft_id if cid not in df[f"id_{feature}"].to_list()] df_nt = get_cpnt_frequency(cnx, list_ft, feature, region, cpnt_type) - df_com = get_community_table([list_ft], size_threshold, feature) + df_com = get_community_table({"C-CTRL": list_ft}, size_threshold, feature) df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") - df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] df = pd.concat([df, df_nt], axis=0, ignore_index=True) return df @@ -252,7 +250,7 @@ def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, def create_outfiles(project: str, weight: int, global_weight: int, same_gene: bool, feature: str, cpnt_type: str, - cpnt: str, test_type: str + cpnt: str, test_type: str, community_file: str ) -> Tuple[Path, Path]: """ Create a file used to store diagnostics and a file used to store the \ @@ -271,11 +269,22 @@ def create_outfiles(project: str, weight: int, global_weight: int, :param cpnt: The component (nt, aa, dnt) of interest :param feature: The kind of feature analysed :param test_type: The type of test to make (permutation or lm) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. :return: file used to store diagnostics and a file used to store the \ table containing the test communities and the control community """ outfolder = f"community_enrichment/{cpnt_type}_analysis" + if community_file != "": + cn = Path(community_file).name.replace(".txt", "") + base = ConfigGraph.output_folder / outfolder / f"custom_{cn}" + base.mkdir(exist_ok=True, parents=True) + outfile = base / f"{cpnt}-{cpnt_type}_stat_{test_type}.txt" + outfile_ctrl = base / \ + f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.pdf" + return outfile, outfile_ctrl outfile = ConfigGraph.\ get_community_file(project, weight, global_weight, same_gene, feature, f"{cpnt}-{cpnt_type}_stat_{test_type}.txt", @@ -292,7 +301,9 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, cpnt_type: str, cpnt: str, dic_com: Dict, feature: str = "exon", test_type: str = "", - iteration: int = 1000) -> pd.Series: + iteration: int = 1000, + display_size: bool = False, + community_file: str = "") -> pd.Series: """ Get data (proportion of `reg` regulated exons by a splicing factor `sf_name` within a community) for every community. @@ -315,23 +326,30 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, :param feature: The kind of feature analysed :param test_type: The type of test to make (permutation or lm) :param iteration: The number of sub samples to create + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. """ logging.debug(f"{test_type} for {project}, w:{weight}, " f"g:{global_weight} cpnt: {cpnt}({cpnt_type})") outfile, outfile_ctrl = create_outfiles(project, weight, global_weight, same_gene, feature, cpnt_type, - cpnt, test_type) + cpnt, test_type, community_file) res = {"project": project, "cpnt": cpnt, 'pval': lm_maker(df, outfile, cpnt)} create_community_fig(df, feature, cpnt, outfile_ctrl, test_type, - dic_com, cpnt_type, iteration) + dic_com, cpnt_type, iteration, + display_size=display_size) return pd.Series(res) def create_dataframe(project: str, weight: int, global_weight: int, same_gene: bool, feature: str = 'exon', region: str = "", component_type: str = 'nt', - from_communities: bool = True + community_file: str = "", + from_communities: bool = True, ) -> pd.DataFrame: """ :param project: The name of the project of interest @@ -347,17 +365,27 @@ def create_dataframe(project: str, weight: int, global_weight: int, :param region: the region of interest to extract from gene :param component_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ size_threshold = 10 if feature == "gene" else 50 cnx = sqlite3.connect(ConfigGraph.db_file) if from_communities: - result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, - "_communities.txt") - communities = get_communities(result, 0) - ncommunities = [c for c in communities if len(c) >= size_threshold] + if community_file == "": + result = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, feature, + ".txt") + else: + result = Path(community_file) + if not result.is_file(): + raise FileNotFoundError(f"File {community_file} not found !") + communities = get_communities(result, feature, 0) + ncommunities = [communities[c] for c in communities + if len(communities[c]) >= size_threshold] list_exon = reduce(lambda x, y: x + y, ncommunities) df_com = get_community_table(communities, size_threshold, feature) @@ -372,7 +400,8 @@ def create_dataframe(project: str, weight: int, global_weight: int, def create_dataframes(project, weight, global_weight, same_gene, feature, - region, test_type, component_type: str + region, test_type, component_type: str, + community_file: str ) -> Tuple[pd.DataFrame, Dict]: """ :param weight: The weight of interaction to consider @@ -387,9 +416,12 @@ def create_dataframes(project, weight, global_weight, same_gene, feature, :param test_type: The type of test to make (permutation or lm) :param component_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. """ df = create_dataframe(project, weight, global_weight, same_gene, - feature, region, component_type) + feature, region, component_type, community_file) df_ctrl = create_dataframe(project, weight, global_weight, same_gene, feature, region, component_type, from_communities=False) @@ -411,9 +443,11 @@ def multiple_nt_lm_launcher(ps: int, component_type: str = "nt", test_type: str = "lm", iteration: int = 1000, + display_size: bool = False, + community_file: str = "", logging_level: str = "DISABLE"): """ - Launch the statistical analysis for every + Launch the creation of community files for every component type selected. :param ps: The number of processes to use :param weight: The weight of interaction to consider @@ -429,6 +463,11 @@ def multiple_nt_lm_launcher(ps: int, :param region: the region of interest to extract from gene :param test_type: The type of test to make (permutation or lmm) :param iteration: The number of iteration to perform + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -449,14 +488,15 @@ def multiple_nt_lm_launcher(ps: int, logging.debug("Creating tables") df, dic_com = create_dataframes(project, weight, global_weight, same_gene, feature, region, - test_type, component_type) + test_type, component_type, community_file) for project, weight, cpnt in condition: nfile_table = ConfigGraph.get_community_file( project, weight, global_weight, same_gene, feature, f"_{component_type}_table.txt", "community_enrichment") df.to_csv(nfile_table, sep="\t", index=False) args = [df, project, weight, global_weight, same_gene, component_type, - cpnt, dic_com, feature, test_type, iteration] + cpnt, dic_com, feature, test_type, iteration, display_size, + community_file] processes.append(pool.apply_async(get_stat_cpnt_communities, args)) results = [p.get(timeout=None) for p in processes] pool.close() @@ -473,5 +513,53 @@ def multiple_nt_lm_launcher(ps: int, fdf.to_csv(noutfile, sep="\t", index=False) +@lp.parse(ps=range(mp.cpu_count()), weight=range(1, 11), + global_weight=range(11), + feature=('gene', 'exon'), region=('', 'exon', 'intron', 'gene'), + iteration="iteration > 20", test_type=('lm', 'permutation'), + component_type=("nt", "aa", "dnt")) +def launcher_community_file(ps: int = 1, + weight: int = -1, + global_weight: int = -1, + project: str = "GSM1018963_GSM1018964", + same_gene: bool = True, + feature: str = 'exon', + region: str = '', + component_type: str = "nt", + test_type: str = "lm", + iteration: int = 1000, + display_size: bool = False, + community_file: str = "", + logging_level: str = "DISABLE"): + """ + Launch the creation of community files for every component type selected. + + :param ps: The number of processes to use (default 1) + :param weight: The weight of interaction to consider (default -1) + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. (default -1) + :param project: The project name, used only if global_weight = 0 + :param same_gene: Say if we consider as co-localised exon within the \ + same gene (default True) + :param feature: The kind of analysed feature (default exon) + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param region: the region of interest to extract from gene (default '') + :param test_type: The type of test to make (permutation or lm) (default lm) + :param iteration: The number of iteration to perform (default 1000) + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. (default '') + :param logging_level: Level of information to display (default DISABLE) + """ + multiple_nt_lm_launcher(ps, weight, global_weight, project, + same_gene, feature, region, component_type, + test_type, iteration, display_size, community_file, + logging_level) + + if __name__ == "__main__": - testmod() + launcher_community_file() diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py index d81dc5931eaa789c0ef2a119a64cd3197077571b..41728da8cfec21ee69cbe30eb1261f14dfc1a87a 100644 --- a/src/find_interaction_cluster/sf_and_communities.py +++ b/src/find_interaction_cluster/sf_and_communities.py @@ -9,9 +9,8 @@ are often regulated by a splicing factor from typing import List, Tuple, Dict import sqlite3 -from .config import ConfigGraph +from .config import ConfigGraph, get_communities_basefile from ..figures_utils.exons_interactions import get_every_events_4_a_sl -from .community_finder import get_communities import pandas as pd import numpy as np from itertools import product @@ -298,8 +297,8 @@ def get_stat4communities(sf_name: str, reg: str, cnx = sqlite3.connect(ConfigGraph.db_file) result = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, feature, - "_communities.txt") - communities = get_communities(result, 0) + ".txt") + communities = get_communities_basefile(result, 0) regulated_dic, number = get_every_events_4_a_sl(cnx, sf_name, reg) reg_ft = regulated_dic[sf_name + "_" + reg] reg_ft = adapt_regulated_list(cnx, reg_ft, sf_name, reg, feature) diff --git a/src/nt_composition/distance.py b/src/nt_composition/distance.py index 41c64c12bc0406061ecaa39c18b05b25f3f6b800..c2ac000c5d7865ccbd3346d1f8ab048b19435012 100644 --- a/src/nt_composition/distance.py +++ b/src/nt_composition/distance.py @@ -155,8 +155,12 @@ def compute_controls_distances(arr_interaction: np.array, list_int = [] exon = np.unique(arr_interaction.flatten()) nb_interaction = len(arr_interaction) - pbar = tqdm(range(iteration), position=int( - mp.current_process().name.replace("ForkPoolWorker-", ""))) + try: + worker_name = int(mp.current_process().name. + replace("ForkPoolWorker-", "")) + except ValueError: + worker_name = 0 + pbar = tqdm(range(iteration), position=worker_name) for _ in pbar: ctrl_coloc = randomize_colocalisation(exon, nb_interaction) list_val.append(compute_mean_distance(ctrl_coloc, dic_freq)) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/files/SF_test_bar.txt b/tests/files/SF_test_bar.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f8b6b0691bd5d224993966d665f759eef025d95 --- /dev/null +++ b/tests/files/SF_test_bar.txt @@ -0,0 +1,103 @@ +peak_density ctrl_std community mean_peak_density id_gene community_size order p-adj reg-adj +0.15811188472465293 0.12663292250602926 C91 0.1586054794606214 ctrl 349.0 0.0 +0.16041415362534722 0.10696043743139222 C852 0.1586054794606214 ctrl 349.0 1.0 +0.15890638952977632 0.1266005146872099 C305 0.1586054794606214 ctrl 349.0 2.0 +0.15698525929832327 0.11753428212815271 C279 0.1586054794606214 ctrl 349.0 3.0 +0.1578382446087718 0.10899977512514246 C1093 0.1586054794606214 ctrl 349.0 4.0 +0.0 C91 0.0027225799926581832 13856 2.0 0.027338333333333336 - +0.0 C91 0.0027225799926581832 13360 2.0 0.027338333333333336 - +0.0 C852 0.006666068752563779 7831 1.0 0.015356000000000002 - +0.06863417982155113 C10 0.27053090890214626 16763 3.0 0.2282961325966851 . +0.01505824823817057 C10 0.27053090890214626 16676 3.0 0.2282961325966851 . +0.05655468838366701 C10 0.27053090890214626 16817 3.0 0.2282961325966851 . +0.24937655860349128 C165 0.2714873366751865 7314 12.0 0.2528539215686274 . +0.17001020061203673 C165 0.2714873366751865 8811 12.0 0.2528539215686274 . +0.24993751562109473 C165 0.2714873366751865 7335 12.0 0.2528539215686274 . +0.06930228590924599 C165 0.2714873366751865 8538 12.0 0.2528539215686274 . +0.28785261945883706 C165 0.2714873366751865 8278 12.0 0.2528539215686274 . +1.278772378516624 C165 0.2714873366751865 8177 12.0 0.2528539215686274 . +0.0 C165 0.2714873366751865 8827 12.0 0.2528539215686274 . +0.07797270955165693 C165 0.2714873366751865 7688 12.0 0.2528539215686274 . +0.2229902999219534 C165 0.2714873366751865 7199 12.0 0.2528539215686274 . +0.14855897791423195 C165 0.2714873366751865 8446 12.0 0.2528539215686274 . +0.41693690354859636 C165 0.2714873366751865 7168 12.0 0.2528539215686274 . +0.08613759044446996 C165 0.2714873366751865 7889 12.0 0.2528539215686274 . +0.0 C325 0.2723632059180303 18286 18.0 0.2282961325966851 . +0.3679914358356751 C325 0.2723632059180303 18098 18.0 0.2282961325966851 . +0.15547263681592038 C325 0.2723632059180303 18108 18.0 0.2282961325966851 . +0.10806668697010482 C325 0.2723632059180303 18182 18.0 0.2282961325966851 . +0.16793551276309898 C325 0.2723632059180303 17796 18.0 0.2282961325966851 . +0.13238019592268996 C325 0.2723632059180303 17922 18.0 0.2282961325966851 . +0.26574541589157585 C325 0.2723632059180303 17706 18.0 0.2282961325966851 . +0.19554165037152915 C325 0.2723632059180303 17976 18.0 0.2282961325966851 . +0.1679712433231431 C325 0.2723632059180303 18101 18.0 0.2282961325966851 . +0.3679914358356751 C325 0.2723632059180303 18033 18.0 0.2282961325966851 . +0.1834946557181522 C325 0.2723632059180303 18127 18.0 0.2282961325966851 . +0.0 C325 0.2723632059180303 17997 18.0 0.2282961325966851 . +0.3897116134060795 C325 0.2723632059180303 17999 18.0 0.2282961325966851 . +0.1543434049970947 C325 0.2723632059180303 18226 18.0 0.2282961325966851 . +0.0 C325 0.2723632059180303 17689 18.0 0.2282961325966851 . +2.155172413793103 C325 0.2723632059180303 18202 18.0 0.2282961325966851 . +0.0 C325 0.2723632059180303 18274 18.0 0.2282961325966851 . +0.09071940488070399 C325 0.2723632059180303 17753 18.0 0.2282961325966851 . +0.0 C232 0.2762251885031267 17226 12.0 0.2439456852791878 . +0.11020498126515318 C232 0.2762251885031267 17418 12.0 0.2439456852791878 . +0.2179233580313139 C232 0.2762251885031267 16985 12.0 0.2439456852791878 . +0.18858673103760418 C232 0.2762251885031267 16897 12.0 0.2439456852791878 . +0.5926245184925788 C232 0.2762251885031267 16536 12.0 0.2439456852791878 . +0.5753526354862657 C232 0.2762251885031267 17214 12.0 0.2439456852791878 . +0.8451384417256922 C232 0.2762251885031267 17271 12.0 0.2439456852791878 . +0.13189364096792347 C232 0.2762251885031267 17360 12.0 0.2439456852791878 . +0.06295114990767164 C232 0.2762251885031267 16022 12.0 0.2439456852791878 . +0.022621363615798758 C232 0.2762251885031267 16660 12.0 0.2439456852791878 . +0.25590322205420496 C232 0.2762251885031267 17080 12.0 0.2439456852791878 . +0.31150221945331363 C232 0.2762251885031267 17038 12.0 0.2439456852791878 . +0.0 C397 0.27717091037401387 10558 20.0 0.20961024096385542 . +0.0 C397 0.27717091037401387 11520 20.0 0.20961024096385542 . +0.4414036636504083 C397 0.27717091037401387 10569 20.0 0.20961024096385542 . +0.4857175899155777 C397 0.27717091037401387 10641 20.0 0.20961024096385542 . +0.05470260019692936 C397 0.27717091037401387 10333 20.0 0.20961024096385542 . +0.0 C397 0.27717091037401387 12509 20.0 0.20961024096385542 . +0.16477179106936893 C397 0.27717091037401387 11687 20.0 0.20961024096385542 . +0.5427702996092054 C397 0.27717091037401387 10886 20.0 0.20961024096385542 . +0.41946308724832215 C397 0.27717091037401387 10784 20.0 0.20961024096385542 . +0.19127108328986264 C397 0.27717091037401387 10997 20.0 0.20961024096385542 . +0.41126876413736374 C397 0.27717091037401387 10907 20.0 0.20961024096385542 . +0.5374417771408098 C397 0.27717091037401387 12115 20.0 0.20961024096385542 . +0.0 C397 0.27717091037401387 10922 20.0 0.20961024096385542 . +0.8423417099536712 C397 0.27717091037401387 10586 20.0 0.20961024096385542 . +0.2102828304068973 C397 0.27717091037401387 12030 20.0 0.20961024096385542 . +0.0 C397 0.27717091037401387 10627 20.0 0.20961024096385542 . +0.5621486570893192 C397 0.27717091037401387 12525 20.0 0.20961024096385542 . +0.2037005601765405 C397 0.27717091037401387 12172 20.0 0.20961024096385542 . +0.0 C397 0.27717091037401387 11346 20.0 0.20961024096385542 . +0.4761337935960005 C397 0.27717091037401387 11949 20.0 0.20961024096385542 . +0.0018155904754123661 C952 0.27806788897859824 15390 10.0 0.2528539215686274 . +0.0 C952 0.27806788897859824 15348 10.0 0.2528539215686274 . +0.27839643652561247 C952 0.27806788897859824 14966 10.0 0.2528539215686274 . +0.0 C952 0.27806788897859824 15272 10.0 0.2528539215686274 . +0.0 C952 0.27806788897859824 15287 10.0 0.2528539215686274 . +1.7311021350259663 C952 0.27806788897859824 15135 10.0 0.2528539215686274 . +0.0 C952 0.27806788897859824 14992 10.0 0.2528539215686274 . +0.7302468234263181 C952 0.27806788897859824 15006 10.0 0.2528539215686274 . +0.026880997822639176 C952 0.27806788897859824 15199 10.0 0.2528539215686274 . +0.012236906510034264 C952 0.27806788897859824 15285 10.0 0.2528539215686274 . +0.24703557312252963 C365 0.2789826258284869 13959 12.0 0.2439456852791878 . +0.7249003262051469 C365 0.2789826258284869 13415 12.0 0.2439456852791878 . +0.0 C365 0.2789826258284869 5587 12.0 0.2439456852791878 . +0.10414792792963387 C365 0.2789826258284869 12728 12.0 0.2439456852791878 . +0.31294897509210656 C365 0.2789826258284869 5593 12.0 0.2439456852791878 . +0.0 C365 0.2789826258284869 13737 12.0 0.2439456852791878 . +0.0 C365 0.2789826258284869 13542 12.0 0.2439456852791878 . +0.0 C365 0.2789826258284869 13008 12.0 0.2439456852791878 . +0.6556148730916924 C365 0.2789826258284869 13265 12.0 0.2439456852791878 . +0.0 C365 0.2789826258284869 13783 12.0 0.2439456852791878 . +1.303143834500733 C365 0.2789826258284869 13553 12.0 0.2439456852791878 . +0.0 C365 0.2789826258284869 13032 12.0 0.2439456852791878 . +0.30063132578414675 C680 0.28202754935175395 16283 2.0 0.25166582914572866 . +1.1500862564692351 C680 0.28202754935175395 16852 2.0 0.25166582914572866 . +1.4199743052268579 C916 0.9734678858566262 11489 2.0 0.0 + +0.5591649802960912 C916 0.9734678858566262 11723 2.0 0.0 + +1.5463917525773194 C690 1.0070237479949165 9289 3.0 0.0 + +0.0 C690 1.0070237479949165 9734 3.0 0.0 + +1.9790783149561777 C690 1.0070237479949165 9708 3.0 0.0 + diff --git a/tests/files/clip.bed b/tests/files/clip.bed new file mode 100644 index 0000000000000000000000000000000000000000..69c737df3f8b4af5c58b8f4e321b219c9703e1e6 --- /dev/null +++ b/tests/files/clip.bed @@ -0,0 +1,8 @@ +18 28645943 28646043 1 clip1 - +18 28646043 28646070 1 clip2 - +18 28709190 28709200 2 clip3 - +18 28898040 28898060 3 clip4 + +18 28898060 28898080 3 clip5 + +18 28898080 28898100 3 clip6 + +18 51062273 51062283 8 clip7 + +18 49866521 49866541 8 clip8 + \ No newline at end of file diff --git a/tests/files/exons.bed b/tests/files/exons.bed new file mode 100644 index 0000000000000000000000000000000000000000..a61bd39eb9cce7276d2ebf93eb85a9461ff21ad0 --- /dev/null +++ b/tests/files/exons.bed @@ -0,0 +1,10 @@ +#ref start end id score strand +18 28681865 28682388 1_1 0 - +18 28681183 28681432 1_2 0 - +18 28673521 28673606 1_3 0 - +18 28672063 28672263 1_4 0 - +18 28671489 28671530 1_5 0 - +18 28670990 28671110 1_6 0 - +18 28669401 28669557 1_7 0 - +18 28667631 28667776 1_8 0 - +18 28666538 28666705 1_9 0 - diff --git a/tests/files/genes.bed b/tests/files/genes.bed new file mode 100644 index 0000000000000000000000000000000000000000..5840938c2b694d71ad66a49b77bce5f8ef3ab288 --- /dev/null +++ b/tests/files/genes.bed @@ -0,0 +1,12 @@ +#ref start end id score strand +18 28645943 28682388 1 DSC2 - +18 28709190 28742819 2 DSC1 - +18 28898050 28937394 3 DSG1 + +18 28956739 28994869 4 DSG4 + +13 45766989 45775176 5 KCTD4 - +13 45911001 45915347 6 TPT1 - +18 48918411 49088839 7 AC011260.1 + +18 49866541 51062273 8 DCC + +13 45967450 45992516 9 SLC25A30 - +20 238376 241735 415 DEFB132 + +X 102962271 102983552 10123 GLRA4 - diff --git a/tests/files/test_community_file.txt b/tests/files/test_community_file.txt new file mode 100644 index 0000000000000000000000000000000000000000..d552e4459b630602b170109bd9102d5e3747b6c8 --- /dev/null +++ b/tests/files/test_community_file.txt @@ -0,0 +1,3 @@ +community nodes edges EC HCS %E vs E in complete G cov modularity genes project +C1 11 20 1 no 36.36 0.95 0.95045 415, 416, 421, 422, 423, 433, 441, 475, 481, 502, 511 Global-weight-3 +C2 5 4 1 no 40.0 0.95 0.95045 10123, 10209, 8812, 9140, 9166 Global-weight-3 diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000000000000000000000000000000000000..d0892e27762056d62cd4956d4ca22cb378b5efe0 --- /dev/null +++ b/tests/test.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" + +import doctest +from pathlib import Path +from typing import List +import unittest +import sys + +sys.path.insert(0, str(Path(__file__).parents[1].resolve())) + + +# recover ignored files +def get_ignored_files() -> List[str]: + """ + Recover ignored python files in gitignore + """ + gitignore = Path(__file__).parents[1] / '.gitignore' + if not gitignore.is_file(): + return [] + with gitignore.open('r') as f: + files = f.read().splitlines() + return [cfile.replace('.py', '').replace('/', '.') + for cfile in files if cfile.endswith('.py')] + \ + ["src.db_utils.interactions.features_interactions"] + + +# Loading every python file in this folder +list_mod = [str(mfile.relative_to(Path(__file__).resolve().parents[1])) + for mfile in + list((Path(__file__).resolve().parents[1] / "src").rglob('*.py'))] +list_mod2 = [m.replace('.py', '').replace('/', '.') for m in list_mod + if '__init__' not in m + and '__main__' not in m + and 'test' not in m] +final_mod = [mod for mod in list_mod2 if mod not in get_ignored_files()] + + +def load_tests(loader, tests, ignore): + for cmod in final_mod: + tests.addTest(doctest.DocTestSuite(cmod)) + return tests + + +if __name__ == "__main__": + unittest.main()