diff --git a/src/find_interaction_cluster/clip_figures/clip_analyser.py b/src/find_interaction_cluster/clip_figures/clip_analyser.py index b30447e4a4151f26a398e7386e2db55aa578e06c..4346a2f2de4ce5267c92f51329db7a729bcf2e4f 100644 --- a/src/find_interaction_cluster/clip_figures/clip_analyser.py +++ b/src/find_interaction_cluster/clip_figures/clip_analyser.py @@ -17,6 +17,8 @@ from ..community_finder import multiple_community_launcher, get_projects from ..community_figures.fig_functions import create_community_fig from ...logging_conf import logging_def import logging +from typing import Tuple +import multiprocessing as mp def bedtools_intersect(gene_bed: Path, clip_bed: Path, @@ -197,10 +199,46 @@ def create_table(feature: str, clip_file: Path, return merge_dataframes(df_clip, df_com, feature) +def select_community_file(project: str, weight: int, global_weight: int, + same_gene: bool, feature: str, + community_file: str = "") -> Tuple[Path, Path]: + """ + Return the community file and output folder that will be used. + + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param feature: The feature we want to analyse (default 'exon') + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. + :return: The community file used and the output folder used. + + """ + if community_file == "": + com_file = find_or_create_community(project, weight, global_weight, + same_gene, feature) + output = com_file.parent / f"CLIP_community_figures_{feature}" + else: + com_file = Path(community_file) + if not com_file.is_file(): + raise FileNotFoundError(f"File {com_file} was not found !") + tmp_name = com_file.name.replace(".txt", "") + output = ConfigClip.output_folder / \ + f"CLIP_community_figures-{feature}-{tmp_name}" + return com_file, output + + def create_figure(project: str, weight: int, global_weight: int, same_gene: bool, feature: str, clip_file: Path, feature_bed: Path, test_type: str = "permutation", - iteration: int = 10000) -> None: + iteration: int = 10000, display_size: bool=False, + community_file: str = "") -> None: """ Create the final figure :param project: The name of the project of interest @@ -218,21 +256,30 @@ def create_figure(project: str, weight: int, global_weight: int, :param test_type: The king of test to perform for frequency analysis. \ (default 'lm') (choose from 'lm', 'permutation') :param iteration: The number of iteration to make + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param ps: The number of processes to create (default 1) """ - com_file = find_or_create_community(project, weight, global_weight, - same_gene, feature) - output = com_file.parent / f"CLIP_community_figures_{feature}" - output.mkdir(exist_ok=True) + logging.info(f"Working on {clip_file}") + com_file, output = select_community_file(project, weight, global_weight, + same_gene, feature, + community_file) + output.mkdir(exist_ok=True, parents=True) outfile = output / f"{clip_file.name.split('.')[0]}.pdf" final_table = create_table(feature, clip_file, feature_bed, com_file) create_community_fig(final_table, feature, "peak_density", outfile, - test_type, iteration=iteration) + test_type, iteration=iteration, + display_size=display_size) def clip_folder_analysis(clip_folder: Path, project: str, weight: int, global_weight: int, same_gene: bool, feature: str, test_type: str = "permutation", - iteration: int = 10000, + iteration: int = 10000, display_size: bool=False, + community_file: str = "", ps: int = 1, logging_level: str = "DEBUG") -> None: """ Create the final figure @@ -249,6 +296,12 @@ def clip_folder_analysis(clip_folder: Path, project: str, weight: int, :param test_type: The king of test to perform for frequency analysis. \ (default 'lm') (choose from 'lm', 'permutation') :param iteration: The number of iteration to make + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. + :param ps: The number of processes to create (default 1) :param logging_level: The level of data to display (default 'DISABLE') """ logging_def(ConfigGraph.community_folder, __file__, logging_level) @@ -256,7 +309,11 @@ def clip_folder_analysis(clip_folder: Path, project: str, weight: int, else ConfigClip.bed_exon files = list(clip_folder.glob("*.bed")) + \ list(clip_folder.glob("*.bed.gz")) + pool = mp.Pool(processes=min(len(files), ps)) + processes = [] for mfile in files: - logging.info(f"Working on {mfile}") - create_figure(project, weight, global_weight, same_gene, feature, - mfile, feature_bed, test_type, iteration) + args = [project, weight, global_weight, same_gene, feature, + mfile, feature_bed, test_type, iteration, display_size, + community_file] + processes.append(pool.apply_async(create_figure, args)) + [p.get(timeout=None) for p in processes]