diff --git a/src/find_interaction_cluster/clip_figures/clip_launcher_4_many_communities.py b/src/find_interaction_cluster/clip_figures/clip_launcher_4_many_communities.py new file mode 100644 index 0000000000000000000000000000000000000000..f9e6f2ea305c375c321b7f0034382d305374e904 --- /dev/null +++ b/src/find_interaction_cluster/clip_figures/clip_launcher_4_many_communities.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Launch Clip analysis for many communities +""" + + +import logging +from typing import Tuple, Dict +from pathlib import Path +from .config import ConfigClip +from .clip_analyser import create_table, \ + add_regulation_column, create_community_fig, find_or_create_community, \ + ConfigGraph, logging_def +import multiprocessing as mp +from itertools import product +import lazyparser as lp +import numpy as np +import subprocess as sp + + +def select_community_file(project: str, weight: int, global_weight: int, + same_gene: bool, inflation: float, cell_line: str, + feature: str, clip_file: Path, + community_file: str = "") -> Tuple[Path, Path]: + """ + Return the community file and output folder that will be used. + + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). (default ALL) + :param feature: The feature we want to analyse (default 'exon') + :param clip_file: A bed file containing clip + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. + :return: The community file used and the output folder used. + + """ + if community_file == "": + com_file = find_or_create_community(project, weight, global_weight, + same_gene, inflation, cell_line, + feature) + else: + com_file = Path(community_file) + if not com_file.is_file(): + raise FileNotFoundError(f"File {com_file} was not found !") + outname = clip_file.name.split('.')[0] + output = ConfigClip.output_folder / \ + f"multiTAD_CLIP_community_figures-{feature}-{cell_line}" / outname + return com_file, output + + +def create_figure(p: Dict , clip_file: Path, + feature_bed: Path, + community_file: Tuple[str, str], + test_type: str = "permutation", + iteration: int = 10000, display_size: bool = False, + sl_reg: bool = False) -> Path: + """ + Create the final figure + :param p: A dictionary containing parameter used to compute \ + HipMCL communities + :param clip_file: A bed file containing clip + :param feature_bed: A bed files containing exons or genes depending on \ + feature parameter. + :param test_type: The king of test to perform for frequency analysis. \ + (default 'lm') (choose from 'lm', 'permutation') + :param iteration: The number of iteration to make + :param community_file: A Tuple containing a file containing custom \ + communities. If it equals to '' then weight, global weight and \ + same genes parameter are used to find the community files computed \ + with ChIA-PET data. The second item of the tuple is it's name. + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param sl_reg: True to display the FaRLine regulation of the \ + same factor, False to not display it. + :return: Folder containing he figures + """ + logging.info(f"Working on {clip_file} - {community_file[0]} - " + f"{community_file[1]}") + com_file, output = select_community_file(p["project"], p["weight"], + p["global_weight"], + p["same_gene"], p["inflation"], + p["cell_line"], + p["feature"], clip_file, + community_file[0]) + output.mkdir(exist_ok=True, parents=True) + outfile = output / f"{community_file[1]}.tmp.pdf" + final_table = create_table(p["feature"], clip_file, feature_bed, com_file) + if sl_reg: + final_table = add_regulation_column(final_table, + clip_file.name.split("_")[0], + p["feature"]) + create_community_fig(final_table, p["feature"], "peak_density", outfile, + test_type, iteration=iteration, + display_size=display_size) + return outfile.parent + + +def merge_figures(folder: Path) -> None: + """ + Merge the figures together using imageMagick + + :param folder: A folder containing pdf files + """ + fig_name = folder.name + cmd = f"montage -geometry +1+1 -tile 1X6 " \ + f"-compress jpeg -density 100 " \ + f"{folder}/*.tmp.pdf {folder}/{fig_name}.pdf" + sp.check_call(cmd, shell=True) + + +@lp.parse(test_type=["permutation", "lm"], feature=["gene", "exon"]) +def clip_folder_analysis(clip_folder: str, weight: int, + global_weight: int, same_gene: bool = True, + project: str = "GSM1018963_GSM1018964", + inflation: float = 1.5, + cell_line: str = "ALL", + feature: str = "exon", + test_type: str = "permutation", + iteration: int = 10000, display_size: bool=False, + sl_reg: bool = False) -> None: + """ + Create the final figure + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). (default ALL) + :param feature: The feature we want to analyse (default 'exon') + :param clip_folder: A folder containing clip file + :param test_type: The king of test to perform for frequency analysis. \ + (default 'lm') (choose from 'lm', 'permutation') + :param iteration: The number of iteration to make + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param sl_reg: True to display the FaRLine regulation of the \ + same factor, False to not display it. + :param ps: The number of processes to create (default 1) + """ + logging_def(ConfigGraph.community_folder, __file__, "INFO") + clip_folder = Path(clip_folder) + feature_bed = ConfigClip.bed_gene if feature == "gene" \ + else ConfigClip.bed_exon + files = list(clip_folder.glob("*.bed")) + \ + list(clip_folder.glob("*.bed.gz")) + files = [files[0]] + + processes = [] + p = {"project": project, "weight": weight, "global_weight": global_weight, + "same_gene": same_gene, "inflation": inflation, + "cell_line": cell_line, "feature": feature} + + prod = list(product(files, zip(ConfigClip.communities, + ConfigClip.communities_name))) + pool = mp.Pool(processes=min(len(prod), ConfigGraph.cpu)) + for mfile, community_file in prod: + if community_file[0] == "": + tmp = \ + f"HIPMCL_g{global_weight}_w{weight}_{inflation}" + community_file = (community_file[0], tmp) + args = [p, mfile, feature_bed, community_file, test_type, iteration, + display_size, sl_reg] + processes.append(pool.apply_async(create_figure, args)) + list_path = [str(p.get(timeout=None)) for p in processes] + list_path = np.unique(list_path) + for my_folder in list_path: + merge_figures(Path(my_folder)) + + +if __name__ == "__main__": + clip_folder_analysis() \ No newline at end of file