diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index fba2798d8ead2395475908ec8c9d4498f1b32c82..cb855ff0a50f41845a2abbb4a5cdc65128644965 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -12,7 +12,7 @@ import logging import sqlite3 from .config import ConfigGraph, get_communities from typing import List, Tuple, Dict -from doctest import testmod +import lazyparser as lp from functools import reduce from pathlib import Path from rpy2.robjects import r, pandas2ri @@ -78,37 +78,36 @@ def get_cpnt_frequency(cnx: sqlite3.Connection, list_ft: List[str], return df -def get_community_table(communities: List[List[str]], +def get_community_table(communities: Dict[str, List[str]], size_threshold: int, feature: str) -> pd.DataFrame: """ return the table indicating the name of the exons and the \ the name of the community. - :param communities: List of community of exons + :param communities: Dictionary of community of exons :param size_threshold: The required size a community must \ have to be considered :param feature: The kind of feature analysed :return: table of community - >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']] + >>> c = {"C1": ['1_1', '2_5'], "C2": ['7_9', '4_19', '3_3']} >>> get_community_table(c, 3, 'exon') community id_exon community_size 0 C2 7_9 3 1 C2 4_19 3 2 C2 3_3 3 - >>> c = [['1', '2'], ['7', '49', '3']] + >>> c = {"G1": ['1', '2'], "G2": ['7', '49', '3']} >>> get_community_table(c, 3, 'gene') community id_gene community_size - 0 C2 7 3 - 1 C2 49 3 - 2 C2 3 3 + 0 G2 7 3 + 1 G2 49 3 + 2 G2 3 3 """ dic = {"community": [], f"id_{feature}": [], "community_size": []} - for k, comm in enumerate(communities): + for k, comm in communities.items(): if len(comm) >= size_threshold: - name = f'C{k + 1}' clen = len(comm) for exon in comm: - dic["community"].append(name) + dic["community"].append(k) dic[f'id_{feature}'].append(exon) dic["community_size"].append(clen) return pd.DataFrame(dic) @@ -183,9 +182,8 @@ def create_ctrl_community(df: pd.DataFrame, list_ft = [cid for cid in ft_id if cid not in df[f"id_{feature}"].to_list()] df_nt = get_cpnt_frequency(cnx, list_ft, feature, region, cpnt_type) - df_com = get_community_table([list_ft], size_threshold, feature) + df_com = get_community_table({"C-CTRL": list_ft}, size_threshold, feature) df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") - df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] df = pd.concat([df, df_nt], axis=0, ignore_index=True) return df @@ -252,7 +250,7 @@ def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, def create_outfiles(project: str, weight: int, global_weight: int, same_gene: bool, feature: str, cpnt_type: str, - cpnt: str, test_type: str + cpnt: str, test_type: str, community_file: str ) -> Tuple[Path, Path]: """ Create a file used to store diagnostics and a file used to store the \ @@ -271,11 +269,22 @@ def create_outfiles(project: str, weight: int, global_weight: int, :param cpnt: The component (nt, aa, dnt) of interest :param feature: The kind of feature analysed :param test_type: The type of test to make (permutation or lm) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. :return: file used to store diagnostics and a file used to store the \ table containing the test communities and the control community """ outfolder = f"community_enrichment/{cpnt_type}_analysis" + if community_file != "": + cn = Path(outfolder).name.replace(".txt", "") + base = ConfigGraph.output_folder / outfolder / f"custom_{cn}" + base.mkdir(exist_ok=True, parents=True) + outfile = base / f"{cpnt}-{cpnt_type}_stat_{test_type}.txt" + outfile_ctrl = base / \ + f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.pdf" + return outfile, outfile_ctrl outfile = ConfigGraph.\ get_community_file(project, weight, global_weight, same_gene, feature, f"{cpnt}-{cpnt_type}_stat_{test_type}.txt", @@ -292,7 +301,9 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, cpnt_type: str, cpnt: str, dic_com: Dict, feature: str = "exon", test_type: str = "", - iteration: int = 1000) -> pd.Series: + iteration: int = 1000, + display_size: bool = False, + community_file: str = "") -> pd.Series: """ Get data (proportion of `reg` regulated exons by a splicing factor `sf_name` within a community) for every community. @@ -315,23 +326,30 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, :param feature: The kind of feature analysed :param test_type: The type of test to make (permutation or lm) :param iteration: The number of sub samples to create + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. """ logging.debug(f"{test_type} for {project}, w:{weight}, " f"g:{global_weight} cpnt: {cpnt}({cpnt_type})") outfile, outfile_ctrl = create_outfiles(project, weight, global_weight, same_gene, feature, cpnt_type, - cpnt, test_type) + cpnt, test_type, community_file) res = {"project": project, "cpnt": cpnt, 'pval': lm_maker(df, outfile, cpnt)} create_community_fig(df, feature, cpnt, outfile_ctrl, test_type, - dic_com, cpnt_type, iteration) + dic_com, cpnt_type, iteration, + display_size=display_size) return pd.Series(res) def create_dataframe(project: str, weight: int, global_weight: int, same_gene: bool, feature: str = 'exon', region: str = "", component_type: str = 'nt', - from_communities: bool = True + community_file: str = "", + from_communities: bool = True, ) -> pd.DataFrame: """ :param project: The name of the project of interest @@ -347,17 +365,27 @@ def create_dataframe(project: str, weight: int, global_weight: int, :param region: the region of interest to extract from gene :param component_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ size_threshold = 10 if feature == "gene" else 50 cnx = sqlite3.connect(ConfigGraph.db_file) if from_communities: - result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, - "_communities.txt") - communities = get_communities(result, 0) - ncommunities = [c for c in communities if len(c) >= size_threshold] + if community_file == "": + result = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, feature, + ".txt") + else: + result = Path(community_file) + if not result.is_file(): + raise FileNotFoundError(f"File {community_file} not found !") + communities = get_communities(result, feature, 0) + ncommunities = [communities[c] for c in communities + if len(communities[c]) >= size_threshold] list_exon = reduce(lambda x, y: x + y, ncommunities) df_com = get_community_table(communities, size_threshold, feature) @@ -372,7 +400,8 @@ def create_dataframe(project: str, weight: int, global_weight: int, def create_dataframes(project, weight, global_weight, same_gene, feature, - region, test_type, component_type: str + region, test_type, component_type: str, + community_file: str ) -> Tuple[pd.DataFrame, Dict]: """ :param weight: The weight of interaction to consider @@ -387,9 +416,12 @@ def create_dataframes(project, weight, global_weight, same_gene, feature, :param test_type: The type of test to make (permutation or lm) :param component_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. """ df = create_dataframe(project, weight, global_weight, same_gene, - feature, region, component_type) + feature, region, component_type, community_file) df_ctrl = create_dataframe(project, weight, global_weight, same_gene, feature, region, component_type, from_communities=False) @@ -411,9 +443,11 @@ def multiple_nt_lm_launcher(ps: int, component_type: str = "nt", test_type: str = "lm", iteration: int = 1000, + display_size: bool = False, + community_file: str = "", logging_level: str = "DISABLE"): """ - Launch the statistical analysis for every + Launch the creation of community files for every component type selected. :param ps: The number of processes to use :param weight: The weight of interaction to consider @@ -429,6 +463,11 @@ def multiple_nt_lm_launcher(ps: int, :param region: the region of interest to extract from gene :param test_type: The type of test to make (permutation or lmm) :param iteration: The number of iteration to perform + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -449,14 +488,15 @@ def multiple_nt_lm_launcher(ps: int, logging.debug("Creating tables") df, dic_com = create_dataframes(project, weight, global_weight, same_gene, feature, region, - test_type, component_type) + test_type, component_type, community_file) for project, weight, cpnt in condition: nfile_table = ConfigGraph.get_community_file( project, weight, global_weight, same_gene, feature, f"_{component_type}_table.txt", "community_enrichment") df.to_csv(nfile_table, sep="\t", index=False) args = [df, project, weight, global_weight, same_gene, component_type, - cpnt, dic_com, feature, test_type, iteration] + cpnt, dic_com, feature, test_type, iteration, display_size, + community_file] processes.append(pool.apply_async(get_stat_cpnt_communities, args)) results = [p.get(timeout=None) for p in processes] pool.close() @@ -473,5 +513,53 @@ def multiple_nt_lm_launcher(ps: int, fdf.to_csv(noutfile, sep="\t", index=False) +@lp.parse(ps=range(mp.cpu_count()), weight=range(1, 11), + global_weight=range(11), + feature=('gene', 'exon'), region=('', 'exon', 'intron', 'gene'), + iteration="iteration > 20", test_type=('lm', 'permutation'), + component_type=("nt", "aa", "dnt")) +def launcher_community_file(ps: int = 1, + weight: int = -1, + global_weight: int = -1, + project: str = "GSM1018963_GSM1018964", + same_gene: bool = True, + feature: str = 'exon', + region: str = '', + component_type: str = "nt", + test_type: str = "lm", + iteration: int = 1000, + display_size: bool = False, + community_file: str = "", + logging_level: str = "DISABLE"): + """ + Launch the creation of community files for every component type selected. + + :param ps: The number of processes to use (default 1) + :param weight: The weight of interaction to consider (default -1) + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. (default -1) + :param project: The project name, used only if global_weight = 0 + :param same_gene: Say if we consider as co-localised exon within the \ + same gene (default True) + :param feature: The kind of analysed feature (default exon) + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param region: the region of interest to extract from gene (default '') + :param test_type: The type of test to make (permutation or lm) (default lm) + :param iteration: The number of iteration to perform (default 1000) + :param display_size: True to display the size of the community. \ + False to display nothing. (default False) + :param community_file: A file containing custom communities. If \ + it equals to '' then weight, global weight and same genes parameter are \ + used to find the community files computed with ChIA-PET data. (default '') + :param logging_level: Level of information to display (default DISABLE) + """ + multiple_nt_lm_launcher(ps, weight, global_weight, project, + same_gene, feature, region, component_type, + test_type, iteration, display_size, community_file, + logging_level) + + if __name__ == "__main__": - testmod() + launcher_community_file()