diff --git a/src/find_interaction_cluster/__main__.py b/src/find_interaction_cluster/__main__.py index 19361fad8859cd421a81b1ed8d46562dabaed614..a834dc78013dd26c0a02e27fc22ba5b556800570 100644 --- a/src/find_interaction_cluster/__main__.py +++ b/src/find_interaction_cluster/__main__.py @@ -27,6 +27,7 @@ from .colocalisation_n_ppi_analysis import coloc_ppi_stat_main def launcher(weight: int = 1, global_weight: int = 0, same_gene: bool = True, + inflation: float = 1.5, ps: int = ConfigGraph.cpu, html_fig: bool = False, feature: str = 'exon', region: str = '', component_type: str = 'nt', @@ -45,6 +46,7 @@ def launcher(weight: int = 1, :param project: The name of the project of interest (default GSM1517081) :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default True) + :param inflation: The inflation parameter :param html_fig: True to display the html figure (default False). :param feature: The feature we want to analyse (default 'exon') :param region: The region of a gene to analyse (used only if feature \ @@ -71,17 +73,18 @@ def launcher(weight: int = 1, multiple_community_launcher(weight, global_weight, project, same_gene, html_fig, feature, logging_level) # multiple_stat_launcher(ps, weight, global_weight, project, same_gene, - # feature, logging_level) + # inflation, feature, logging_level) multiple_nt_lm_launcher(ps, weight, global_weight, project, - same_gene, feature, region, component_type, - test_type, iteration, display_size, + same_gene, inflation, feature, region, + component_type, test_type, iteration, display_size, logging_level=logging_level) if feature == "gene": # ppi_stat_launcher(weight, global_weight, project, same_gene, + # inflation, # ConfigGraph.ppi_threshold, iteration, # logging_level) coloc_ppi_stat_main(weight, global_weight, project, same_gene, - iteration, logging_level) + inflation, iteration, logging_level) diff --git a/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py b/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py index 6e7efb6518672e2677f13baf055091c288792567..cd33bb836743019a844cc1b0f5c93d709fe6af72 100644 --- a/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py +++ b/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py @@ -302,7 +302,8 @@ def create_figure(df_full: pd.DataFrame, outfile: Path) -> None: def coloc_ppi_stat_main(weight: int, global_weight: int, - project: str, same_gene: bool, iteration: int = 1000, + project: str, same_gene: bool, inflation: float, + iteration: int = 1000, logging_level: str = "DISABLE"): """ Launch the statistical tests allowing to determine if interaction between \ @@ -327,14 +328,14 @@ def coloc_ppi_stat_main(weight: int, global_weight: int, logging.debug("Calculating stats...") community_file = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", - f".txt") + same_gene, inflation, + "gene", f".txt") df_com = pd.read_csv(community_file, sep="\t") df_com = df_com[df_com['nodes'] >= 10].copy() full_df = create_scored_dataframe(df_com, ConfigPPI.fasterdb_ppi) outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", + same_gene, inflation, "gene", f"_interation_gene-protein.pdf", "community_gene_vs_protein") create_figure(full_df, outfile) diff --git a/src/find_interaction_cluster/community_calibration.py b/src/find_interaction_cluster/community_calibration.py index 79697e09bb612271d88b0921bf10b5cb48a89cbe..3363835827d77b5b111916a2193a33f44564e113 100644 --- a/src/find_interaction_cluster/community_calibration.py +++ b/src/find_interaction_cluster/community_calibration.py @@ -108,10 +108,10 @@ def community_finder(weight: int, global_weight: int, inflation: float, level=feature) outfileg, result_file = write_interaction_file(interaction, project, weight, global_weight, - same_gene, feature=feature, + same_gene, + inflation, + feature=feature, use_weight=use_weight) - if result_file.is_file(): - result_file.unlink() graph = create_graph(interaction) df, dic_community = find_communities(graph, project, outfileg, result_file, feature, inflation=inflation, diff --git a/src/find_interaction_cluster/community_finder.py b/src/find_interaction_cluster/community_finder.py index 9e5841b908829eeb30f93c75d8dd80202cb23eeb..46e076549c83b91ea602c3bc46ece7ddbb2c9259 100644 --- a/src/find_interaction_cluster/community_finder.py +++ b/src/find_interaction_cluster/community_finder.py @@ -276,7 +276,8 @@ def get_figure_title(project, weight, global_weight, same_gene, feature): def write_interaction_file(arr_interaction: np.array, project: str, weight: int, global_weight: int, same_gene: bool, - use_weight: bool = False, feature: str = 'exon'): + inflation: float, use_weight: bool = False, + feature: str = 'exon'): """ :param arr_interaction: Each couples of co-localized feature within a \ @@ -291,14 +292,15 @@ def write_interaction_file(arr_interaction: np.array, project: str, same gene (True) or not (False) (default False) :param use_weight: Say if we want to write the weight into the result file. :param feature: Says if we want to work at gene or at exons level + :param inflation: The inflation parameter :return: """ logging.debug('Writing interaction files ...') outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, + same_gene, inflation, feature, f"_interation.txt") result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, + same_gene, inflation, feature, f"_communities.txt") with outfile.open('w') as f: for exon1, exon2, cweight in arr_interaction: @@ -310,7 +312,8 @@ def write_interaction_file(arr_interaction: np.array, project: str, def community_finder(weight: int, global_weight: int, project: str = "", same_gene=True, html_fig: bool = False, - feature: str = "exon", logging_level: str = "DISABLE"): + feature: str = "exon", inflation: float = 1.5, + logging_level: str = "DISABLE"): """ Find communities inside co-localisation between exons found in \ a ChIA-PET project. @@ -325,6 +328,7 @@ def community_finder(weight: int, global_weight: int, project: str = "", same gene (True) or not (False) (default False) :param logging_level: The level of data to display (default 'DISABLE') :param html_fig: True to create an html figure, false else + :param inflation: The inflation parameter :param feature: The feature we want to analyse (default 'exon') """ ConfigGraph.output_folder.mkdir(exist_ok=True, parents=True) @@ -335,13 +339,14 @@ def community_finder(weight: int, global_weight: int, project: str = "", level=feature) outfile, result_file = write_interaction_file(interaction, project, weight, global_weight, - same_gene, feature=feature) + same_gene, feature=feature, + inflation=inflation) graph = create_graph(interaction) df, dic_community = find_communities(graph, project, outfile, result_file, feature) logging.debug('Writing results ...') outfiles = [ConfigGraph.get_community_file( - project, weight, global_weight, same_gene, feature, ext) + project, weight, global_weight, same_gene, inflation, feature, ext) for ext in [f'.txt', f'.cyjs', f'.html']] df.to_csv(outfiles[0], sep="\t", index=False) logging.debug("Saving the graph ...") diff --git a/src/find_interaction_cluster/config.py b/src/find_interaction_cluster/config.py index 4263fc5f85bd279dbd06dc52d1925f41d63b706d..3e6c717cc65766ba2d4b90461d4ca98a104ed40b 100644 --- a/src/find_interaction_cluster/config.py +++ b/src/find_interaction_cluster/config.py @@ -13,31 +13,32 @@ from typing import List, Dict import pandas as pd -def get_weight_folder(weight: int, global_weight: int): +def get_weight_folder(weight: int, global_weight: int, inflation: float): """ Get the weight folder. :param weight: The weight of interaction to consider :param global_weight: The global weight to consider. if \ the global weight is equal to 0 then then density figure are calculated \ - by project, else all projet are merge together and the interaction \ + by project, else all project are merge together and the interaction \ seen in `global_weight` project are taken into account + :param inflation: The inflation parameter :return: The folder that will contains the interaction with a weight \ greater or equal to `weigh` in ChIA-PET projects """ if global_weight == 0: weight_folder = ConfigGraph.community_folder / \ - f"project_weight-{weight}" + f"project_weight-{weight}-{inflation}" else: weight_folder = ConfigGraph.community_folder / \ f"weight-{weight}_" \ - f"global_weight-{global_weight}" + f"global_weight-{global_weight}-{inflation}" weight_folder.mkdir(parents=True, exist_ok=True) return weight_folder def get_community_file(project: str, weight: int, global_weight: int, - same_gene: bool, feature: str = 'exon', + same_gene: bool, inflation: float, feature: str = 'exon', ext: str = ".txt", sub_fold: str = ''): """ Get the output file of interest. @@ -50,12 +51,13 @@ def get_community_file(project: str, weight: int, global_weight: int, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised exon within the \ same gene - :param the kind of feature analyzed + :param inflation: The inflation parameter + :param feature: the kind of feature analyzed :param ext: The file extension - :param subfolder: if filled, then the data are recovered from a subfolder + :param sub_fold: if filled, then the data are recovered from a subfolder :return: The filename of interest """ - folder = get_weight_folder(weight, global_weight) + folder = get_weight_folder(weight, global_weight, inflation) if sub_fold != '': folder = folder / sub_fold folder.mkdir(exist_ok=True, parents=True) diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py index 1d401736c97007912f8ec3b84253e5c7613b6fb7..fa159e8984a8688bfdbc8aca4ae79d1621d49127 100644 --- a/src/find_interaction_cluster/create_ppi_files.py +++ b/src/find_interaction_cluster/create_ppi_files.py @@ -8,13 +8,10 @@ Description: create a file of gene interaction at gene and protein levels. from .config import ConfigGraph from .ppi_scripts.config_ppi import ConfigPPI -from typing import List from ..logging_conf import logging_def import numpy as np import logging from .community_finder import get_projects -from itertools import product -import multiprocessing as mp from pathlib import Path from typing import Dict, Union import pandas as pd @@ -68,7 +65,7 @@ def ppi_array(fasterdb_ppi: Path, threshold: int) -> np.array: def write_interaction_ppi(arr_interaction: np.array, project: str, weight: int, global_weight: int, same_gene: bool, - use_weight: bool = False): + inflation: float, use_weight: bool = False): """ :param arr_interaction: Each couples of co-localized feature within a \ @@ -81,15 +78,16 @@ def write_interaction_ppi(arr_interaction: np.array, project: str, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) + :param inflation: The inflation parameter :param use_weight: Say if we want to write the weight into the result file. """ logging.debug('Writing interaction files ...') outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", + same_gene, inflation, "gene", f"_interation_PPI_tmp.txt", "community_gene_vs_protein") result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", + same_gene, inflation, "gene", f"_communities_PPI_tmp.txt", "community_gene_vs_protein") with outfile.open('w') as f: @@ -102,7 +100,7 @@ def write_interaction_ppi(arr_interaction: np.array, project: str, def ppi_community_finder(fasterdb_ppi: Path, project: str, weight: int, global_weight: int, - same_gene: bool = True, + same_gene: bool = True, inflation: float=1.5, threshold: int = 700): """ Find communities inside protein-protein interaction file @@ -116,18 +114,19 @@ def ppi_community_finder(fasterdb_ppi: Path, project: str, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) + :param inflation: The inflation parameter :param threshold: The minimum threshold needed to consider the interaction """ interaction = ppi_array(fasterdb_ppi, threshold) outfile, result_file = write_interaction_ppi(interaction, project, weight, global_weight, - same_gene) + same_gene, inflation) graph = create_graph(interaction) df, dic_community = find_communities(graph, project, outfile, result_file, "gene") logging.debug('Writing results ...') outfiles = [ConfigGraph.get_community_file( - project, weight, global_weight, same_gene, "gene", ext, + project, weight, global_weight, same_gene, inflation, "gene", ext, "community_gene_vs_protein") for ext in [f'_graph_community_PPI.txt', f'_graph_community_PPI.cyjs']] df.to_csv(outfiles[0], sep="\t", index=False) @@ -229,7 +228,7 @@ def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, project: str, weight: int, global_weight: int, - same_gene: bool = True, + same_gene: bool = True, inflation: float = 1.5, threshold: int = 700): """ @@ -245,17 +244,19 @@ def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param inflation: The inflation parameter :param threshold: The minimum threshold needed to consider the interaction :return: """ outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", + same_gene, inflation, "gene", '_graph_community_PPI.txt', "community_gene_vs_protein") if not outfile.is_file(): logging.debug("Creating the community file for PPI") df_comm_ppi = ppi_community_finder(fasterdb_ppi, project, weight, - global_weight, same_gene, threshold) + global_weight, same_gene, inflation, + threshold) else: df_comm_ppi = pd.read_csv(outfile, sep="\t") logging.debug("Turning ppi community file into a dic") @@ -275,7 +276,7 @@ def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, project: str, weight: int, global_weight: int, - same_gene: bool = True, + same_gene: bool = True, inflation: float = 1.5, threshold: int = 700, iteration: int = 1000): """ :param community_file: A file containing community of gene interacting \ @@ -290,23 +291,25 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param inflation: The inflation parameter :param threshold: The minimum threshold needed to consider the interaction :param iteration: The number of iteration to make :return: """ outfile = ConfigGraph.get_community_file(project, weight, - global_weight, same_gene, "gene", + global_weight, same_gene, + inflation, "gene", f"ppi_gene_complete_table.txt", "community_gene_vs_protein") ppi_outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", + same_gene, inflation, "gene", '_graph_community_PPI.txt', "community_gene_vs_protein") if not outfile.is_file(): df = create_community_ppi_table(community_file, fasterdb_ppi, project, weight, global_weight, - same_gene, threshold) + same_gene, inflation, threshold) df.to_csv(outfile, sep="\t", index=False) else: df = pd.read_csv(outfile, sep="\t") @@ -318,7 +321,7 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, df, dic_values = update_overlap_df(df_overlap, dic_dna_gene, ppi_gene, iteration) outstat = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", + same_gene, inflation, "gene", f"ppi_gene_table_{iteration}_" f"stat.txt", "community_gene_vs_protein") @@ -327,7 +330,7 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, iteration, dic_values, use_seed=False) outstat = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", + same_gene, inflation, "gene", f"ppi_gene_table_{iteration}_" f"stat_recap.txt", "community_gene_vs_protein") @@ -337,7 +340,8 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, def ppi_stat_launcher(weight: int, global_weight: int, project: str, - same_gene: bool, threshold: int = 700, + same_gene: bool, inflation: float, + threshold: int = 700, iteration: int = 1000, logging_level: str = "DISABLE"): """ @@ -351,6 +355,7 @@ def ppi_stat_launcher(weight: int, :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param inflation: The inflation parameter :param threshold: The minimum threshold needed to consider the interaction :param iteration: The number of iteration to make :param logging_level: Level of information to display @@ -363,10 +368,11 @@ def ppi_stat_launcher(weight: int, logging.debug("Calculating stats...") community_file = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "gene", - f".txt") + same_gene, inflation, + "gene", f".txt") ppi_stats_analysis(community_file, ConfigPPI.fasterdb_ppi, project, - weight, global_weight, same_gene, threshold, iteration) + weight, global_weight, same_gene, inflation, threshold, + iteration) if __name__ == "__main__": diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index c38f4f878d8fb7db8d6502908d0935771e3d196e..d1d591ed527c40e3dac73f386d52e1c17a2a3141 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -249,9 +249,9 @@ def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, def create_outfiles(project: str, weight: int, global_weight: int, - same_gene: bool, feature: str, cpnt_type: str, - cpnt: str, test_type: str, community_file: str - ) -> Tuple[Path, Path]: + same_gene: bool, inflation: float, feature: str, + cpnt_type: str, cpnt: str, test_type: str, + community_file: str) -> Tuple[Path, Path]: """ Create a file used to store diagnostics and a file used to store the \ table containing the test communities and the control community @@ -268,6 +268,7 @@ def create_outfiles(project: str, weight: int, global_weight: int, can be 'nt', 'dnt' or 'aa'. :param cpnt: The component (nt, aa, dnt) of interest :param feature: The kind of feature analysed + :param inflation: The inflation parameter :param test_type: The type of test to make (permutation or lm) :param community_file: A file containing custom communities. If \ it equals to '' then weight, global weight and same genes parameter are \ @@ -286,11 +287,13 @@ def create_outfiles(project: str, weight: int, global_weight: int, f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.pdf" return outfile, outfile_ctrl outfile = ConfigGraph.\ - get_community_file(project, weight, global_weight, same_gene, feature, + get_community_file(project, weight, global_weight, same_gene, + inflation, feature, f"{cpnt}-{cpnt_type}_stat_{test_type}.txt", outfolder) outfile_ctrl = ConfigGraph.\ - get_community_file(project, weight, global_weight, same_gene, feature, + get_community_file(project, weight, global_weight, same_gene, + inflation, feature, f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.pdf", outfolder) return outfile, outfile_ctrl @@ -298,6 +301,7 @@ def create_outfiles(project: str, weight: int, global_weight: int, def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, global_weight: int, same_gene: bool, + inflation: float, cpnt_type: str, cpnt: str, dic_com: Dict, feature: str = "exon", test_type: str = "", @@ -318,6 +322,7 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) + :param inflation: The inflation parameter :param cpnt_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. :param cpnt: The component (nt, aa, dnt) of interest @@ -335,7 +340,8 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, logging.debug(f"{test_type} for {project}, w:{weight}, " f"g:{global_weight} cpnt: {cpnt}({cpnt_type})") outfile, outfile_ctrl = create_outfiles(project, weight, global_weight, - same_gene, feature, cpnt_type, + same_gene, inflation, feature, + cpnt_type, cpnt, test_type, community_file) res = {"project": project, "cpnt": cpnt, 'pval': lm_maker(df, outfile, cpnt)} @@ -346,7 +352,7 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, def create_dataframe(project: str, weight: int, global_weight: int, - same_gene: bool, feature: str = 'exon', + same_gene: bool, inflation: float, feature: str = 'exon', region: str = "", component_type: str = 'nt', community_file: str = "", from_communities: bool = True, @@ -360,6 +366,7 @@ def create_dataframe(project: str, weight: int, global_weight: int, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) + :param inflation: The inflation parameter :param feature: The kind of feature to analyse :param from_communities: True if we only select gene/exons :param region: the region of interest to extract from gene @@ -377,8 +384,8 @@ def create_dataframe(project: str, weight: int, global_weight: int, if community_file == "": result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, - ".txt") + same_gene, inflation, + feature, ".txt") else: result = Path(community_file) if not result.is_file(): @@ -399,7 +406,8 @@ def create_dataframe(project: str, weight: int, global_weight: int, return df -def create_dataframes(project, weight, global_weight, same_gene, feature, +def create_dataframes(project, weight, global_weight, same_gene, inflation, + feature, region, test_type, component_type: str, community_file: str ) -> Tuple[pd.DataFrame, Dict]: @@ -411,6 +419,7 @@ def create_dataframes(project, weight, global_weight, same_gene, feature, :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param inflation: The inflation parameter :param feature: The kind of analysed feature :param region: the region of interest to extract from gene :param test_type: The type of test to make (permutation or lm) @@ -420,10 +429,10 @@ def create_dataframes(project, weight, global_weight, same_gene, feature, it equals to '' then weight, global weight and same genes parameter are \ used to find the community files computed with ChIA-PET data. """ - df = create_dataframe(project, weight, global_weight, same_gene, + df = create_dataframe(project, weight, global_weight, same_gene, inflation, feature, region, component_type, community_file) df_ctrl = create_dataframe(project, weight, global_weight, same_gene, - feature, region, component_type, + inflation, feature, region, component_type, from_communities=False) df_ctrl = df_ctrl.loc[-df_ctrl[f"id_{feature}"].isin(df[f"id_{feature}"]), :].copy() @@ -438,6 +447,7 @@ def multiple_nt_lm_launcher(ps: int, global_weight: int, project: str, same_gene: bool, + inflation: float, feature: str = 'exon', region: str = '', component_type: str = "nt", @@ -457,6 +467,7 @@ def multiple_nt_lm_launcher(ps: int, :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param inflation: The inflation parameter :param feature: The kind of analysed feature :param component_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. @@ -487,16 +498,16 @@ def multiple_nt_lm_launcher(ps: int, pool = mp.Pool(processes=min(ps, len(condition))) logging.debug("Creating tables") df, dic_com = create_dataframes(project, weight, global_weight, - same_gene, feature, region, + same_gene, inflation, feature, region, test_type, component_type, community_file) for project, weight, cpnt in condition: nfile_table = ConfigGraph.get_community_file( - project, weight, global_weight, same_gene, feature, + project, weight, global_weight, same_gene, inflation, feature, f"_{component_type}_table.txt", "community_enrichment") df.to_csv(nfile_table, sep="\t", index=False) - args = [df, project, weight, global_weight, same_gene, component_type, - cpnt, dic_com, feature, test_type, iteration, display_size, - community_file] + args = [df, project, weight, global_weight, same_gene, inflation, + component_type, cpnt, dic_com, feature, test_type, iteration, + display_size, community_file] processes.append(pool.apply_async(get_stat_cpnt_communities, args)) results = [p.get(timeout=None) for p in processes] pool.close() @@ -505,7 +516,7 @@ def multiple_nt_lm_launcher(ps: int, fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, + same_gene, inflation, feature, f"lmm-{component_type}_stat.txt", "community_enrichment") nfolder = outfile.parent / f"{component_type}_analysis" @@ -523,6 +534,7 @@ def launcher_community_file(ps: int = 1, global_weight: int = -1, project: str = "GSM1018963_GSM1018964", same_gene: bool = True, + inflation: float = 1.5, feature: str = 'exon', region: str = '', component_type: str = "nt", @@ -556,9 +568,9 @@ def launcher_community_file(ps: int = 1, :param logging_level: Level of information to display (default DISABLE) """ multiple_nt_lm_launcher(ps, weight, global_weight, project, - same_gene, feature, region, component_type, - test_type, iteration, display_size, community_file, - logging_level) + same_gene, inflation, feature, region, + component_type, test_type, iteration, display_size, + community_file, logging_level) if __name__ == "__main__": diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py index 41728da8cfec21ee69cbe30eb1261f14dfc1a87a..0449bbf88e70be9456a96cb6870f597460e562d4 100644 --- a/src/find_interaction_cluster/sf_and_communities.py +++ b/src/find_interaction_cluster/sf_and_communities.py @@ -180,7 +180,7 @@ def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float: ... "community_size": [5, 7], ... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]}) >>> e_df = expand_dataframe(d) - >>> outfile = ConfigGraph.get_community_file("Test", 1, 1, True, + >>> outfile = ConfigGraph.get_community_file("Test", 1, 1, True, 1.5, ... "_stat.txt", "sf_community_enrichment") >>> glmm_maker(e_df, outfile) 1.0 @@ -210,7 +210,8 @@ def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float: def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, project: str, weight: int, global_weight: int, - same_gene: bool, feature: str = "exon") -> pd.Series: + same_gene: bool, inflation: float, + feature: str = "exon") -> pd.Series: """ Create the glmm statistics for a given splicing factor with \ given communities. @@ -227,13 +228,14 @@ def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) :param feature: The kind of feature analysed + :param inflation: the inflation parameter :return: The glmm pvalue among with other informations """ ndf = df.loc[-df['community'].isin(["All-community", "FASTERDB"]), :].copy() expanded_df = expand_dataframe(ndf) outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, + same_gene, inflation, feature, f"{sf_name}_{reg}_stat.txt", "sf_community_enrichment") noutfold = outfile.parent / "expanded_df" @@ -274,6 +276,7 @@ def adapt_regulated_list(cnx: sqlite3.Connection, def get_stat4communities(sf_name: str, reg: str, project: str, weight: int, global_weight: int, same_gene: bool, + inflation: float, feature: str = 'exon', ) -> Tuple[pd.DataFrame, pd.Series]: """ @@ -290,13 +293,14 @@ def get_stat4communities(sf_name: str, reg: str, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) + :param inflation: The inflation parameter :param feature: The kind of analysed feature """ logging.debug(f"Working on {sf_name}-{reg}, for {project}, w:{weight}, " f"g:{global_weight}") cnx = sqlite3.connect(ConfigGraph.db_file) result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, + same_gene, inflation, feature, ".txt") communities = get_communities_basefile(result, 0) regulated_dic, number = get_every_events_4_a_sl(cnx, sf_name, reg) @@ -328,7 +332,7 @@ def get_stat4communities(sf_name: str, reg: str, d['project'] = [project] * len(d["community"]) df = pd.DataFrame(d) s = glmm_statistics(df, sf_name, reg, project, weight, global_weight, - same_gene, feature) + same_gene, inflation, feature) return df, s @@ -365,7 +369,8 @@ def multiple_stat_launcher(ps: int, weight: int, global_weight: int, project: str, - same_gene: bool, feature: str = 'exon', + same_gene: bool, inflation: float, + feature: str = 'exon', logging_level: str = "DISABLE"): """ Launch the statistical analysis for every @@ -379,6 +384,7 @@ def multiple_stat_launcher(ps: int, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param feature: The feature we want to analyse + :param inflation: the inflation parameter :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -394,7 +400,7 @@ def multiple_stat_launcher(ps: int, for project, weight, sf_name, reg in condition: ckey = get_key(project, weight) args = [sf_name, reg, project, weight, global_weight, same_gene, - feature] + inflation, feature] if ckey in processes: processes[ckey].append(pool.apply_async(get_stat4communities, args)) else: @@ -409,7 +415,7 @@ def multiple_stat_launcher(ps: int, df = pd.concat(list_df, axis=0, ignore_index=True) outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, + same_gene, inflation, feature, "_stat.txt", "sf_community_enrichment") df.to_csv(outfile, sep="\t", index=False) @@ -418,7 +424,7 @@ def multiple_stat_launcher(ps: int, method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, + same_gene, inflation, feature, "_glmm_stat.txt", "sf_community_enrichment") glm_df.to_csv(outfile, sep="\t", index=False)