diff --git a/src/find_interaction_cluster/__main__.py b/src/find_interaction_cluster/__main__.py index 89ebea84808a1910969d84ec5a0c1057c538a258..f43a18e9d31c8b13631a7e0b78fdbbde93308d67 100644 --- a/src/find_interaction_cluster/__main__.py +++ b/src/find_interaction_cluster/__main__.py @@ -28,6 +28,7 @@ def launcher(weight: int = 1, global_weight: int = 0, same_gene: bool = True, inflation: float = 1.5, + cell_line: str = 'ALL', use_weight: bool = False, ps: int = ConfigGraph.cpu, html_fig: bool = False, feature: str = 'exon', region: str = '', @@ -48,6 +49,8 @@ def launcher(weight: int = 1, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default True) :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). (default ALL) :param use_weight: Say if we want to write the weight into the result file. :param html_fig: True to display the html figure (default False). :param feature: The feature we want to analyse (default 'exon') @@ -74,21 +77,21 @@ def launcher(weight: int = 1, install_hipmcl("INFO") multiple_community_launcher(weight, global_weight, project, same_gene, - inflation, use_weight, + inflation, cell_line, use_weight, html_fig, feature, logging_level) # multiple_stat_launcher(ps, weight, global_weight, project, same_gene, - # inflation, feature, logging_level) + # inflation, cell_line, feature, logging_level) multiple_nt_lm_launcher(ps, weight, global_weight, project, - same_gene, inflation, feature, region, + same_gene, inflation, cell_line, feature, region, component_type, test_type, iteration, display_size, logging_level=logging_level) # if feature == "gene": # # ppi_stat_launcher(weight, global_weight, project, same_gene, - # # inflation, + # # inflation, cell_line, # # ConfigGraph.ppi_threshold, iteration, # # logging_level) # coloc_ppi_stat_main(weight, global_weight, project, same_gene, - # inflation, iteration, logging_level) + # inflation, cell_line, iteration, logging_level) diff --git a/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py b/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py index cd33bb836743019a844cc1b0f5c93d709fe6af72..18fe454254a7c9771f96ebf97007c0f316690c09 100644 --- a/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py +++ b/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py @@ -303,7 +303,7 @@ def create_figure(df_full: pd.DataFrame, outfile: Path) -> None: def coloc_ppi_stat_main(weight: int, global_weight: int, project: str, same_gene: bool, inflation: float, - iteration: int = 1000, + cell_line: str = "ALL", iteration: int = 1000, logging_level: str = "DISABLE"): """ Launch the statistical tests allowing to determine if interaction between \ @@ -318,6 +318,8 @@ def coloc_ppi_stat_main(weight: int, global_weight: int, same gene :param threshold: The minimum threshold needed to consider the interaction :param iteration: The number of iteration to make + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -329,13 +331,15 @@ def coloc_ppi_stat_main(weight: int, global_weight: int, community_file = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, inflation, + cell_line, "gene", f".txt") df_com = pd.read_csv(community_file, sep="\t") df_com = df_com[df_com['nodes'] >= 10].copy() full_df = create_scored_dataframe(df_com, ConfigPPI.fasterdb_ppi) outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, "gene", + same_gene, inflation, cell_line, + "gene", f"_interation_gene-protein.pdf", "community_gene_vs_protein") create_figure(full_df, outfile) diff --git a/src/find_interaction_cluster/community_finder.py b/src/find_interaction_cluster/community_finder.py index 9ff5afe0678c196e8658e72255ab03234bcd8296..c5c3a770bc93429fb5a762eff480a9971115b016 100644 --- a/src/find_interaction_cluster/community_finder.py +++ b/src/find_interaction_cluster/community_finder.py @@ -276,7 +276,8 @@ def get_figure_title(project, weight, global_weight, same_gene, feature): def write_interaction_file(arr_interaction: np.array, project: str, weight: int, global_weight: int, same_gene: bool, - inflation: float, use_weight: bool = False, + inflation: float, cell_line: str = "ALL", + use_weight: bool = False, feature: str = 'exon'): """ @@ -293,14 +294,18 @@ def write_interaction_file(arr_interaction: np.array, project: str, :param use_weight: Say if we want to write the weight into the result file. :param feature: Says if we want to work at gene or at exons level :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :return: """ logging.debug('Writing interaction files ...') outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, feature, + same_gene, inflation, cell_line, + feature, f"_interation.txt") result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, feature, + same_gene, inflation, cell_line, + feature, f"_communities.txt") with outfile.open('w') as f: for exon1, exon2, cweight in arr_interaction: @@ -313,7 +318,7 @@ def write_interaction_file(arr_interaction: np.array, project: str, def community_finder(weight: int, global_weight: int, project: str = "", same_gene=True, html_fig: bool = False, feature: str = "exon", inflation: float = 1.5, - use_weight: bool = True, + cell_line: str = "ALL", use_weight: bool = True, logging_level: str = "DISABLE"): """ Find communities inside co-localisation between exons found in \ @@ -332,24 +337,28 @@ def community_finder(weight: int, global_weight: int, project: str = "", :param inflation: The inflation parameter :param use_weight: Say if we want to write the weight into the result file :param feature: The feature we want to analyse (default 'exon') + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). """ ConfigGraph.output_folder.mkdir(exist_ok=True, parents=True) logging_def(ConfigGraph.output_folder, __file__, logging_level) cnx = sqlite3.connect(ConfigGraph.db_file) interaction = get_project_colocalisation(cnx, project, weight, global_weight, same_gene, True, - level=feature) + level=feature, cell=cell_line) outfile, result_file = write_interaction_file(interaction, project, weight, global_weight, same_gene, feature=feature, inflation=inflation, - use_weight=use_weight) + use_weight=use_weight, + cell_line=cell_line) graph = create_graph(interaction) df, dic_community = find_communities(graph, project, outfile, result_file, feature, inflation) logging.debug('Writing results ...') outfiles = [ConfigGraph.get_community_file( - project, weight, global_weight, same_gene, inflation, feature, ext) + project, weight, global_weight, same_gene, inflation, cell_line, + feature, ext) for ext in [f'.txt', f'.cyjs', f'.html']] df.to_csv(outfiles[0], sep="\t", index=False) logging.debug("Saving the graph ...") @@ -383,6 +392,7 @@ def multiple_community_launcher(weight: int, project: str, same_gene: bool, inflation: float = 1.5, + cell_line: str = "ALL", use_weight: bool = False, html_fig: bool = False, feature: str = 'exon', @@ -395,6 +405,8 @@ def multiple_community_launcher(weight: int, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param use_weight: Say if we want to write the weight into the result file :param html_fig: True to create an html figure, false else :param feature: The feature we want to analyse (default 'exon') @@ -407,4 +419,4 @@ def multiple_community_launcher(weight: int, logging.info(f'Finding community for project : {project}, ' f'global_weight : {global_weight}, weight: {weight}') community_finder(weight, global_weight, project, same_gene, html_fig, - feature, inflation, use_weight) + feature, inflation, cell_line, use_weight) diff --git a/src/find_interaction_cluster/config.py b/src/find_interaction_cluster/config.py index 3e6c717cc65766ba2d4b90461d4ca98a104ed40b..7b72748eb19c408737845e9a9ee51544794cfb17 100644 --- a/src/find_interaction_cluster/config.py +++ b/src/find_interaction_cluster/config.py @@ -13,7 +13,8 @@ from typing import List, Dict import pandas as pd -def get_weight_folder(weight: int, global_weight: int, inflation: float): +def get_weight_folder(weight: int, global_weight: int, inflation: float, + cell_line: str): """ Get the weight folder. @@ -23,22 +24,26 @@ def get_weight_folder(weight: int, global_weight: int, inflation: float): by project, else all project are merge together and the interaction \ seen in `global_weight` project are taken into account :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :return: The folder that will contains the interaction with a weight \ greater or equal to `weigh` in ChIA-PET projects """ + cell_line = "" if cell_line == "ALL" else f"-{cell_line}" if global_weight == 0: weight_folder = ConfigGraph.community_folder / \ - f"project_weight-{weight}-{inflation}" + f"project_weight-{weight}-{inflation}{cell_line}" else: weight_folder = ConfigGraph.community_folder / \ f"weight-{weight}_" \ - f"global_weight-{global_weight}-{inflation}" + f"global_weight-{global_weight}-{inflation}{cell_line}" weight_folder.mkdir(parents=True, exist_ok=True) return weight_folder def get_community_file(project: str, weight: int, global_weight: int, - same_gene: bool, inflation: float, feature: str = 'exon', + same_gene: bool, inflation: float, + cell_line: str = "ALL", feature: str = 'exon', ext: str = ".txt", sub_fold: str = ''): """ Get the output file of interest. @@ -52,12 +57,14 @@ def get_community_file(project: str, weight: int, global_weight: int, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param feature: the kind of feature analyzed :param ext: The file extension :param sub_fold: if filled, then the data are recovered from a subfolder :return: The filename of interest """ - folder = get_weight_folder(weight, global_weight, inflation) + folder = get_weight_folder(weight, global_weight, inflation, cell_line) if sub_fold != '': folder = folder / sub_fold folder.mkdir(exist_ok=True, parents=True) diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py index fa159e8984a8688bfdbc8aca4ae79d1621d49127..a990afa4978203cfca869e6484d86b80c958f09a 100644 --- a/src/find_interaction_cluster/create_ppi_files.py +++ b/src/find_interaction_cluster/create_ppi_files.py @@ -19,7 +19,7 @@ from .community_finder import create_graph, find_communities, \ write_cytoscape_graph from doctest import testmod from .radomization_test_ppi import get_ppi_community_gene, \ - get_dna_community_gene, update_overlap_df,summary_randomisation_test + get_dna_community_gene, update_overlap_df, summary_randomisation_test def get_community_dic_fom_community_file(mfile: Union[Path, pd.DataFrame], @@ -65,7 +65,8 @@ def ppi_array(fasterdb_ppi: Path, threshold: int) -> np.array: def write_interaction_ppi(arr_interaction: np.array, project: str, weight: int, global_weight: int, same_gene: bool, - inflation: float, use_weight: bool = False): + inflation: float, cell_line: str = "ALL", + use_weight: bool = False): """ :param arr_interaction: Each couples of co-localized feature within a \ @@ -79,15 +80,19 @@ def write_interaction_ppi(arr_interaction: np.array, project: str, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). (default ALL) :param use_weight: Say if we want to write the weight into the result file. """ logging.debug('Writing interaction files ...') outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, "gene", + same_gene, inflation, cell_line, + "gene", f"_interation_PPI_tmp.txt", "community_gene_vs_protein") result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, "gene", + same_gene, inflation, cell_line, + "gene", f"_communities_PPI_tmp.txt", "community_gene_vs_protein") with outfile.open('w') as f: @@ -100,7 +105,8 @@ def write_interaction_ppi(arr_interaction: np.array, project: str, def ppi_community_finder(fasterdb_ppi: Path, project: str, weight: int, global_weight: int, - same_gene: bool = True, inflation: float=1.5, + same_gene: bool = True, inflation: float = 1.5, + cell_line: str = "ALL", threshold: int = 700): """ Find communities inside protein-protein interaction file @@ -115,19 +121,22 @@ def ppi_community_finder(fasterdb_ppi: Path, project: str, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). (default ALL) :param threshold: The minimum threshold needed to consider the interaction """ interaction = ppi_array(fasterdb_ppi, threshold) outfile, result_file = write_interaction_ppi(interaction, project, weight, global_weight, - same_gene, inflation) + same_gene, inflation, + cell_line) graph = create_graph(interaction) df, dic_community = find_communities(graph, project, outfile, result_file, "gene") logging.debug('Writing results ...') outfiles = [ConfigGraph.get_community_file( - project, weight, global_weight, same_gene, inflation, "gene", ext, - "community_gene_vs_protein") + project, weight, global_weight, same_gene, inflation, cell_line, + "gene", ext, "community_gene_vs_protein") for ext in [f'_graph_community_PPI.txt', f'_graph_community_PPI.cyjs']] df.to_csv(outfiles[0], sep="\t", index=False) logging.debug("Saving the graph ...") @@ -229,7 +238,7 @@ def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, project: str, weight: int, global_weight: int, same_gene: bool = True, inflation: float = 1.5, - threshold: int = 700): + cell_line: str = "ALL", threshold: int = 700): """ :param community_file: A file containing community of gene interacting \ @@ -245,18 +254,21 @@ def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). (default ALL) :param threshold: The minimum threshold needed to consider the interaction :return: """ outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, "gene", + same_gene, inflation, cell_line, + "gene", '_graph_community_PPI.txt', "community_gene_vs_protein") if not outfile.is_file(): logging.debug("Creating the community file for PPI") df_comm_ppi = ppi_community_finder(fasterdb_ppi, project, weight, global_weight, same_gene, inflation, - threshold) + cell_line, threshold) else: df_comm_ppi = pd.read_csv(outfile, sep="\t") logging.debug("Turning ppi community file into a dic") @@ -277,6 +289,7 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, project: str, weight: int, global_weight: int, same_gene: bool = True, inflation: float = 1.5, + cell_line: str = "ALL", threshold: int = 700, iteration: int = 1000): """ :param community_file: A file containing community of gene interacting \ @@ -292,24 +305,28 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). (default ALL) :param threshold: The minimum threshold needed to consider the interaction :param iteration: The number of iteration to make :return: """ outfile = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, - inflation, "gene", + inflation, cell_line, "gene", f"ppi_gene_complete_table.txt", "community_gene_vs_protein") ppi_outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, "gene", + same_gene, inflation, + cell_line, "gene", '_graph_community_PPI.txt', "community_gene_vs_protein") if not outfile.is_file(): df = create_community_ppi_table(community_file, fasterdb_ppi, project, weight, global_weight, - same_gene, inflation, threshold) + same_gene, inflation, cell_line, + threshold) df.to_csv(outfile, sep="\t", index=False) else: df = pd.read_csv(outfile, sep="\t") @@ -321,7 +338,8 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, df, dic_values = update_overlap_df(df_overlap, dic_dna_gene, ppi_gene, iteration) outstat = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, "gene", + same_gene, inflation, cell_line, + "gene", f"ppi_gene_table_{iteration}_" f"stat.txt", "community_gene_vs_protein") @@ -330,7 +348,8 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, iteration, dic_values, use_seed=False) outstat = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, "gene", + same_gene, inflation, cell_line, + "gene", f"ppi_gene_table_{iteration}_" f"stat_recap.txt", "community_gene_vs_protein") @@ -341,6 +360,7 @@ def ppi_stat_launcher(weight: int, global_weight: int, project: str, same_gene: bool, inflation: float, + cell_line: str= "ALL", threshold: int = 700, iteration: int = 1000, logging_level: str = "DISABLE"): @@ -356,6 +376,8 @@ def ppi_stat_launcher(weight: int, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). (default ALL) :param threshold: The minimum threshold needed to consider the interaction :param iteration: The number of iteration to make :param logging_level: Level of information to display @@ -369,10 +391,10 @@ def ppi_stat_launcher(weight: int, community_file = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, inflation, - "gene", f".txt") + cell_line, "gene", f".txt") ppi_stats_analysis(community_file, ConfigPPI.fasterdb_ppi, project, - weight, global_weight, same_gene, inflation, threshold, - iteration) + weight, global_weight, same_gene, inflation, cell_line, + threshold, iteration) if __name__ == "__main__": diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index d1d591ed527c40e3dac73f386d52e1c17a2a3141..9aadb7e95052e95453f03e05b3cc146778a22323 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -249,8 +249,8 @@ def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, def create_outfiles(project: str, weight: int, global_weight: int, - same_gene: bool, inflation: float, feature: str, - cpnt_type: str, cpnt: str, test_type: str, + same_gene: bool, inflation: float, cell_line: str, + feature: str, cpnt_type: str, cpnt: str, test_type: str, community_file: str) -> Tuple[Path, Path]: """ Create a file used to store diagnostics and a file used to store the \ @@ -269,6 +269,8 @@ def create_outfiles(project: str, weight: int, global_weight: int, :param cpnt: The component (nt, aa, dnt) of interest :param feature: The kind of feature analysed :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param test_type: The type of test to make (permutation or lm) :param community_file: A file containing custom communities. If \ it equals to '' then weight, global weight and same genes parameter are \ @@ -288,12 +290,12 @@ def create_outfiles(project: str, weight: int, global_weight: int, return outfile, outfile_ctrl outfile = ConfigGraph.\ get_community_file(project, weight, global_weight, same_gene, - inflation, feature, + inflation, cell_line, feature, f"{cpnt}-{cpnt_type}_stat_{test_type}.txt", outfolder) outfile_ctrl = ConfigGraph.\ get_community_file(project, weight, global_weight, same_gene, - inflation, feature, + inflation, cell_line, feature, f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.pdf", outfolder) return outfile, outfile_ctrl @@ -301,7 +303,7 @@ def create_outfiles(project: str, weight: int, global_weight: int, def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, global_weight: int, same_gene: bool, - inflation: float, + inflation: float, cell_line: str, cpnt_type: str, cpnt: str, dic_com: Dict, feature: str = "exon", test_type: str = "", @@ -323,6 +325,8 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param cpnt_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. :param cpnt: The component (nt, aa, dnt) of interest @@ -340,8 +344,8 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, logging.debug(f"{test_type} for {project}, w:{weight}, " f"g:{global_weight} cpnt: {cpnt}({cpnt_type})") outfile, outfile_ctrl = create_outfiles(project, weight, global_weight, - same_gene, inflation, feature, - cpnt_type, + same_gene, inflation, cell_line, + feature, cpnt_type, cpnt, test_type, community_file) res = {"project": project, "cpnt": cpnt, 'pval': lm_maker(df, outfile, cpnt)} @@ -352,7 +356,8 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, def create_dataframe(project: str, weight: int, global_weight: int, - same_gene: bool, inflation: float, feature: str = 'exon', + same_gene: bool, inflation: float, cell_line: str = "ALL", + feature: str = 'exon', region: str = "", component_type: str = 'nt', community_file: str = "", from_communities: bool = True, @@ -367,6 +372,8 @@ def create_dataframe(project: str, weight: int, global_weight: int, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param feature: The kind of feature to analyse :param from_communities: True if we only select gene/exons :param region: the region of interest to extract from gene @@ -385,6 +392,7 @@ def create_dataframe(project: str, weight: int, global_weight: int, result = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, inflation, + cell_line, feature, ".txt") else: result = Path(community_file) @@ -407,7 +415,7 @@ def create_dataframe(project: str, weight: int, global_weight: int, def create_dataframes(project, weight, global_weight, same_gene, inflation, - feature, + cell_line, feature, region, test_type, component_type: str, community_file: str ) -> Tuple[pd.DataFrame, Dict]: @@ -420,6 +428,8 @@ def create_dataframes(project, weight, global_weight, same_gene, inflation, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param feature: The kind of analysed feature :param region: the region of interest to extract from gene :param test_type: The type of test to make (permutation or lm) @@ -430,9 +440,11 @@ def create_dataframes(project, weight, global_weight, same_gene, inflation, used to find the community files computed with ChIA-PET data. """ df = create_dataframe(project, weight, global_weight, same_gene, inflation, - feature, region, component_type, community_file) + cell_line, feature, region, component_type, + community_file) df_ctrl = create_dataframe(project, weight, global_weight, same_gene, - inflation, feature, region, component_type, + inflation, cell_line, feature, region, + component_type, from_communities=False) df_ctrl = df_ctrl.loc[-df_ctrl[f"id_{feature}"].isin(df[f"id_{feature}"]), :].copy() @@ -448,6 +460,7 @@ def multiple_nt_lm_launcher(ps: int, project: str, same_gene: bool, inflation: float, + cell_line: str = "ALL", feature: str = 'exon', region: str = '', component_type: str = "nt", @@ -468,6 +481,8 @@ def multiple_nt_lm_launcher(ps: int, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param feature: The kind of analysed feature :param component_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. @@ -498,16 +513,17 @@ def multiple_nt_lm_launcher(ps: int, pool = mp.Pool(processes=min(ps, len(condition))) logging.debug("Creating tables") df, dic_com = create_dataframes(project, weight, global_weight, - same_gene, inflation, feature, region, + same_gene, inflation, cell_line, + feature, region, test_type, component_type, community_file) for project, weight, cpnt in condition: nfile_table = ConfigGraph.get_community_file( - project, weight, global_weight, same_gene, inflation, feature, - f"_{component_type}_table.txt", "community_enrichment") + project, weight, global_weight, same_gene, inflation, cell_line, + feature, f"_{component_type}_table.txt", "community_enrichment") df.to_csv(nfile_table, sep="\t", index=False) args = [df, project, weight, global_weight, same_gene, inflation, - component_type, cpnt, dic_com, feature, test_type, iteration, - display_size, community_file] + cell_line, component_type, cpnt, dic_com, feature, test_type, + iteration, display_size, community_file] processes.append(pool.apply_async(get_stat_cpnt_communities, args)) results = [p.get(timeout=None) for p in processes] pool.close() @@ -516,7 +532,8 @@ def multiple_nt_lm_launcher(ps: int, fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, feature, + same_gene, inflation, cell_line, + feature, f"lmm-{component_type}_stat.txt", "community_enrichment") nfolder = outfile.parent / f"{component_type}_analysis" @@ -535,6 +552,7 @@ def launcher_community_file(ps: int = 1, project: str = "GSM1018963_GSM1018964", same_gene: bool = True, inflation: float = 1.5, + cell_line: str = "ALL", feature: str = 'exon', region: str = '', component_type: str = "nt", @@ -554,6 +572,8 @@ def launcher_community_file(ps: int = 1, :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene (default True) + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param feature: The kind of analysed feature (default exon) :param component_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. @@ -568,7 +588,7 @@ def launcher_community_file(ps: int = 1, :param logging_level: Level of information to display (default DISABLE) """ multiple_nt_lm_launcher(ps, weight, global_weight, project, - same_gene, inflation, feature, region, + same_gene, inflation, cell_line, feature, region, component_type, test_type, iteration, display_size, community_file, logging_level) diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py index 0449bbf88e70be9456a96cb6870f597460e562d4..a9209bcbf43ff7c8ae1fdae21ec92fbc8b36c5c4 100644 --- a/src/find_interaction_cluster/sf_and_communities.py +++ b/src/find_interaction_cluster/sf_and_communities.py @@ -181,7 +181,7 @@ def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float: ... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]}) >>> e_df = expand_dataframe(d) >>> outfile = ConfigGraph.get_community_file("Test", 1, 1, True, 1.5, - ... "_stat.txt", "sf_community_enrichment") + ... "ALL", "_stat.txt", "sf_community_enrichment") >>> glmm_maker(e_df, outfile) 1.0 """ @@ -210,7 +210,7 @@ def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float: def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, project: str, weight: int, global_weight: int, - same_gene: bool, inflation: float, + same_gene: bool, inflation: float, cell_line: str = "ALL", feature: str = "exon") -> pd.Series: """ Create the glmm statistics for a given splicing factor with \ @@ -229,13 +229,16 @@ def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, same gene (True) or not (False) (default False) :param feature: The kind of feature analysed :param inflation: the inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :return: The glmm pvalue among with other informations """ ndf = df.loc[-df['community'].isin(["All-community", "FASTERDB"]), :].copy() expanded_df = expand_dataframe(ndf) outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, feature, + same_gene, inflation, cell_line, + feature, f"{sf_name}_{reg}_stat.txt", "sf_community_enrichment") noutfold = outfile.parent / "expanded_df" @@ -276,7 +279,7 @@ def adapt_regulated_list(cnx: sqlite3.Connection, def get_stat4communities(sf_name: str, reg: str, project: str, weight: int, global_weight: int, same_gene: bool, - inflation: float, + inflation: float, cell_line: str = "ALL", feature: str = 'exon', ) -> Tuple[pd.DataFrame, pd.Series]: """ @@ -294,14 +297,16 @@ def get_stat4communities(sf_name: str, reg: str, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) :param inflation: The inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param feature: The kind of analysed feature """ logging.debug(f"Working on {sf_name}-{reg}, for {project}, w:{weight}, " f"g:{global_weight}") cnx = sqlite3.connect(ConfigGraph.db_file) result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, feature, - ".txt") + same_gene, inflation, cell_line, + feature, ".txt") communities = get_communities_basefile(result, 0) regulated_dic, number = get_every_events_4_a_sl(cnx, sf_name, reg) reg_ft = regulated_dic[sf_name + "_" + reg] @@ -332,7 +337,7 @@ def get_stat4communities(sf_name: str, reg: str, d['project'] = [project] * len(d["community"]) df = pd.DataFrame(d) s = glmm_statistics(df, sf_name, reg, project, weight, global_weight, - same_gene, inflation, feature) + same_gene, inflation, cell_line, feature) return df, s @@ -370,6 +375,7 @@ def multiple_stat_launcher(ps: int, global_weight: int, project: str, same_gene: bool, inflation: float, + cell_line: str = "ALL", feature: str = 'exon', logging_level: str = "DISABLE"): """ @@ -385,6 +391,8 @@ def multiple_stat_launcher(ps: int, same gene :param feature: The feature we want to analyse :param inflation: the inflation parameter + :param cell_line: Interactions are only selected from projects made on \ + a specific cell line (ALL to disable this filter). :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -400,7 +408,7 @@ def multiple_stat_launcher(ps: int, for project, weight, sf_name, reg in condition: ckey = get_key(project, weight) args = [sf_name, reg, project, weight, global_weight, same_gene, - inflation, feature] + inflation, cell_line, feature] if ckey in processes: processes[ckey].append(pool.apply_async(get_stat4communities, args)) else: @@ -415,7 +423,9 @@ def multiple_stat_launcher(ps: int, df = pd.concat(list_df, axis=0, ignore_index=True) outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, feature, + same_gene, inflation, + cell_line, + feature, "_stat.txt", "sf_community_enrichment") df.to_csv(outfile, sep="\t", index=False) @@ -424,7 +434,9 @@ def multiple_stat_launcher(ps: int, method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, inflation, feature, + same_gene, inflation, + cell_line, + feature, "_glmm_stat.txt", "sf_community_enrichment") glm_df.to_csv(outfile, sep="\t", index=False)