From f05e0585a97d431272eaf2282423304268249068 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Fri, 20 Nov 2020 16:12:10 +0100 Subject: [PATCH] src/find_interaction_cluster/*.py: replace list parameters weights and global_weights by int paramet weight and global_weight --- src/find_interaction_cluster/__main__.py | 24 +++--- .../community_finder.py | 75 ++++++------------- .../create_ppi_files.py | 50 ++++++------- .../nt_and_community.py | 42 +++++------ .../sf_and_communities.py | 37 +++++---- 5 files changed, 95 insertions(+), 133 deletions(-) diff --git a/src/find_interaction_cluster/__main__.py b/src/find_interaction_cluster/__main__.py index 7fecb019..a196eeb3 100644 --- a/src/find_interaction_cluster/__main__.py +++ b/src/find_interaction_cluster/__main__.py @@ -20,12 +20,13 @@ from ..logging_conf import logging_def @lp.parse -def launcher(weight: List[int] = (1), - global_weight: List[int] = (0), +def launcher(weight: int = 1, + global_weight: int = 0, same_gene: bool = True, ps: int = ConfigGraph.cpu, html_fig: bool = False, feature: str = 'exon', region: str = '', iteration_ppi: int = 1000, + project: str = "GSM1018963_GSM1018964", logging_level: str = "DISABLE"): """ Script used to find communities inside exon co-localized within a project @@ -41,9 +42,11 @@ def launcher(weight: List[int] = (1), :param html_fig: True to display the html figure (default False). :param feature: The feature we want to analyse (default 'exon') :param region: The region of a gene to analyse (used only if feature \ - is 'gene') (default ''). + is 'gene') (default '') (can be 'gene', 'exon', 'intron'). :param logging_level: The level of data to display (default 'DISABLE') :param iteration_ppi: the number of iteration for ppi analysis + :param project: The project name of interest \ + (only used is global_weight = 0). :param ps: The number of processes to use """ logging_def(ConfigGraph.community_folder, __file__, logging_level) @@ -53,15 +56,14 @@ def launcher(weight: List[int] = (1), same_gene = True if not ConfigGraph.get_hipmcl_prog().is_file(): install_hipmcl("INFO") - multiple_community_launcher(1, weight, global_weight, same_gene, html_fig, - feature, logging_level) - multiple_stat_launcher(ps, weight, global_weight, same_gene, feature, - logging_level) - multiple_nt_lmm_launcher(ps, weight, global_weight, same_gene, feature, - region, - logging_level) + multiple_community_launcher(weight, global_weight, project, same_gene, + html_fig, feature, logging_level) + multiple_stat_launcher(ps, weight, global_weight, project, same_gene, + feature, logging_level) + multiple_nt_lmm_launcher(ps, weight, global_weight, project, + same_gene, feature, region, logging_level) if feature == "gene": - ppi_stat_launcher(ps, weight, global_weight, same_gene, + ppi_stat_launcher(weight, global_weight, project, same_gene, ConfigGraph.ppi_threshold, iteration_ppi, logging_level) diff --git a/src/find_interaction_cluster/community_finder.py b/src/find_interaction_cluster/community_finder.py index b4180848..d65d5a81 100644 --- a/src/find_interaction_cluster/community_finder.py +++ b/src/find_interaction_cluster/community_finder.py @@ -253,8 +253,8 @@ def get_figure_title(project, weight, global_weight, same_gene, feature): :param feature: The kind of analysed features :return: A figure title """ - title = f"Co-localisation between {feature}s having a weight greater than " \ - f"{weight} in " + title = f"Co-localisation between {feature}s having a weight greater " \ + f"than {weight} in " if global_weight == 0: title += f"the project {project}" else: @@ -347,7 +347,7 @@ def community_finder(weight: int, global_weight: int, project: str = "", logging.debug('Done !') -def get_projects(global_weight: int) -> List[str]: +def get_projects(global_weight: int, project: str) -> str: """ Get projects name. @@ -355,69 +355,38 @@ def get_projects(global_weight: int) -> List[str]: the global weight is equal to 0 then then density figure are calculated \ by project, else all projet are merge together and the interaction \ seen in `global_weight` project are taken into account - :return: The list of the project to consider + :param project: The name of a project + :return: The project to consider """ if global_weight != 0: - return [f'Global-weight-{global_weight}'] + return f'Global-weight-{global_weight}' else: - return ConfigGraph.good_projects + return project -def get_projects_name(global_weights: List[int]) -> Tuple[List[str], Dict]: - """ - Get projects name given a list of global_weight and a dictionary linking, - each project name to it's corresponding global weight. - - :param global_weights: The list of global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projet are merge together and the interaction \ - seen in `global_weight` project are taken into account - :return: project names and a dictionary linking, - each name to it's corresponding global weight. - """ - dic = {} - projects = [] - for global_weight in global_weights: - tmp = get_projects(global_weight) - projects += tmp - for p in tmp: - dic[p] = global_weight - return projects, dic - - -def multiple_community_launcher(ps: int, weights: List[int], - global_weights: List[int], - same_gene: bool, html_fig: bool = False, +def multiple_community_launcher(weight: int, + global_weight: int, + project: str, + same_gene: bool, + html_fig: bool = False, feature: str = 'exon', logging_level: str = "DISABLE"): """ - :param ps: The number of processes we want to use. - :param weights: The list of weights of interaction to consider - :param global_weights: The list global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projcet are merge together and the interaction \ - seen in `global_weight` project are taken into account + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. :param same_gene: Say if we consider as co-localised exon within the \ same gene :param html_fig: True to create an html figure, false else :param feature: The feature we want to analyse (default 'exon') + :param project: The project name, used only if global_weight = 0 :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) logging_def(ConfigGraph.community_folder, __file__, logging_level) - global_weights = list(np.unique(global_weights)) - weights = list(np.unique(weights)) - projects, dic_project = get_projects_name(global_weights) - condition = list(product(projects, weights)) - processes = [] - pool = mp.Pool(processes=min(ps, len(condition))) - for project, weight in condition: - global_weight = dic_project[project] - logging.info(f'Finding community for project : {project}, ' - f'global_weight : {global_weight}, weight: {weight}') - args = [weight, global_weight, project, same_gene, html_fig, feature] - processes.append(pool.apply_async(community_finder, args)) - for proc in processes: - proc.get(timeout=None) - pool.close() - pool.join() + project = get_projects(global_weight, project) + logging.info(f'Finding community for project : {project}, ' + f'global_weight : {global_weight}, weight: {weight}') + community_finder(weight, global_weight, project, same_gene, html_fig, + feature) diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py index 0c7a211d..1d401736 100644 --- a/src/find_interaction_cluster/create_ppi_files.py +++ b/src/find_interaction_cluster/create_ppi_files.py @@ -12,7 +12,7 @@ from typing import List from ..logging_conf import logging_def import numpy as np import logging -from .community_finder import get_projects_name +from .community_finder import get_projects from itertools import product import multiprocessing as mp from pathlib import Path @@ -195,7 +195,8 @@ def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int :param df: The dataframe containing the number of \ genes that clusters in community at DNA and protein level - :param the minimum size required to keep gene community at dna level. + :param size_threshold: the minimum size required to keep gene + community at dna level. :return: The dataframe with only one line per DNA community >>> test_df = pd.DataFrame({"id_gene": range(1, 12), @@ -203,7 +204,7 @@ def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int ... "community_size": [9] * 5 + [14] * 5 + [19], ... "nb_com-ppi": [0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0], ... "size_com-ppi": [30, 50, 105, 102, 25, 30, 42, 47, 89, 12, 0]}) - >>> filter_most_overllaping_ppi(test_df) + >>> filter_most_overllaping_ppi(test_df, 10) DNA_community community_size nb_com-ppi size_com-ppi 0 C1 10 3 105 1 C2 15 2 42 @@ -324,7 +325,7 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, df.to_csv(outstat, sep="\t", index=False) final_df = summary_randomisation_test(df, dic_dna_gene, ppi_gene, iteration, dic_values, - use_seed = False) + use_seed=False) outstat = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, "gene", f"ppi_gene_table_{iteration}_" @@ -333,8 +334,9 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, final_df.to_csv(outstat, sep="\t", index=False) -def ppi_stat_launcher(ps: int, weights: List[int], - global_weights: List[int], +def ppi_stat_launcher(weight: int, + global_weight: int, + project: str, same_gene: bool, threshold: int = 700, iteration: int = 1000, logging_level: str = "DISABLE"): @@ -342,12 +344,11 @@ def ppi_stat_launcher(ps: int, weights: List[int], Launch the statistical allowing to determine if interaction between \ genes at dna level as an influence on the interactions at protein level. - :param ps: The number of processes we want to use. - :param weights: The list of weights of interaction to consider - :param global_weights: The list global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projcet are merge together and the interaction \ - seen in `global_weight` project are taken into account + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene :param threshold: The minimum threshold needed to consider the interaction @@ -358,24 +359,15 @@ def ppi_stat_launcher(ps: int, weights: List[int], logging_def(ConfigGraph.community_folder, __file__, logging_level) logging.info("Checking if gene communities at DNA level have an " "influence on communities at protein level") - global_weights = list(np.unique(global_weights)) - weights = list(np.unique(weights)) - projects, dic_project = get_projects_name(global_weights) - condition = list(product(projects, weights)) - pool = mp.Pool(processes=min(ps, len(condition))) + project = get_projects(global_weight, project) logging.debug("Calculating stats...") - processes = [] - for project, weight in condition: - global_weight = dic_project[project] - community_file = ConfigGraph.get_community_file(project, weight, - global_weight, - same_gene, "gene", - f".txt") - args = [community_file, ConfigPPI.fasterdb_ppi, project, - weight, global_weight, same_gene, threshold, iteration] - processes.append(pool.apply_async(ppi_stats_analysis, args)) - results = [p.get(timeout=None) for p in processes] + community_file = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, "gene", + f".txt") + ppi_stats_analysis(community_file, ConfigPPI.fasterdb_ppi, project, + weight, global_weight, same_gene, threshold, iteration) if __name__ == "__main__": - testmod() \ No newline at end of file + testmod() diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 0b7a0140..8e7b6394 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -18,8 +18,7 @@ from functools import reduce from pathlib import Path from rpy2.robjects import r, pandas2ri from statsmodels.stats.multitest import multipletests -import numpy as np -from .community_finder import get_projects_name +from .community_finder import get_projects from ..logging_conf import logging_def from itertools import product import multiprocessing as mp @@ -64,8 +63,8 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], {query_region} """ df = pd.read_sql_query(query, cnx) - df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\ - .reset_index() + df = df.pivot_table(index=f"id_{feature}", columns="ft", + values="frequency").reset_index() df[f"id_{feature}"] = df[f"id_{feature}"].astype(str) return df @@ -203,6 +202,8 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path, community. :param outfile: The output table containing frequencies :param feature: The kind of feature to analyse + :param region: only use if feature is 'gene'. Used to focus on \ + a given region in genes (can be gene, exon, intron). :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ @@ -304,20 +305,22 @@ def create_dataframe(project: str, weight: int, global_weight: int, return df -def multiple_nt_lmm_launcher(ps: int, weights: List[int], - global_weights: List[int], +def multiple_nt_lmm_launcher(ps: int, + weight: int, + global_weight: int, + project: str, same_gene: bool, feature: str = 'exon', region: str = '', logging_level: str = "DISABLE"): """ Launch the statistical analysis for every - :param ps: The number of processes we want to use. - :param weights: The list of weights of interaction to consider - :param global_weights: The list global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projcet are merge together and the interaction \ - seen in `global_weight` project are taken into account + :param ps: The number of processes to use + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene :param feature: The kind of analysed feature @@ -328,17 +331,14 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], logging_def(ConfigGraph.community_folder, __file__, logging_level) logging.info("Checking if communities as an effect on nucleotide " "frequency") - global_weights = list(np.unique(global_weights)) - weights = list(np.unique(weights)) - projects, dic_project = get_projects_name(global_weights) + project = get_projects(global_weight, project) nt_list = ["A", "C", "G", "T", "S", "W"] - condition = list(product(projects, weights, nt_list)) + condition = list(product([project], [weight], nt_list)) processes = {} pool = mp.Pool(processes=min(ps, len(condition))) logging.debug("Calculating stats...") dic_df = {} for project, weight, nt in condition: - global_weight = dic_project[project] ckey = get_key(project, weight) if ckey in dic_df: df = dic_df[ckey] @@ -352,11 +352,11 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], dic_df[ckey] = df args = [df, project, weight, global_weight, same_gene, nt, feature, region] - if ckey not in processes.keys(): - processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)] - else: + if ckey in processes: processes[ckey].append( pool.apply_async(get_stat_nt_communities, args)) + else: + processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)] for p, value in processes.items(): project, weight = p.split("_") results = [p.get(timeout=None) for p in value] @@ -365,7 +365,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], fdf = pd.DataFrame(results) fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, - dic_project[project], + global_weight, same_gene, feature, f"lmm-nt_stat.txt", "sf_community_enrichment") diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py index ac965851..d81dc593 100644 --- a/src/find_interaction_cluster/sf_and_communities.py +++ b/src/find_interaction_cluster/sf_and_communities.py @@ -15,7 +15,7 @@ from .community_finder import get_communities import pandas as pd import numpy as np from itertools import product -from .community_finder import get_projects_name +from .community_finder import get_projects import multiprocessing as mp import logging from ..logging_conf import logging_def @@ -362,19 +362,21 @@ def get_key(project: str, weight: int) -> str: return f"{project}_{weight}" -def multiple_stat_launcher(ps: int, weights: List[int], - global_weights: List[int], +def multiple_stat_launcher(ps: int, + weight: int, + global_weight: int, + project: str, same_gene: bool, feature: str = 'exon', logging_level: str = "DISABLE"): """ Launch the statistical analysis for every - :param ps: The number of processes we want to use. - :param weights: The list of weights of interaction to consider - :param global_weights: The list global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projcet are merge together and the interaction \ - seen in `global_weight` project are taken into account + :param ps: The number of processes to use + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene :param feature: The feature we want to analyse @@ -385,22 +387,19 @@ def multiple_stat_launcher(ps: int, weights: List[int], logging.info(f"Checking if communities contains often {feature}s " f"regulated by a splicing factor") sf_list = get_sfname() - global_weights = list(np.unique(global_weights)) - weights = list(np.unique(weights)) - projects, dic_project = get_projects_name(global_weights) - condition = list(product(projects, weights, sf_list, ['down', 'up'])) + project = get_projects(global_weight, project) + condition = list(product([project], [weight], sf_list, ['down', 'up'])) processes = {} pool = mp.Pool(processes=min(ps, len(condition))) logging.debug("Calculating stats...") for project, weight, sf_name, reg in condition: ckey = get_key(project, weight) - global_weight = dic_project[project] args = [sf_name, reg, project, weight, global_weight, same_gene, feature] - if ckey not in processes.keys(): - processes[ckey] = [pool.apply_async(get_stat4communities, args)] - else: + if ckey in processes: processes[ckey].append(pool.apply_async(get_stat4communities, args)) + else: + processes[ckey] = [pool.apply_async(get_stat4communities, args)] for p, value in processes.items(): project, weight = p.split("_") list_tuples = [proc.get(timeout=None) for proc in value] @@ -410,7 +409,7 @@ def multiple_stat_launcher(ps: int, weights: List[int], list_series = [t[1] for t in list_tuples] df = pd.concat(list_df, axis=0, ignore_index=True) outfile = ConfigGraph.get_community_file(project, weight, - dic_project[project], + global_weight, same_gene, feature, "_stat.txt", "sf_community_enrichment") @@ -419,7 +418,7 @@ def multiple_stat_launcher(ps: int, weights: List[int], glm_df["padj"] = multipletests(glm_df['pval'].values, method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, - dic_project[project], + global_weight, same_gene, feature, "_glmm_stat.txt", "sf_community_enrichment") -- GitLab