From c0104bd4566b8256859a4539e09010bbb2a4415c Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Mon, 26 Oct 2020 15:12:42 +0100 Subject: [PATCH] src/find_interaction_cluster/*.py: add a paramter feature --- src/find_interaction_cluster/__main__.py | 18 +++-- .../community_finder.py | 62 ++++++++------ src/find_interaction_cluster/config.py | 7 +- .../nt_and_community.py | 80 ++++++++++++------- .../sf_and_communities.py | 77 +++++++++++++----- 5 files changed, 166 insertions(+), 78 deletions(-) diff --git a/src/find_interaction_cluster/__main__.py b/src/find_interaction_cluster/__main__.py index 2d7271ae..71a3bcc3 100644 --- a/src/find_interaction_cluster/__main__.py +++ b/src/find_interaction_cluster/__main__.py @@ -14,6 +14,8 @@ from typing import List from .install_hipMCL import install_hipmcl from .sf_and_communities import multiple_stat_launcher from .nt_and_community import multiple_nt_lmm_launcher +import logging +from ..logging_conf import logging_def @lp.parse @@ -21,7 +23,7 @@ def launcher(weight: List[int] = (1), global_weight: List[int] = (0), same_gene: bool = True, ps: int = ConfigGraph.cpu, - html_fig: bool = False, + html_fig: bool = False, feature: str = 'exon', logging_level: str = "DISABLE"): """ Script used to find communities inside exon co-localized within a project @@ -35,15 +37,21 @@ def launcher(weight: List[int] = (1), :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default True) :param html_fig: True to display the html figure (default False). + :param feature: The feature we want to analyse (default 'exon') :param logging_level: The level of data to display (default 'DISABLE') """ + logging_def(ConfigGraph.community_folder, __file__, logging_level) + if feature == "gene" and not same_gene: + logging.warning(f"When level is equal to gene then same_gene must " + f"be true. Setting it to True.") + same_gene = True if not ConfigGraph.get_hipmcl_prog().is_file(): install_hipmcl("INFO") multiple_community_launcher(1, weight, global_weight, same_gene, html_fig, - logging_level) - multiple_stat_launcher(ps, weight, global_weight, same_gene, - logging_level) - multiple_nt_lmm_launcher(ps, weight, global_weight, same_gene, + feature, logging_level) + # multiple_stat_launcher(ps, weight, global_weight, same_gene, feature, + # logging_level) + multiple_nt_lmm_launcher(ps, weight, global_weight, same_gene, feature, logging_level) diff --git a/src/find_interaction_cluster/community_finder.py b/src/find_interaction_cluster/community_finder.py index 0ae68de0..b4180848 100644 --- a/src/find_interaction_cluster/community_finder.py +++ b/src/find_interaction_cluster/community_finder.py @@ -88,14 +88,15 @@ def write_cytoscape_graph(graph: nx.Graph, dic_community: Dict[str, str], def find_communities(graph: nx.Graph, project: str, - outfile: Path, result_file: Path - ) -> Tuple[pd.DataFrame, Dict]: + outfile: Path, result_file: Path, + feature: str = 'exon') -> Tuple[pd.DataFrame, Dict]: """ Find the communities within the graph. :param graph: Graph of co-localized exons :param project: The name of the project :param outfile: A file containing interaction + :param feature: the kind of analysed feature (exons or gene) :return: The communities within the graph and the community \ to wich each exon belong """ @@ -111,7 +112,7 @@ def find_communities(graph: nx.Graph, project: str, modularity = community.modularity(graph, communities, weight='X') d = {'community': [], 'nodes': [], 'edges': [], 'EC': [], 'HCS': [], '%E vs E in complete G': [], - 'cov': [], 'modularity': [], 'exons': []} + 'cov': [], 'modularity': [], f'{feature}s': []} colors = cm.hsv(np.linspace(0, 1, len(communities))) logging.debug('Creating community result file') for k, c in enumerate(communities): @@ -139,7 +140,7 @@ def find_communities(graph: nx.Graph, project: str, d['EC'].append(edge_connectivity) d['HCS'].append(is_hc) d['%E vs E in complete G'].append(complete) - d['exons'].append(', '.join(sorted(list(c)))) + d[f'{feature}s'].append(', '.join(sorted(list(c)))) d['cov'].append(cov) d['modularity'].append(round(modularity, 5)) if nb_nodes <= 9: @@ -147,8 +148,8 @@ def find_communities(graph: nx.Graph, project: str, d['project'] = [project] * len(d['community']) df = pd.DataFrame(d) if warn > 0: - logging.warning(f" {warn} communities were composed by one exon or " - f"less !") + logging.warning(f" {warn} communities were composed by one {feature} " + f"or less !") return df, dic_community @@ -237,7 +238,7 @@ def create_figure(graph: nx.Graph, outfile: Path, dic_community: Dict, auto_open=False, validate=False) -def get_figure_title(project, weight, global_weight, same_gene): +def get_figure_title(project, weight, global_weight, same_gene, feature): """ The title of the graphics. @@ -249,27 +250,30 @@ def get_figure_title(project, weight, global_weight, same_gene): seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) + :param feature: The kind of analysed features :return: A figure title """ - title = f"Co-localisation between exons having a weight greater than " \ + title = f"Co-localisation between {feature}s having a weight greater than " \ f"{weight} in " if global_weight == 0: title += f"the project {project}" else: title += f" at least {global_weight} chia-pet projects" - if not same_gene: - title += ' (co-localisation within gene are NOT taken into account)' - else: - title += ' (co-localisation within gene ARE taken into account)' + if feature == 'exon': + if not same_gene: + title += ' (co-localisation within gene are NOT taken into ' \ + 'account)' + else: + title += ' (co-localisation within gene ARE taken into account)' return title def write_interaction_file(arr_interaction: np.array, project: str, weight: int, global_weight: int, same_gene: bool, - use_weight: bool = False): + use_weight: bool = False, feature: str = 'exon'): """ - :param arr_interaction: Each couples of co-localized exons within a \ + :param arr_interaction: Each couples of co-localized feature within a \ project. :param project: The name of the project of interest :param weight: The minimum weight of interaction to consider @@ -280,14 +284,16 @@ def write_interaction_file(arr_interaction: np.array, project: str, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) :param use_weight: Say if we want to write the weight into the result file. + :param feature: Says if we want to work at gene or at exons level :return: """ logging.debug('Writing interaction files ...') outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, "_interation.txt") + same_gene, feature, + f"_interation.txt") result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, - "_communities.txt") + same_gene, feature, + f"_communities.txt") with outfile.open('w') as f: for exon1, exon2, cweight in arr_interaction: if not use_weight: @@ -298,7 +304,7 @@ def write_interaction_file(arr_interaction: np.array, project: str, def community_finder(weight: int, global_weight: int, project: str = "", same_gene=True, html_fig: bool = False, - logging_level: str = "DISABLE"): + feature: str = "exon", logging_level: str = "DISABLE"): """ Find communities inside co-localisation between exons found in \ a ChIA-PET project. @@ -313,26 +319,30 @@ def community_finder(weight: int, global_weight: int, project: str = "", same gene (True) or not (False) (default False) :param logging_level: The level of data to display (default 'DISABLE') :param html_fig: True to create an html figure, false else + :param feature: The feature we want to analyse (default 'exon') """ ConfigGraph.output_folder.mkdir(exist_ok=True, parents=True) logging_def(ConfigGraph.output_folder, __file__, logging_level) cnx = sqlite3.connect(ConfigGraph.db_file) interaction = get_project_colocalisation(cnx, project, weight, - global_weight, same_gene, True) + global_weight, same_gene, True, + level=feature) outfile, result_file = write_interaction_file(interaction, project, weight, global_weight, - same_gene) + same_gene, feature=feature) graph = create_graph(interaction) - df, dic_community = find_communities(graph, project, outfile, result_file) + df, dic_community = find_communities(graph, project, outfile, result_file, + feature) logging.debug('Writing results ...') outfiles = [ConfigGraph.get_community_file( - project, weight, global_weight, same_gene, ext) - for ext in ['.txt', '.cyjs', '.html']] + project, weight, global_weight, same_gene, feature, ext) + for ext in [f'.txt', f'.cyjs', f'.html']] df.to_csv(outfiles[0], sep="\t", index=False) logging.debug("Saving the graph ...") write_cytoscape_graph(graph, dic_community, outfiles[1]) if html_fig: - fig_title = get_figure_title(project, weight, global_weight, same_gene) + fig_title = get_figure_title(project, weight, global_weight, same_gene, + feature) create_figure(graph, outfiles[2], dic_community, fig_title) logging.debug('Done !') @@ -378,6 +388,7 @@ def get_projects_name(global_weights: List[int]) -> Tuple[List[str], Dict]: def multiple_community_launcher(ps: int, weights: List[int], global_weights: List[int], same_gene: bool, html_fig: bool = False, + feature: str = 'exon', logging_level: str = "DISABLE"): """ :param ps: The number of processes we want to use. @@ -389,6 +400,7 @@ def multiple_community_launcher(ps: int, weights: List[int], :param same_gene: Say if we consider as co-localised exon within the \ same gene :param html_fig: True to create an html figure, false else + :param feature: The feature we want to analyse (default 'exon') :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -403,7 +415,7 @@ def multiple_community_launcher(ps: int, weights: List[int], global_weight = dic_project[project] logging.info(f'Finding community for project : {project}, ' f'global_weight : {global_weight}, weight: {weight}') - args = [weight, global_weight, project, same_gene, html_fig] + args = [weight, global_weight, project, same_gene, html_fig, feature] processes.append(pool.apply_async(community_finder, args)) for proc in processes: proc.get(timeout=None) diff --git a/src/find_interaction_cluster/config.py b/src/find_interaction_cluster/config.py index 0fbafd92..4c6cbcfb 100644 --- a/src/find_interaction_cluster/config.py +++ b/src/find_interaction_cluster/config.py @@ -36,7 +36,8 @@ def get_weight_folder(weight: int, global_weight: int): def get_community_file(project: str, weight: int, global_weight: int, - same_gene: bool, ext: str = ".txt", stat: bool = False): + same_gene: bool, feature: str = 'exon', + ext: str = ".txt", stat: bool = False): """ Get the output file of interest. @@ -48,6 +49,7 @@ def get_community_file(project: str, weight: int, global_weight: int, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param the kind of feature analyzed :param ext: The file extension :param stat: True to place the result in 'sf_community_enrichment' \ subfolder @@ -59,7 +61,8 @@ def get_community_file(project: str, weight: int, global_weight: int, folder.mkdir(exist_ok=True, parents=True) if global_weight != 0: project = f"global-weight-{global_weight}" - return folder / f"{project}_weight-{weight}_same_gene-{same_gene}{ext}" + return folder / \ + f"{project}_weight-{weight}_same_gene-{same_gene}_{feature}{ext}" def get_hipmcl_folder() -> Path: diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index c9dbfceb..ec799b42 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -26,36 +26,46 @@ import multiprocessing as mp from .sf_and_communities import get_key -def get_nt_frequency(cnx: sqlite3.Connection, list_exon: List[str] - ) -> pd.DataFrame: +def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], + feature: str) -> pd.DataFrame: """ - Get the frequency of every nucleotides for exons in list_exon. + Get the frequency of every nucleotides for features in list_ft. :param cnx: Connection to chia-pet database - :param list_exon: The list of exons for which we want to get \ + :param list_ft: The list of exons for which we want to get + :param feature: the kind of feature analysed :return: the frequency of nucleotides for the list of exons. >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file), - ... ["1_1", "1_2"]) + ... ["1_1", "1_2"], "exon") >>> d[["id_exon", 'A', 'C', 'G', 'T']] ft id_exon A C G T 0 1_1 16.63480 34.60803 32.12237 16.63480 1 1_2 16.06426 26.10442 39.75904 18.07229 + >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file), + ... ['1', '2'], "gene") + >>> d[["id_gene", 'A', 'C', 'G', 'T']] + ft id_gene A C G T + 0 1 29.49376 18.34271 18.43874 33.72479 + 1 2 31.90401 16.40251 18.79033 32.90315 """ + if feature == "gene": + list_ft = [int(ft) for ft in list_ft] query = f""" - SELECT ft, id_exon, frequency - FROM cin_exon_frequency - WHERE id_exon IN {tuple(list_exon)} + SELECT ft, id_{feature}, frequency + FROM cin_{feature}_frequency + WHERE id_{feature} IN {tuple(list_ft)} AND ft_type="nt" """ df = pd.read_sql_query(query, cnx) - df = df.pivot_table(index="id_exon", columns="ft", values="frequency")\ + df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\ .reset_index() + df[f"id_{feature}"] = df[f"id_{feature}"].astype(str) return df def get_community_table(communities: List[List[str]], - size_threshold: int) -> pd.DataFrame: + size_threshold: int, feature: str) -> pd.DataFrame: """ return the table indicating the name of the exons and the \ the name of the community. @@ -63,22 +73,29 @@ def get_community_table(communities: List[List[str]], :param communities: List of community of exons :param size_threshold: The required size a community must \ have to be considered + :param feature: The kind of feature analysed :return: table of community >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']] - >>> get_community_table(c, 3) + >>> get_community_table(c, 3, 'exon') community id_exon community_size 0 C1 7_9 3 1 C1 4_19 3 2 C1 3_3 3 + >>> c = [['1', '2'], ['7', '49', '3']] + >>> get_community_table(c, 3, 'gene') + community id_gene community_size + 0 C1 7 3 + 1 C1 49 3 + 2 C1 3 3 """ - dic = {"community": [], "id_exon": [], "community_size": []} + dic = {"community": [], f"id_{feature}": [], "community_size": []} for k, comm in enumerate(communities): if len(comm) >= size_threshold: name = f'C{k}' clen = len(comm) for exon in comm: dic["community"].append(name) - dic['id_exon'].append(exon) + dic[f'id_{feature}'].append(exon) dic["community_size"].append(clen) return pd.DataFrame(dic) @@ -114,12 +131,13 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: folder = outfile.parent / "diagnostics" folder.mkdir(parents=True, exist_ok=True) partial_name = outfile.name.replace('.txt', '') - return lmm(df, str(folder), partial_name)["Pr(>Chisq)"][1] + res = lmm(df, str(folder), partial_name) + return res["Pr(>Chisq)"][1] def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, global_weight: int, same_gene: bool, - nt: str + nt: str, feature: str = "exon", ) -> pd.Series: """ Get data (proportion of `reg` regulated exons by a splicing factor @@ -136,6 +154,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) :param nt: The nucleotide of interest + :param feature: The king of feature analysed """ logging.debug(f"lmm for {project}, w:{weight}, " f"g:{global_weight} nt: {nt}") @@ -143,7 +162,8 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, f"{nt}_stat.txt", + same_gene, feature, + f"{nt}_stat.txt", True) nfolder = outfile.parent / "nt_analysis" nfolder.mkdir(exist_ok=True, parents=True) @@ -156,7 +176,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, def create_dataframe(project: str, weight: int, global_weight: int, - same_gene: bool) -> pd.DataFrame: + same_gene: bool, feature: str = 'exon') -> pd.DataFrame: """ :param project: The name of the project of interest :param weight: The minimum weight of interaction to consider @@ -166,26 +186,29 @@ def create_dataframe(project: str, weight: int, global_weight: int, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) + :param feature: The kind of feature to analyse :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ - size_threshold = 50 + size_threshold = 10 if feature == "gene" else 50 result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, + same_gene, feature, "_communities.txt") communities = get_communities(result, 0) ncommunities = [c for c in communities if len(c) >= size_threshold] cnx = sqlite3.connect(ConfigGraph.db_file) list_exon = reduce(lambda x, y: x + y, ncommunities) - df = get_nt_frequency(cnx, list_exon) - df_com = get_community_table(communities, size_threshold) - df = df.merge(df_com, how="left", on="id_exon") + df = get_nt_frequency(cnx, list_exon, feature) + df_com = get_community_table(communities, size_threshold, feature) + df = df.merge(df_com, how="left", on=f"id_{feature}") return df def multiple_nt_lmm_launcher(ps: int, weights: List[int], global_weights: List[int], - same_gene: bool, logging_level: str = "DISABLE"): + same_gene: bool, + feature: str = 'exon', + logging_level: str = "DISABLE"): """ Launch the statistical analysis for every @@ -197,6 +220,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param feature: The kind of analysed feature :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -218,15 +242,16 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], if ckey in dic_df: df = dic_df[ckey] else: - df = create_dataframe(project, weight, global_weight, same_gene) + df = create_dataframe(project, weight, global_weight, same_gene, + feature) nfile_table = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, + same_gene, feature, f"nt_table.txt", True) df.to_csv(nfile_table, sep="\t", index=False) dic_df[ckey] = df - args = [df, project, weight, global_weight, same_gene, nt] + args = [df, project, weight, global_weight, same_gene, nt, feature] if ckey not in processes.keys(): processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)] else: @@ -241,7 +266,8 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, dic_project[project], - same_gene, f"lmm-nt_stat.txt", + same_gene, feature, + f"lmm-nt_stat.txt", True) nfolder = outfile.parent / "nt_analysis" noutfile = nfolder / outfile.name diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py index 724406d3..b950c8b2 100644 --- a/src/find_interaction_cluster/sf_and_communities.py +++ b/src/find_interaction_cluster/sf_and_communities.py @@ -28,9 +28,9 @@ from rpy2.robjects import r, pandas2ri from pathlib import Path -def get_exon_regulated_in_communities(community: List[str], - regulated_exons: List[str] - ) -> Tuple[int, int, float]: +def get_ft_regulated_in_communities(community: List[str], + regulated_exons: List[str] + ) -> Tuple[int, int, float]: """ :param community: strongly co-localized group of exons belonging to \ @@ -83,7 +83,7 @@ def get_stat_community_exons(cnx: sqlite3.Connection, dic: Dict[str, List], n_reg, n_tot = np.nan, np.nan for cur_com, name in [[full_com, 'All-community'], [ctrl_exons, "FASTERDB"]]: reg_community, tot_commmunity, prop = \ - get_exon_regulated_in_communities(cur_com, regulated_exons) + get_ft_regulated_in_communities(cur_com, regulated_exons) if name == 'All-community': n_reg = reg_community n_tot = tot_commmunity @@ -211,7 +211,7 @@ def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float: def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, project: str, weight: int, global_weight: int, - same_gene: bool) -> pd.Series: + same_gene: bool, feature: str = "exon") -> pd.Series: """ Create the glmm statistics for a given splicing factor with \ given communities. @@ -227,13 +227,14 @@ def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) + :param feature: The kind of feature analysed :return: The glmm pvalue among with other informations """ ndf = df.loc[-df['community'].isin(["All-community", "FASTERDB"]), :].copy() expanded_df = expand_dataframe(ndf) outfile = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, + same_gene, feature, f"{sf_name}_{reg}_stat.txt", True) noutfold = outfile.parent / "expanded_df" noutfold.mkdir(exist_ok=True, parents=True) @@ -244,9 +245,36 @@ def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, "pval": pval}) +def adapt_regulated_list(cnx: sqlite3.Connection, + exon_list: List, sf_name: str, reg: str, + feature: str = 'exon') -> List: + """ + Transform a list containing exons into a list containing gene only if \ + feature is equal to 'gene'. + + :param cnx: connection to chia-pet database + :param exon_list: A list of exons + :param sf_name: The name of the splicing factor regulating the exons \ + in exon_list + :param reg: The regulation of the exons regulated by sf_name + :param feature: The kind of feature to analyse + :return: if feature is exon then return the original list else, + return the list containing the regulated gene + """ + if feature == 'exon': + return exon_list + dic = {"down": "up", "up": "down"} + regulated_dic, number = get_every_events_4_a_sl(cnx, sf_name, dic[reg]) + exon_list2 = regulated_dic[f"{sf_name}_{dic[reg]}"] + genes1 = list(np.unique([exon.split('_')[0] for exon in exon_list])) + genes2 = list(np.unique([exon.split('_')[0] for exon in exon_list2])) + return [gene for gene in genes1 if gene not in genes2] + + def get_stat4communities(sf_name: str, reg: str, project: str, weight: int, - global_weight: int, same_gene: bool + global_weight: int, same_gene: bool, + feature: str = 'exon', ) -> Tuple[pd.DataFrame, pd.Series]: """ Get data (proportion of `reg` regulated exons by a splicing factor @@ -262,23 +290,31 @@ def get_stat4communities(sf_name: str, reg: str, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) + :param feature: The kind of analysed feature """ logging.debug(f"Working on {sf_name}-{reg}, for {project}, w:{weight}, " f"g:{global_weight}") cnx = sqlite3.connect(ConfigGraph.db_file) result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, + same_gene, feature, "_communities.txt") communities = get_communities(result, 0) regulated_dic, number = get_every_events_4_a_sl(cnx, sf_name, reg) - reg_exons = regulated_dic[sf_name + "_" + reg] + reg_ft = regulated_dic[sf_name + "_" + reg] + reg_ft = adapt_regulated_list(cnx, reg_ft, sf_name, reg, feature) + if len(reg_ft) == 0: + raise IndexError(f'There no {feature} {reg} regulated specifically ' + f'by {sf_name}') d = {"community": [], "reg_in_community": [], "community_size": [], "%reg in community": [], 'pval': []} - d, n2, t2 = get_stat_community_exons(cnx, d, communities, reg_exons) + d, n2, t2 = get_stat_community_exons(cnx, d, communities, reg_ft) + threshold = 49 + if feature == "gene": + threshold = 9 for k, community in enumerate(communities): - if len(community) > 49: + if len(community) > threshold: reg_community, tot_commmunity, prop = \ - get_exon_regulated_in_communities(community, reg_exons) + get_ft_regulated_in_communities(community, reg_ft) d["community"].append(f"C{k + 1}") d['reg_in_community'].append(reg_community) d['community_size'].append(tot_commmunity) @@ -292,7 +328,7 @@ def get_stat4communities(sf_name: str, reg: str, d['project'] = [project] * len(d["community"]) df = pd.DataFrame(d) s = glmm_statistics(df, sf_name, reg, project, weight, global_weight, - same_gene) + same_gene, feature) return df, s @@ -327,7 +363,8 @@ def get_key(project: str, weight: int) -> str: def multiple_stat_launcher(ps: int, weights: List[int], global_weights: List[int], - same_gene: bool, logging_level: str = "DISABLE"): + same_gene: bool, feature: str = 'exon', + logging_level: str = "DISABLE"): """ Launch the statistical analysis for every @@ -339,12 +376,13 @@ def multiple_stat_launcher(ps: int, weights: List[int], seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param feature: The feature we want to analyse :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) logging_def(ConfigGraph.community_folder, __file__, logging_level) - logging.info("Checking if communities contains often exons regulated by a" - " splicing factor") + logging.info(f"Checking if communities contains often {feature}s " + f"regulated by a splicing factor") sf_list = get_sfname() global_weights = list(np.unique(global_weights)) weights = list(np.unique(weights)) @@ -356,7 +394,8 @@ def multiple_stat_launcher(ps: int, weights: List[int], for project, weight, sf_name, reg in condition: ckey = get_key(project, weight) global_weight = dic_project[project] - args = [sf_name, reg, project, weight, global_weight, same_gene] + args = [sf_name, reg, project, weight, global_weight, same_gene, + feature] if ckey not in processes.keys(): processes[ckey] = [pool.apply_async(get_stat4communities, args)] else: @@ -371,7 +410,7 @@ def multiple_stat_launcher(ps: int, weights: List[int], df = pd.concat(list_df, axis=0, ignore_index=True) outfile = ConfigGraph.get_community_file(project, weight, dic_project[project], - same_gene, + same_gene, feature, "_stat.txt", True) df.to_csv(outfile, sep="\t", index=False) glm_df = pd.DataFrame(list_series) @@ -379,7 +418,7 @@ def multiple_stat_launcher(ps: int, weights: List[int], method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, dic_project[project], - same_gene, + same_gene, feature, "_glmm_stat.txt", True) glm_df.to_csv(outfile, sep="\t", index=False) -- GitLab