From 10a59086a0a49b66d02f8c27ea2ad2e754549301 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Mon, 23 Nov 2020 10:05:26 +0100 Subject: [PATCH] src/find_interaction_cluster/nt_and_community.py: modification of multiple_nt_lmm_launcher launcher --- .../nt_and_community.py | 54 ++++++++----------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 8e7b6394..d95347d5 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -22,7 +22,6 @@ from .community_finder import get_projects from ..logging_conf import logging_def from itertools import product import multiprocessing as mp -from .sf_and_communities import get_key def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], @@ -264,6 +263,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, res['pval'] = pval nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt", "_ctrl.txt") + print(df.head()) ndf = create_ctrl_community(df, nt_ctrl_table, feature, region) sum_df = lmm_maker_summary(ndf, outfile, nt) outfile_ctrl = ConfigGraph.get_community_file(project, weight, @@ -334,44 +334,34 @@ def multiple_nt_lmm_launcher(ps: int, project = get_projects(global_weight, project) nt_list = ["A", "C", "G", "T", "S", "W"] condition = list(product([project], [weight], nt_list)) - processes = {} + processes = [] pool = mp.Pool(processes=min(ps, len(condition))) logging.debug("Calculating stats...") - dic_df = {} for project, weight, nt in condition: - ckey = get_key(project, weight) - if ckey in dic_df: - df = dic_df[ckey] - else: - df = create_dataframe(project, weight, global_weight, same_gene, - feature) - nfile_table = ConfigGraph.get_community_file( + df = create_dataframe(project, weight, global_weight, same_gene, + feature) + nfile_table = ConfigGraph.get_community_file( project, weight, global_weight, same_gene, feature, f"_nt_table.txt", "sf_community_enrichment") - df.to_csv(nfile_table, sep="\t", index=False) - dic_df[ckey] = df + df.to_csv(nfile_table, sep="\t", index=False) + + args = [df, project, weight, global_weight, same_gene, nt, feature, region] - if ckey in processes: - processes[ckey].append( - pool.apply_async(get_stat_nt_communities, args)) - else: - processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)] - for p, value in processes.items(): - project, weight = p.split("_") - results = [p.get(timeout=None) for p in value] - pool.close() - pool.join() - fdf = pd.DataFrame(results) - fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1] - outfile = ConfigGraph.get_community_file(project, weight, - global_weight, - same_gene, feature, - f"lmm-nt_stat.txt", - "sf_community_enrichment") - nfolder = outfile.parent / "nt_analysis" - noutfile = nfolder / outfile.name - fdf.to_csv(noutfile, sep="\t", index=False) + processes.append(pool.apply_async(get_stat_nt_communities, args)) + results = [p.get(timeout=None) for p in processes] + pool.close() + pool.join() + fdf = pd.DataFrame(results) + fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1] + outfile = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, feature, + f"lmm-nt_stat.txt", + "sf_community_enrichment") + nfolder = outfile.parent / "nt_analysis" + noutfile = nfolder / outfile.name + fdf.to_csv(noutfile, sep="\t", index=False) if __name__ == "__main__": -- GitLab