From aacaf00f74a197f22c837ec49fc9aca81db4836b Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Mon, 23 Nov 2020 15:47:31 +0100 Subject: [PATCH] src/find_interaction_cluster/nt_and_community.py: add permutations tests --- .../nt_and_community.py | 291 +++++++++++++++--- 1 file changed, 242 insertions(+), 49 deletions(-) diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index d95347d5..a899ebac 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -7,12 +7,11 @@ Description: The goal of this script is to see if the nucleotide \ frequency is not random between communities """ - import pandas as pd import logging import sqlite3 from .config import ConfigGraph, get_communities -from typing import List +from typing import List, Tuple, Dict from doctest import testmod from functools import reduce from pathlib import Path @@ -22,6 +21,9 @@ from .community_finder import get_projects from ..logging_conf import logging_def from itertools import product import multiprocessing as mp +import numpy as np +from .radomization_test_ppi import get_pvalue +from tqdm import tqdm def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], @@ -82,20 +84,20 @@ def get_community_table(communities: List[List[str]], >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']] >>> get_community_table(c, 3, 'exon') community id_exon community_size - 0 C1 7_9 3 - 1 C1 4_19 3 - 2 C1 3_3 3 + 0 C2 7_9 3 + 1 C2 4_19 3 + 2 C2 3_3 3 >>> c = [['1', '2'], ['7', '49', '3']] >>> get_community_table(c, 3, 'gene') community id_gene community_size - 0 C1 7 3 - 1 C1 49 3 - 2 C1 3 3 + 0 C2 7 3 + 1 C2 49 3 + 2 C2 3 3 """ dic = {"community": [], f"id_{feature}": [], "community_size": []} for k, comm in enumerate(communities): if len(comm) >= size_threshold: - name = f'C{k}' + name = f'C{k + 1}' clen = len(comm) for exon in comm: dic["community"].append(name) @@ -173,7 +175,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str res_df.rename({'index': 'community'}, inplace=True, axis=1) res_df['community'] = res_df['community'].str.replace('community', '') res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" - mean_df = df[[nt, "community", "community_size"]].\ + mean_df = df[[nt, "community", "community_size"]]. \ groupby(["community", "community_size"]).mean().reset_index() return res_df.merge(mean_df, how="left", on="community") @@ -193,21 +195,18 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: return [str(cid[0]) for cid in res] -def create_ctrl_community(df: pd.DataFrame, outfile: Path, +def create_ctrl_community(df: pd.DataFrame, feature: str = 'exon', region: str = "" ) -> pd.DataFrame: """ :param df: A dataframe containing the frequency of each feature in a \ community. - :param outfile: The output table containing frequencies :param feature: The kind of feature to analyse :param region: only use if feature is 'gene'. Used to focus on \ a given region in genes (can be gene, exon, intron). :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ - if outfile.is_file(): - return pd.read_csv(outfile, sep="\t") size_threshold = 10 if feature == "gene" else 50 cnx = sqlite3.connect(ConfigGraph.db_file) ft_id = get_ft_id(cnx, feature) @@ -218,15 +217,165 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path, df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] df = pd.concat([df, df_nt], axis=0, ignore_index=True) - df.to_csv(outfile, sep="\t", index=False) return df +def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, + nt: str, outfile_diag: Path) -> pd.DataFrame: + """ + + :param df: df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param feature: The kind of feature analysed + :param region: the region of interest to extract from gene + :param nt: The nucleotide of interest + :param outfile_diag: File from which the diagnostics folder will be \ + inferred + :return: The dataframe with the p-value compared to the control \ + list of exons. + """ + ndf = create_ctrl_community(df, feature, region) + return lmm_maker_summary(ndf, outfile_diag, nt) + + +def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: + """ + Create a dictionary containing the exons contained in each community. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param feature: the feature of interest (exon, gene) + :return: A dictionary linking each community to the exons it contains + + >>> dataf = pd.DataFrame({"id_gene": ['1', '2', '3', '4'], + ... 'community': ['C1', 'C1', 'C2', 'C2']}) + >>> get_feature_by_community(dataf, 'gene') + {'C1': ['1', '2'], 'C2': ['3', '4']} + >>> dataf.rename({"id_gene": "id_exon"}, axis=1, inplace=True) + >>> get_feature_by_community(dataf, 'exon') + {'C1': ['1', '2'], 'C2': ['3', '4']} + """ + dic = {} + for i in range(df.shape[0]): + com, id_ft = df.iloc[i, :][['community', f'id_{feature}']] + + if com in dic: + dic[com].append(id_ft) + else: + dic[com] = [id_ft] + return dic + + +def get_permutation_mean(df_ctrl: pd.DataFrame, + nt: str, size: int, iteration: int) -> List[float]: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param nt: A nucleotide of interest + :param size: The size of each sub samples to create + :param iteration: The number of sub samples to create + :return: The list of mean frequencies of `nt` in each subsample + """ + return [ + float(np.mean(df_ctrl[nt].sample(size, replace=True).values)) + for _ in range(iteration) + ] + + +def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame, + nt: str, iteration: int) -> Tuple[float, float, str]: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param row: A line of a dataframe containing the frequency of \ + each feature inside a community. + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param nt: A nucleotide of interest + :param iteration: The number of sub samples to create + :return: The ctrl mean frequency value of `nt` \ + the pvalue and the regulation of the enrichment/impoverishment \ + of the community in `row` compared to control exons. + """ + list_values = get_permutation_mean(df_ctrl, nt, row["community_size"], + iteration) + pval, reg = get_pvalue(np.array(list_values), row[nt], iteration) + return float(np.mean(list_values)), pval, reg + + +def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, + nt: str, iteration: int, dic_com: Dict) -> pd.DataFrame: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param feature: The feature of interest (gene, exon) + :param nt: A nucleotide of interest + :param iteration: The number of sub samples to create + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :return: The dataframe containing p-values and regulation \ + indicating the enrichment of + """ + list_pval, list_reg, mean_ctrl = ([] for _ in range(3)) + for i in tqdm(range(df.shape[0]), desc="performing permutations"): + row = df.iloc[i, :] + res = perm_community_pval(row, + df_ctrl.loc[ + -df_ctrl[f'id_{feature}' + ].isin(dic_com[row['community']]), + :], + nt, iteration) + [x.append(y) for x, y in zip([mean_ctrl, list_pval, list_reg], res)] + adj_pvals = multipletests(list_pval, alpha=0.05, + method='fdr_bh', + is_sorted=False, + returnsorted=False)[1] + adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . " + for i in range(len(list_reg))] + df[f'mean_{iteration}_ctrl'] = mean_ctrl + df[f'p-adj'] = adj_pvals + df[f'reg-adj'] = adj_regs + return df + + +def perm_with_ctrl(df: pd.DataFrame, feature: str, + nt: str, df_ctrl: pd.DataFrame, dic_com: Dict, + iteration: int) -> pd.DataFrame: + """ + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param feature: The kind of feature analysed + :param nt: The nucleotide of interest + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :param iteration: The number of sub samples to create + :return: The dataframe with the p-value compared to the control \ + list of exons. + """ + mean_df = df[[nt, "community", "community_size"]]. \ + groupby(["community", "community_size"]).mean().reset_index() + return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com) + + def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, global_weight: int, same_gene: bool, - nt: str, feature: str = "exon", - region: str = "", - ) -> pd.Series: + nt: str, df_ctrl: pd.DataFrame, + dic_com: Dict, + feature: str = "exon", + region: str = "", test_type: str = "", + iteration: int = 1000) -> pd.Series: """ Get data (proportion of `reg` regulated exons by a splicing factor `sf_name` within a community) for every community. @@ -242,17 +391,22 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) :param nt: The nucleotide of interest - :param feature: The king of feature analysed + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :param feature: The kind of feature analysed :param region: the region of interest to extract from gene + :param test_type: The type of test to make (permutation or lmm) + :param iteration: The number of sub samples to create """ - logging.debug(f"lmm for {project}, w:{weight}, " + logging.debug(f"{test_type} for {project}, w:{weight}, " f"g:{global_weight} nt: {nt}") res = {"project": [], "nt": [], 'pval': []} - outfile = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, feature, - f"{nt}_stat.txt", + f"{nt}_stat_{test_type}.txt", "sf_community_enrichment") nfolder = outfile.parent / "nt_analysis" nfolder.mkdir(exist_ok=True, parents=True) @@ -261,23 +415,25 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, res['project'] = project res['nt'] = nt res['pval'] = pval - nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt", - "_ctrl.txt") - print(df.head()) - ndf = create_ctrl_community(df, nt_ctrl_table, feature, region) - sum_df = lmm_maker_summary(ndf, outfile, nt) outfile_ctrl = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, feature, - f"{nt}_VS_CTRL_stat.txt", + f"{nt}_VS_CTRL_stat_" + f"{test_type}.txt", "sf_community_enrichment") + if test_type == "lmm": + rdf = lmm_with_ctrl(df, feature, region, nt, outfile) + else: + rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration) noutfile_ctrl = nfolder / outfile_ctrl.name - sum_df.to_csv(noutfile_ctrl, sep="\t", index=False) + rdf.to_csv(noutfile_ctrl, sep="\t", index=False) return pd.Series(res) def create_dataframe(project: str, weight: int, global_weight: int, - same_gene: bool, feature: str = 'exon') -> pd.DataFrame: + same_gene: bool, feature: str = 'exon', + region: str = "", from_communities: bool = True + ) -> pd.DataFrame: """ :param project: The name of the project of interest :param weight: The minimum weight of interaction to consider @@ -288,29 +444,65 @@ def create_dataframe(project: str, weight: int, global_weight: int, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) :param feature: The kind of feature to analyse + :param from_communities: True if we only select gene/exons + :param region: the region of interest to extract from gene :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ size_threshold = 10 if feature == "gene" else 50 - result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, - "_communities.txt") - communities = get_communities(result, 0) - ncommunities = [c for c in communities if len(c) >= size_threshold] cnx = sqlite3.connect(ConfigGraph.db_file) - list_exon = reduce(lambda x, y: x + y, ncommunities) - df = get_nt_frequency(cnx, list_exon, feature) - df_com = get_community_table(communities, size_threshold, feature) - df = df.merge(df_com, how="left", on=f"id_{feature}") + if from_communities: + result = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, feature, + "_communities.txt") + communities = get_communities(result, 0) + ncommunities = [c for c in communities if len(c) >= size_threshold] + list_exon = reduce(lambda x, y: x + y, ncommunities) + + df_com = get_community_table(communities, size_threshold, feature) + df = get_nt_frequency(cnx, list_exon, feature, region) + df = df.merge(df_com, how="left", on=f"id_{feature}") + else: + list_exon = get_ft_id(cnx, feature) + df = get_nt_frequency(cnx, list_exon, feature, region) return df +def create_dataframes(project, weight, global_weight, same_gene, feature, + region, test_type + ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]: + """ + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param feature: The kind of analysed feature + :param region: the region of interest to extract from gene + :param test_type: The type of test to make (permutation or lmm) + """ + df = create_dataframe(project, weight, global_weight, same_gene, + feature, region) + if test_type == 'lmm': + df_ctrl = pd.DataFrame() + dic_com = {} + else: + df_ctrl = create_dataframe(project, weight, global_weight, same_gene, + feature, region, from_communities=False) + dic_com = get_feature_by_community(df, feature) + return df, df_ctrl, dic_com + + def multiple_nt_lmm_launcher(ps: int, weight: int, global_weight: int, project: str, same_gene: bool, feature: str = 'exon', region: str = '', + test_type: str = "lmm", + iteration: int = 1000, logging_level: str = "DISABLE"): """ Launch the statistical analysis for every @@ -325,6 +517,8 @@ def multiple_nt_lmm_launcher(ps: int, same gene :param feature: The kind of analysed feature :param region: the region of interest to extract from gene + :param test_type: The type of test to make (permutation or lmm) + :param iteration: The number of iteration to perform :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) @@ -332,22 +526,21 @@ def multiple_nt_lmm_launcher(ps: int, logging.info("Checking if communities as an effect on nucleotide " "frequency") project = get_projects(global_weight, project) - nt_list = ["A", "C", "G", "T", "S", "W"] + nt_list = ["A"] # , "C", "G", "T", "S", "W"] condition = list(product([project], [weight], nt_list)) processes = [] pool = mp.Pool(processes=min(ps, len(condition))) - logging.debug("Calculating stats...") + logging.debug("Creating tables") + df, df_ctrl, dic_com = create_dataframes(project, weight, global_weight, + same_gene, feature, region, + test_type) for project, weight, nt in condition: - df = create_dataframe(project, weight, global_weight, same_gene, - feature) nfile_table = ConfigGraph.get_community_file( - project, weight, global_weight, same_gene, feature, - f"_nt_table.txt", "sf_community_enrichment") + project, weight, global_weight, same_gene, feature, + f"_nt_table.txt", "sf_community_enrichment") df.to_csv(nfile_table, sep="\t", index=False) - - - args = [df, project, weight, global_weight, same_gene, nt, feature, - region] + args = [df, project, weight, global_weight, same_gene, nt, df_ctrl, + dic_com, feature, region, test_type, iteration] processes.append(pool.apply_async(get_stat_nt_communities, args)) results = [p.get(timeout=None) for p in processes] pool.close() -- GitLab