From 93caebe123b084aa8b9056e9fe0f6ab356e7fe5e Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Tue, 17 Nov 2020 15:11:55 +0100 Subject: [PATCH] src/find_interaction_cluster/radomization_test_ppi.py: add 3 functions to check if the number communities that are enriched in common genes between dna and protein level is also enriched --- .../radomization_test_ppi.py | 233 +++++++++++++++++- 1 file changed, 220 insertions(+), 13 deletions(-) diff --git a/src/find_interaction_cluster/radomization_test_ppi.py b/src/find_interaction_cluster/radomization_test_ppi.py index fa046d4a..4a7e2e95 100644 --- a/src/find_interaction_cluster/radomization_test_ppi.py +++ b/src/find_interaction_cluster/radomization_test_ppi.py @@ -90,8 +90,8 @@ def get_common_genes(dna_gene: np.array, ppi_gene: np.array) -> int: def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array], - ppi_size: int, ppi_gene: np.array, iteration: int - ) -> np.array: + ppi_size: int, ppi_gene: np.array, iteration: int, + use_seed: bool = True) -> np.array: """ Return the number of common gene between genes in the community \ `community`and a sublist of randomly chosen genes in `ppi_gene` of \ @@ -103,6 +103,7 @@ def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array], :param ppi_size: The size of the ppi_gene subsample :param ppi_gene: All the gene interacting at protein level :param iteration: The number of time we want to repeat the process + :param use_seed: True to use a fixed seed, false else. :return: The number of gene in common between the genes in community \ `community` and the subsample of ppi_size @@ -119,7 +120,8 @@ def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array], >>> get_list_common_gene("C1", dna_gene, 10, ppi_g, 5) array([5, 5, 5, 5, 5]) """ - np.random.seed(1) + if use_seed: + np.random.seed(1) common = [ get_common_genes( dic_dna_gene[community], @@ -180,11 +182,24 @@ def get_pvalue(values: np.array, target: float, iteration: int return pval, regulation +def adjust_regulation(x: pd.Series) -> str: + """ + Return the adjusted regulation. + + :param x: A row a the dataframe + :return: The adjusted row + """ + return x["regulation"] if x["p-adjust"] <= 0.05 else " . " + + def update_overlap_df(df_overlap: pd.DataFrame, dic_dna_gene: Dict[str, np.array], - ppi_gene: np.array, iteration: int - ) -> pd.DataFrame: + ppi_gene: np.array, iteration: int, + use_seed: bool = True, + ) -> Tuple[pd.DataFrame, np.array]: """ + Perform a randomization test to check if communities at \ + DNA level are more similar at protein level than randomly expected. :param df_overlap: A dataframe containing every gene community at dna \ level and the number of common gene with a gne community at protein \ @@ -193,36 +208,48 @@ def update_overlap_df(df_overlap: pd.DataFrame, level :param ppi_gene: All the gene interacting at protein level :param iteration: The number of time we want to repeat the process - :return: The updated Df (column p-value, p-adjust and regulation + :param use_seed: True to use a fixed seed, false else. + :return: The updated Df (column p-value, p-adjust and regulation) + \ + the list of common gene shared between gene community at DNA and \ + protein level obtained by randmization analysis >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], ... "size_com-ppi": [10, 10, 10]}) >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]} >>> ppi_g = list(range(11)) - >>> update_overlap_df(do, dna_gene, ppi_g, 100).iloc[:, - ... [0, 1, 2, 4, 5]] + >>> d, di = update_overlap_df(do, dna_gene, ppi_g, 100) + >>> d.iloc[:, [0, 1, 2, 4, 5]] DNA_community community_size nb_com-ppi p-value p-adjust 0 C1 2 2 0.79 0.79 1 C2 3 3 0.72 0.79 2 C3 2 1 0.18 0.54 - >>> d = update_overlap_df(do, dna_gene, list(range(1000)), 100) + >>> d.iloc[:, 6].to_list() == d.iloc[:, 7].to_list() == [" . "] * 3 + True + >>> d, di = update_overlap_df(do, dna_gene, list(range(1000)), 100) >>> d.iloc[:, [0, 1, 2, 4, 5]] DNA_community community_size nb_com-ppi p-value p-adjust 0 C1 2 2 0.01 0.01 1 C2 3 3 0.00 0.00 2 C3 2 1 0.00 0.00 - >>> d["regulation"].to_list() == [" + "] * 3 + >>> d.iloc[:, 6].to_list() == d.iloc[:, 7].to_list() == [" + "] * 3 + True + >>> len(di.keys()) == 3 + True + >>> len(di[list(di.keys())[0]]) == 100 True """ pval_list = [] reg_list = [] + dic = {} for i in tqdm(range(df_overlap.shape[0])): - values = get_list_common_gene(df_overlap.iloc[i, :]["DNA_community"], + community = df_overlap.iloc[i, :]["DNA_community"] + values = get_list_common_gene(community, dic_dna_gene, df_overlap.iloc[i, :]["size_com-ppi"], ppi_gene, - iteration) + iteration, use_seed) + dic[community] = values pval, reg = get_pvalue(values, df_overlap.iloc[i, :]["nb_com-ppi"], iteration) pval_list.append(pval) @@ -233,7 +260,187 @@ def update_overlap_df(df_overlap: pd.DataFrame, is_sorted=False, returnsorted=False)[1] df_overlap["regulation"] = reg_list - return df_overlap + df_overlap["regulation-adjusted"] = \ + df_overlap.apply(adjust_regulation, axis=1) + return df_overlap, dic + + +def summarize_df_overlap(df_overlap: pd.DataFrame) -> pd.DataFrame: + """ + Create a summary of the dataframe `df_overlap` that indicates the number \ + of common gene shared between communities at protein and DNA level. + + :param df_overlap: The dataframe containing community at DNA and \ + protein level and their enrichment + :return: The dataframe showing the number of enrichment, impoverishment \ + or no regulation + + >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], + ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], + ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0], + ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3, + ... "regulation-adjusted": [" + "] * 3}) + >>> summarize_df_overlap(do) + regulation-adjusted community_size nb_com-ppi size_com-ppi number + 0 + 2.333333 2.0 10.0 3 + 1 - NaN NaN NaN 0 + 2 . NaN NaN NaN 0 + """ + df_overlap = df_overlap[["regulation-adjusted", "community_size", + "nb_com-ppi", "size_com-ppi", "p-adjust"]] + df_tmp = pd.DataFrame({"regulation-adjusted": [" + ", " - ", " . "], + "community_size": [np.nan] * 3, + "nb_com-ppi": [np.nan] * 3, + "size_com-ppi": [np.nan] * 3, + "p-adjust": [np.nan] * 3}) + df = pd.concat([df_overlap, df_tmp], axis=0, ignore_index=True) + df = df.groupby("regulation-adjusted").agg({"community_size": "mean", + "nb_com-ppi": "mean", + "size_com-ppi": "mean", + "p-adjust": "count"}) + return df.rename({"p-adjust": "number"}, axis=1).reset_index() + + +def get_enriched_impoverished_communities(df_overlap: pd.DataFrame, + dic_dna_gene: Dict[str, np.array], + ppi_gene: np.array, iteration: int, + dic_values: Dict[str, np.array], + use_seed: bool = True + ) -> Dict[str, int]: + """ + For each communities, randomly samples only once in the list \ + of `ppi_gene` to get the number of common gene betwwen this sample \ + and the community of gene at DNA level. Then this function determines \ + how many communites are enriched, impovershied thanks to `di_values`. + + :param df_overlap: A dataframe containing every gene community at dna \ + level and the number of common gene with a gne community at protein \ + level + :param dic_dna_gene: The dictionary containing gene interacting at dna \ + level + :param ppi_gene: All the gene interacting at protein level + :param iteration: The number of time we want to repeat the process + :param dic_values: A dictionary containing the number of common \ + gene between communities at DNA and protein level for each \ + subsampling made from the ppi_gene list. + :param use_seed: True to use a fixed seed, false else. + :return: The number of enriched, impoverished number of \ + genes shared betwwen gene at DNA and protein level. + + >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], + ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], + ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0], + ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3, + ... "regulation-adjusted": [" + "] * 3}) + >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]} + >>> ppi_g = list(range(11)) + >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2], + ... "C2": [0] * 95 + [1, 1, 1, 2, 2], + ... "C3": [0] * 75 + [1] * 10 + [2] * 10 + [3] * 5} + >>> get_enriched_impoverished_communities(do, dna_gene, ppi_g, 100, dv) + {' + ': 2, ' . ': 1, ' - ': 0} + >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2], + ... "C2": [0] * 95 + [1, 1, 1, 2, 2], + ... "C3": [3] * 75 + [4] * 23 + [1] * 2} + >>> get_enriched_impoverished_communities(do, dna_gene, ppi_g, 100, dv) + {' + ': 2, ' . ': 0, ' - ': 1} + """ + pval_list = [] + reg_list = [] + for i in range(df_overlap.shape[0]): + community = df_overlap.iloc[i, :]["DNA_community"] + my_value = get_list_common_gene(community, + dic_dna_gene, + df_overlap.iloc[i, :]["size_com-ppi"], + ppi_gene, + 1, use_seed)[0] + pval, reg = get_pvalue(dic_values[community], my_value, + iteration) + pval_list.append(pval) + reg_list.append(reg) + p_adjust = multipletests(pval_list, alpha=0.05, method='fdr_bh', + is_sorted=False, returnsorted=False)[1] + dic = {" + ": 0, " . ": 0, " - ": 0} + for i, reg in enumerate(reg_list): + dic[reg if p_adjust[i] <= 0.05 else " . "] += 1 + return dic + + +def summary_randomisation_test(df_overlap: pd.DataFrame, + dic_dna_gene: Dict[str, np.array], + ppi_gene: np.array, iteration: int, + dic_values: Dict[str, np.array], + use_seed: bool = True + ) -> pd.DataFrame: + """ + Says if the number of community that shared an enriched number \ + of common gene between the protein and DNA level is enriched as well. + + :param df_overlap: A dataframe containing every gene community at dna \ + level and the number of common gene with a gne community at protein \ + level + :param dic_dna_gene: The dictionary containing gene interacting at dna \ + level + :param ppi_gene: All the gene interacting at protein level + :param iteration: The number of time we want to repeat the process + :param dic_values: A dictionary containing the number of common \ + gene between communities at DNA and protein level for each \ + subsampling made from the ppi_gene list. + :param use_seed: True to use a fixed seed, false else. + :return: The updated summary table + + >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], + ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], + ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0], + ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3, + ... "regulation-adjusted": [" + "] * 3}) + >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]} + >>> ppi_g = list(range(11)) + >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2], + ... "C2": [0] * 95 + [1, 1, 1, 2, 2], + ... "C3": [0] * 75 + [1] * 10 + [2] * 10 + [3] * 5} + >>> ds = summary_randomisation_test(do, dna_gene, ppi_g, 100, dv) + >>> ds.iloc[:, range(4)] + regulation-adjusted community_size nb_com-ppi size_com-ppi + 0 + 2.333333 2.0 10.0 + 1 - NaN NaN NaN + 2 . NaN NaN NaN + >>> ds.iloc[:, range(4, 7)] + number mean_100_number p-adjust + 0 3 2.0 0.0 + 1 0 0.0 1.0 + 2 0 1.0 0.0 + >>> ds.iloc[:, 7].to_list() == [" + ", " . ", " - "] + True + """ + df_summarized = summarize_df_overlap(df_overlap) + dic_rand_sum = {k: [] for k in df_summarized["regulation-adjusted"].values} + for _ in tqdm(range(iteration), desc="Making stats to summarise global " + "enrichment"): + d = get_enriched_impoverished_communities(df_overlap, dic_dna_gene, + ppi_gene, iteration, + dic_values, use_seed) + for k, v in dic_rand_sum.items(): + v.append(d[k]) + pvalues = [] + regulations = [] + mean_number = [] + for i in range(df_summarized.shape[0]): + reg, val = df_summarized.iloc[i, :][["regulation-adjusted", "number"]] + pval, creg = get_pvalue(dic_rand_sum[reg], val, + iteration) + # print(reg, val, dic_rand_sum[reg]) + pvalues.append(pval) + regulations.append(creg) + mean_number.append(np.mean(dic_rand_sum[reg])) + p_adjust = multipletests(pvalues, alpha=0.05, method='fdr_bh', + is_sorted=False, returnsorted=False)[1] + reg_adj = [regulations[i] if p_adjust[i] <= 0.05 else " . " + for i in range(len(regulations))] + df_summarized[f"mean_{iteration}_number"] = mean_number + df_summarized[f"p-adjust"] = p_adjust + df_summarized[f"reg_adj"] = reg_adj + return df_summarized if __name__ == "__main__": -- GitLab