diff --git a/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py b/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..51034046dc2b59154613a5b75b24a6dc0891db86 --- /dev/null +++ b/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to check whether close \ +gene produce protein that interact between each other +""" + +from .config import ConfigGraph +from ..logging_conf import logging_def +import logging +from .community_finder import get_projects +from pathlib import Path +import pandas as pd +from itertools import combinations, product +from typing import List, Tuple +from doctest import testmod +import numpy as np +from .ppi_scripts.config_ppi import ConfigPPI +import seaborn as sns +from scipy.stats import mannwhitneyu +from ..fig_utils.stat_annot import add_stat_annotation + + +def get_gene_list(gene_list: str) -> List[int]: + """ + From a string containing a list of id of genes separated by a comma, \ + return a list of those id. + + :param gene_list: A list of gene id separated by a comma + :return: The list of gene id + + >>> get_gene_list("1, 2, 3") + [1, 2, 3] + """ + id_gene_list = list(map(int, gene_list.split(', '))) + return sorted(id_gene_list) + + +def get_couples_in_community(row: pd.Series) -> pd.DataFrame: + """ + From a line of a dataframe containing a community, return each \ + couples of gene in that community. + + :param row: line of a dataframe containing a community + :return: A dataframe containing couples of gene in that community + + >>> data = pd.Series({"community": "C1", + ... "nodes": 3, + ... "genes": "1, 2, 3"}) + >>> get_couples_in_community(data) + gene1 gene2 community_size community + 0 1 2 3 C1 + 1 1 3 3 C1 + 2 2 3 3 C1 + """ + couples = combinations(get_gene_list(row['genes']), 2) + dic = {"gene1": [], "gene2": []} + for gene1, gene2 in couples: + dic["gene1"].append(gene1) + dic["gene2"].append(gene2) + dic['community_size'] = [row['nodes']] * len(dic["gene1"]) + dic['community'] = [row['community']] * len(dic["gene1"]) + return pd.DataFrame(dic) + + +def edge_community_table(df_com: pd.DataFrame) -> pd.DataFrame: + """ + From a dataframe containing the communities of co-localised genes in the \ + nucleus, create a table showing each couple of gene in each community. + + :param df_com: dataframe containing the communities of co-localised \ + genes in the nucleus + :return: table showing each couple of gene in each community. + + >>> data = pd.DataFrame({"community": ["C1", "C2"], + ... "nodes": [3, 2], + ... "genes": ["1, 2, 3", "4, 5"]}) + >>> edge_community_table(data) + gene1 gene2 community_size community + 0 1 2 3 C1 + 1 1 3 3 C1 + 2 2 3 3 C1 + 3 4 5 2 C2 + """ + list_df = [get_couples_in_community(df_com.iloc[i, :]) + for i in range(df_com.shape[0])] + return pd.concat(list_df, axis=0, ignore_index=True) + + +def ppi_good_order(line: str) -> List[int]: + """ + From a line of a file coming containing each couple of genes whose \ + proteins interact with each other return the line with the proper order. + + :param line: a line of a file coming containing each couple of genes \ + whose proteins interact with each other + :return: The lowest gene id, the highest gene id and their \ + interaction score + + >>> ppi_good_order("ENSP00000000233\\tENSP00000272298\\t490\\t" + + ... "5764\\t3926\\n") + [3926, 5764, 490] + """ + cline = line.strip().split("\t") + return sorted([int(cline[3]), int(cline[4])]) + [int(cline[2])] + + +def load_fasterdb_ppi(fasterdb_ppi: Path) -> pd.DataFrame: + """ + Return a file containing each couple of genes whose proteins \ + interact with each other. + + :param fasterdb_ppi: A file containing couples of genes whose proteins \ + interact with each other + :return: The dataframe of the interaction between genes \ + at protein level and their score. + + >>> from tempfile import NamedTemporaryFile + >>> f = NamedTemporaryFile(mode='w') + >>> _ = f.write("ENSP00000000233\\tENSP00000272298\\t490\\t" + + ... "5764\\t3926\\n" + "ENSP00000000233\\tENSP00000253401\\t198\\t" + + ... "5764\\t3273\\n") + >>> f.flush() + >>> mfile = Path(f.name) + >>> load_fasterdb_ppi(mfile) + gene1 gene2 string_score + 0 3926 5764 490 + 1 3273 5764 198 + """ + with fasterdb_ppi.open("r") as ppi_file: + content = [ + ppi_good_order(line) + for line in ppi_file + if not line.startswith("protein1") + ] + return pd.DataFrame(content, columns=['gene1', 'gene2', 'string_score']) + + +def merge_community_and_score(df_cpl: pd.DataFrame, ppi_score: pd.DataFrame + ) -> pd.DataFrame: + """ + :param df_cpl: table showing each couple of gene in each community \ + at gene level + :param ppi_score: The dataframe of the interaction between genes \ + at protein level and their score. + :return: df_com with string score + + >>> d1 = pd.DataFrame({"gene1": [1, 1, 2, 4], "gene2": [2, 3, 3, 5], + ... "community_size": [3, 3, 3, 2], "community": ["C1", "C1", "C1", "C2"]}) + >>> d2 = pd.DataFrame({"gene1": [1, 1, 2, 9], "gene2": [2, 3, 3, 8], + ... "string_score": [10, 20, 30, 40]}) + >>> merge_community_and_score(d1, d2) + gene1 gene2 community_size community string_score + 0 1 2 3 C1 10.0 + 1 1 3 3 C1 20.0 + 2 2 3 3 C1 30.0 + 3 4 5 2 C2 NaN + """ + return df_cpl.merge(ppi_score, how="left", on=["gene1", "gene2"]) + + +def get_gene_in_community(df_comp: pd.DataFrame) -> List[int]: + """ + Get all the couples of exons inter-community + + :param df_comp: table showing each couple of gene in each community \ + at gene level + :return: The list of gene inside community + + >>> data = pd.DataFrame({"community": ["C1", "C2"], + ... "nodes": [3, 2], + ... "genes": ["1, 2, 3", "4, 5"]}) + >>> get_gene_in_community(data) + [1, 2, 3, 4, 5] + """ + gene_list = [] + for i in range(df_comp.shape[0]): + gene_list += get_gene_list(df_comp.iloc[i, :]["genes"]) + return gene_list + + +def get_couple_of_ctrl_exon(community_gene: str, all_genes: List[int] + ) -> List[Tuple[int, int]]: + """ + Get every control couple of genes, (couples of genes inter-community). + + :param community_gene: A string containing the genes of interest. + :param all_genes: Every genes in every community at dna level + :return: Every inter-community couple + + >>> get_couple_of_ctrl_exon('1, 2', [1, 2, 3, 4, 5]) + [(1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5)] + """ + cgenes = get_gene_list(community_gene) + all_genes = [g for g in all_genes if g not in cgenes] + values = list(product(cgenes, all_genes)) + return [v for v in values if v[0] < v[1]] + + +def get_very_ctrl_couple(df_comp: pd.DataFrame) -> pd.DataFrame: + """ + Get a dataframe containing all the control couple of genes + + :param df_comp: dataframe containing the communities of co-localised \ + genes in the nucleus + :return: A dataframe containing all the control couples of genes + + >>> data = pd.DataFrame({"community": ["C1", "C2"], + ... "nodes": [3, 2], + ... "genes": ["1, 2, 3", "4, 5"]}) + >>> get_very_ctrl_couple(data) + gene1 gene2 community_size community + 0 1 4 NaN C-CTRL + 1 1 5 NaN C-CTRL + 2 2 4 NaN C-CTRL + 3 2 5 NaN C-CTRL + 4 3 4 NaN C-CTRL + 5 3 5 NaN C-CTRL + """ + all_genes = get_gene_in_community(df_comp) + ctrl_couples = [] + for i in range(df_comp.shape[0]): + ctrl_couples += get_couple_of_ctrl_exon(df_comp.iloc[i, :]["genes"], + all_genes) + df = pd.DataFrame(ctrl_couples, columns=["gene1", "gene2"]) + df["community_size"] = [np.nan] * df.shape[0] + df["community"] = ["C-CTRL"] * df.shape[0] + return df + + +def create_scored_dataframe(df_comp: pd.DataFrame, fasterdb_ppi: Path + ) -> pd.DataFrame: + """ + Create a dataframe containing couple of exons \ + inside a community at dna level and the control couples between \ + communities and their string score. + + :param df_comp: dataframe containing the communities of co-localised \ + genes in the nucleus + :param fasterdb_ppi: A file containing couples of genes whose proteins \ + interact with each other + :return: a dataframe containing couple of exons \ + inside a community at dna level and the control couples between \ + communities and their string score. + + >>> from tempfile import NamedTemporaryFile + >>> f = NamedTemporaryFile(mode='w') + >>> _ = f.write("P1\\tP2\\t490\\t1\\t2\\n" + "P3\\tP4\\t198\\t1\\t3\\n" + ... "P3\\tP4\\t199\\t1\\t4\\n" + "P3\\tP4\\t200\\t1\\t5\\n" + ... "P3\\tP4\\t202\\t2\\t3\\n" + "P3\\tP4\\t208\\t2\\t4\\n" + ... "P3\\tP4\\t398\\t2\\t5\\n" + "P3\\tP4\\t298\\t3\\t4\\n" + ... "P3\\tP4\\t498\\t5\\t3\\n") + >>> f.flush() + >>> mfile = Path(f.name) + >>> data = pd.DataFrame({"community": ["C1", "C2"], + ... "nodes": [3, 2], + ... "genes": ["1, 2, 3", "4, 5"]}) + >>> create_scored_dataframe(data, mfile) + gene1 gene2 community_size community string_score group + 0 1 2 3.0 C1 490.0 community + 1 1 3 3.0 C1 198.0 community + 2 2 3 3.0 C1 202.0 community + 3 4 5 2.0 C2 NaN community + 4 1 4 NaN C-CTRL 199.0 ctrl + 5 1 5 NaN C-CTRL 200.0 ctrl + 6 2 4 NaN C-CTRL 208.0 ctrl + 7 2 5 NaN C-CTRL 398.0 ctrl + 8 3 4 NaN C-CTRL 298.0 ctrl + 9 3 5 NaN C-CTRL 498.0 ctrl + """ + df_ctrl = get_very_ctrl_couple(df_comp) + df_real = edge_community_table(df_comp) + full_df = pd.concat([df_real, df_ctrl], axis=0, ignore_index=True) + df_score = load_fasterdb_ppi(fasterdb_ppi) + full_df = merge_community_and_score(full_df, df_score) + full_df['group'] = ["community"] * full_df.shape[0] + full_df.loc[full_df['community'] == "C-CTRL", "group"] = "ctrl" + return full_df + + +def create_figure(df_full: pd.DataFrame, outfile: Path) -> None: + """ + Create a barplot of string confidence score. + + :param df_full: A dataframe showing the interactions between \ + genes in same community. + + :param outfile: The file ythat will contains the barplot + """ + sns.set(context="talk") + g = sns.catplot(x="group", y="string_score", data=df_full, kind="bar", + ci="sd", aspect=1.7, height=12) + _, p = mannwhitneyu(df_full.loc[df_full['group'] == 'community', + 'string_score'].values, + df_full.loc[df_full['group'] == 'community', + 'string_score'].values) + add_stat_annotation(g.ax, data=df_full, x="group", y="string_score", + box_pair_list=[("community", "ctrl", p)]) + g.savefig(outfile) + + +def coloc_ppi_stat_main(weight: int, global_weight: int, + project: str, same_gene: bool, iteration: int = 1000, + logging_level: str = "DISABLE"): + """ + Launch the statistical tests allowing to determine if interaction between \ + genes at dna level as an influence on the interactions at protein level. + + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param threshold: The minimum threshold needed to consider the interaction + :param iteration: The number of iteration to make + :param logging_level: Level of information to display + """ + ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) + logging_def(ConfigGraph.community_folder, __file__, logging_level) + logging.info("Checking if gene communities at DNA level have an " + "influence on communities at protein level") + project = get_projects(global_weight, project) + logging.debug("Calculating stats...") + community_file = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, "gene", + f".txt") + + df_com = pd.read_csv(community_file, sep="\t") + df_com = df_com[df_com['nodes'] >= 10].copy() + full_df = create_scored_dataframe(df_com, ConfigPPI.fasterdb_ppi) + outfile = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"_interation_gene-protein.pdf", + "community_gene_vs_protein") + create_figure(full_df, outfile) + logging.info(f"{full_df.shape[0]} total couples (with ctrl)") + logging.info(f"{full_df[-np.isnan(full_df['string_score'])].shape[0]} " + f"couples with a defined string score") + tmp = full_df[full_df['group'] == 'community'].shape[0] + logging.info(f"{tmp} tests couples") + tmp = full_df.loc[(-np.isnan(full_df['string_score'])) & + (full_df['group'] == 'community'), :].shape[0] + logging.info(f"{tmp} tests couples with a defined string score") + + + +if __name__ == "__main__": + testmod() \ No newline at end of file