diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py new file mode 100644 index 0000000000000000000000000000000000000000..6be04f38e3463ad839134681b83af7b17522bcfc --- /dev/null +++ b/src/find_interaction_cluster/create_ppi_files.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: create a file of gene interaction at gene and protein levels. +""" + +from .config import ConfigGraph +from .ppi_scripts.config_ppi import ConfigPPI +from typing import List +from ..logging_conf import logging_def +import numpy as np +import logging +from .community_finder import get_projects_name +from itertools import product +import multiprocessing as mp +from pathlib import Path +from typing import Dict, Union +import pandas as pd +from .community_finder import create_graph, find_communities, \ + write_cytoscape_graph +from doctest import testmod +from .radomization_test_ppi import get_ppi_community_gene, \ + get_dna_community_gene, update_overlap_df + + +def get_community_dic_fom_community_file(mfile: Union[Path, pd.DataFrame], + community_data: bool = True) -> Dict: + """ + Read a file containing community of genes and return a dictionary \ + linking each gene to other gene in it's community. + + :param mfile: A file containing community of genes + :return: A dictionary linking a gene to all the other in it's community + """ + df = pd.read_csv(mfile, sep="\t") if isinstance(mfile, Path) else mfile + dic = {} + for i in range(df.shape[0]): + cseries = df.iloc[i, :] + list_gene = list(map(int, cseries['genes'].split(", "))) + if len(list_gene) > 1: + for gene in list_gene: + if community_data: + dic[gene] = {"links": [g for g in list_gene if g != gene], + "community": cseries['community']} + else: + dic[gene] = [g for g in list_gene if g != gene] + return dic + + +def ppi_array(fasterdb_ppi: Path, threshold: int) -> np.array: + """ + Return an array of gene having their proteins interaction with each \ + other + :param fasterdb_ppi: A file containing ppi interaction + :param threshold: The minimum theshold needed to consider the interaction + :return: An array containing the interaction + """ + df = pd.read_csv(fasterdb_ppi, sep="\t") + df['gene1'] = df['gene1'].astype(str) + df['gene2'] = df['gene2'].astype(str) + return df.loc[df['combined_score'] >= threshold, + ['gene1', 'gene2', 'combined_score']]\ + .drop_duplicates(keep="first")\ + .values + + +def write_interaction_ppi(arr_interaction: np.array, project: str, + weight: int, global_weight: int, same_gene: bool, + use_weight: bool = False): + """ + + :param arr_interaction: Each couples of co-localized feature within a \ + project. + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param use_weight: Say if we want to write the weight into the result file. + """ + logging.debug('Writing interaction files ...') + outfile = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"_interation_PPI_tmp.txt", + "community_gene_vs_protein") + result = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"_communities_PPI_tmp.txt", + "community_gene_vs_protein") + with outfile.open('w') as f: + for exon1, exon2, cweight in arr_interaction: + if not use_weight: + cweight = 1 + f.write(f"{exon1}\t{exon2}\t{cweight}\n") + return outfile, result + + +def ppi_community_finder(fasterdb_ppi: Path, project: str, + weight: int, global_weight: int, + same_gene: bool = True, + threshold: int = 700): + """ + Find communities inside protein-protein interaction file + + :param fasterdb_ppi: File containing ppi interaction + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param threshold: The minimum threshold needed to consider the interaction + """ + interaction = ppi_array(fasterdb_ppi, threshold) + outfile, result_file = write_interaction_ppi(interaction, project, + weight, global_weight, + same_gene) + graph = create_graph(interaction) + df, dic_community = find_communities(graph, project, outfile, result_file, + "gene") + logging.debug('Writing results ...') + outfiles = [ConfigGraph.get_community_file( + project, weight, global_weight, same_gene, "gene", ext, + "community_gene_vs_protein") + for ext in [f'_graph_community_PPI.txt', f'_graph_community_PPI.cyjs']] + df.to_csv(outfiles[0], sep="\t", index=False) + logging.debug("Saving the graph ...") + write_cytoscape_graph(graph, dic_community, outfiles[1]) + logging.debug('Done !') + return df + + +def create_dataframe(gene_com_dic: Dict) -> pd.DataFrame: + """ + Create a dataframe from a dictionary containing a community of gene. + + :param gene_com_dic: A dictionary linking each gene, to it's neighbors \ + at DNA level + :return: A table containing the gene, the community name of it's gene \ + and the community size of this gene + """ + content = [[gene, gene_com_dic[gene]['community'], + len(gene_com_dic[gene]['links'])] + for gene in gene_com_dic] + return pd.DataFrame(content, columns=["id_gene", "DNA_community", + "community_size"]) + + +def add_size_and_common_gene_number(df: pd.DataFrame, dic_gene: Dict, + dic_ppi: Dict, nb_col: str, + size_col: str) -> pd.DataFrame: + """ + Add a column containing the number of shared interaction between genes \ + and their product. + + :param df: A dataframe containing + :param dic_gene: A dictionary containing the gene interacting at \ + DNA level + :param dic_ppi: A dictionary contaning the interaction between protein \ + of genes. + :param nb_col: The name of the column that will stored the shared \ + interactions between genes at protein and DNA level + :param size_col: The size of the community + :return: The dataframe with two more columns. + """ + nb_content_col = [] + size_content_col = [] + for i in range(df.shape[0]): + cgene = df.iloc[i, :]["id_gene"] + if cgene in dic_ppi: + common = [g for g in dic_ppi[cgene] + if g in dic_gene[cgene]['links']] + size_content_col.append(len(dic_ppi[cgene])) + nb_content_col.append(len(common)) + else: + size_content_col.append(0) + nb_content_col.append(0) + df[nb_col] = nb_content_col + df[size_col] = size_content_col + return df + + +def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame: + """ + Get the DNA community and the ppi community have the larger overlap \ + possible. + + :param df: The dataframe containing the number of \ + genes that clusters in community at DNA and protein level + :return: The dataframe with only one line per DNA community + + >>> test_df = pd.DataFrame({"id_gene": range(1, 12), + ... "DNA_community": ["C1"] * 5 + ["C2"] * 5 + ["C3"], + ... "community_size": [10] * 5 + [15] * 5 + [20], + ... "nb_com-ppi": [0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0], + ... "size_com-ppi": [30, 50, 105, 102, 25, 30, 42, 47, 89, 12, 0]}) + >>> filter_most_overllaping_ppi(test_df) + DNA_community community_size nb_com-ppi size_com-ppi + 0 C1 10 3 105 + 1 C2 15 2 42 + 2 C3 20 0 0 + """ + df.drop('id_gene', axis=1, inplace=True) + dna_communities = df["DNA_community"].unique() + list_df = [] + for my_community in dna_communities: + tmp_df = df.loc[(df['DNA_community'] == my_community), :] + val = np.max(tmp_df['nb_com-ppi'].values) + list_df.append(tmp_df.loc[tmp_df['nb_com-ppi'] == val, :] + .sample(1, random_state=1)) + df = pd.concat(list_df, axis=0, ignore_index=True) + df['nb_com-ppi'] = df.apply(lambda x: x['nb_com-ppi'] + 1 + if x["size_com-ppi"] > 0 + else x['nb_com-ppi'], axis=1) + return df + + +def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, + project: str, + weight: int, global_weight: int, + same_gene: bool = True, + threshold: int = 700): + """ + + :param community_file: A file containing community of gene interacting \ + at DNA level + :param fasterdb_ppi: A file containing pairs of genes having their \ + protein product interacting at protein level. + :param project: The name of the project of interest + :param weight: The weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all project are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param threshold: The minimum threshold needed to consider the interaction + :return: + """ + outfile = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + '_graph_community_PPI.txt', + "community_gene_vs_protein") + if not outfile.is_file(): + logging.debug("Creating the community file for PPI") + df_comm_ppi = ppi_community_finder(fasterdb_ppi, project, weight, + global_weight, same_gene, threshold) + else: + df_comm_ppi = pd.read_csv(outfile, sep="\t") + logging.debug("Turning ppi community file into a dic") + dic_ppi_com = get_community_dic_fom_community_file(df_comm_ppi, + community_data=False) + logging.debug("Loading gene community file") + dic_gene = get_community_dic_fom_community_file(community_file, + community_data=True) + logging.debug("Creation of the gene community ad DNA level") + df = create_dataframe(dic_gene) + logging.debug("Add a column for proteins community") + df = add_size_and_common_gene_number(df, dic_gene, dic_ppi_com, + "nb_com-ppi", "size_com-ppi") + return df + + +def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, + project: str, + weight: int, global_weight: int, + same_gene: bool = True, + threshold: int = 700, iteration: int = 1000): + """ + :param community_file: A file containing community of gene interacting \ + at DNA level + :param fasterdb_ppi: A file containing pairs of genes having their \ + protein product interacting at protein level. + :param project: The name of the project of interest + :param weight: The weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projcet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param threshold: The minimum threshold needed to consider the interaction + :param iteration: The number of iteration to make + :return: + """ + outfile = ConfigGraph.get_community_file(project, weight, + global_weight, same_gene, "gene", + f"ppi_gene_complete_table.txt", + "community_gene_vs_protein") + ppi_outfile = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, "gene", + '_graph_community_PPI.txt', + "community_gene_vs_protein") + if not outfile.is_file(): + df = create_community_ppi_table(community_file, fasterdb_ppi, + project, weight, global_weight, + same_gene, threshold) + df.to_csv(outfile, sep="\t", index=False) + else: + df = pd.read_csv(outfile, sep="\t") + df_overlap = filter_most_overllaping_ppi(df) + ppi_gene = get_ppi_community_gene(pd.read_csv(ppi_outfile, sep="\t")) + dic_dna_gene = get_dna_community_gene(pd.read_csv(community_file, + sep="\t")) + df = update_overlap_df(df_overlap, dic_dna_gene, ppi_gene, iteration) + outstat = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"ppi_gene_table_{iteration}_" + f"stat.txt", + "community_gene_vs_protein") + df.to_csv(outstat, sep="\t", index=False) + + +def ppi_stat_launcher(ps: int, weights: List[int], + global_weights: List[int], + same_gene: bool, threshold: int = 700, + iteration: int = 1000, + logging_level: str = "DISABLE"): + """ + Launch the statistical allowing to determine if interaction between \ + genes at dna level as an influence on the interactions at protein level. + + :param ps: The number of processes we want to use. + :param weights: The list of weights of interaction to consider + :param global_weights: The list global weights to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projcet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param threshold: The minimum threshold needed to consider the interaction + :param iteration: The number of iteration to make + :param logging_level: Level of information to display + """ + ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) + logging_def(ConfigGraph.community_folder, __file__, logging_level) + logging.info("Checking if gene communities at DNA level have an " + "influence on communities at protein level") + global_weights = list(np.unique(global_weights)) + weights = list(np.unique(weights)) + projects, dic_project = get_projects_name(global_weights) + condition = list(product(projects, weights)) + pool = mp.Pool(processes=min(ps, len(condition))) + logging.debug("Calculating stats...") + processes = [] + for project, weight in condition: + global_weight = dic_project[project] + community_file = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, "gene", + f".txt") + args = [community_file, ConfigPPI.fasterdb_ppi, project, + weight, global_weight, same_gene, threshold, iteration] + processes.append(pool.apply_async(ppi_stats_analysis, args)) + results = [p.get(timeout=None) for p in processes] + + +if __name__ == "__main__": + testmod() \ No newline at end of file