diff --git a/src/find_interaction_cluster/ppi_scripts/__init__.py b/src/find_interaction_cluster/ppi_scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..60f3a118d41cf7494b1cd067bf5d649b42ba1e05 --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" diff --git a/src/find_interaction_cluster/ppi_scripts/__main__.py b/src/find_interaction_cluster/ppi_scripts/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f10e03ded5cbaf1644ba3a5f4883f1c0189483 --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/__main__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" + +from .get_ensembl_hg19_gene_ccoordinates import create_protein_gene_file +from .create_gene_interaction_file import get_protein_interaction_table + +def launcher_ppi(): + """ + launch every script in this submodules. + """ + create_protein_gene_file() + get_protein_interaction_table() + + +launcher_ppi() \ No newline at end of file diff --git a/src/find_interaction_cluster/ppi_scripts/config_ppi.py b/src/find_interaction_cluster/ppi_scripts/config_ppi.py new file mode 100644 index 0000000000000000000000000000000000000000..312baea9d862909fa409ab7aed53984c2e50e702 --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/config_ppi.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: This script contains all the variables needed in this \ +submodules +""" + +from ..config import ConfigGraph + + +class ConfigPPI: + """ + contains all the variables used in this submodules + """ + output_folder = ConfigGraph.output_folder / "ppi_analysis" + data = ConfigGraph.data + bed_ensembl = data / "ppi_data" / "liftover_hg38gene2hg19.bed" + ensembl_gene = data / "ppi_data" / "mart_export_hg38_proteins_genes.txt" + ppi_string = data / "ppi_data" / "9606.protein.links.v11.0.txt.gz" + protein_gene = output_folder / "protein_gene.txt" + gene_ppi_file = output_folder / "ppi_gene_file.txt" + protein_gene_fasterdb = output_folder / "protein_gene_fasterdb.txt" + fasterdb_ppi = output_folder / "fasterdb_ppi.txt" \ No newline at end of file diff --git a/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py b/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py new file mode 100644 index 0000000000000000000000000000000000000000..59221d693e4fa8d558d4b2018b94f8cc94964bfa --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Create a file of genes having their proteins interacting with \ +each other +""" + +from pathlib import Path +from ..config import Config +from .config_ppi import ConfigPPI +import pandas as pd +from typing import Dict, Optional, Tuple + + +def load_string_ppi_file(ppi_file: Path) -> pd.DataFrame: + """ + Load the string protein protein interaction file. + + :param ppi_file: A file containing the protein protein interaction. + :return: The loaded string ppi interaction file + """ + df = pd.read_csv(ppi_file, sep=" ") + df['protein1'] = df['protein1'].str.replace("9606.", "", regex=False) + df['protein2'] = df['protein2'].str.replace("9606.", "", regex=False) + return df + + +def load_protein_gene_links(prot_gene_file: Path) -> Tuple[pd.DataFrame, Dict]: + """ + Load the file linking each gene to it's hg19 coordinates and encoded \ + proteins. + + :param prot_gene_file: A file linking each gene to it's hg19 \ + coordinates and encoded proteins. + :return: The loaded file and a dictionary of synonyms + """ + df = pd.read_csv(prot_gene_file, sep="\t", + dtype={"Gene_stable_ID": str, + "Protein_stable_ID": str, + "Gene_name": str, + "Gene_Synonym": str, + "chr": str, + "start": int, + "stop": int, + "strand": str}) + dic_synonym = {} + for i in range(df.shape[0]): + mseries = df.iloc[i, :] + if isinstance(mseries["Gene_Synonym"], float): + dic_synonym[mseries["Gene_name"].upper()] = \ + mseries["Gene_name"].upper() + else: + dic_synonym[mseries["Gene_Synonym"].upper()] = \ + mseries["Gene_name"].upper() + return df, dic_synonym + + +def load_fasterdb_gene(bed_gene: Path, dic_synonym: Dict) -> Dict: + """ + Load a bed file containing all fasterDB gene. + + :param bed_gene: A bed file containing all fasterdb gene + :param dic_synonym: A dictionary of synonyms used to replace \ + the old fasterDB names by the new ones + :return: A dictionary linking each fasterDB gene name to its coordinates + """ + dic = {} + with bed_gene.open("r") as inbed: + for line in inbed: + line = line.replace("\n", "").split("\t") + gene_name = line[4].upper() + if gene_name in dic_synonym.keys(): + gene_name = dic_synonym[gene_name] + if gene_name not in dic.keys(): + dic[gene_name] = [[line[0], int(line[1]), int(line[2]), + int(line[3]), gene_name, line[5]]] + else: + dic[gene_name].append( + [line[0], int(line[1]), int(line[2]), + int(line[3]), gene_name, line[5]]) + return dic + + +def find_fasterdb_id(mseries: pd.Series, dic: Dict) -> Optional[int]: + """ + Find the fasterDB gene id from a line corresponding to an ensembl gene. + + :param mseries: A line containing the ensembl gene id and gene name of \ + a gene + :param dic: A dictionary linking each gene name to it's coordinate + :return: The fasterDB id of the ensembl gene. + """ + if mseries['Gene_name'].upper() not in dic.keys(): + if isinstance(mseries['Gene_Synonym'], float) or \ + mseries['Gene_Synonym'].upper() not in dic.keys(): + return None + genes = dic[mseries['Gene_Synonym'].upper()] + else: + genes = dic[mseries['Gene_name'].upper()] + if len(genes) == 1: + return genes[0][3] + elif len(genes) >= 1: + for gene in genes: + if ( + mseries['chr'] == gene[0] + and mseries['start'] == gene[1] + and mseries['stop'] == gene[2] + and mseries['strand'] == gene[5] + ): + return gene[3] + return None + return None + + +def add_gene_id_column(prot_gene_df: pd.DataFrame, dic: Dict) -> pd.DataFrame: + """ + Add the fasterDB gene id to ensembl gene then only return lines in \ + the dataframe with a fasterDB gene id. + + :param prot_gene_df: A table containing ensembl gene id. + :param dic: A dictionary linking gene name to is fasterdb id. + :return: The table prot_gene_df with a column FasterDB_id + """ + prot_gene_df['FasterDB_id'] = prot_gene_df.apply(find_fasterdb_id, + axis=1, dic=dic) + df = prot_gene_df.loc[-prot_gene_df['FasterDB_id'].isna(), :].copy() + df['FasterDB_id'] = df['FasterDB_id'].astype(int) + df.drop(['Gene_Synonym'], axis=1, inplace=True) + return df.drop_duplicates() + + +def add_fasterdb_id_to_ppi_file(ppi_df: pd.DataFrame, + df_fasterdb: pd.DataFrame) -> pd.DataFrame: + """ + Add two columns to the ppi_df: one containing the fasterdb gene_id \ + of the gene that encode the first interacting protein and the second \ + containing the fasterdb gene id of the gene encoding the second \ + interacting protein. + + :param ppi_df: Dataframe of interaction + :param df_fasterdb: df with the fasterDB gene id of the gene en their \ + encoding protein identified by ensembl protein id. + :return: The dataframe with the gene id of genes encoding the interacting \ + protein. + """ + df_fasterdb = df_fasterdb[["Protein_stable_ID", "FasterDB_id"]].copy() + df_fasterdb.rename({"FasterDB_id": "gene"}, axis=1, inplace=True) + ppi_df = ppi_df.merge(df_fasterdb, how="left", left_on="protein1", + right_on="Protein_stable_ID") + ppi_df = ppi_df.merge(df_fasterdb, how="left", left_on="protein2", + right_on="Protein_stable_ID", suffixes=['1', '2']) + ppi_df = ppi_df.loc[(-ppi_df['gene1'].isna()) & + (-ppi_df['gene2'].isna()), :] + ppi_df.drop(["Protein_stable_ID1", "Protein_stable_ID2"], axis=1, + inplace=True) + ppi_df['gene1'] = ppi_df['gene1'].astype(int) + ppi_df['gene2'] = ppi_df['gene2'].astype(int) + return ppi_df + + +def get_protein_interaction_table(): + """ + link ensemble gene and their encoding protein to a fasterDB gene id. + """ + df_prot_gene, dic_synonyms = \ + load_protein_gene_links(ConfigPPI.protein_gene) + dic_gene = load_fasterdb_gene(Config.bed_gene, dic_synonyms) + df_fasterdb_id = add_gene_id_column(df_prot_gene, dic_gene) + df_fasterdb_id.to_csv(ConfigPPI.protein_gene_fasterdb, sep="\t", + index=False) + ppi_df = load_string_ppi_file(ConfigPPI.ppi_string) + ppi_fasterdb = add_fasterdb_id_to_ppi_file(ppi_df, df_fasterdb_id) + ppi_fasterdb.to_csv(ConfigPPI.fasterdb_ppi, sep="\t", index=False) diff --git a/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py b/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ecaeda470c7c56d08d951d1a9bb7ec3c9f0255 --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to get only coding gene from \ +ensemble with their protein id and their coordinate in hg19. +""" + +from .config_ppi import ConfigPPI +import pandas as pd +from pathlib import Path +import numpy as np + + +def load_genes(ensembl_file: Path) -> pd.DataFrame: + """ + Load a table containing all the human protein of ensembl database \ + and the name and location of their genes. + + :param ensembl_file: A file containing all the human protein of \ + ensembl database and the name and location of their genes. + :return: table containing all the human protein of ensembl database \ + and the name and location of their genes. + """ + df = pd.read_csv(ensembl_file, sep="\t", + dtype={"Gene_stable_ID": str, + "Gene_stable_ID_version": str, + "Protein_stable_ID": str, + "Chromosome-scaffold_name": str, + "Gene_start": int, + "Gene_end": int, + "Strand": str, + "Gene_name": str, + "Gene_Synonym": str}) + df = df.loc[-df['Protein_stable_ID'].isna(), :] + return df[["Gene_stable_ID", "Protein_stable_ID", "Gene_name", + "Gene_Synonym"]] + + +def load_bed(ensembl_bed: Path) -> pd.DataFrame: + """ + Load a bed file containing ensembl hg38 gene on hg19 coordinates. + + :param ensembl_bed: A bed file containing ensembl gene + :return: the bed file + """ + df = pd.read_csv(ensembl_bed, names=["chr", "start", "stop", "name", + "score", "strand"], sep="\t") + df['chr'] = df["chr"].str.replace("chr", "") + df = df.drop_duplicates() + names = np.unique(df['name'].values, return_counts=True) + good_names = [cname for i, cname in enumerate(list(names[0])) + if names[1][i] == 1] + return df.loc[df['name'].isin(good_names), :] + + +def merge_tables(df_gene: pd.DataFrame, df_bed: pd.DataFrame) -> pd.DataFrame: + """ + Merges `df_gene` and `df_bed`together. + + :param df_gene: table containing all the human protein of ensembl \ + database and the name and location of their genes. + :param df_bed: A bed file containing ensembl gene + :return: The merged table + """ + df_gene = df_gene.loc[ + df_gene["Gene_name"].isin(list(df_bed['name'].unique())), :] + df_bed.rename({'name': "Gene_name"}, inplace=True, axis=1) + return df_gene.merge(df_bed, how="left", on="Gene_name") + + +def create_protein_gene_file(): + """ + Create a file linking each ensembl protein to it's gene and the hg19 \ + coordinate of this gene. + """ + + df_gene = load_genes(ConfigPPI.ensembl_gene) + df_bed = load_bed(ConfigPPI.bed_ensembl) + df = merge_tables(df_gene, df_bed) + ConfigPPI.protein_gene.parent.mkdir(parents=True, exist_ok=True) + df.to_csv(ConfigPPI.protein_gene, sep="\t", index=False)