creation of a file linking protein and the fasterDB gene id encoding them from...

creation of a file linking protein and the fasterDB gene id encoding them from a file containing ensembl! id for genes and their protein in hg19

creation of a file linking protein and the fasterDB gene id encoding them from...
2eac9baa · nfontrod · cb228b23 · 2eac9baa · 2eac9baa · 2eac9baa
Commit 2eac9baa authored Nov 16, 2020 by nfontrod
--- a/src/find_interaction_cluster/ppi_scripts/config_ppi.py
+++ b/src/find_interaction_cluster/ppi_scripts/config_ppi.py
@@ -17,7 +17,7 @@ class ConfigPPI:
    output_folder = ConfigGraph.output_folder / "ppi_analysis"
    data = ConfigGraph.data
    bed_ensembl = data / "ppi_data" / "liftover_hg38gene2hg19.bed"
-    ensembl_gene = data / "ppi_data" / "mart_export_hg38_proteins_genes.txt"
+    ensembl_gene = data / "ppi_data" / "mart_export_hg37_proteins_genes.txt"
    ppi_string = data / "ppi_data" / "9606.protein.links.v11.0.txt.gz"
    protein_gene = output_folder / "protein_gene.txt"
    gene_ppi_file = output_folder / "ppi_gene_file.txt"

--- a/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py
+++ b/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py
@@ -27,43 +27,27 @@ def load_string_ppi_file(ppi_file: Path) -> pd.DataFrame:
    return df


-def load_protein_gene_links(prot_gene_file: Path) -> Tuple[pd.DataFrame, Dict]:
+def load_protein_gene_links(prot_gene_file: Path) -> pd.DataFrame:
    """
    Load the file linking each gene to it's hg19 coordinates and encoded \
    proteins.

    :param prot_gene_file: A file linking each gene to it's hg19 \
    coordinates and encoded proteins.
-    :return: The loaded file and a dictionary of synonyms
-    """
-    df = pd.read_csv(prot_gene_file, sep="\t",
-                     dtype={"Gene_stable_ID": str,
-                            "Protein_stable_ID": str,
-                            "Gene_name": str,
-                            "Gene_Synonym": str,
-                            "chr": str,
-                            "start": int,
-                            "stop": int,
-                            "strand": str})
-    dic_synonym = {}
-    for i in range(df.shape[0]):
-        mseries = df.iloc[i, :]
-        if isinstance(mseries["Gene_Synonym"], float):
-            dic_synonym[mseries["Gene_name"].upper()] = \
-                mseries["Gene_name"].upper()
-        else:
-            dic_synonym[mseries["Gene_Synonym"].upper()] = \
-                mseries["Gene_name"].upper()
-    return df, dic_synonym
+    :return: The loaded file
+    """
+    df = pd.read_csv(prot_gene_file, sep="\t")
+    df.rename({"Chromosome-scaffold_name": "chr", "Gene_start": "start",
+               "Gene_end": "stop", "Strand": "strand"}, axis=1, inplace=True)
+    df['Gene_name'] = df['Gene_name'].str.upper()
+    return df


-def load_fasterdb_gene(bed_gene: Path, dic_synonym: Dict) -> Dict:
+def load_fasterdb_gene(bed_gene: Path) -> Dict:
    """
    Load a bed file containing all fasterDB gene.

    :param bed_gene: A bed file containing all fasterdb gene
-    :param dic_synonym: A dictionary of synonyms used to replace \
-    the old fasterDB names by the new ones
    :return: A dictionary linking each fasterDB gene name to its coordinates
    """
    dic = {}
@@ -71,8 +55,6 @@ def load_fasterdb_gene(bed_gene: Path, dic_synonym: Dict) -> Dict:
        for line in inbed:
            line = line.replace("\n", "").split("\t")
            gene_name = line[4].upper()
-            if gene_name in dic_synonym.keys():
-                gene_name = dic_synonym[gene_name]
            if gene_name not in dic.keys():
                dic[gene_name] = [[line[0], int(line[1]), int(line[2]),
                                   int(line[3]), gene_name, line[5]]]
@@ -93,11 +75,7 @@ def find_fasterdb_id(mseries: pd.Series, dic: Dict) -> Optional[int]:
    :return: The fasterDB id of the ensembl gene.
    """
    if mseries['Gene_name'].upper() not in dic.keys():
-        if isinstance(mseries['Gene_Synonym'], float) or \
-                mseries['Gene_Synonym'].upper() not in dic.keys():
            return None
-        genes = dic[mseries['Gene_Synonym'].upper()]
-    else:
    genes = dic[mseries['Gene_name'].upper()]
    if len(genes) == 1:
        return genes[0][3]
@@ -127,8 +105,7 @@ def add_gene_id_column(prot_gene_df: pd.DataFrame, dic: Dict) -> pd.DataFrame:
                                                     axis=1, dic=dic)
    df = prot_gene_df.loc[-prot_gene_df['FasterDB_id'].isna(), :].copy()
    df['FasterDB_id'] = df['FasterDB_id'].astype(int)
-    df.drop(['Gene_Synonym'], axis=1, inplace=True)
-    return df.drop_duplicates()
+    return df


 def add_fasterdb_id_to_ppi_file(ppi_df: pd.DataFrame,
@@ -164,9 +141,8 @@ def get_protein_interaction_table():
    """
    link ensemble gene and their encoding protein to a fasterDB gene id.
    """
-    df_prot_gene, dic_synonyms = \
-        load_protein_gene_links(ConfigPPI.protein_gene)
-    dic_gene = load_fasterdb_gene(Config.bed_gene, dic_synonyms)
+    df_prot_gene = load_protein_gene_links(ConfigPPI.protein_gene)
+    dic_gene = load_fasterdb_gene(Config.bed_gene)
    df_fasterdb_id = add_gene_id_column(df_prot_gene, dic_gene)
    df_fasterdb_id.to_csv(ConfigPPI.protein_gene_fasterdb, sep="\t",
                          index=False)

--- a/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py
+++ b/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py
@@ -4,7 +4,7 @@

 """
 Description: The goal of this script is to get only coding gene from \
-ensemble with their protein id and their coordinate in hg19.
+ensembl..
 """

 from .config_ppi import ConfigPPI
@@ -25,49 +25,16 @@ def load_genes(ensembl_file: Path) -> pd.DataFrame:
    """
    df = pd.read_csv(ensembl_file, sep="\t",
                     dtype={"Gene_stable_ID": str,
-                            "Gene_stable_ID_version": str,
                            "Protein_stable_ID": str,
                            "Chromosome-scaffold_name": str,
                            "Gene_start": int,
                            "Gene_end": int,
                            "Strand": str,
-                            "Gene_name": str,
-                            "Gene_Synonym": str})
+                            "Gene_name": str})
    df = df.loc[-df['Protein_stable_ID'].isna(), :]
-    return df[["Gene_stable_ID", "Protein_stable_ID", "Gene_name",
-               "Gene_Synonym"]]
-
-
-def load_bed(ensembl_bed: Path) -> pd.DataFrame:
-    """
-    Load a bed file containing ensembl hg38 gene on hg19 coordinates.
-
-    :param ensembl_bed: A bed file containing ensembl gene
-    :return: the bed file
-    """
-    df = pd.read_csv(ensembl_bed, names=["chr", "start", "stop", "name",
-                                         "score", "strand"], sep="\t")
-    df['chr'] = df["chr"].str.replace("chr", "")
-    df = df.drop_duplicates()
-    names = np.unique(df['name'].values, return_counts=True)
-    good_names = [cname for i, cname in enumerate(list(names[0]))
-                  if names[1][i] == 1]
-    return df.loc[df['name'].isin(good_names), :]
-
-
-def merge_tables(df_gene: pd.DataFrame, df_bed: pd.DataFrame) -> pd.DataFrame:
-    """
-    Merges `df_gene` and `df_bed`together.
-
-    :param df_gene: table containing all the human protein of ensembl \
-     database and the name and location of their genes.
-    :param df_bed: A bed file containing ensembl gene
-    :return: The merged table
-    """
-    df_gene = df_gene.loc[
-        df_gene["Gene_name"].isin(list(df_bed['name'].unique())), :]
-    df_bed.rename({'name': "Gene_name"}, inplace=True, axis=1)
-    return df_gene.merge(df_bed, how="left", on="Gene_name")
+    df = df.loc[df["Chromosome-scaffold_name"].isin(
+        list(map(str, range(1, 23))) + ["X", "Y"]), :]
+    return df.drop_duplicates()


 def create_protein_gene_file():
@@ -77,7 +44,5 @@ def create_protein_gene_file():
    """

    df_gene = load_genes(ConfigPPI.ensembl_gene)
-    df_bed = load_bed(ConfigPPI.bed_ensembl)
-    df = merge_tables(df_gene, df_bed)
    ConfigPPI.protein_gene.parent.mkdir(parents=True, exist_ok=True)
-    df.to_csv(ConfigPPI.protein_gene, sep="\t", index=False)
+    df_gene.to_csv(ConfigPPI.protein_gene, sep="\t", index=False)