Skip to content
Snippets Groups Projects
Commit 2eac9baa authored by nfontrod's avatar nfontrod
Browse files

creation of a file linking protein and the fasterDB gene id encoding them from...

creation of a file linking protein and the fasterDB gene id encoding them from a file containing ensembl! id for genes and their protein in hg19
parent cb228b23
Branches
No related tags found
No related merge requests found
......@@ -17,7 +17,7 @@ class ConfigPPI:
output_folder = ConfigGraph.output_folder / "ppi_analysis"
data = ConfigGraph.data
bed_ensembl = data / "ppi_data" / "liftover_hg38gene2hg19.bed"
ensembl_gene = data / "ppi_data" / "mart_export_hg38_proteins_genes.txt"
ensembl_gene = data / "ppi_data" / "mart_export_hg37_proteins_genes.txt"
ppi_string = data / "ppi_data" / "9606.protein.links.v11.0.txt.gz"
protein_gene = output_folder / "protein_gene.txt"
gene_ppi_file = output_folder / "ppi_gene_file.txt"
......
......@@ -27,43 +27,27 @@ def load_string_ppi_file(ppi_file: Path) -> pd.DataFrame:
return df
def load_protein_gene_links(prot_gene_file: Path) -> Tuple[pd.DataFrame, Dict]:
def load_protein_gene_links(prot_gene_file: Path) -> pd.DataFrame:
"""
Load the file linking each gene to it's hg19 coordinates and encoded \
proteins.
:param prot_gene_file: A file linking each gene to it's hg19 \
coordinates and encoded proteins.
:return: The loaded file and a dictionary of synonyms
"""
df = pd.read_csv(prot_gene_file, sep="\t",
dtype={"Gene_stable_ID": str,
"Protein_stable_ID": str,
"Gene_name": str,
"Gene_Synonym": str,
"chr": str,
"start": int,
"stop": int,
"strand": str})
dic_synonym = {}
for i in range(df.shape[0]):
mseries = df.iloc[i, :]
if isinstance(mseries["Gene_Synonym"], float):
dic_synonym[mseries["Gene_name"].upper()] = \
mseries["Gene_name"].upper()
else:
dic_synonym[mseries["Gene_Synonym"].upper()] = \
mseries["Gene_name"].upper()
return df, dic_synonym
:return: The loaded file
"""
df = pd.read_csv(prot_gene_file, sep="\t")
df.rename({"Chromosome-scaffold_name": "chr", "Gene_start": "start",
"Gene_end": "stop", "Strand": "strand"}, axis=1, inplace=True)
df['Gene_name'] = df['Gene_name'].str.upper()
return df
def load_fasterdb_gene(bed_gene: Path, dic_synonym: Dict) -> Dict:
def load_fasterdb_gene(bed_gene: Path) -> Dict:
"""
Load a bed file containing all fasterDB gene.
:param bed_gene: A bed file containing all fasterdb gene
:param dic_synonym: A dictionary of synonyms used to replace \
the old fasterDB names by the new ones
:return: A dictionary linking each fasterDB gene name to its coordinates
"""
dic = {}
......@@ -71,8 +55,6 @@ def load_fasterdb_gene(bed_gene: Path, dic_synonym: Dict) -> Dict:
for line in inbed:
line = line.replace("\n", "").split("\t")
gene_name = line[4].upper()
if gene_name in dic_synonym.keys():
gene_name = dic_synonym[gene_name]
if gene_name not in dic.keys():
dic[gene_name] = [[line[0], int(line[1]), int(line[2]),
int(line[3]), gene_name, line[5]]]
......@@ -93,11 +75,7 @@ def find_fasterdb_id(mseries: pd.Series, dic: Dict) -> Optional[int]:
:return: The fasterDB id of the ensembl gene.
"""
if mseries['Gene_name'].upper() not in dic.keys():
if isinstance(mseries['Gene_Synonym'], float) or \
mseries['Gene_Synonym'].upper() not in dic.keys():
return None
genes = dic[mseries['Gene_Synonym'].upper()]
else:
genes = dic[mseries['Gene_name'].upper()]
if len(genes) == 1:
return genes[0][3]
......@@ -127,8 +105,7 @@ def add_gene_id_column(prot_gene_df: pd.DataFrame, dic: Dict) -> pd.DataFrame:
axis=1, dic=dic)
df = prot_gene_df.loc[-prot_gene_df['FasterDB_id'].isna(), :].copy()
df['FasterDB_id'] = df['FasterDB_id'].astype(int)
df.drop(['Gene_Synonym'], axis=1, inplace=True)
return df.drop_duplicates()
return df
def add_fasterdb_id_to_ppi_file(ppi_df: pd.DataFrame,
......@@ -164,9 +141,8 @@ def get_protein_interaction_table():
"""
link ensemble gene and their encoding protein to a fasterDB gene id.
"""
df_prot_gene, dic_synonyms = \
load_protein_gene_links(ConfigPPI.protein_gene)
dic_gene = load_fasterdb_gene(Config.bed_gene, dic_synonyms)
df_prot_gene = load_protein_gene_links(ConfigPPI.protein_gene)
dic_gene = load_fasterdb_gene(Config.bed_gene)
df_fasterdb_id = add_gene_id_column(df_prot_gene, dic_gene)
df_fasterdb_id.to_csv(ConfigPPI.protein_gene_fasterdb, sep="\t",
index=False)
......
......@@ -4,7 +4,7 @@
"""
Description: The goal of this script is to get only coding gene from \
ensemble with their protein id and their coordinate in hg19.
ensembl..
"""
from .config_ppi import ConfigPPI
......@@ -25,49 +25,16 @@ def load_genes(ensembl_file: Path) -> pd.DataFrame:
"""
df = pd.read_csv(ensembl_file, sep="\t",
dtype={"Gene_stable_ID": str,
"Gene_stable_ID_version": str,
"Protein_stable_ID": str,
"Chromosome-scaffold_name": str,
"Gene_start": int,
"Gene_end": int,
"Strand": str,
"Gene_name": str,
"Gene_Synonym": str})
"Gene_name": str})
df = df.loc[-df['Protein_stable_ID'].isna(), :]
return df[["Gene_stable_ID", "Protein_stable_ID", "Gene_name",
"Gene_Synonym"]]
def load_bed(ensembl_bed: Path) -> pd.DataFrame:
"""
Load a bed file containing ensembl hg38 gene on hg19 coordinates.
:param ensembl_bed: A bed file containing ensembl gene
:return: the bed file
"""
df = pd.read_csv(ensembl_bed, names=["chr", "start", "stop", "name",
"score", "strand"], sep="\t")
df['chr'] = df["chr"].str.replace("chr", "")
df = df.drop_duplicates()
names = np.unique(df['name'].values, return_counts=True)
good_names = [cname for i, cname in enumerate(list(names[0]))
if names[1][i] == 1]
return df.loc[df['name'].isin(good_names), :]
def merge_tables(df_gene: pd.DataFrame, df_bed: pd.DataFrame) -> pd.DataFrame:
"""
Merges `df_gene` and `df_bed`together.
:param df_gene: table containing all the human protein of ensembl \
database and the name and location of their genes.
:param df_bed: A bed file containing ensembl gene
:return: The merged table
"""
df_gene = df_gene.loc[
df_gene["Gene_name"].isin(list(df_bed['name'].unique())), :]
df_bed.rename({'name': "Gene_name"}, inplace=True, axis=1)
return df_gene.merge(df_bed, how="left", on="Gene_name")
df = df.loc[df["Chromosome-scaffold_name"].isin(
list(map(str, range(1, 23))) + ["X", "Y"]), :]
return df.drop_duplicates()
def create_protein_gene_file():
......@@ -77,7 +44,5 @@ def create_protein_gene_file():
"""
df_gene = load_genes(ConfigPPI.ensembl_gene)
df_bed = load_bed(ConfigPPI.bed_ensembl)
df = merge_tables(df_gene, df_bed)
ConfigPPI.protein_gene.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(ConfigPPI.protein_gene, sep="\t", index=False)
df_gene.to_csv(ConfigPPI.protein_gene, sep="\t", index=False)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment