diff --git a/src/find_interaction_cluster/ppi_scripts/config_ppi.py b/src/find_interaction_cluster/ppi_scripts/config_ppi.py index 312baea9d862909fa409ab7aed53984c2e50e702..d61820b4b05874c7d86013efed9c5018f6407623 100644 --- a/src/find_interaction_cluster/ppi_scripts/config_ppi.py +++ b/src/find_interaction_cluster/ppi_scripts/config_ppi.py @@ -17,7 +17,7 @@ class ConfigPPI: output_folder = ConfigGraph.output_folder / "ppi_analysis" data = ConfigGraph.data bed_ensembl = data / "ppi_data" / "liftover_hg38gene2hg19.bed" - ensembl_gene = data / "ppi_data" / "mart_export_hg38_proteins_genes.txt" + ensembl_gene = data / "ppi_data" / "mart_export_hg37_proteins_genes.txt" ppi_string = data / "ppi_data" / "9606.protein.links.v11.0.txt.gz" protein_gene = output_folder / "protein_gene.txt" gene_ppi_file = output_folder / "ppi_gene_file.txt" diff --git a/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py b/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py index 59221d693e4fa8d558d4b2018b94f8cc94964bfa..3c3bebabfea017d47bcfa3ae0cdef0dc53949518 100644 --- a/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py +++ b/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py @@ -27,43 +27,27 @@ def load_string_ppi_file(ppi_file: Path) -> pd.DataFrame: return df -def load_protein_gene_links(prot_gene_file: Path) -> Tuple[pd.DataFrame, Dict]: +def load_protein_gene_links(prot_gene_file: Path) -> pd.DataFrame: """ Load the file linking each gene to it's hg19 coordinates and encoded \ proteins. :param prot_gene_file: A file linking each gene to it's hg19 \ coordinates and encoded proteins. - :return: The loaded file and a dictionary of synonyms + :return: The loaded file """ - df = pd.read_csv(prot_gene_file, sep="\t", - dtype={"Gene_stable_ID": str, - "Protein_stable_ID": str, - "Gene_name": str, - "Gene_Synonym": str, - "chr": str, - "start": int, - "stop": int, - "strand": str}) - dic_synonym = {} - for i in range(df.shape[0]): - mseries = df.iloc[i, :] - if isinstance(mseries["Gene_Synonym"], float): - dic_synonym[mseries["Gene_name"].upper()] = \ - mseries["Gene_name"].upper() - else: - dic_synonym[mseries["Gene_Synonym"].upper()] = \ - mseries["Gene_name"].upper() - return df, dic_synonym - - -def load_fasterdb_gene(bed_gene: Path, dic_synonym: Dict) -> Dict: + df = pd.read_csv(prot_gene_file, sep="\t") + df.rename({"Chromosome-scaffold_name": "chr", "Gene_start": "start", + "Gene_end": "stop", "Strand": "strand"}, axis=1, inplace=True) + df['Gene_name'] = df['Gene_name'].str.upper() + return df + + +def load_fasterdb_gene(bed_gene: Path) -> Dict: """ Load a bed file containing all fasterDB gene. :param bed_gene: A bed file containing all fasterdb gene - :param dic_synonym: A dictionary of synonyms used to replace \ - the old fasterDB names by the new ones :return: A dictionary linking each fasterDB gene name to its coordinates """ dic = {} @@ -71,8 +55,6 @@ def load_fasterdb_gene(bed_gene: Path, dic_synonym: Dict) -> Dict: for line in inbed: line = line.replace("\n", "").split("\t") gene_name = line[4].upper() - if gene_name in dic_synonym.keys(): - gene_name = dic_synonym[gene_name] if gene_name not in dic.keys(): dic[gene_name] = [[line[0], int(line[1]), int(line[2]), int(line[3]), gene_name, line[5]]] @@ -93,12 +75,8 @@ def find_fasterdb_id(mseries: pd.Series, dic: Dict) -> Optional[int]: :return: The fasterDB id of the ensembl gene. """ if mseries['Gene_name'].upper() not in dic.keys(): - if isinstance(mseries['Gene_Synonym'], float) or \ - mseries['Gene_Synonym'].upper() not in dic.keys(): return None - genes = dic[mseries['Gene_Synonym'].upper()] - else: - genes = dic[mseries['Gene_name'].upper()] + genes = dic[mseries['Gene_name'].upper()] if len(genes) == 1: return genes[0][3] elif len(genes) >= 1: @@ -127,8 +105,7 @@ def add_gene_id_column(prot_gene_df: pd.DataFrame, dic: Dict) -> pd.DataFrame: axis=1, dic=dic) df = prot_gene_df.loc[-prot_gene_df['FasterDB_id'].isna(), :].copy() df['FasterDB_id'] = df['FasterDB_id'].astype(int) - df.drop(['Gene_Synonym'], axis=1, inplace=True) - return df.drop_duplicates() + return df def add_fasterdb_id_to_ppi_file(ppi_df: pd.DataFrame, @@ -164,9 +141,8 @@ def get_protein_interaction_table(): """ link ensemble gene and their encoding protein to a fasterDB gene id. """ - df_prot_gene, dic_synonyms = \ - load_protein_gene_links(ConfigPPI.protein_gene) - dic_gene = load_fasterdb_gene(Config.bed_gene, dic_synonyms) + df_prot_gene = load_protein_gene_links(ConfigPPI.protein_gene) + dic_gene = load_fasterdb_gene(Config.bed_gene) df_fasterdb_id = add_gene_id_column(df_prot_gene, dic_gene) df_fasterdb_id.to_csv(ConfigPPI.protein_gene_fasterdb, sep="\t", index=False) diff --git a/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py b/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py index a2ecaeda470c7c56d08d951d1a9bb7ec3c9f0255..0f2c9fa94c751d03bde3351dff9a3f5367ab6a3d 100644 --- a/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py +++ b/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py @@ -4,7 +4,7 @@ """ Description: The goal of this script is to get only coding gene from \ -ensemble with their protein id and their coordinate in hg19. +ensembl.. """ from .config_ppi import ConfigPPI @@ -25,49 +25,16 @@ def load_genes(ensembl_file: Path) -> pd.DataFrame: """ df = pd.read_csv(ensembl_file, sep="\t", dtype={"Gene_stable_ID": str, - "Gene_stable_ID_version": str, "Protein_stable_ID": str, "Chromosome-scaffold_name": str, "Gene_start": int, "Gene_end": int, "Strand": str, - "Gene_name": str, - "Gene_Synonym": str}) + "Gene_name": str}) df = df.loc[-df['Protein_stable_ID'].isna(), :] - return df[["Gene_stable_ID", "Protein_stable_ID", "Gene_name", - "Gene_Synonym"]] - - -def load_bed(ensembl_bed: Path) -> pd.DataFrame: - """ - Load a bed file containing ensembl hg38 gene on hg19 coordinates. - - :param ensembl_bed: A bed file containing ensembl gene - :return: the bed file - """ - df = pd.read_csv(ensembl_bed, names=["chr", "start", "stop", "name", - "score", "strand"], sep="\t") - df['chr'] = df["chr"].str.replace("chr", "") - df = df.drop_duplicates() - names = np.unique(df['name'].values, return_counts=True) - good_names = [cname for i, cname in enumerate(list(names[0])) - if names[1][i] == 1] - return df.loc[df['name'].isin(good_names), :] - - -def merge_tables(df_gene: pd.DataFrame, df_bed: pd.DataFrame) -> pd.DataFrame: - """ - Merges `df_gene` and `df_bed`together. - - :param df_gene: table containing all the human protein of ensembl \ - database and the name and location of their genes. - :param df_bed: A bed file containing ensembl gene - :return: The merged table - """ - df_gene = df_gene.loc[ - df_gene["Gene_name"].isin(list(df_bed['name'].unique())), :] - df_bed.rename({'name': "Gene_name"}, inplace=True, axis=1) - return df_gene.merge(df_bed, how="left", on="Gene_name") + df = df.loc[df["Chromosome-scaffold_name"].isin( + list(map(str, range(1, 23))) + ["X", "Y"]), :] + return df.drop_duplicates() def create_protein_gene_file(): @@ -77,7 +44,5 @@ def create_protein_gene_file(): """ df_gene = load_genes(ConfigPPI.ensembl_gene) - df_bed = load_bed(ConfigPPI.bed_ensembl) - df = merge_tables(df_gene, df_bed) ConfigPPI.protein_gene.parent.mkdir(parents=True, exist_ok=True) - df.to_csv(ConfigPPI.protein_gene, sep="\t", index=False) + df_gene.to_csv(ConfigPPI.protein_gene, sep="\t", index=False)