From 3a733e5958353871bb3f7182281bd8940276a11c Mon Sep 17 00:00:00 2001 From: alapendr <audrey.lapendry@ens-lyon.fr> Date: Thu, 17 Sep 2020 14:15:56 +0200 Subject: [PATCH] features_interactions.py: modifications to add the distance between genes or exons in pairwise interactions --- .../interactions/features_interactions.py | 84 ++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index 6369dbb8..92253d8b 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -30,6 +30,7 @@ import re import numpy as np import os.path import sys +import sqlite3 from ..populate_database import populate_df @@ -251,12 +252,91 @@ def filtering_2(df_filter_2: pd.DataFrame): if FILE_1 == FILE_2: df_filter_2["id_project"] = FILE_1 df_filter_2["id2"] = df_filter_2["id"].astype(str) + "_" + \ - df_filter_2["id_project"].astype(str) + df_filter_2["id_project"].astype(str) del df_filter_2["id"] df_filter_2.rename(columns={"id2": "id"}, inplace=True) return df_filter_2 +def get_info_from_database(cnx: sqlite3.Connection, query: str) -> \ + pd.DataFrame: + """ + Get the exons and the genes information from the database, for example: + - for the exons: 1_1 18 28681865 28682388 + - for the genes: 1 18 28645943 28682388 + + :param cnx: connexion to the ChIA-PET database + :param query: the SQL query that allows us to get data from the database + :return df_res: the dataframe with the data obtained + """ + cursor = cnx.cursor() + cursor.execute(query) + res = list(cursor.fetchall()) + df_res = pd.DataFrame(res, columns=["ID", "chr", "start", "stop"]) + return df_res + + +def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: + """ + Allows to calculate the distance between the two genes or the two exons + studied in interaction and to add this information in the result dataframe. + If the result is NULL it is because we study two exons or genes located in + the same chromosome. + If the result is (null), so NaN, it is because we study two exons or genes + which have different identifiers, but strictly identical coordinates. + + :param df: Result of the "filtering_2" function + :return df: df with distances added or NULL or (null) see before for more + details. + """ + info_exon = get_info_from_database(sqlite3.connect(Config.db_file), + """SELECT id, chromosome, start, stop + FROM cin_exon""") + info_gene = get_info_from_database(sqlite3.connect(Config.db_file), + """SELECT id, chromosome, start, stop + FROM cin_gene""") + if OPT_EXON == "on": + df = df.merge(info_exon, left_on="exon1", right_on="ID") + df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1', + 'stop': 'stop1'}) + df = df.merge(info_exon, left_on="exon2", right_on="ID") + df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2', + 'stop': 'stop2'}) + df.loc[df["start1"] < df["start2"], "distance"] = \ + (df["start2"] - df["stop1"] + 1) + df.loc[df["stop1"] < df["stop2"], "distance"] = \ + (df["start2"] - df["stop1"] + 1) + df.loc[df["start1"] > df["start2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["stop1"] > df["stop2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" + df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2", + "stop2"], axis='columns', inplace=True) + elif OPT_GENE == "on": + info_gene["ID"] = info_gene["ID"].astype(str) + df = df.merge(info_gene, left_on="gene1", right_on="ID") + df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1', + 'stop': 'stop1'}) + df = df.merge(info_gene, left_on="gene2", right_on="ID") + df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2', + 'stop': 'stop2'}) + df.loc[df["start1"] < df["start2"], "distance"] = \ + (df["start2"] - df["stop1"] + 1) + df.loc[df["stop1"] < df["stop2"], "distance"] = \ + (df["start2"] - df["stop1"] + 1) + df.loc[df["start1"] > df["start2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["start1"] > df["start2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["stop1"] > df["stop2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" + df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", + "start2", "stop2"], axis='columns', inplace=True) + return df + + def create_interaction_table(logging_level: str = "DISABLE"): """ Create the interaction tables. @@ -281,6 +361,8 @@ def create_interaction_table(logging_level: str = "DISABLE"): logging.debug("Sum weight of identical interaction") df = filtering_2(df) logging.debug(df.head()) + df = add_info_distance_between_features(df) + logging.debug(df.head()) df.to_csv(MY_OUT_FILE, index=False, sep="\t", header=True) if OPT_EXON == "on": logging.debug("Filling cin_exon_interaction") -- GitLab