diff --git a/src/db_utils/db_creation.py b/src/db_utils/db_creation.py index ccfed76ee3fd0616cf19ad91ab26ba31d9caa572..079008273fc48ebd3090af963a9a4bbf37237e7a 100755 --- a/src/db_utils/db_creation.py +++ b/src/db_utils/db_creation.py @@ -124,6 +124,7 @@ def create_cin_exon_interaction_table(conn: sqlite3.Connection) -> None: [exon2] VARCHAR(30) NOT NULL, [id_project] INT NOT NULL, [level] VARCHAR(25) NOT NULL, + [distance] VARCHAR(30), PRIMARY KEY ([id]), FOREIGN KEY ([exon1]) REFERENCES cin_exon([id]), FOREIGN KEY ([exon2]) REFERENCES cin_exon([id]), @@ -146,6 +147,7 @@ def create_cin_gene_interaction_table(conn: sqlite3.Connection) -> None: [gene2] INT NOT NULL, [id_project] INT NOT NULL, [level] VARCHAR(25) NOT NULL, + [distance] VARCHAR(30), PRIMARY KEY ([id]), FOREIGN KEY ([gene1]) REFERENCES cin_gene([id]), FOREIGN KEY ([gene2]) REFERENCES cin_gene([id]), diff --git a/src/db_utils/interactions/config_interactions.py b/src/db_utils/interactions/config_interactions.py index 753a9a2c3b2d238b5bd6a45ae8388d030b03812b..517923d9da0d4703462d2e3e981de3feb952e539 100644 --- a/src/db_utils/interactions/config_interactions.py +++ b/src/db_utils/interactions/config_interactions.py @@ -8,6 +8,7 @@ Description: Configuration variables for subfolder interactions. """ from ..config import Config +from pathlib import Path class ConfigInteractions: @@ -30,3 +31,5 @@ class ConfigInteractions: pet_vs_gene_output = chia_pet_interaction / 'intersections_gene' couple_exon = chia_pet_interaction / 'couple_exon' couple_gene = chia_pet_interaction / 'couple_gene' + results = Path(__file__).parents[3] / "results" + db_file = results / 'chia_pet_database.db' diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index 6369dbb85172f6815b964c1184abad4d5ea01a5f..92253d8be30ee66ed26948b8eb2828daa4c5d35f 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -30,6 +30,7 @@ import re import numpy as np import os.path import sys +import sqlite3 from ..populate_database import populate_df @@ -251,12 +252,91 @@ def filtering_2(df_filter_2: pd.DataFrame): if FILE_1 == FILE_2: df_filter_2["id_project"] = FILE_1 df_filter_2["id2"] = df_filter_2["id"].astype(str) + "_" + \ - df_filter_2["id_project"].astype(str) + df_filter_2["id_project"].astype(str) del df_filter_2["id"] df_filter_2.rename(columns={"id2": "id"}, inplace=True) return df_filter_2 +def get_info_from_database(cnx: sqlite3.Connection, query: str) -> \ + pd.DataFrame: + """ + Get the exons and the genes information from the database, for example: + - for the exons: 1_1 18 28681865 28682388 + - for the genes: 1 18 28645943 28682388 + + :param cnx: connexion to the ChIA-PET database + :param query: the SQL query that allows us to get data from the database + :return df_res: the dataframe with the data obtained + """ + cursor = cnx.cursor() + cursor.execute(query) + res = list(cursor.fetchall()) + df_res = pd.DataFrame(res, columns=["ID", "chr", "start", "stop"]) + return df_res + + +def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: + """ + Allows to calculate the distance between the two genes or the two exons + studied in interaction and to add this information in the result dataframe. + If the result is NULL it is because we study two exons or genes located in + the same chromosome. + If the result is (null), so NaN, it is because we study two exons or genes + which have different identifiers, but strictly identical coordinates. + + :param df: Result of the "filtering_2" function + :return df: df with distances added or NULL or (null) see before for more + details. + """ + info_exon = get_info_from_database(sqlite3.connect(Config.db_file), + """SELECT id, chromosome, start, stop + FROM cin_exon""") + info_gene = get_info_from_database(sqlite3.connect(Config.db_file), + """SELECT id, chromosome, start, stop + FROM cin_gene""") + if OPT_EXON == "on": + df = df.merge(info_exon, left_on="exon1", right_on="ID") + df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1', + 'stop': 'stop1'}) + df = df.merge(info_exon, left_on="exon2", right_on="ID") + df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2', + 'stop': 'stop2'}) + df.loc[df["start1"] < df["start2"], "distance"] = \ + (df["start2"] - df["stop1"] + 1) + df.loc[df["stop1"] < df["stop2"], "distance"] = \ + (df["start2"] - df["stop1"] + 1) + df.loc[df["start1"] > df["start2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["stop1"] > df["stop2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" + df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2", + "stop2"], axis='columns', inplace=True) + elif OPT_GENE == "on": + info_gene["ID"] = info_gene["ID"].astype(str) + df = df.merge(info_gene, left_on="gene1", right_on="ID") + df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1', + 'stop': 'stop1'}) + df = df.merge(info_gene, left_on="gene2", right_on="ID") + df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2', + 'stop': 'stop2'}) + df.loc[df["start1"] < df["start2"], "distance"] = \ + (df["start2"] - df["stop1"] + 1) + df.loc[df["stop1"] < df["stop2"], "distance"] = \ + (df["start2"] - df["stop1"] + 1) + df.loc[df["start1"] > df["start2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["start1"] > df["start2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["stop1"] > df["stop2"], "distance"] = \ + (df["start1"] - df["stop2"] + 1) + df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" + df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", + "start2", "stop2"], axis='columns', inplace=True) + return df + + def create_interaction_table(logging_level: str = "DISABLE"): """ Create the interaction tables. @@ -281,6 +361,8 @@ def create_interaction_table(logging_level: str = "DISABLE"): logging.debug("Sum weight of identical interaction") df = filtering_2(df) logging.debug(df.head()) + df = add_info_distance_between_features(df) + logging.debug(df.head()) df.to_csv(MY_OUT_FILE, index=False, sep="\t", header=True) if OPT_EXON == "on": logging.debug("Filling cin_exon_interaction")