From 3a733e5958353871bb3f7182281bd8940276a11c Mon Sep 17 00:00:00 2001
From: alapendr <audrey.lapendry@ens-lyon.fr>
Date: Thu, 17 Sep 2020 14:15:56 +0200
Subject: [PATCH] features_interactions.py: modifications to add the distance
 between genes or exons in pairwise interactions

---
 .../interactions/features_interactions.py     | 84 ++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py
index 6369dbb8..92253d8b 100644
--- a/src/db_utils/interactions/features_interactions.py
+++ b/src/db_utils/interactions/features_interactions.py
@@ -30,6 +30,7 @@ import re
 import numpy as np
 import os.path
 import sys
+import sqlite3
 from ..populate_database import populate_df
 
 
@@ -251,12 +252,91 @@ def filtering_2(df_filter_2: pd.DataFrame):
     if FILE_1 == FILE_2:
         df_filter_2["id_project"] = FILE_1
         df_filter_2["id2"] = df_filter_2["id"].astype(str) + "_" + \
-                             df_filter_2["id_project"].astype(str)
+            df_filter_2["id_project"].astype(str)
         del df_filter_2["id"]
         df_filter_2.rename(columns={"id2": "id"}, inplace=True)
     return df_filter_2
 
 
+def get_info_from_database(cnx: sqlite3.Connection, query: str) -> \
+        pd.DataFrame:
+    """
+    Get the exons and the genes information from the database, for example:
+    - for the exons: 1_1	18	28681865	28682388
+    - for the genes: 1	18	28645943	28682388
+
+    :param cnx: connexion to the ChIA-PET database
+    :param query: the SQL query that allows us to get data from the database
+    :return df_res: the dataframe with the data obtained
+    """
+    cursor = cnx.cursor()
+    cursor.execute(query)
+    res = list(cursor.fetchall())
+    df_res = pd.DataFrame(res, columns=["ID", "chr", "start", "stop"])
+    return df_res
+
+
+def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Allows to calculate the distance between the two genes or the two exons
+    studied in interaction and to add this information in the result dataframe.
+    If the result is NULL it is because we study two exons or genes located in
+    the same chromosome.
+    If the result is (null), so NaN, it is because we study two exons or genes
+    which have different identifiers, but strictly identical coordinates.
+
+    :param df: Result of the "filtering_2" function
+    :return df: df with distances added or NULL or (null) see before for more
+    details.
+    """
+    info_exon = get_info_from_database(sqlite3.connect(Config.db_file),
+                                       """SELECT id, chromosome, start, stop 
+                                       FROM cin_exon""")
+    info_gene = get_info_from_database(sqlite3.connect(Config.db_file),
+                                       """SELECT id, chromosome, start, stop 
+                                       FROM cin_gene""")
+    if OPT_EXON == "on":
+        df = df.merge(info_exon, left_on="exon1", right_on="ID")
+        df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1',
+                                'stop': 'stop1'})
+        df = df.merge(info_exon, left_on="exon2", right_on="ID")
+        df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2',
+                                'stop': 'stop2'})
+        df.loc[df["start1"] < df["start2"], "distance"] = \
+            (df["start2"] - df["stop1"] + 1)
+        df.loc[df["stop1"] < df["stop2"], "distance"] = \
+            (df["start2"] - df["stop1"] + 1)
+        df.loc[df["start1"] > df["start2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["stop1"] > df["stop2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL"
+        df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2",
+                 "stop2"], axis='columns', inplace=True)
+    elif OPT_GENE == "on":
+        info_gene["ID"] = info_gene["ID"].astype(str)
+        df = df.merge(info_gene, left_on="gene1", right_on="ID")
+        df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1',
+                                'stop': 'stop1'})
+        df = df.merge(info_gene, left_on="gene2", right_on="ID")
+        df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2',
+                                'stop': 'stop2'})
+        df.loc[df["start1"] < df["start2"], "distance"] = \
+            (df["start2"] - df["stop1"] + 1)
+        df.loc[df["stop1"] < df["stop2"], "distance"] = \
+            (df["start2"] - df["stop1"] + 1)
+        df.loc[df["start1"] > df["start2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["start1"] > df["start2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["stop1"] > df["stop2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL"
+        df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2",
+                 "start2", "stop2"], axis='columns', inplace=True)
+    return df
+
+
 def create_interaction_table(logging_level: str = "DISABLE"):
     """
     Create the interaction tables.
@@ -281,6 +361,8 @@ def create_interaction_table(logging_level: str = "DISABLE"):
     logging.debug("Sum weight of identical interaction")
     df = filtering_2(df)
     logging.debug(df.head())
+    df = add_info_distance_between_features(df)
+    logging.debug(df.head())
     df.to_csv(MY_OUT_FILE, index=False, sep="\t", header=True)
     if OPT_EXON == "on":
         logging.debug("Filling cin_exon_interaction")
-- 
GitLab