Merge dev into master

2a804a99 · nfontrod · 2578a449 · f84037e4 · 2a804a99 · 2a804a99
Commit 2a804a99 authored 4 years ago by nfontrod
--- a/src/db_utils/db_creation.py
+++ b/src/db_utils/db_creation.py
@@ -124,6 +124,7 @@ def create_cin_exon_interaction_table(conn: sqlite3.Connection) -> None:
                [exon2] VARCHAR(30) NOT NULL,
                [id_project] INT NOT NULL,
                [level] VARCHAR(25) NOT NULL,
+                [distance] VARCHAR(30),
                PRIMARY KEY ([id]),
                FOREIGN KEY ([exon1]) REFERENCES cin_exon([id]),
                FOREIGN KEY ([exon2]) REFERENCES cin_exon([id]),
@@ -146,6 +147,7 @@ def create_cin_gene_interaction_table(conn: sqlite3.Connection) -> None:
                [gene2] INT NOT NULL,
                [id_project] INT NOT NULL,
                [level] VARCHAR(25) NOT NULL,
+                [distance] VARCHAR(30),
                PRIMARY KEY ([id]),
                FOREIGN KEY ([gene1]) REFERENCES cin_gene([id]),
                FOREIGN KEY ([gene2]) REFERENCES cin_gene([id]),

--- a/src/db_utils/interactions/config_interactions.py
+++ b/src/db_utils/interactions/config_interactions.py
@@ -8,6 +8,7 @@ Description: Configuration variables for subfolder interactions.
 """

 from ..config import Config
+from pathlib import Path


 class ConfigInteractions:
@@ -30,3 +31,5 @@ class ConfigInteractions:
    pet_vs_gene_output = chia_pet_interaction / 'intersections_gene'
    couple_exon = chia_pet_interaction / 'couple_exon'
    couple_gene = chia_pet_interaction / 'couple_gene'
+    results = Path(__file__).parents[3] / "results"
+    db_file = results / 'chia_pet_database.db'
--- a/src/db_utils/interactions/features_interactions.py
+++ b/src/db_utils/interactions/features_interactions.py
@@ -30,6 +30,7 @@ import re
 import numpy as np
 import os.path
 import sys
+import sqlite3
 from ..populate_database import populate_df


@@ -251,12 +252,91 @@ def filtering_2(df_filter_2: pd.DataFrame):
    if FILE_1 == FILE_2:
        df_filter_2["id_project"] = FILE_1
        df_filter_2["id2"] = df_filter_2["id"].astype(str) + "_" + \
-                             df_filter_2["id_project"].astype(str)
+            df_filter_2["id_project"].astype(str)
        del df_filter_2["id"]
        df_filter_2.rename(columns={"id2": "id"}, inplace=True)
    return df_filter_2


+def get_info_from_database(cnx: sqlite3.Connection, query: str) -> \
+        pd.DataFrame:
+    """
+    Get the exons and the genes information from the database, for example:
+    - for the exons: 1_1	18	28681865	28682388
+    - for the genes: 1	18	28645943	28682388
+
+    :param cnx: connexion to the ChIA-PET database
+    :param query: the SQL query that allows us to get data from the database
+    :return df_res: the dataframe with the data obtained
+    """
+    cursor = cnx.cursor()
+    cursor.execute(query)
+    res = list(cursor.fetchall())
+    df_res = pd.DataFrame(res, columns=["ID", "chr", "start", "stop"])
+    return df_res
+
+
+def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Allows to calculate the distance between the two genes or the two exons
+    studied in interaction and to add this information in the result dataframe.
+    If the result is NULL it is because we study two exons or genes located in
+    the same chromosome.
+    If the result is (null), so NaN, it is because we study two exons or genes
+    which have different identifiers, but strictly identical coordinates.
+
+    :param df: Result of the "filtering_2" function
+    :return df: df with distances added or NULL or (null) see before for more
+    details.
+    """
+    info_exon = get_info_from_database(sqlite3.connect(Config.db_file),
+                                       """SELECT id, chromosome, start, stop 
+                                       FROM cin_exon""")
+    info_gene = get_info_from_database(sqlite3.connect(Config.db_file),
+                                       """SELECT id, chromosome, start, stop 
+                                       FROM cin_gene""")
+    if OPT_EXON == "on":
+        df = df.merge(info_exon, left_on="exon1", right_on="ID")
+        df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1',
+                                'stop': 'stop1'})
+        df = df.merge(info_exon, left_on="exon2", right_on="ID")
+        df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2',
+                                'stop': 'stop2'})
+        df.loc[df["start1"] < df["start2"], "distance"] = \
+            (df["start2"] - df["stop1"] + 1)
+        df.loc[df["stop1"] < df["stop2"], "distance"] = \
+            (df["start2"] - df["stop1"] + 1)
+        df.loc[df["start1"] > df["start2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["stop1"] > df["stop2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL"
+        df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2",
+                 "stop2"], axis='columns', inplace=True)
+    elif OPT_GENE == "on":
+        info_gene["ID"] = info_gene["ID"].astype(str)
+        df = df.merge(info_gene, left_on="gene1", right_on="ID")
+        df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1',
+                                'stop': 'stop1'})
+        df = df.merge(info_gene, left_on="gene2", right_on="ID")
+        df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2',
+                                'stop': 'stop2'})
+        df.loc[df["start1"] < df["start2"], "distance"] = \
+            (df["start2"] - df["stop1"] + 1)
+        df.loc[df["stop1"] < df["stop2"], "distance"] = \
+            (df["start2"] - df["stop1"] + 1)
+        df.loc[df["start1"] > df["start2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["start1"] > df["start2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["stop1"] > df["stop2"], "distance"] = \
+            (df["start1"] - df["stop2"] + 1)
+        df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL"
+        df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2",
+                 "start2", "stop2"], axis='columns', inplace=True)
+    return df
+
+
 def create_interaction_table(logging_level: str = "DISABLE"):
    """
    Create the interaction tables.
@@ -281,6 +361,8 @@ def create_interaction_table(logging_level: str = "DISABLE"):
    logging.debug("Sum weight of identical interaction")
    df = filtering_2(df)
    logging.debug(df.head())
+    df = add_info_distance_between_features(df)
+    logging.debug(df.head())
    df.to_csv(MY_OUT_FILE, index=False, sep="\t", header=True)
    if OPT_EXON == "on":
        logging.debug("Filling cin_exon_interaction")