Skip to content
Snippets Groups Projects
Commit 2a804a99 authored by nfontrod's avatar nfontrod
Browse files

Merge dev into master

parents 2578a449 f84037e4
No related branches found
No related tags found
No related merge requests found
......@@ -124,6 +124,7 @@ def create_cin_exon_interaction_table(conn: sqlite3.Connection) -> None:
[exon2] VARCHAR(30) NOT NULL,
[id_project] INT NOT NULL,
[level] VARCHAR(25) NOT NULL,
[distance] VARCHAR(30),
PRIMARY KEY ([id]),
FOREIGN KEY ([exon1]) REFERENCES cin_exon([id]),
FOREIGN KEY ([exon2]) REFERENCES cin_exon([id]),
......@@ -146,6 +147,7 @@ def create_cin_gene_interaction_table(conn: sqlite3.Connection) -> None:
[gene2] INT NOT NULL,
[id_project] INT NOT NULL,
[level] VARCHAR(25) NOT NULL,
[distance] VARCHAR(30),
PRIMARY KEY ([id]),
FOREIGN KEY ([gene1]) REFERENCES cin_gene([id]),
FOREIGN KEY ([gene2]) REFERENCES cin_gene([id]),
......
......@@ -8,6 +8,7 @@ Description: Configuration variables for subfolder interactions.
"""
from ..config import Config
from pathlib import Path
class ConfigInteractions:
......@@ -30,3 +31,5 @@ class ConfigInteractions:
pet_vs_gene_output = chia_pet_interaction / 'intersections_gene'
couple_exon = chia_pet_interaction / 'couple_exon'
couple_gene = chia_pet_interaction / 'couple_gene'
results = Path(__file__).parents[3] / "results"
db_file = results / 'chia_pet_database.db'
......@@ -30,6 +30,7 @@ import re
import numpy as np
import os.path
import sys
import sqlite3
from ..populate_database import populate_df
......@@ -251,12 +252,91 @@ def filtering_2(df_filter_2: pd.DataFrame):
if FILE_1 == FILE_2:
df_filter_2["id_project"] = FILE_1
df_filter_2["id2"] = df_filter_2["id"].astype(str) + "_" + \
df_filter_2["id_project"].astype(str)
df_filter_2["id_project"].astype(str)
del df_filter_2["id"]
df_filter_2.rename(columns={"id2": "id"}, inplace=True)
return df_filter_2
def get_info_from_database(cnx: sqlite3.Connection, query: str) -> \
pd.DataFrame:
"""
Get the exons and the genes information from the database, for example:
- for the exons: 1_1 18 28681865 28682388
- for the genes: 1 18 28645943 28682388
:param cnx: connexion to the ChIA-PET database
:param query: the SQL query that allows us to get data from the database
:return df_res: the dataframe with the data obtained
"""
cursor = cnx.cursor()
cursor.execute(query)
res = list(cursor.fetchall())
df_res = pd.DataFrame(res, columns=["ID", "chr", "start", "stop"])
return df_res
def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Allows to calculate the distance between the two genes or the two exons
studied in interaction and to add this information in the result dataframe.
If the result is NULL it is because we study two exons or genes located in
the same chromosome.
If the result is (null), so NaN, it is because we study two exons or genes
which have different identifiers, but strictly identical coordinates.
:param df: Result of the "filtering_2" function
:return df: df with distances added or NULL or (null) see before for more
details.
"""
info_exon = get_info_from_database(sqlite3.connect(Config.db_file),
"""SELECT id, chromosome, start, stop
FROM cin_exon""")
info_gene = get_info_from_database(sqlite3.connect(Config.db_file),
"""SELECT id, chromosome, start, stop
FROM cin_gene""")
if OPT_EXON == "on":
df = df.merge(info_exon, left_on="exon1", right_on="ID")
df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1',
'stop': 'stop1'})
df = df.merge(info_exon, left_on="exon2", right_on="ID")
df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2',
'stop': 'stop2'})
df.loc[df["start1"] < df["start2"], "distance"] = \
(df["start2"] - df["stop1"] + 1)
df.loc[df["stop1"] < df["stop2"], "distance"] = \
(df["start2"] - df["stop1"] + 1)
df.loc[df["start1"] > df["start2"], "distance"] = \
(df["start1"] - df["stop2"] + 1)
df.loc[df["stop1"] > df["stop2"], "distance"] = \
(df["start1"] - df["stop2"] + 1)
df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL"
df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2",
"stop2"], axis='columns', inplace=True)
elif OPT_GENE == "on":
info_gene["ID"] = info_gene["ID"].astype(str)
df = df.merge(info_gene, left_on="gene1", right_on="ID")
df = df.rename(columns={'ID': 'ID1', 'chr': 'chr1', 'start': 'start1',
'stop': 'stop1'})
df = df.merge(info_gene, left_on="gene2", right_on="ID")
df = df.rename(columns={'ID': 'ID2', 'chr': 'chr2', 'start': 'start2',
'stop': 'stop2'})
df.loc[df["start1"] < df["start2"], "distance"] = \
(df["start2"] - df["stop1"] + 1)
df.loc[df["stop1"] < df["stop2"], "distance"] = \
(df["start2"] - df["stop1"] + 1)
df.loc[df["start1"] > df["start2"], "distance"] = \
(df["start1"] - df["stop2"] + 1)
df.loc[df["start1"] > df["start2"], "distance"] = \
(df["start1"] - df["stop2"] + 1)
df.loc[df["stop1"] > df["stop2"], "distance"] = \
(df["start1"] - df["stop2"] + 1)
df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL"
df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2",
"start2", "stop2"], axis='columns', inplace=True)
return df
def create_interaction_table(logging_level: str = "DISABLE"):
"""
Create the interaction tables.
......@@ -281,6 +361,8 @@ def create_interaction_table(logging_level: str = "DISABLE"):
logging.debug("Sum weight of identical interaction")
df = filtering_2(df)
logging.debug(df.head())
df = add_info_distance_between_features(df)
logging.debug(df.head())
df.to_csv(MY_OUT_FILE, index=False, sep="\t", header=True)
if OPT_EXON == "on":
logging.debug("Filling cin_exon_interaction")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment