Skip to content
Snippets Groups Projects
Commit 4e4d827d authored by nfontrod's avatar nfontrod
Browse files

src/db_utils/interactions/features_interactions.py: mojor modifications to...

src/db_utils/interactions/features_interactions.py: mojor modifications to speedup teh creation of interaction tables
parent 772b8999
No related branches found
No related tags found
No related merge requests found
......@@ -20,20 +20,14 @@ duplicate BED6 file of ChIA-PET data, that is in the following format:
import pandas as pd
from datetime import datetime
from .config_interactions import ConfigInteractions as Config
print("Start:", str(datetime.now()))
def get_id_col(m_series: pd.Series) -> str:
"""
Allows to produce a id sorted, under this format: anchor1$anchor2.
:param m_series: Contains the anchor1 and anchor2.
:return:
"""
return "$".join(sorted([m_series["anchor1"], m_series["anchor2"]]))
import logging
from ...logging_conf import logging_def
from itertools import product
from tqdm import tqdm
from typing import Dict, List
import re
import numpy as np
def work_on_pet():
......@@ -49,13 +43,8 @@ def work_on_pet():
"""
pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t",
header=None)
pet = pet[[3]].drop_duplicates()
pet = pet.iloc[:, 0].str.split(r"-|,", expand=True)
pet.columns = ["anchor1", "anchor2", "weight"]
pet["id"] = pet.apply(get_id_col, axis=1)
pet = pet[["weight", "id"]].groupby(["id"]).sum().reset_index(drop=False)
pet = pet["id"] + "$" + pet["weight"]
pet = pet.str.split("$", expand=True)
pet.drop_duplicates(subset=3, inplace=True)
pet = pet.iloc[:, 3].str.split(r"-|,", expand=True)
pet.columns = ["anchor1", "anchor2", "weight"]
return pet
......@@ -101,24 +90,26 @@ def work_on_intersection():
format is (see the description of this script for more information), e.g.
18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 .
100
We want the following output format:
1_1 18:28682..28782 --> which is the ID of the genomic region, e.g. exon
and the coordinates of the anchor matching to this genomic region.
We want a dictionary linking the id of the pet to the region (exon/gene) \
it contains.
:return:
"""
inter = pd.read_csv(Config.pet_vs_gene_output /
"gene_w200_vs_GSM1517080.bed",
sep="\t", header=None)
inter = inter.iloc[:, [3, 6, 7, 8]]
inter.columns = ["id_region", "chr", "start", "end"]
inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\
astype("str") + ".." + inter["end"].astype("str")
inter.drop(["chr", "start", "end"], inplace=True, axis=1)
return inter
inter_file = Config.pet_vs_gene_output / "gene_w200_vs_GSM1517080.bed"
dic = {}
with inter_file.open("r") as infile:
for line in infile:
line = line.strip("\n").split("\t")
id_anchor = f"{line[6]}:{line[7]}..{line[8]}"
if id_anchor not in dic.keys():
dic[id_anchor] = [line[3]]
else:
if line[3] not in dic[id_anchor]:
dic[id_anchor].append(line[3])
return dic
def interactions(pet: pd.DataFrame, inter: pd.DataFrame):
def interactions(pet: pd.DataFrame, anchor_dic: Dict[str, List[str]]):
"""
Allows to determine which couples of genomic regions interact, according to
what weight.
......@@ -128,29 +119,35 @@ def interactions(pet: pd.DataFrame, inter: pd.DataFrame):
2.
:param pet:
:param inter:
:param anchor_dic:
:return:
"""
df_final = pd.DataFrame()
for index, row in pet.iterrows():
if index == 50:
break
match_a = inter.loc[inter["id_anchor"] == row[0], :]
match_b = inter.loc[inter["id_anchor"] == row[1], :]
table_a = pd.DataFrame({"id_region": match_a["id_region"],
"id_anchor": match_a["id_anchor"],
"number": [index] * len(match_a)})
table_b = pd.DataFrame({"id_region": match_b["id_region"],
"id_anchor": match_b["id_anchor"],
"number": [index] * len(match_b)})
table_a = table_a.merge(table_b, how="outer", on="number",
suffixes=["_a1", "_a2"])
table_a.drop("number", inplace=True, axis=1)
table_a = table_a.loc[(-table_a["id_anchor_a1"].isna()) &
(-table_a["id_anchor_a2"].isna()), :]
if not table_a.empty:
df_final = pd.concat([df_final, table_a], axis=0,
ignore_index=True)
pet_dic = pet.to_dict("index")
pbar = tqdm(pet_dic.keys())
couples_list = []
pattern = re.compile(r":\S+")
for index in pbar:
anchor1 = pet_dic[index]["anchor1"]
anchor2 = pet_dic[index]["anchor2"]
try:
region1 = anchor_dic[anchor1]
region2 = anchor_dic[anchor2]
couples = list(product(region1, region2))
couples = filtering_1(couples)
clen = len(couples)
if clen == 0:
continue
couples = np.c_[couples, [anchor1] * clen, [anchor2] * clen,
[get_level(anchor1, anchor2, pattern)] * clen]
couples_df = pd.DataFrame(couples, columns=["id_region_a1",
"id_region_a2",
"id_anchor_a1",
"id_anchor_a2",
"level"])
couples_list.append(couples_df)
except KeyError:
continue
df_final = pd.concat(couples_list, axis=0, ignore_index=True)
df_final = df_final.merge(pet, how="left",
left_on=["id_anchor_a1", "id_anchor_a2"],
right_on=["anchor1", "anchor2"])
......@@ -159,51 +156,31 @@ def interactions(pet: pd.DataFrame, inter: pd.DataFrame):
return df_final
def add_level(df_level: pd.DataFrame):
def get_level(anchor1: str, anchor2: str, pattern: re.Pattern) -> str:
"""
Add the level to the previous dataframe result (df_final):
- intrachromosomique: the two genomic regions that interact are in the same
chromosome
- interchromosomique: the two genomic regions that interact are in
different chromosomes
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique
Say if anchor1 and anchor2 are on the same chromosome.
:param df_level:
:param anchor1: The id of an anchor
:param anchor2: The mate of anchor1
:param pattern: A regex pattern
:return:
"""
df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\
split(":", expand=True)
df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\
split(":", expand=True)
df_level.loc[df_level["chr_a1"] == df_level["chr_a2"], "level"] = \
"intrachromosomique"
df_level.loc[df_level["chr_a1"] != df_level["chr_a2"], "level"] = \
"interchromosomique"
df_level.drop(["chr_a1", "coordinates_a1", "chr_a2", "coordinates_a2"],
axis="columns", inplace=True)
return df_level
if re.sub(pattern, "", anchor1) == re.sub(pattern, "", anchor2):
return "intra"
else:
return "inter"
def filtering_1(df_filter_1: pd.DataFrame):
def filtering_1(region_lists: List) -> List:
"""
Filtering of the previous dataframe result (df_level) by removing:
- the genomic regions that interact with itself, e.g.
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique
Removing common exons.
:param df_filter_1:
:return:
:param region_lists: List of couple of regions
:return: The lists without common regions
"""
df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"],
"identical_or_not"] = "identical"
df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"],
"identical_or_not"] = "not"
to_del = df_filter_1[df_filter_1.identical_or_not == "identical"].index.\
tolist()
df_filter_1 = df_filter_1.drop(to_del)
del df_filter_1["identical_or_not"]
return df_filter_1
return [sorted(couple) for couple in region_lists
if couple[0] != couple[1]]
def filtering_2(df_filter_2: pd.DataFrame):
......@@ -219,7 +196,7 @@ def filtering_2(df_filter_2: pd.DataFrame):
:param df_filter_2:
:return:
"""
df_filter_2 = df_filter_2.drop_duplicates()
df_filter_2.drop_duplicates(inplace=True)
df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \
df_filter_2["id_region_a2"].astype(str)
df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1",
......@@ -230,25 +207,33 @@ def filtering_2(df_filter_2: pd.DataFrame):
df_filter_2[["id_region_a1", "id_region_a2"]] = df_filter_2.id.str.\
split("$", expand=True)
del df_filter_2["id"]
print(df_filter_2)
print("End:", str(datetime.now()))
return df_filter_2
def create_interaction_table():
def create_interaction_table(logging_level: str = "DISABLE"):
"""
Create the interaction table.
:return: The table of interaction
"""
inter_df = work_on_intersection()
logging_def(Config.chia_pet_interaction, __file__, logging_level)
logging.debug("Creation of intersection between exons and an anchor")
anchor_dic = work_on_intersection()
logging.debug("Getting anchor couples (PET) and weight")
df = work_on_pet()
logging.debug(df.head())
logging.debug("Removing anchor couples (PET) overlapping")
df = del_overlaps(df)
df = interactions(df, inter_df)
df = add_level(df)
df = filtering_1(df)
return filtering_2(df)
logging.debug(df.head())
logging.debug("Linking exons interacting with each other")
df = interactions(df, anchor_dic)
logging.debug(df.head())
logging.debug("Sum weight of identical interaction")
df = filtering_2(df)
logging.debug(df.head())
df.to_csv('test3.txt', index=False, sep="\t")
return df
if __name__ == "__main__":
create_interaction_table()
print(create_interaction_table("DEBUG").head())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment