Skip to content
Snippets Groups Projects
Commit 4e4d827d authored by nfontrod's avatar nfontrod
Browse files

src/db_utils/interactions/features_interactions.py: mojor modifications to...

src/db_utils/interactions/features_interactions.py: mojor modifications to speedup teh creation of interaction tables
parent 772b8999
Branches
Tags
No related merge requests found
...@@ -20,20 +20,14 @@ duplicate BED6 file of ChIA-PET data, that is in the following format: ...@@ -20,20 +20,14 @@ duplicate BED6 file of ChIA-PET data, that is in the following format:
import pandas as pd import pandas as pd
from datetime import datetime
from .config_interactions import ConfigInteractions as Config from .config_interactions import ConfigInteractions as Config
import logging
print("Start:", str(datetime.now())) from ...logging_conf import logging_def
from itertools import product
from tqdm import tqdm
def get_id_col(m_series: pd.Series) -> str: from typing import Dict, List
""" import re
Allows to produce a id sorted, under this format: anchor1$anchor2. import numpy as np
:param m_series: Contains the anchor1 and anchor2.
:return:
"""
return "$".join(sorted([m_series["anchor1"], m_series["anchor2"]]))
def work_on_pet(): def work_on_pet():
...@@ -49,13 +43,8 @@ def work_on_pet(): ...@@ -49,13 +43,8 @@ def work_on_pet():
""" """
pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t", pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t",
header=None) header=None)
pet = pet[[3]].drop_duplicates() pet.drop_duplicates(subset=3, inplace=True)
pet = pet.iloc[:, 0].str.split(r"-|,", expand=True) pet = pet.iloc[:, 3].str.split(r"-|,", expand=True)
pet.columns = ["anchor1", "anchor2", "weight"]
pet["id"] = pet.apply(get_id_col, axis=1)
pet = pet[["weight", "id"]].groupby(["id"]).sum().reset_index(drop=False)
pet = pet["id"] + "$" + pet["weight"]
pet = pet.str.split("$", expand=True)
pet.columns = ["anchor1", "anchor2", "weight"] pet.columns = ["anchor1", "anchor2", "weight"]
return pet return pet
...@@ -101,24 +90,26 @@ def work_on_intersection(): ...@@ -101,24 +90,26 @@ def work_on_intersection():
format is (see the description of this script for more information), e.g. format is (see the description of this script for more information), e.g.
18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 . 18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 .
100 100
We want the following output format: We want a dictionary linking the id of the pet to the region (exon/gene) \
1_1 18:28682..28782 --> which is the ID of the genomic region, e.g. exon it contains.
and the coordinates of the anchor matching to this genomic region.
:return: :return:
""" """
inter = pd.read_csv(Config.pet_vs_gene_output / inter_file = Config.pet_vs_gene_output / "gene_w200_vs_GSM1517080.bed"
"gene_w200_vs_GSM1517080.bed", dic = {}
sep="\t", header=None) with inter_file.open("r") as infile:
inter = inter.iloc[:, [3, 6, 7, 8]] for line in infile:
inter.columns = ["id_region", "chr", "start", "end"] line = line.strip("\n").split("\t")
inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\ id_anchor = f"{line[6]}:{line[7]}..{line[8]}"
astype("str") + ".." + inter["end"].astype("str") if id_anchor not in dic.keys():
inter.drop(["chr", "start", "end"], inplace=True, axis=1) dic[id_anchor] = [line[3]]
return inter else:
if line[3] not in dic[id_anchor]:
dic[id_anchor].append(line[3])
return dic
def interactions(pet: pd.DataFrame, inter: pd.DataFrame): def interactions(pet: pd.DataFrame, anchor_dic: Dict[str, List[str]]):
""" """
Allows to determine which couples of genomic regions interact, according to Allows to determine which couples of genomic regions interact, according to
what weight. what weight.
...@@ -128,29 +119,35 @@ def interactions(pet: pd.DataFrame, inter: pd.DataFrame): ...@@ -128,29 +119,35 @@ def interactions(pet: pd.DataFrame, inter: pd.DataFrame):
2. 2.
:param pet: :param pet:
:param inter: :param anchor_dic:
:return: :return:
""" """
df_final = pd.DataFrame() pet_dic = pet.to_dict("index")
for index, row in pet.iterrows(): pbar = tqdm(pet_dic.keys())
if index == 50: couples_list = []
break pattern = re.compile(r":\S+")
match_a = inter.loc[inter["id_anchor"] == row[0], :] for index in pbar:
match_b = inter.loc[inter["id_anchor"] == row[1], :] anchor1 = pet_dic[index]["anchor1"]
table_a = pd.DataFrame({"id_region": match_a["id_region"], anchor2 = pet_dic[index]["anchor2"]
"id_anchor": match_a["id_anchor"], try:
"number": [index] * len(match_a)}) region1 = anchor_dic[anchor1]
table_b = pd.DataFrame({"id_region": match_b["id_region"], region2 = anchor_dic[anchor2]
"id_anchor": match_b["id_anchor"], couples = list(product(region1, region2))
"number": [index] * len(match_b)}) couples = filtering_1(couples)
table_a = table_a.merge(table_b, how="outer", on="number", clen = len(couples)
suffixes=["_a1", "_a2"]) if clen == 0:
table_a.drop("number", inplace=True, axis=1) continue
table_a = table_a.loc[(-table_a["id_anchor_a1"].isna()) & couples = np.c_[couples, [anchor1] * clen, [anchor2] * clen,
(-table_a["id_anchor_a2"].isna()), :] [get_level(anchor1, anchor2, pattern)] * clen]
if not table_a.empty: couples_df = pd.DataFrame(couples, columns=["id_region_a1",
df_final = pd.concat([df_final, table_a], axis=0, "id_region_a2",
ignore_index=True) "id_anchor_a1",
"id_anchor_a2",
"level"])
couples_list.append(couples_df)
except KeyError:
continue
df_final = pd.concat(couples_list, axis=0, ignore_index=True)
df_final = df_final.merge(pet, how="left", df_final = df_final.merge(pet, how="left",
left_on=["id_anchor_a1", "id_anchor_a2"], left_on=["id_anchor_a1", "id_anchor_a2"],
right_on=["anchor1", "anchor2"]) right_on=["anchor1", "anchor2"])
...@@ -159,51 +156,31 @@ def interactions(pet: pd.DataFrame, inter: pd.DataFrame): ...@@ -159,51 +156,31 @@ def interactions(pet: pd.DataFrame, inter: pd.DataFrame):
return df_final return df_final
def add_level(df_level: pd.DataFrame): def get_level(anchor1: str, anchor2: str, pattern: re.Pattern) -> str:
""" """
Add the level to the previous dataframe result (df_final): Say if anchor1 and anchor2 are on the same chromosome.
- intrachromosomique: the two genomic regions that interact are in the same
chromosome
- interchromosomique: the two genomic regions that interact are in
different chromosomes
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique
:param df_level: :param anchor1: The id of an anchor
:param anchor2: The mate of anchor1
:param pattern: A regex pattern
:return: :return:
""" """
df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\ if re.sub(pattern, "", anchor1) == re.sub(pattern, "", anchor2):
split(":", expand=True) return "intra"
df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\ else:
split(":", expand=True) return "inter"
df_level.loc[df_level["chr_a1"] == df_level["chr_a2"], "level"] = \
"intrachromosomique"
df_level.loc[df_level["chr_a1"] != df_level["chr_a2"], "level"] = \
"interchromosomique"
df_level.drop(["chr_a1", "coordinates_a1", "chr_a2", "coordinates_a2"],
axis="columns", inplace=True)
return df_level
def filtering_1(df_filter_1: pd.DataFrame): def filtering_1(region_lists: List) -> List:
""" """
Filtering of the previous dataframe result (df_level) by removing: Removing common exons.
- the genomic regions that interact with itself, e.g.
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique
:param df_filter_1: :param region_lists: List of couple of regions
:return: :return: The lists without common regions
""" """
df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"],
"identical_or_not"] = "identical" return [sorted(couple) for couple in region_lists
df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"], if couple[0] != couple[1]]
"identical_or_not"] = "not"
to_del = df_filter_1[df_filter_1.identical_or_not == "identical"].index.\
tolist()
df_filter_1 = df_filter_1.drop(to_del)
del df_filter_1["identical_or_not"]
return df_filter_1
def filtering_2(df_filter_2: pd.DataFrame): def filtering_2(df_filter_2: pd.DataFrame):
...@@ -219,7 +196,7 @@ def filtering_2(df_filter_2: pd.DataFrame): ...@@ -219,7 +196,7 @@ def filtering_2(df_filter_2: pd.DataFrame):
:param df_filter_2: :param df_filter_2:
:return: :return:
""" """
df_filter_2 = df_filter_2.drop_duplicates() df_filter_2.drop_duplicates(inplace=True)
df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \ df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \
df_filter_2["id_region_a2"].astype(str) df_filter_2["id_region_a2"].astype(str)
df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1", df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1",
...@@ -230,25 +207,33 @@ def filtering_2(df_filter_2: pd.DataFrame): ...@@ -230,25 +207,33 @@ def filtering_2(df_filter_2: pd.DataFrame):
df_filter_2[["id_region_a1", "id_region_a2"]] = df_filter_2.id.str.\ df_filter_2[["id_region_a1", "id_region_a2"]] = df_filter_2.id.str.\
split("$", expand=True) split("$", expand=True)
del df_filter_2["id"] del df_filter_2["id"]
print(df_filter_2)
print("End:", str(datetime.now()))
return df_filter_2 return df_filter_2
def create_interaction_table(): def create_interaction_table(logging_level: str = "DISABLE"):
""" """
Create the interaction table. Create the interaction table.
:return: The table of interaction :return: The table of interaction
""" """
inter_df = work_on_intersection() logging_def(Config.chia_pet_interaction, __file__, logging_level)
logging.debug("Creation of intersection between exons and an anchor")
anchor_dic = work_on_intersection()
logging.debug("Getting anchor couples (PET) and weight")
df = work_on_pet() df = work_on_pet()
logging.debug(df.head())
logging.debug("Removing anchor couples (PET) overlapping")
df = del_overlaps(df) df = del_overlaps(df)
df = interactions(df, inter_df) logging.debug(df.head())
df = add_level(df) logging.debug("Linking exons interacting with each other")
df = filtering_1(df) df = interactions(df, anchor_dic)
return filtering_2(df) logging.debug(df.head())
logging.debug("Sum weight of identical interaction")
df = filtering_2(df)
logging.debug(df.head())
df.to_csv('test3.txt', index=False, sep="\t")
return df
if __name__ == "__main__": if __name__ == "__main__":
create_interaction_table() print(create_interaction_table("DEBUG").head())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment