diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index b63d647babdbca016b5c360a62e02b225a3fdc7c..d6a5898502c10ee597c08bffde23fe31c4b7ac1b 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -20,20 +20,14 @@ duplicate BED6 file of ChIA-PET data, that is in the following format: import pandas as pd -from datetime import datetime from .config_interactions import ConfigInteractions as Config - -print("Start:", str(datetime.now())) - - -def get_id_col(m_series: pd.Series) -> str: - """ - Allows to produce a id sorted, under this format: anchor1$anchor2. - - :param m_series: Contains the anchor1 and anchor2. - :return: - """ - return "$".join(sorted([m_series["anchor1"], m_series["anchor2"]])) +import logging +from ...logging_conf import logging_def +from itertools import product +from tqdm import tqdm +from typing import Dict, List +import re +import numpy as np def work_on_pet(): @@ -49,13 +43,8 @@ def work_on_pet(): """ pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t", header=None) - pet = pet[[3]].drop_duplicates() - pet = pet.iloc[:, 0].str.split(r"-|,", expand=True) - pet.columns = ["anchor1", "anchor2", "weight"] - pet["id"] = pet.apply(get_id_col, axis=1) - pet = pet[["weight", "id"]].groupby(["id"]).sum().reset_index(drop=False) - pet = pet["id"] + "$" + pet["weight"] - pet = pet.str.split("$", expand=True) + pet.drop_duplicates(subset=3, inplace=True) + pet = pet.iloc[:, 3].str.split(r"-|,", expand=True) pet.columns = ["anchor1", "anchor2", "weight"] return pet @@ -101,24 +90,26 @@ def work_on_intersection(): format is (see the description of this script for more information), e.g. 18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 . 100 - We want the following output format: - 1_1 18:28682..28782 --> which is the ID of the genomic region, e.g. exon - and the coordinates of the anchor matching to this genomic region. + We want a dictionary linking the id of the pet to the region (exon/gene) \ + it contains. :return: """ - inter = pd.read_csv(Config.pet_vs_gene_output / - "gene_w200_vs_GSM1517080.bed", - sep="\t", header=None) - inter = inter.iloc[:, [3, 6, 7, 8]] - inter.columns = ["id_region", "chr", "start", "end"] - inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\ - astype("str") + ".." + inter["end"].astype("str") - inter.drop(["chr", "start", "end"], inplace=True, axis=1) - return inter + inter_file = Config.pet_vs_gene_output / "gene_w200_vs_GSM1517080.bed" + dic = {} + with inter_file.open("r") as infile: + for line in infile: + line = line.strip("\n").split("\t") + id_anchor = f"{line[6]}:{line[7]}..{line[8]}" + if id_anchor not in dic.keys(): + dic[id_anchor] = [line[3]] + else: + if line[3] not in dic[id_anchor]: + dic[id_anchor].append(line[3]) + return dic -def interactions(pet: pd.DataFrame, inter: pd.DataFrame): +def interactions(pet: pd.DataFrame, anchor_dic: Dict[str, List[str]]): """ Allows to determine which couples of genomic regions interact, according to what weight. @@ -128,29 +119,35 @@ def interactions(pet: pd.DataFrame, inter: pd.DataFrame): 2. :param pet: - :param inter: + :param anchor_dic: :return: """ - df_final = pd.DataFrame() - for index, row in pet.iterrows(): - if index == 50: - break - match_a = inter.loc[inter["id_anchor"] == row[0], :] - match_b = inter.loc[inter["id_anchor"] == row[1], :] - table_a = pd.DataFrame({"id_region": match_a["id_region"], - "id_anchor": match_a["id_anchor"], - "number": [index] * len(match_a)}) - table_b = pd.DataFrame({"id_region": match_b["id_region"], - "id_anchor": match_b["id_anchor"], - "number": [index] * len(match_b)}) - table_a = table_a.merge(table_b, how="outer", on="number", - suffixes=["_a1", "_a2"]) - table_a.drop("number", inplace=True, axis=1) - table_a = table_a.loc[(-table_a["id_anchor_a1"].isna()) & - (-table_a["id_anchor_a2"].isna()), :] - if not table_a.empty: - df_final = pd.concat([df_final, table_a], axis=0, - ignore_index=True) + pet_dic = pet.to_dict("index") + pbar = tqdm(pet_dic.keys()) + couples_list = [] + pattern = re.compile(r":\S+") + for index in pbar: + anchor1 = pet_dic[index]["anchor1"] + anchor2 = pet_dic[index]["anchor2"] + try: + region1 = anchor_dic[anchor1] + region2 = anchor_dic[anchor2] + couples = list(product(region1, region2)) + couples = filtering_1(couples) + clen = len(couples) + if clen == 0: + continue + couples = np.c_[couples, [anchor1] * clen, [anchor2] * clen, + [get_level(anchor1, anchor2, pattern)] * clen] + couples_df = pd.DataFrame(couples, columns=["id_region_a1", + "id_region_a2", + "id_anchor_a1", + "id_anchor_a2", + "level"]) + couples_list.append(couples_df) + except KeyError: + continue + df_final = pd.concat(couples_list, axis=0, ignore_index=True) df_final = df_final.merge(pet, how="left", left_on=["id_anchor_a1", "id_anchor_a2"], right_on=["anchor1", "anchor2"]) @@ -159,51 +156,31 @@ def interactions(pet: pd.DataFrame, inter: pd.DataFrame): return df_final -def add_level(df_level: pd.DataFrame): +def get_level(anchor1: str, anchor2: str, pattern: re.Pattern) -> str: """ - Add the level to the previous dataframe result (df_final): - - intrachromosomique: the two genomic regions that interact are in the same - chromosome - - interchromosomique: the two genomic regions that interact are in - different chromosomes - #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level - 7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique + Say if anchor1 and anchor2 are on the same chromosome. - :param df_level: + :param anchor1: The id of an anchor + :param anchor2: The mate of anchor1 + :param pattern: A regex pattern :return: """ - df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\ - split(":", expand=True) - df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\ - split(":", expand=True) - df_level.loc[df_level["chr_a1"] == df_level["chr_a2"], "level"] = \ - "intrachromosomique" - df_level.loc[df_level["chr_a1"] != df_level["chr_a2"], "level"] = \ - "interchromosomique" - df_level.drop(["chr_a1", "coordinates_a1", "chr_a2", "coordinates_a2"], - axis="columns", inplace=True) - return df_level + if re.sub(pattern, "", anchor1) == re.sub(pattern, "", anchor2): + return "intra" + else: + return "inter" -def filtering_1(df_filter_1: pd.DataFrame): +def filtering_1(region_lists: List) -> List: """ - Filtering of the previous dataframe result (df_level) by removing: - - the genomic regions that interact with itself, e.g. - #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level - 7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique + Removing common exons. - :param df_filter_1: - :return: + :param region_lists: List of couple of regions + :return: The lists without common regions """ - df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"], - "identical_or_not"] = "identical" - df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"], - "identical_or_not"] = "not" - to_del = df_filter_1[df_filter_1.identical_or_not == "identical"].index.\ - tolist() - df_filter_1 = df_filter_1.drop(to_del) - del df_filter_1["identical_or_not"] - return df_filter_1 + + return [sorted(couple) for couple in region_lists + if couple[0] != couple[1]] def filtering_2(df_filter_2: pd.DataFrame): @@ -219,7 +196,7 @@ def filtering_2(df_filter_2: pd.DataFrame): :param df_filter_2: :return: """ - df_filter_2 = df_filter_2.drop_duplicates() + df_filter_2.drop_duplicates(inplace=True) df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \ df_filter_2["id_region_a2"].astype(str) df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1", @@ -230,25 +207,33 @@ def filtering_2(df_filter_2: pd.DataFrame): df_filter_2[["id_region_a1", "id_region_a2"]] = df_filter_2.id.str.\ split("$", expand=True) del df_filter_2["id"] - print(df_filter_2) - print("End:", str(datetime.now())) return df_filter_2 -def create_interaction_table(): +def create_interaction_table(logging_level: str = "DISABLE"): """ Create the interaction table. :return: The table of interaction """ - inter_df = work_on_intersection() + logging_def(Config.chia_pet_interaction, __file__, logging_level) + logging.debug("Creation of intersection between exons and an anchor") + anchor_dic = work_on_intersection() + logging.debug("Getting anchor couples (PET) and weight") df = work_on_pet() + logging.debug(df.head()) + logging.debug("Removing anchor couples (PET) overlapping") df = del_overlaps(df) - df = interactions(df, inter_df) - df = add_level(df) - df = filtering_1(df) - return filtering_2(df) + logging.debug(df.head()) + logging.debug("Linking exons interacting with each other") + df = interactions(df, anchor_dic) + logging.debug(df.head()) + logging.debug("Sum weight of identical interaction") + df = filtering_2(df) + logging.debug(df.head()) + df.to_csv('test3.txt', index=False, sep="\t") + return df if __name__ == "__main__": - create_interaction_table() + print(create_interaction_table("DEBUG").head())