diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index d6a5898502c10ee597c08bffde23fe31c4b7ac1b..bc0f819d522f49fdc14f101e1d7b9dbd50e9a98d 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -3,7 +3,7 @@ # -*- coding: UTF-8 -*- """ -Description:This script allows to determine which couples of genomic regions +Description: This script allows to determine which couples of genomic regions interact, e.g. exons or genes, that for each ChIA-PET dataset. This script requires the output file produced by the script bedtools.py, which @@ -36,10 +36,11 @@ def work_on_pet(): #chr1 start1 end1 chr1:start1..end1-chr2:start2..end2,weight1-2 weight1-2 . #chr2 start2 end2 chr1:start1..end1-chr2:start2..end2,weight1-2 weight1-2 . We want the following output format: - #chr1:start1..end1 chr2:start2..end2 weight1-2 - 10:100172432..100175026 6:92192672..92194172 2 + #anchor1 anchor2 weight + #chr1:start1..end1 chr2:start2..end2 weight1-2 + 10:100172432..100175026 6:92192672..92194172 2 - :return: + :return: Pet in this format: chr1:start1..end1 chr2:start2..end2 weight1-2 """ pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t", header=None) @@ -52,12 +53,13 @@ def work_on_pet(): def del_overlaps(pet: pd.DataFrame): """ This works on the previous dataframe result (pet). Input format is: - #chr1:start1..end1 chr2:start2..end2 weight1-2 + #anchor1 anchor2 weight + #chr1:start1..end1 chr2:start2..end2 weight1-2 We delete from this dataframe the pet that has overlapping anchors, e.g. - 9:139773532..139778733 9:139778161..139781850 7 + 9:139773532..139778733 9:139778161..139781850 7 - :param pet: - :return: + :param pet: In this format: chr1:start1..end1 chr2:start2..end2 weight1-2 + :return: Pet dataframe without pet that have overlapping anchors """ pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\ split(r"[:..]", expand=True) @@ -90,12 +92,14 @@ def work_on_intersection(): format is (see the description of this script for more information), e.g. 18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 . 100 - We want a dictionary linking the id of the pet to the region (exon/gene) \ - it contains. + We return a dictionary which link the id of the pet with the region + (exon/gene) it contains, e.g. '17:73176122..73178842': ['19423_1', + '19423_2'] - :return: + :return: A dictionary which link the id of the pet with the region + (exon/gene) it contains. """ - inter_file = Config.pet_vs_gene_output / "gene_w200_vs_GSM1517080.bed" + inter_file = Config.pet_vs_exon_output / "exon_w200_vs_GSM1517080.bed" dic = {} with inter_file.open("r") as infile: for line in infile: @@ -113,14 +117,17 @@ def interactions(pet: pd.DataFrame, anchor_dic: Dict[str, List[str]]): """ Allows to determine which couples of genomic regions interact, according to what weight. - #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight - 7832 10:10019900..10020058 16755 11:5834473..5834631 2 - It means that gene 7832 interacts with gene 16755, according to a weight of - 2. - - :param pet: - :param anchor_dic: - :return: + #id_region_1 id_region_2 id_anchor_1 id_anchor_2 level weight + 9815_1 9815_7 1:858942..862596 1:874802..878017 intra 4 + It means that exon 9815_1 interacts with exon 9815_7, according to a weight + of 4 and that the interaction is intrachromosome. + + :param pet: del_overlaps() return = pet dataframe without pet that have + overlapping anchors. + :param anchor_dic: work_on_intersection() return = a dictionary which link + the id of the pet with the region (exon/gene) it contains. + :return: A dataframe with these columns: id_region_1, id_region_2, + id_anchor_1, id_anchor_2, level, weight """ pet_dic = pet.to_dict("index") pbar = tqdm(pet_dic.keys()) @@ -139,17 +146,17 @@ def interactions(pet: pd.DataFrame, anchor_dic: Dict[str, List[str]]): continue couples = np.c_[couples, [anchor1] * clen, [anchor2] * clen, [get_level(anchor1, anchor2, pattern)] * clen] - couples_df = pd.DataFrame(couples, columns=["id_region_a1", - "id_region_a2", - "id_anchor_a1", - "id_anchor_a2", + couples_df = pd.DataFrame(couples, columns=["id_region_1", + "id_region_2", + "id_anchor_1", + "id_anchor_2", "level"]) couples_list.append(couples_df) except KeyError: continue df_final = pd.concat(couples_list, axis=0, ignore_index=True) df_final = df_final.merge(pet, how="left", - left_on=["id_anchor_a1", "id_anchor_a2"], + left_on=["id_anchor_1", "id_anchor_2"], right_on=["anchor1", "anchor2"]) df_final.drop(["anchor1", "anchor2"], axis=1, inplace=True) df_final.rename(columns={2: "weight"}, inplace=True) @@ -158,7 +165,9 @@ def interactions(pet: pd.DataFrame, anchor_dic: Dict[str, List[str]]): def get_level(anchor1: str, anchor2: str, pattern: re.Pattern) -> str: """ - Say if anchor1 and anchor2 are on the same chromosome. + Look if anchor_1 and anchor_2 (so also region_1 and region_2) are on the + same chromosome or not, so if the interaction is intrachromosome or + interchromosome. :param anchor1: The id of an anchor :param anchor2: The mate of anchor1 @@ -173,51 +182,57 @@ def get_level(anchor1: str, anchor2: str, pattern: re.Pattern) -> str: def filtering_1(region_lists: List) -> List: """ - Removing common exons. + Remove pairs of interacting regions that would be the same, e.g. + #id_region_1 id_region_2 weight + 9815_1 9815_1 2 :param region_lists: List of couple of regions - :return: The lists without common regions + :return: Region_lists without pairs of interacting regions that would be + the same """ - return [sorted(couple) for couple in region_lists if couple[0] != couple[1]] def filtering_2(df_filter_2: pd.DataFrame): """ - Filtering of the previous dataframe result (df_filter_1) by adding: - - the weights of the interactions that describe the same interaction, e.g. - #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level - 7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique - 7832 10:10019900..10020088 16755 11:5834422..5834625 2 interchromosomique - --> #id_region_a1 id_region_a2 weight level - --> 7832 16755 4 interchromosomique + Use the result of the "interactions" function to add the weights of the + same pairs of interactions, e.g. + #id_region_1 id_region_2 id_anchor_1 id_anchor_2 level weight + 9815_1 16755_2 10:10019900..10020058 11:5834473..5834631 inter 2 + 9815_1 16755_2 10:10019900..10020088 11:5834422..5834625 inter 2 + --> #id_region_1 id_region_2 weight level + --> 9815_1 16755_2 4 inter - :param df_filter_2: - :return: + :param df_filter_2: Result of the "interactions" function + :return: df_filter_2 with weights added, when it describes the same pairs + of interactions. """ df_filter_2.drop_duplicates(inplace=True) - df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \ - df_filter_2["id_region_a2"].astype(str) - df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1", - "id_region_a2"], axis="columns", inplace=True) + df_filter_2["id"] = df_filter_2["id_region_1"].astype(str) + "$" + \ + df_filter_2["id_region_2"].astype(str) + df_filter_2.drop(["id_anchor_1", "id_anchor_2", "id_region_1", + "id_region_2"], axis="columns", inplace=True) df_filter_2["weight"] = df_filter_2["weight"].astype(int) df_filter_2 = df_filter_2[["weight", "id", "level"]].groupby( ["id", "level"]).sum().reset_index(drop=False) - df_filter_2[["id_region_a1", "id_region_a2"]] = df_filter_2.id.str.\ + df_filter_2[["id_region_1", "id_region_2"]] = df_filter_2.id.str.\ split("$", expand=True) del df_filter_2["id"] + df_filter_2 = df_filter_2.reindex(columns=["id_region_1", "id_region_2", + "weight", "level"]) return df_filter_2 def create_interaction_table(logging_level: str = "DISABLE"): """ - Create the interaction table. + Create the interaction tables. :return: The table of interaction """ logging_def(Config.chia_pet_interaction, __file__, logging_level) - logging.debug("Creation of intersection between exons and an anchor") + logging.debug("Reading of intersection between genomic regions and an " + "anchor") anchor_dic = work_on_intersection() logging.debug("Getting anchor couples (PET) and weight") df = work_on_pet() @@ -225,7 +240,7 @@ def create_interaction_table(logging_level: str = "DISABLE"): logging.debug("Removing anchor couples (PET) overlapping") df = del_overlaps(df) logging.debug(df.head()) - logging.debug("Linking exons interacting with each other") + logging.debug("Linking genomic regions interacting with each other") df = interactions(df, anchor_dic) logging.debug(df.head()) logging.debug("Sum weight of identical interaction") @@ -236,4 +251,4 @@ def create_interaction_table(logging_level: str = "DISABLE"): if __name__ == "__main__": - print(create_interaction_table("DEBUG").head()) + create_interaction_table("DEBUG")