diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index 4f2512d0f13ed72a6ef7cc19ce420466f0ce9b36..b63d647babdbca016b5c360a62e02b225a3fdc7c 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -21,9 +21,11 @@ duplicate BED6 file of ChIA-PET data, that is in the following format: import pandas as pd from datetime import datetime +from .config_interactions import ConfigInteractions as Config print("Start:", str(datetime.now())) + def get_id_col(m_series: pd.Series) -> str: """ Allows to produce a id sorted, under this format: anchor1$anchor2. @@ -45,7 +47,8 @@ def work_on_pet(): :return: """ - pet = pd.read_csv("/home/audrey/IE/ChIA_PET_network/data/interactions_files/chia_pet/GSM1517080.bed",sep="\t", header=None) + pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t", + header=None) pet = pet[[3]].drop_duplicates() pet = pet.iloc[:, 0].str.split(r"-|,", expand=True) pet.columns = ["anchor1", "anchor2", "weight"] @@ -57,16 +60,16 @@ def work_on_pet(): return pet -def del_overlaps(): +def del_overlaps(pet: pd.DataFrame): """ This works on the previous dataframe result (pet). Input format is: #chr1:start1..end1 chr2:start2..end2 weight1-2 We delete from this dataframe the pet that has overlapping anchors, e.g. 9:139773532..139778733 9:139778161..139781850 7 + :param pet: :return: """ - pet = work_on_pet() pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\ split(r"[:..]", expand=True) pet[["chr2", "start2", "space2", "end2"]] = pet["anchor2"].str.\ @@ -104,7 +107,9 @@ def work_on_intersection(): :return: """ - inter = pd.read_csv("/home/audrey/IE/ChIA_PET_network/results/interactions/intersections_gene/gene_w200_vs_GSM1517080.bed", sep="\t", header=None) + inter = pd.read_csv(Config.pet_vs_gene_output / + "gene_w200_vs_GSM1517080.bed", + sep="\t", header=None) inter = inter.iloc[:, [3, 6, 7, 8]] inter.columns = ["id_region", "chr", "start", "end"] inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\ @@ -113,7 +118,7 @@ def work_on_intersection(): return inter -def interactions(): +def interactions(pet: pd.DataFrame, inter: pd.DataFrame): """ Allows to determine which couples of genomic regions interact, according to what weight. @@ -122,14 +127,14 @@ def interactions(): It means that gene 7832 interacts with gene 16755, according to a weight of 2. + :param pet: + :param inter: :return: """ - pet = del_overlaps() df_final = pd.DataFrame() for index, row in pet.iterrows(): if index == 50: break - inter = work_on_intersection() match_a = inter.loc[inter["id_anchor"] == row[0], :] match_b = inter.loc[inter["id_anchor"] == row[1], :] table_a = pd.DataFrame({"id_region": match_a["id_region"], @@ -154,7 +159,7 @@ def interactions(): return df_final -def add_level(): +def add_level(df_level: pd.DataFrame): """ Add the level to the previous dataframe result (df_final): - intrachromosomique: the two genomic regions that interact are in the same @@ -164,9 +169,9 @@ def add_level(): #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level 7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique + :param df_level: :return: """ - df_level = interactions() df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\ split(":", expand=True) df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\ @@ -180,16 +185,16 @@ def add_level(): return df_level -def filtering_1(): +def filtering_1(df_filter_1: pd.DataFrame): """ Filtering of the previous dataframe result (df_level) by removing: - the genomic regions that interact with itself, e.g. #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level 7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique + :param df_filter_1: :return: """ - df_filter_1 = add_level() df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"], "identical_or_not"] = "identical" df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"], @@ -201,7 +206,7 @@ def filtering_1(): return df_filter_1 -def filtering_2(): +def filtering_2(df_filter_2: pd.DataFrame): """ Filtering of the previous dataframe result (df_filter_1) by adding: - the weights of the interactions that describe the same interaction, e.g. @@ -211,12 +216,12 @@ def filtering_2(): --> #id_region_a1 id_region_a2 weight level --> 7832 16755 4 interchromosomique + :param df_filter_2: :return: """ - df_filter_2 = filtering_1() df_filter_2 = df_filter_2.drop_duplicates() df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \ - df_filter_2["id_region_a2"].astype(str) + df_filter_2["id_region_a2"].astype(str) df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1", "id_region_a2"], axis="columns", inplace=True) df_filter_2["weight"] = df_filter_2["weight"].astype(int) @@ -230,5 +235,20 @@ def filtering_2(): return df_filter_2 +def create_interaction_table(): + """ + Create the interaction table. + + :return: The table of interaction + """ + inter_df = work_on_intersection() + df = work_on_pet() + df = del_overlaps(df) + df = interactions(df, inter_df) + df = add_level(df) + df = filtering_1(df) + return filtering_2(df) + + if __name__ == "__main__": - filtering_2() + create_interaction_table()