From f6f97b98ab8c9d2c3bac857bef030ac6657517a4 Mon Sep 17 00:00:00 2001 From: alapendr <audrey.lapendry@ens-lyon.fr> Date: Fri, 25 Sep 2020 09:34:52 +0200 Subject: [PATCH] db_utils/interactions/features_interactions.py: removal of completely overlapping anchors, genes and exons --- .../interactions/features_interactions.py | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index fe99989a..73bcab73 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -81,7 +81,8 @@ def del_overlaps(pet: pd.DataFrame): 9:139773532..139778733 9:139778161..139781850 7 :param pet: In this format: chr1:start1..end1 chr2:start2..end2 weight1-2 - :return: Pet dataframe without pet that have overlapping anchors + :return: Pet dataframe without pet that have overlapping anchors (partial + and complete) """ pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\ split(r"[:..]", expand=True) @@ -89,6 +90,7 @@ def del_overlaps(pet: pd.DataFrame): split(r"[:..]", expand=True) pet = pet.drop(["anchor1", "anchor2", "space1", "space2"], axis=1) pet.loc[pet["chr1"] != pet["chr2"], "delete"] = "no" + # Removal of a partial overlap pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"].astype(int) >= pet["start2"].astype(int)) & (pet["start1"].astype(int) <= @@ -98,6 +100,12 @@ def del_overlaps(pet: pd.DataFrame): (pet["end1"].astype(int) <= pet["end2"].astype(int))), "delete"] = "yes" + # Removal of a complete overlap, e.g. anchor1 is fully included in anchor2 + pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"] >= pet["start2"]) & + (pet["end2"] >= pet["end1"]) | + (pet["start1"] <= pet["start2"]) & + (pet["end2"] <= pet["end1"])), + "delete"] = "yes" to_del = pet[pet.delete == "yes"].index.tolist() pet = pet.drop(to_del) pet["anchor1"] = pet["chr1"] + ":" + pet["start1"] + ".." + pet["end1"] @@ -283,7 +291,8 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: If the result is NULL it is because we study two exons or genes located in the same chromosome. If the result is 0, it is because we study two exons or genes which have - different identifiers, but strictly identical coordinates. + different identifiers, but strictly identical coordinates OR two exons or + genes which overlap (partially or completely). :param df: Result of the "filtering_2" function :return df: df with distances added or NULL or (null) see before for more @@ -312,6 +321,18 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: (df["start1"] - df["stop2"] + 1) df.loc[(df["start1"] == df["start2"]) & (df["stop1"] == df["stop2"]), "distance"] = 0 + # Removal of a partial overlap + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] <= df["start2"]) & + (df["start2"] <= df["stop1"]) | + (df["start1"] <= df["stop2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 + # Removal of a complete overlap, e.g. exon1 is fully included in exon2 + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] >= df["start2"]) & + (df["stop2"] >= df["stop1"]) | + (df["start1"] <= df["start2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2", "stop2"], axis='columns', inplace=True) @@ -335,6 +356,18 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: (df["start1"] - df["stop2"] + 1) df.loc[(df["start1"] == df["start2"]) & (df["stop1"] == df["stop2"]), "distance"] = 0 + # Removal of a partial overlap + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] <= df["start2"]) & + (df["start2"] <= df["stop1"]) | + (df["start1"] <= df["stop2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 + # Removal of a complete overlap, e.g. exon1 is fully included in exon2 + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] >= df["start2"]) & + (df["stop2"] >= df["stop1"]) | + (df["start1"] <= df["start2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2", "stop2"], axis='columns', inplace=True) -- GitLab