diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index fe99989a2a60f2884e9dd7c4b4aa9f709c944f7f..73bcab7345cbd3d427ae530dd4ca19a56aaa4766 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -81,7 +81,8 @@ def del_overlaps(pet: pd.DataFrame): 9:139773532..139778733 9:139778161..139781850 7 :param pet: In this format: chr1:start1..end1 chr2:start2..end2 weight1-2 - :return: Pet dataframe without pet that have overlapping anchors + :return: Pet dataframe without pet that have overlapping anchors (partial + and complete) """ pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\ split(r"[:..]", expand=True) @@ -89,6 +90,7 @@ def del_overlaps(pet: pd.DataFrame): split(r"[:..]", expand=True) pet = pet.drop(["anchor1", "anchor2", "space1", "space2"], axis=1) pet.loc[pet["chr1"] != pet["chr2"], "delete"] = "no" + # Removal of a partial overlap pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"].astype(int) >= pet["start2"].astype(int)) & (pet["start1"].astype(int) <= @@ -98,6 +100,12 @@ def del_overlaps(pet: pd.DataFrame): (pet["end1"].astype(int) <= pet["end2"].astype(int))), "delete"] = "yes" + # Removal of a complete overlap, e.g. anchor1 is fully included in anchor2 + pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"] >= pet["start2"]) & + (pet["end2"] >= pet["end1"]) | + (pet["start1"] <= pet["start2"]) & + (pet["end2"] <= pet["end1"])), + "delete"] = "yes" to_del = pet[pet.delete == "yes"].index.tolist() pet = pet.drop(to_del) pet["anchor1"] = pet["chr1"] + ":" + pet["start1"] + ".." + pet["end1"] @@ -283,7 +291,8 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: If the result is NULL it is because we study two exons or genes located in the same chromosome. If the result is 0, it is because we study two exons or genes which have - different identifiers, but strictly identical coordinates. + different identifiers, but strictly identical coordinates OR two exons or + genes which overlap (partially or completely). :param df: Result of the "filtering_2" function :return df: df with distances added or NULL or (null) see before for more @@ -312,6 +321,18 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: (df["start1"] - df["stop2"] + 1) df.loc[(df["start1"] == df["start2"]) & (df["stop1"] == df["stop2"]), "distance"] = 0 + # Removal of a partial overlap + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] <= df["start2"]) & + (df["start2"] <= df["stop1"]) | + (df["start1"] <= df["stop2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 + # Removal of a complete overlap, e.g. exon1 is fully included in exon2 + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] >= df["start2"]) & + (df["stop2"] >= df["stop1"]) | + (df["start1"] <= df["start2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2", "stop2"], axis='columns', inplace=True) @@ -335,6 +356,18 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: (df["start1"] - df["stop2"] + 1) df.loc[(df["start1"] == df["start2"]) & (df["stop1"] == df["stop2"]), "distance"] = 0 + # Removal of a partial overlap + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] <= df["start2"]) & + (df["start2"] <= df["stop1"]) | + (df["start1"] <= df["stop2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 + # Removal of a complete overlap, e.g. exon1 is fully included in exon2 + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] >= df["start2"]) & + (df["stop2"] >= df["stop1"]) | + (df["start1"] <= df["start2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2", "stop2"], axis='columns', inplace=True)