diff --git a/src/db_utils/db_creation.py b/src/db_utils/db_creation.py index 079008273fc48ebd3090af963a9a4bbf37237e7a..7a23303153909757527467ce8fc9e20818f4336a 100755 --- a/src/db_utils/db_creation.py +++ b/src/db_utils/db_creation.py @@ -124,7 +124,7 @@ def create_cin_exon_interaction_table(conn: sqlite3.Connection) -> None: [exon2] VARCHAR(30) NOT NULL, [id_project] INT NOT NULL, [level] VARCHAR(25) NOT NULL, - [distance] VARCHAR(30), + [distance] INT, PRIMARY KEY ([id]), FOREIGN KEY ([exon1]) REFERENCES cin_exon([id]), FOREIGN KEY ([exon2]) REFERENCES cin_exon([id]), @@ -147,7 +147,7 @@ def create_cin_gene_interaction_table(conn: sqlite3.Connection) -> None: [gene2] INT NOT NULL, [id_project] INT NOT NULL, [level] VARCHAR(25) NOT NULL, - [distance] VARCHAR(30), + [distance] INT, PRIMARY KEY ([id]), FOREIGN KEY ([gene1]) REFERENCES cin_gene([id]), FOREIGN KEY ([gene2]) REFERENCES cin_gene([id]), diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index 92253d8be30ee66ed26948b8eb2828daa4c5d35f..73bcab7345cbd3d427ae530dd4ca19a56aaa4766 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -81,7 +81,8 @@ def del_overlaps(pet: pd.DataFrame): 9:139773532..139778733 9:139778161..139781850 7 :param pet: In this format: chr1:start1..end1 chr2:start2..end2 weight1-2 - :return: Pet dataframe without pet that have overlapping anchors + :return: Pet dataframe without pet that have overlapping anchors (partial + and complete) """ pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\ split(r"[:..]", expand=True) @@ -89,6 +90,7 @@ def del_overlaps(pet: pd.DataFrame): split(r"[:..]", expand=True) pet = pet.drop(["anchor1", "anchor2", "space1", "space2"], axis=1) pet.loc[pet["chr1"] != pet["chr2"], "delete"] = "no" + # Removal of a partial overlap pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"].astype(int) >= pet["start2"].astype(int)) & (pet["start1"].astype(int) <= @@ -98,6 +100,12 @@ def del_overlaps(pet: pd.DataFrame): (pet["end1"].astype(int) <= pet["end2"].astype(int))), "delete"] = "yes" + # Removal of a complete overlap, e.g. anchor1 is fully included in anchor2 + pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"] >= pet["start2"]) & + (pet["end2"] >= pet["end1"]) | + (pet["start1"] <= pet["start2"]) & + (pet["end2"] <= pet["end1"])), + "delete"] = "yes" to_del = pet[pet.delete == "yes"].index.tolist() pet = pet.drop(to_del) pet["anchor1"] = pet["chr1"] + ":" + pet["start1"] + ".." + pet["end1"] @@ -282,8 +290,9 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: studied in interaction and to add this information in the result dataframe. If the result is NULL it is because we study two exons or genes located in the same chromosome. - If the result is (null), so NaN, it is because we study two exons or genes - which have different identifiers, but strictly identical coordinates. + If the result is 0, it is because we study two exons or genes which have + different identifiers, but strictly identical coordinates OR two exons or + genes which overlap (partially or completely). :param df: Result of the "filtering_2" function :return df: df with distances added or NULL or (null) see before for more @@ -310,6 +319,20 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: (df["start1"] - df["stop2"] + 1) df.loc[df["stop1"] > df["stop2"], "distance"] = \ (df["start1"] - df["stop2"] + 1) + df.loc[(df["start1"] == df["start2"]) & (df["stop1"] == df["stop2"]), + "distance"] = 0 + # Removal of a partial overlap + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] <= df["start2"]) & + (df["start2"] <= df["stop1"]) | + (df["start1"] <= df["stop2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 + # Removal of a complete overlap, e.g. exon1 is fully included in exon2 + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] >= df["start2"]) & + (df["stop2"] >= df["stop1"]) | + (df["start1"] <= df["start2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2", "stop2"], axis='columns', inplace=True) @@ -331,6 +354,20 @@ def add_info_distance_between_features(df: pd.DataFrame) -> pd.DataFrame: (df["start1"] - df["stop2"] + 1) df.loc[df["stop1"] > df["stop2"], "distance"] = \ (df["start1"] - df["stop2"] + 1) + df.loc[(df["start1"] == df["start2"]) & (df["stop1"] == df["stop2"]), + "distance"] = 0 + # Removal of a partial overlap + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] <= df["start2"]) & + (df["start2"] <= df["stop1"]) | + (df["start1"] <= df["stop2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 + # Removal of a complete overlap, e.g. exon1 is fully included in exon2 + df.loc[(df["chr1"] == df["chr2"]) & ((df["start1"] >= df["start2"]) & + (df["stop2"] >= df["stop1"]) | + (df["start1"] <= df["start2"]) & + (df["stop2"] <= df["stop1"])), + "distance"] = 0 df.loc[df["chr1"] != df["chr2"], "distance"] = "NULL" df.drop(["start1", "ID1", "chr1", "stop1", "ID2", "chr2", "start2", "stop2"], axis='columns', inplace=True)