Skip to content
Snippets Groups Projects
Commit 772b8999 authored by nfontrod's avatar nfontrod
Browse files

src/db_utils/interactions/features_interactions.py: Creation of a function...

src/db_utils/interactions/features_interactions.py: Creation of a function create_interaction_table that calls every other function from that file
parent 2904ac24
Branches
No related tags found
No related merge requests found
...@@ -21,9 +21,11 @@ duplicate BED6 file of ChIA-PET data, that is in the following format: ...@@ -21,9 +21,11 @@ duplicate BED6 file of ChIA-PET data, that is in the following format:
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from .config_interactions import ConfigInteractions as Config
print("Start:", str(datetime.now())) print("Start:", str(datetime.now()))
def get_id_col(m_series: pd.Series) -> str: def get_id_col(m_series: pd.Series) -> str:
""" """
Allows to produce a id sorted, under this format: anchor1$anchor2. Allows to produce a id sorted, under this format: anchor1$anchor2.
...@@ -45,7 +47,8 @@ def work_on_pet(): ...@@ -45,7 +47,8 @@ def work_on_pet():
:return: :return:
""" """
pet = pd.read_csv("/home/audrey/IE/ChIA_PET_network/data/interactions_files/chia_pet/GSM1517080.bed",sep="\t", header=None) pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t",
header=None)
pet = pet[[3]].drop_duplicates() pet = pet[[3]].drop_duplicates()
pet = pet.iloc[:, 0].str.split(r"-|,", expand=True) pet = pet.iloc[:, 0].str.split(r"-|,", expand=True)
pet.columns = ["anchor1", "anchor2", "weight"] pet.columns = ["anchor1", "anchor2", "weight"]
...@@ -57,16 +60,16 @@ def work_on_pet(): ...@@ -57,16 +60,16 @@ def work_on_pet():
return pet return pet
def del_overlaps(): def del_overlaps(pet: pd.DataFrame):
""" """
This works on the previous dataframe result (pet). Input format is: This works on the previous dataframe result (pet). Input format is:
#chr1:start1..end1 chr2:start2..end2 weight1-2 #chr1:start1..end1 chr2:start2..end2 weight1-2
We delete from this dataframe the pet that has overlapping anchors, e.g. We delete from this dataframe the pet that has overlapping anchors, e.g.
9:139773532..139778733 9:139778161..139781850 7 9:139773532..139778733 9:139778161..139781850 7
:param pet:
:return: :return:
""" """
pet = work_on_pet()
pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\ pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\
split(r"[:..]", expand=True) split(r"[:..]", expand=True)
pet[["chr2", "start2", "space2", "end2"]] = pet["anchor2"].str.\ pet[["chr2", "start2", "space2", "end2"]] = pet["anchor2"].str.\
...@@ -104,7 +107,9 @@ def work_on_intersection(): ...@@ -104,7 +107,9 @@ def work_on_intersection():
:return: :return:
""" """
inter = pd.read_csv("/home/audrey/IE/ChIA_PET_network/results/interactions/intersections_gene/gene_w200_vs_GSM1517080.bed", sep="\t", header=None) inter = pd.read_csv(Config.pet_vs_gene_output /
"gene_w200_vs_GSM1517080.bed",
sep="\t", header=None)
inter = inter.iloc[:, [3, 6, 7, 8]] inter = inter.iloc[:, [3, 6, 7, 8]]
inter.columns = ["id_region", "chr", "start", "end"] inter.columns = ["id_region", "chr", "start", "end"]
inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\ inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\
...@@ -113,7 +118,7 @@ def work_on_intersection(): ...@@ -113,7 +118,7 @@ def work_on_intersection():
return inter return inter
def interactions(): def interactions(pet: pd.DataFrame, inter: pd.DataFrame):
""" """
Allows to determine which couples of genomic regions interact, according to Allows to determine which couples of genomic regions interact, according to
what weight. what weight.
...@@ -122,14 +127,14 @@ def interactions(): ...@@ -122,14 +127,14 @@ def interactions():
It means that gene 7832 interacts with gene 16755, according to a weight of It means that gene 7832 interacts with gene 16755, according to a weight of
2. 2.
:param pet:
:param inter:
:return: :return:
""" """
pet = del_overlaps()
df_final = pd.DataFrame() df_final = pd.DataFrame()
for index, row in pet.iterrows(): for index, row in pet.iterrows():
if index == 50: if index == 50:
break break
inter = work_on_intersection()
match_a = inter.loc[inter["id_anchor"] == row[0], :] match_a = inter.loc[inter["id_anchor"] == row[0], :]
match_b = inter.loc[inter["id_anchor"] == row[1], :] match_b = inter.loc[inter["id_anchor"] == row[1], :]
table_a = pd.DataFrame({"id_region": match_a["id_region"], table_a = pd.DataFrame({"id_region": match_a["id_region"],
...@@ -154,7 +159,7 @@ def interactions(): ...@@ -154,7 +159,7 @@ def interactions():
return df_final return df_final
def add_level(): def add_level(df_level: pd.DataFrame):
""" """
Add the level to the previous dataframe result (df_final): Add the level to the previous dataframe result (df_final):
- intrachromosomique: the two genomic regions that interact are in the same - intrachromosomique: the two genomic regions that interact are in the same
...@@ -164,9 +169,9 @@ def add_level(): ...@@ -164,9 +169,9 @@ def add_level():
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique 7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique
:param df_level:
:return: :return:
""" """
df_level = interactions()
df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\ df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\
split(":", expand=True) split(":", expand=True)
df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\ df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\
...@@ -180,16 +185,16 @@ def add_level(): ...@@ -180,16 +185,16 @@ def add_level():
return df_level return df_level
def filtering_1(): def filtering_1(df_filter_1: pd.DataFrame):
""" """
Filtering of the previous dataframe result (df_level) by removing: Filtering of the previous dataframe result (df_level) by removing:
- the genomic regions that interact with itself, e.g. - the genomic regions that interact with itself, e.g.
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique 7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique
:param df_filter_1:
:return: :return:
""" """
df_filter_1 = add_level()
df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"], df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"],
"identical_or_not"] = "identical" "identical_or_not"] = "identical"
df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"], df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"],
...@@ -201,7 +206,7 @@ def filtering_1(): ...@@ -201,7 +206,7 @@ def filtering_1():
return df_filter_1 return df_filter_1
def filtering_2(): def filtering_2(df_filter_2: pd.DataFrame):
""" """
Filtering of the previous dataframe result (df_filter_1) by adding: Filtering of the previous dataframe result (df_filter_1) by adding:
- the weights of the interactions that describe the same interaction, e.g. - the weights of the interactions that describe the same interaction, e.g.
...@@ -211,12 +216,12 @@ def filtering_2(): ...@@ -211,12 +216,12 @@ def filtering_2():
--> #id_region_a1 id_region_a2 weight level --> #id_region_a1 id_region_a2 weight level
--> 7832 16755 4 interchromosomique --> 7832 16755 4 interchromosomique
:param df_filter_2:
:return: :return:
""" """
df_filter_2 = filtering_1()
df_filter_2 = df_filter_2.drop_duplicates() df_filter_2 = df_filter_2.drop_duplicates()
df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \ df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \
df_filter_2["id_region_a2"].astype(str) df_filter_2["id_region_a2"].astype(str)
df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1", df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1",
"id_region_a2"], axis="columns", inplace=True) "id_region_a2"], axis="columns", inplace=True)
df_filter_2["weight"] = df_filter_2["weight"].astype(int) df_filter_2["weight"] = df_filter_2["weight"].astype(int)
...@@ -230,5 +235,20 @@ def filtering_2(): ...@@ -230,5 +235,20 @@ def filtering_2():
return df_filter_2 return df_filter_2
def create_interaction_table():
"""
Create the interaction table.
:return: The table of interaction
"""
inter_df = work_on_intersection()
df = work_on_pet()
df = del_overlaps(df)
df = interactions(df, inter_df)
df = add_level(df)
df = filtering_1(df)
return filtering_2(df)
if __name__ == "__main__": if __name__ == "__main__":
filtering_2() create_interaction_table()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment