src/db_utils/interactions/features_interactions.py: Creation of a function...

src/db_utils/interactions/features_interactions.py: Creation of a function create_interaction_table that calls every other function from that file

src/db_utils/interactions/features_interactions.py: Creation of a function...
772b8999 · nfontrod · 2904ac24 · 772b8999
Commit 772b8999 authored 5 years ago by nfontrod
--- a/src/db_utils/interactions/features_interactions.py
+++ b/src/db_utils/interactions/features_interactions.py
@@ -21,9 +21,11 @@ duplicate BED6 file of ChIA-PET data, that is in the following format:
 import pandas as pd
 from datetime import datetime
+from .config_interactions import ConfigInteractions as Config
 print("Start:",  str(datetime.now()))
 def get_id_col(m_series: pd.Series) -> str:
    """
    Allows to produce a id sorted, under this format: anchor1$anchor2.
@@ -45,7 +47,8 @@ def work_on_pet():
    :return:
    """
-    pet = pd.read_csv("/home/audrey/IE/ChIA_PET_network/data/interactions_files/chia_pet/GSM1517080.bed",sep="\t", header=None)
+    pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t",
+                      header=None)
    pet = pet[[3]].drop_duplicates()
    pet = pet.iloc[:, 0].str.split(r"-|,", expand=True)
    pet.columns = ["anchor1", "anchor2", "weight"]
@@ -57,16 +60,16 @@ def work_on_pet():
    return pet
-def del_overlaps():
+def del_overlaps(pet: pd.DataFrame):
    """
    This works on the previous dataframe result (pet). Input format is:
    #chr1:start1..end1   chr2:start2..end2   weight1-2
    We delete from this dataframe the pet that has overlapping anchors, e.g.
    9:139773532..139778733  9:139778161..139781850  7
+    :param pet:
    :return:
    """
-    pet = work_on_pet()
    pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\
        split(r"[:..]", expand=True)
    pet[["chr2", "start2", "space2", "end2"]] = pet["anchor2"].str.\
@@ -104,7 +107,9 @@ def work_on_intersection():
    :return:
    """
-    inter = pd.read_csv("/home/audrey/IE/ChIA_PET_network/results/interactions/intersections_gene/gene_w200_vs_GSM1517080.bed", sep="\t", header=None)
+    inter = pd.read_csv(Config.pet_vs_gene_output /
+                        "gene_w200_vs_GSM1517080.bed",
+                        sep="\t", header=None)
    inter = inter.iloc[:, [3, 6, 7, 8]]
    inter.columns = ["id_region", "chr", "start", "end"]
    inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\
@@ -113,7 +118,7 @@ def work_on_intersection():
    return inter
-def interactions():
+def interactions(pet: pd.DataFrame, inter: pd.DataFrame):
    """
    Allows to determine which couples of genomic regions interact, according to
    what weight.
@@ -122,14 +127,14 @@ def interactions():
    It means that gene 7832 interacts with gene 16755, according to a weight of
    2.
+    :param pet:
+    :param inter:
    :return:
    """
-    pet = del_overlaps()
    df_final = pd.DataFrame()
    for index, row in pet.iterrows():
        if index == 50:
            break
-        inter = work_on_intersection()
        match_a = inter.loc[inter["id_anchor"] == row[0], :]
        match_b = inter.loc[inter["id_anchor"] == row[1], :]
        table_a = pd.DataFrame({"id_region": match_a["id_region"],
@@ -154,7 +159,7 @@ def interactions():
    return df_final
-def add_level():
+def add_level(df_level: pd.DataFrame):
    """
    Add the level to the previous dataframe result (df_final):
    - intrachromosomique: the two genomic regions that interact are in the same
@@ -164,9 +169,9 @@ def add_level():
    #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
    7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique
+    :param df_level:
    :return:
    """
-    df_level = interactions()
    df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\
        split(":", expand=True)
    df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\
@@ -180,16 +185,16 @@ def add_level():
    return df_level
-def filtering_1():
+def filtering_1(df_filter_1: pd.DataFrame):
    """
    Filtering of the previous dataframe result (df_level) by removing:
    - the genomic regions that interact with itself, e.g.
    #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
    7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique
+    :param df_filter_1:
    :return:
    """
-    df_filter_1 = add_level()
    df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"],
                    "identical_or_not"] = "identical"
    df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"],
@@ -201,7 +206,7 @@ def filtering_1():
    return df_filter_1
-def filtering_2():
+def filtering_2(df_filter_2: pd.DataFrame):
    """
    Filtering of the previous dataframe result (df_filter_1) by adding:
    - the weights of the interactions that describe the same interaction, e.g.
@@ -211,12 +216,12 @@ def filtering_2():
    --> #id_region_a1   id_region_a2    weight  level
    --> 7832    16755   4   interchromosomique
+    :param df_filter_2:
    :return:
    """
-    df_filter_2 = filtering_1()
    df_filter_2 = df_filter_2.drop_duplicates()
    df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \
-                        df_filter_2["id_region_a2"].astype(str)
+        df_filter_2["id_region_a2"].astype(str)
    df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1",
                      "id_region_a2"], axis="columns", inplace=True)
    df_filter_2["weight"] = df_filter_2["weight"].astype(int)
@@ -230,5 +235,20 @@ def filtering_2():
    return df_filter_2
+def create_interaction_table():
+    """
+    Create the interaction table.
+    :return: The table of interaction
+    """
+    inter_df = work_on_intersection()
+    df = work_on_pet()
+    df = del_overlaps(df)
+    df = interactions(df, inter_df)
+    df = add_level(df)
+    df = filtering_1(df)
+    return filtering_2(df)
 if __name__ == "__main__":
-    filtering_2()
+    create_interaction_table()