interactions: initiation

efadea32 · alapendr · 5f61e237 · efadea32 · efadea32 · efadea32
Commit efadea32 authored 5 years ago by alapendr
--- a/src/db_utils/interactions/bedtools.py
+++ b/src/db_utils/interactions/bedtools.py
+#!/usr/bin/env python3
+
+# -*- coding: utf-8 -*-
+
+"""
+Description: This script requires that ChIA-PET data must be in the following
+format: duplicated BED6, in which one line corresponds to an anchor of a PET
+and two lines in succession correspond to the two anchors of the same PET:
+#chrA startA endA chrA:startA..endA-chrB:startB..endB,weightA-B weightA-B .
+1   712493   714023   1:712493..714023-22:43009914..43011424,2   2   .
+#chrB startB endB chrA:startA..endA-chrB:startB..endB,weightA-B weightA-B .
+22   43009914   43011424   1:712493..714023-22:43009914..43011424,2   2   .
+
+This script allows to launch bedtools slop commands, in order to obtain two
+BED files: one with exons and one with genes, with theirs coordinates set to
+a window that we want to study, e.g. -200;+200.
+
+Then this script allows to launch bedtools intersect commands, in order to
+obtain one file with the list of exons that are matching with anchors of PETs
+and one file with the list of genes that are matching with anchors of PETs.
+These two files are generated for each project of ChIA-PET analyzed.
+"""
+
+import subprocess
+from .config_interactions import ConfigInteractions as Config
+from pathlib import Path
+
+
+def launch_bedtools_slop(target_bed_file: Path, target_window: int,
+                         target_output: Path) -> None:
+    """
+    Launch bedtools slop commands, in order to obtain two BED files: one with \
+    exons and one with genes, with theirs coordinates set to a window that we \
+    want to study, e.g. -200;+200.
+
+    :param target_bed_file: A bed file which contains the coordinates of the \
+    targets that we want to study, e.g. genes.
+    :param target_window: The nucleotide window that we add to ours targets.
+    :param target_output: The output bed file of our target with its changed \
+    coordinates.
+    """
+    subprocess.check_call(f"bedtools slop -i {target_bed_file} " f"-g "
+                          f"{Config.chr_sizes_hg19} " f"-b {target_window} " 
+                          f"> {target_output}", shell=True,
+                          stderr=subprocess.STDOUT)
+
+
+def launch_bedtools_intersect(target_output: Path, output_repository: Path):
+    """
+    Launch bedtools intersect commands, in order to obtain one file with the \
+    list of exons that are matching with anchors of PETs and one file with \
+    the list of genes that are matching with anchors of PETs. These two files \
+    are generated for each project of ChIA-PET analyzed.
+
+    :param target_output: The output bed file of our target with its changed \
+    coordinates.
+    :param output_repository: The output repository in which the bedtools \
+    intersect command result files are stored.
+    """
+    subprocess.check_call(f"for files in `ls {Config.chia_pet}" f"/*` ; do "
+                          f"basename_files=$(basename -s .bed " f"$files) ; "
+                          f"" f"basename_output=$(basename -s .bed "
+                          f"{target_output}) ; " f"bedtools intersect -wo -a "
+                          f"{target_output} " f"-b $files > " 
+                          f"{output_repository}/" "${basename_output}_vs_"
+                          "${basename_files}.bed ; done", shell=True,
+                          stderr=subprocess.STDOUT)
+
+
+def main_bedtools() -> None:
+    """
+    Launch bedtools slop and bedtools intersect commands for exons and genes.
+
+    """
+    launch_bedtools_slop(Config.exon, Config.exon_window, Config.exon_output)
+    launch_bedtools_slop(Config.gene, Config.gene_window, Config.gene_output)
+    launch_bedtools_intersect(Config.exon_output,
+                              Config.pet_vs_exon_output)
+    launch_bedtools_intersect(Config.gene_output,
+                              Config.pet_vs_gene_output)
+
+
+if __name__ == "__main__":
+    main_bedtools()
--- a/src/db_utils/interactions/config_interactions.py
+++ b/src/db_utils/interactions/config_interactions.py
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+
+"""
+Description: Configuration variables for subfolder interactions.
+"""
+
+from ..config import Config
+from pathlib import Path
+
+
+class ConfigInteractions:
+    """
+    Configuration variable for this subfolder
+    """
+    exon = Config.data / 'bed' / 'exon.bed'
+    gene = Config.data / 'bed' / 'gene.bed'
+    chr_sizes_hg19 = Config.data / 'interactions_files' / \
+                                   'chr_sizes_hg19_nochr.txt'
+    exon_window = 200
+    gene_window = 200
+    chia_pet_interaction = Config.results / 'interactions'
+    exon_output = chia_pet_interaction / 'targets' / \
+                                         f'exon_w{exon_window}.bed'
+    gene_output = chia_pet_interaction / 'targets' / \
+                                         f'gene_w{gene_window}.bed'
+    chia_pet = Config.data / 'interactions_files' / 'chia_pet'
+    pet_vs_exon_output = chia_pet_interaction / 'intersections_exon'
+    pet_vs_gene_output = chia_pet_interaction / 'intersections_gene'
+
+
+def get_interaction_file(bed_file: Path, region_name: str):
+    """
+    Get the name of the interaction file created from the ``bed_file``.
+
+    :param bed_file: A bed file that was used to create the interaction \
+    file whose name is returned by this function.
+    :param region_name: The  name of the genomic region searched in PETs.
+    :return: The name of the interaction file
+    """
+    bed_name = bed_file.name.replace(".bed", "")
+    return ConfigInteractions.chia_pet_interaction / \
+           f"{bed_name}_{region_name}_interaction.csv"
--- a/src/db_utils/interactions/features_interactions.py
+++ b/src/db_utils/interactions/features_interactions.py
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description:This script allows to determine which couples of genomic regions
+interact, e.g. exons or genes, that for each ChIA-PET dataset.
+
+This script requires the output file produced by the script bedtools.py, which
+is the result of the intersection, between the genes or exons BED file with the
+duplicate BED6 file of ChIA-PET data, that is in the following format:
+#Columns of the exons file and then columns of the ChIA-PET data file, e.g.
+18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 . 100
+
+#Exons: chr=18  start=28681 end=28882   id=1_1  0=0 strand=-
+#ChIA-PET: chr2=18  start2=28682    end2=28782
+#chr1:start1..end1-chr2:start2..end2,weight1-2=1:47797..47799-18:28682..28782,2
+#weight1-2=2 .=. number of base pairs of overlap between exons and ChIA-PET=100
+"""
+
+
+import pandas as pd
+from datetime import datetime
+
+print("Start:",  str(datetime.now()))
+
+def get_id_col(m_series: pd.Series) -> str:
+    """
+    Allows to produce a id sorted, under this format: anchor1$anchor2.
+
+    :param m_series: Contains the anchor1 and anchor2.
+    :return:
+    """
+    return "$".join(sorted([m_series["anchor1"], m_series["anchor2"]]))
+
+
+def work_on_pet():
+    """
+    This works on the duplicate BED6 files of ChIA-PET data. Input format is:
+    #chr1 start1 end1 chr1:start1..end1-chr2:start2..end2,weight1-2 weight1-2 .
+    #chr2 start2 end2 chr1:start1..end1-chr2:start2..end2,weight1-2 weight1-2 .
+    We want the following output format:
+    #chr1:start1..end1   chr2:start2..end2   weight1-2
+    10:100172432..100175026    6:92192672..92194172  2
+
+    :return:
+    """
+    pet = pd.read_csv("/home/audrey/IE/ChIA_PET_network/data/interactions_files/chia_pet/GSM1517080.bed",sep="\t", header=None)
+    pet = pet[[3]].drop_duplicates()
+    pet = pet.iloc[:, 0].str.split(r"-|,", expand=True)
+    pet.columns = ["anchor1", "anchor2", "weight"]
+    pet["id"] = pet.apply(get_id_col, axis=1)
+    pet = pet[["weight", "id"]].groupby(["id"]).sum().reset_index(drop=False)
+    pet = pet["id"] + "$" + pet["weight"]
+    pet = pet.str.split("$", expand=True)
+    pet.columns = ["anchor1", "anchor2", "weight"]
+    return pet
+
+
+def del_overlaps():
+    """
+    This works on the previous dataframe result (pet). Input format is:
+    #chr1:start1..end1   chr2:start2..end2   weight1-2
+    We delete from this dataframe the pet that has overlapping anchors, e.g.
+    9:139773532..139778733  9:139778161..139781850  7
+
+    :return:
+    """
+    pet = work_on_pet()
+    pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\
+        split(r"[:..]", expand=True)
+    pet[["chr2", "start2", "space2", "end2"]] = pet["anchor2"].str.\
+        split(r"[:..]", expand=True)
+    pet = pet.drop(["anchor1", "anchor2", "space1", "space2"], axis=1)
+    pet.loc[pet["chr1"] != pet["chr2"], "delete"] = "no"
+    pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"].astype(int) >=
+                                             pet["start2"].astype(int)) &
+                                            (pet["start1"].astype(int) <=
+                                             pet["end2"].astype(int)) |
+                                            (pet["end1"].astype(int) >=
+                                             pet["start2"].astype(int)) &
+                                            (pet["end1"].astype(int) <=
+                                             pet["end2"].astype(int))),
+            "delete"] = "yes"
+    to_del = pet[pet.delete == "yes"].index.tolist()
+    pet = pet.drop(to_del)
+    pet["anchor1"] = pet["chr1"] + ":" + pet["start1"] + ".." + pet["end1"]
+    pet["anchor2"] = pet["chr2"] + ":" + pet["start2"] + ".." + pet["end2"]
+    pet.drop(["chr1", "start1", "end1", "chr2", "start2", "end2", "delete"],
+             inplace=True, axis=1)
+    pet = pet[["anchor1", "anchor2", "weight"]]
+    return pet
+
+
+def work_on_intersection():
+    """
+    This works on the output file produced by the script bedtools.py. The input
+    format is (see the description of this script for more information), e.g.
+    18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 .
+    100
+    We want the following output format:
+    1_1 18:28682..28782 --> which is the ID of the genomic region, e.g. exon
+    and the coordinates of the anchor matching to this genomic region.
+
+    :return:
+    """
+    inter = pd.read_csv("/home/audrey/IE/ChIA_PET_network/results/interactions/intersections_gene/gene_w200_vs_GSM1517080.bed", sep="\t", header=None)
+    inter = inter.iloc[:, [3, 6, 7, 8]]
+    inter.columns = ["id_region", "chr", "start", "end"]
+    inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\
+        astype("str") + ".." + inter["end"].astype("str")
+    inter.drop(["chr", "start", "end"], inplace=True, axis=1)
+    return inter
+
+
+def interactions():
+    """
+    Allows to determine which couples of genomic regions interact, according to
+    what weight.
+    #id_region_a1    id_anchor_a1    id_region_a2    id_anchor_a2    weight
+    7832    10:10019900..10020058 16755   11:5834473..5834631   2
+    It means that gene 7832 interacts with gene 16755, according to a weight of
+    2.
+
+    :return:
+    """
+    pet = del_overlaps()
+    df_final = pd.DataFrame()
+    for index, row in pet.iterrows():
+        if index == 50:
+            break
+        inter = work_on_intersection()
+        match_a = inter.loc[inter["id_anchor"] == row[0], :]
+        match_b = inter.loc[inter["id_anchor"] == row[1], :]
+        table_a = pd.DataFrame({"id_region": match_a["id_region"],
+                                "id_anchor": match_a["id_anchor"],
+                                "number": [index] * len(match_a)})
+        table_b = pd.DataFrame({"id_region": match_b["id_region"],
+                                "id_anchor": match_b["id_anchor"],
+                                "number": [index] * len(match_b)})
+        table_a = table_a.merge(table_b, how="outer", on="number",
+                                suffixes=["_a1", "_a2"])
+        table_a.drop("number", inplace=True, axis=1)
+        table_a = table_a.loc[(-table_a["id_anchor_a1"].isna()) &
+                              (-table_a["id_anchor_a2"].isna()), :]
+        if not table_a.empty:
+            df_final = pd.concat([df_final, table_a], axis=0,
+                                 ignore_index=True)
+    df_final = df_final.merge(pet, how="left",
+                              left_on=["id_anchor_a1", "id_anchor_a2"],
+                              right_on=["anchor1", "anchor2"])
+    df_final.drop(["anchor1", "anchor2"], axis=1, inplace=True)
+    df_final.rename(columns={2: "weight"}, inplace=True)
+    return df_final
+
+
+def add_level():
+    """
+    Add the level to the previous dataframe result (df_final):
+    - intrachromosomique: the two genomic regions that interact are in the same
+    chromosome
+    - interchromosomique: the two genomic regions that interact are in
+    different chromosomes
+    #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
+    7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique
+
+    :return:
+    """
+    df_level = interactions()
+    df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\
+        split(":", expand=True)
+    df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\
+        split(":", expand=True)
+    df_level.loc[df_level["chr_a1"] == df_level["chr_a2"], "level"] = \
+        "intrachromosomique"
+    df_level.loc[df_level["chr_a1"] != df_level["chr_a2"], "level"] = \
+        "interchromosomique"
+    df_level.drop(["chr_a1", "coordinates_a1", "chr_a2", "coordinates_a2"],
+                  axis="columns", inplace=True)
+    return df_level
+
+
+def filtering_1():
+    """
+    Filtering of the previous dataframe result (df_level) by removing:
+    - the genomic regions that interact with itself, e.g.
+    #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
+    7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique
+
+    :return:
+    """
+    df_filter_1 = add_level()
+    df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"],
+                    "identical_or_not"] = "identical"
+    df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"],
+                    "identical_or_not"] = "not"
+    to_del = df_filter_1[df_filter_1.identical_or_not == "identical"].index.\
+        tolist()
+    df_filter_1 = df_filter_1.drop(to_del)
+    del df_filter_1["identical_or_not"]
+    return df_filter_1
+
+
+def filtering_2():
+    """
+    Filtering of the previous dataframe result (df_filter_1) by adding:
+    - the weights of the interactions that describe the same interaction, e.g.
+    #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
+    7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique
+    7832 10:10019900..10020088 16755 11:5834422..5834625 2 interchromosomique
+    --> #id_region_a1   id_region_a2    weight  level
+    --> 7832    16755   4   interchromosomique
+
+    :return:
+    """
+    df_filter_2 = filtering_1()
+    df_filter_2 = df_filter_2.drop_duplicates()
+    df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \
+                        df_filter_2["id_region_a2"].astype(str)
+    df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1",
+                      "id_region_a2"], axis="columns", inplace=True)
+    df_filter_2["weight"] = df_filter_2["weight"].astype(int)
+    df_filter_2 = df_filter_2[["weight", "id", "level"]].groupby(
+        ["id", "level"]).sum().reset_index(drop=False)
+    df_filter_2[["id_region_a1", "id_region_a2"]] = df_filter_2.id.str.\
+        split("$", expand=True)
+    del df_filter_2["id"]
+    print(df_filter_2)
+    print("End:", str(datetime.now()))
+    return df_filter_2
+
+
+if __name__ == "__main__":
+    filtering_2()