diff --git a/src/db_utils/interactions/bedtools.py b/src/db_utils/interactions/bedtools.py new file mode 100644 index 0000000000000000000000000000000000000000..69e9a91442f69df57f3389f4edf625bb5a5bfa4f --- /dev/null +++ b/src/db_utils/interactions/bedtools.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# -*- coding: utf-8 -*- + +""" +Description: This script requires that ChIA-PET data must be in the following +format: duplicated BED6, in which one line corresponds to an anchor of a PET +and two lines in succession correspond to the two anchors of the same PET: +#chrA startA endA chrA:startA..endA-chrB:startB..endB,weightA-B weightA-B . +1 712493 714023 1:712493..714023-22:43009914..43011424,2 2 . +#chrB startB endB chrA:startA..endA-chrB:startB..endB,weightA-B weightA-B . +22 43009914 43011424 1:712493..714023-22:43009914..43011424,2 2 . + +This script allows to launch bedtools slop commands, in order to obtain two +BED files: one with exons and one with genes, with theirs coordinates set to +a window that we want to study, e.g. -200;+200. + +Then this script allows to launch bedtools intersect commands, in order to +obtain one file with the list of exons that are matching with anchors of PETs +and one file with the list of genes that are matching with anchors of PETs. +These two files are generated for each project of ChIA-PET analyzed. +""" + +import subprocess +from .config_interactions import ConfigInteractions as Config +from pathlib import Path + + +def launch_bedtools_slop(target_bed_file: Path, target_window: int, + target_output: Path) -> None: + """ + Launch bedtools slop commands, in order to obtain two BED files: one with \ + exons and one with genes, with theirs coordinates set to a window that we \ + want to study, e.g. -200;+200. + + :param target_bed_file: A bed file which contains the coordinates of the \ + targets that we want to study, e.g. genes. + :param target_window: The nucleotide window that we add to ours targets. + :param target_output: The output bed file of our target with its changed \ + coordinates. + """ + subprocess.check_call(f"bedtools slop -i {target_bed_file} " f"-g " + f"{Config.chr_sizes_hg19} " f"-b {target_window} " + f"> {target_output}", shell=True, + stderr=subprocess.STDOUT) + + +def launch_bedtools_intersect(target_output: Path, output_repository: Path): + """ + Launch bedtools intersect commands, in order to obtain one file with the \ + list of exons that are matching with anchors of PETs and one file with \ + the list of genes that are matching with anchors of PETs. These two files \ + are generated for each project of ChIA-PET analyzed. + + :param target_output: The output bed file of our target with its changed \ + coordinates. + :param output_repository: The output repository in which the bedtools \ + intersect command result files are stored. + """ + subprocess.check_call(f"for files in `ls {Config.chia_pet}" f"/*` ; do " + f"basename_files=$(basename -s .bed " f"$files) ; " + f"" f"basename_output=$(basename -s .bed " + f"{target_output}) ; " f"bedtools intersect -wo -a " + f"{target_output} " f"-b $files > " + f"{output_repository}/" "${basename_output}_vs_" + "${basename_files}.bed ; done", shell=True, + stderr=subprocess.STDOUT) + + +def main_bedtools() -> None: + """ + Launch bedtools slop and bedtools intersect commands for exons and genes. + + """ + launch_bedtools_slop(Config.exon, Config.exon_window, Config.exon_output) + launch_bedtools_slop(Config.gene, Config.gene_window, Config.gene_output) + launch_bedtools_intersect(Config.exon_output, + Config.pet_vs_exon_output) + launch_bedtools_intersect(Config.gene_output, + Config.pet_vs_gene_output) + + +if __name__ == "__main__": + main_bedtools() diff --git a/src/db_utils/interactions/config_interactions.py b/src/db_utils/interactions/config_interactions.py new file mode 100644 index 0000000000000000000000000000000000000000..78e0398bee63091cacbd5f6d46c7a55bb0d51760 --- /dev/null +++ b/src/db_utils/interactions/config_interactions.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + + +""" +Description: Configuration variables for subfolder interactions. +""" + +from ..config import Config +from pathlib import Path + + +class ConfigInteractions: + """ + Configuration variable for this subfolder + """ + exon = Config.data / 'bed' / 'exon.bed' + gene = Config.data / 'bed' / 'gene.bed' + chr_sizes_hg19 = Config.data / 'interactions_files' / \ + 'chr_sizes_hg19_nochr.txt' + exon_window = 200 + gene_window = 200 + chia_pet_interaction = Config.results / 'interactions' + exon_output = chia_pet_interaction / 'targets' / \ + f'exon_w{exon_window}.bed' + gene_output = chia_pet_interaction / 'targets' / \ + f'gene_w{gene_window}.bed' + chia_pet = Config.data / 'interactions_files' / 'chia_pet' + pet_vs_exon_output = chia_pet_interaction / 'intersections_exon' + pet_vs_gene_output = chia_pet_interaction / 'intersections_gene' + + +def get_interaction_file(bed_file: Path, region_name: str): + """ + Get the name of the interaction file created from the ``bed_file``. + + :param bed_file: A bed file that was used to create the interaction \ + file whose name is returned by this function. + :param region_name: The name of the genomic region searched in PETs. + :return: The name of the interaction file + """ + bed_name = bed_file.name.replace(".bed", "") + return ConfigInteractions.chia_pet_interaction / \ + f"{bed_name}_{region_name}_interaction.csv" diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py new file mode 100644 index 0000000000000000000000000000000000000000..4f2512d0f13ed72a6ef7cc19ce420466f0ce9b36 --- /dev/null +++ b/src/db_utils/interactions/features_interactions.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description:This script allows to determine which couples of genomic regions +interact, e.g. exons or genes, that for each ChIA-PET dataset. + +This script requires the output file produced by the script bedtools.py, which +is the result of the intersection, between the genes or exons BED file with the +duplicate BED6 file of ChIA-PET data, that is in the following format: +#Columns of the exons file and then columns of the ChIA-PET data file, e.g. +18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 . 100 + +#Exons: chr=18 start=28681 end=28882 id=1_1 0=0 strand=- +#ChIA-PET: chr2=18 start2=28682 end2=28782 +#chr1:start1..end1-chr2:start2..end2,weight1-2=1:47797..47799-18:28682..28782,2 +#weight1-2=2 .=. number of base pairs of overlap between exons and ChIA-PET=100 +""" + + +import pandas as pd +from datetime import datetime + +print("Start:", str(datetime.now())) + +def get_id_col(m_series: pd.Series) -> str: + """ + Allows to produce a id sorted, under this format: anchor1$anchor2. + + :param m_series: Contains the anchor1 and anchor2. + :return: + """ + return "$".join(sorted([m_series["anchor1"], m_series["anchor2"]])) + + +def work_on_pet(): + """ + This works on the duplicate BED6 files of ChIA-PET data. Input format is: + #chr1 start1 end1 chr1:start1..end1-chr2:start2..end2,weight1-2 weight1-2 . + #chr2 start2 end2 chr1:start1..end1-chr2:start2..end2,weight1-2 weight1-2 . + We want the following output format: + #chr1:start1..end1 chr2:start2..end2 weight1-2 + 10:100172432..100175026 6:92192672..92194172 2 + + :return: + """ + pet = pd.read_csv("/home/audrey/IE/ChIA_PET_network/data/interactions_files/chia_pet/GSM1517080.bed",sep="\t", header=None) + pet = pet[[3]].drop_duplicates() + pet = pet.iloc[:, 0].str.split(r"-|,", expand=True) + pet.columns = ["anchor1", "anchor2", "weight"] + pet["id"] = pet.apply(get_id_col, axis=1) + pet = pet[["weight", "id"]].groupby(["id"]).sum().reset_index(drop=False) + pet = pet["id"] + "$" + pet["weight"] + pet = pet.str.split("$", expand=True) + pet.columns = ["anchor1", "anchor2", "weight"] + return pet + + +def del_overlaps(): + """ + This works on the previous dataframe result (pet). Input format is: + #chr1:start1..end1 chr2:start2..end2 weight1-2 + We delete from this dataframe the pet that has overlapping anchors, e.g. + 9:139773532..139778733 9:139778161..139781850 7 + + :return: + """ + pet = work_on_pet() + pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\ + split(r"[:..]", expand=True) + pet[["chr2", "start2", "space2", "end2"]] = pet["anchor2"].str.\ + split(r"[:..]", expand=True) + pet = pet.drop(["anchor1", "anchor2", "space1", "space2"], axis=1) + pet.loc[pet["chr1"] != pet["chr2"], "delete"] = "no" + pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"].astype(int) >= + pet["start2"].astype(int)) & + (pet["start1"].astype(int) <= + pet["end2"].astype(int)) | + (pet["end1"].astype(int) >= + pet["start2"].astype(int)) & + (pet["end1"].astype(int) <= + pet["end2"].astype(int))), + "delete"] = "yes" + to_del = pet[pet.delete == "yes"].index.tolist() + pet = pet.drop(to_del) + pet["anchor1"] = pet["chr1"] + ":" + pet["start1"] + ".." + pet["end1"] + pet["anchor2"] = pet["chr2"] + ":" + pet["start2"] + ".." + pet["end2"] + pet.drop(["chr1", "start1", "end1", "chr2", "start2", "end2", "delete"], + inplace=True, axis=1) + pet = pet[["anchor1", "anchor2", "weight"]] + return pet + + +def work_on_intersection(): + """ + This works on the output file produced by the script bedtools.py. The input + format is (see the description of this script for more information), e.g. + 18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 . + 100 + We want the following output format: + 1_1 18:28682..28782 --> which is the ID of the genomic region, e.g. exon + and the coordinates of the anchor matching to this genomic region. + + :return: + """ + inter = pd.read_csv("/home/audrey/IE/ChIA_PET_network/results/interactions/intersections_gene/gene_w200_vs_GSM1517080.bed", sep="\t", header=None) + inter = inter.iloc[:, [3, 6, 7, 8]] + inter.columns = ["id_region", "chr", "start", "end"] + inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\ + astype("str") + ".." + inter["end"].astype("str") + inter.drop(["chr", "start", "end"], inplace=True, axis=1) + return inter + + +def interactions(): + """ + Allows to determine which couples of genomic regions interact, according to + what weight. + #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight + 7832 10:10019900..10020058 16755 11:5834473..5834631 2 + It means that gene 7832 interacts with gene 16755, according to a weight of + 2. + + :return: + """ + pet = del_overlaps() + df_final = pd.DataFrame() + for index, row in pet.iterrows(): + if index == 50: + break + inter = work_on_intersection() + match_a = inter.loc[inter["id_anchor"] == row[0], :] + match_b = inter.loc[inter["id_anchor"] == row[1], :] + table_a = pd.DataFrame({"id_region": match_a["id_region"], + "id_anchor": match_a["id_anchor"], + "number": [index] * len(match_a)}) + table_b = pd.DataFrame({"id_region": match_b["id_region"], + "id_anchor": match_b["id_anchor"], + "number": [index] * len(match_b)}) + table_a = table_a.merge(table_b, how="outer", on="number", + suffixes=["_a1", "_a2"]) + table_a.drop("number", inplace=True, axis=1) + table_a = table_a.loc[(-table_a["id_anchor_a1"].isna()) & + (-table_a["id_anchor_a2"].isna()), :] + if not table_a.empty: + df_final = pd.concat([df_final, table_a], axis=0, + ignore_index=True) + df_final = df_final.merge(pet, how="left", + left_on=["id_anchor_a1", "id_anchor_a2"], + right_on=["anchor1", "anchor2"]) + df_final.drop(["anchor1", "anchor2"], axis=1, inplace=True) + df_final.rename(columns={2: "weight"}, inplace=True) + return df_final + + +def add_level(): + """ + Add the level to the previous dataframe result (df_final): + - intrachromosomique: the two genomic regions that interact are in the same + chromosome + - interchromosomique: the two genomic regions that interact are in + different chromosomes + #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level + 7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique + + :return: + """ + df_level = interactions() + df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\ + split(":", expand=True) + df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\ + split(":", expand=True) + df_level.loc[df_level["chr_a1"] == df_level["chr_a2"], "level"] = \ + "intrachromosomique" + df_level.loc[df_level["chr_a1"] != df_level["chr_a2"], "level"] = \ + "interchromosomique" + df_level.drop(["chr_a1", "coordinates_a1", "chr_a2", "coordinates_a2"], + axis="columns", inplace=True) + return df_level + + +def filtering_1(): + """ + Filtering of the previous dataframe result (df_level) by removing: + - the genomic regions that interact with itself, e.g. + #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level + 7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique + + :return: + """ + df_filter_1 = add_level() + df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"], + "identical_or_not"] = "identical" + df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"], + "identical_or_not"] = "not" + to_del = df_filter_1[df_filter_1.identical_or_not == "identical"].index.\ + tolist() + df_filter_1 = df_filter_1.drop(to_del) + del df_filter_1["identical_or_not"] + return df_filter_1 + + +def filtering_2(): + """ + Filtering of the previous dataframe result (df_filter_1) by adding: + - the weights of the interactions that describe the same interaction, e.g. + #id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level + 7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique + 7832 10:10019900..10020088 16755 11:5834422..5834625 2 interchromosomique + --> #id_region_a1 id_region_a2 weight level + --> 7832 16755 4 interchromosomique + + :return: + """ + df_filter_2 = filtering_1() + df_filter_2 = df_filter_2.drop_duplicates() + df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \ + df_filter_2["id_region_a2"].astype(str) + df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1", + "id_region_a2"], axis="columns", inplace=True) + df_filter_2["weight"] = df_filter_2["weight"].astype(int) + df_filter_2 = df_filter_2[["weight", "id", "level"]].groupby( + ["id", "level"]).sum().reset_index(drop=False) + df_filter_2[["id_region_a1", "id_region_a2"]] = df_filter_2.id.str.\ + split("$", expand=True) + del df_filter_2["id"] + print(df_filter_2) + print("End:", str(datetime.now())) + return df_filter_2 + + +if __name__ == "__main__": + filtering_2()