Skip to content
Snippets Groups Projects
Commit efadea32 authored by alapendr's avatar alapendr
Browse files

interactions: initiation

parent 5f61e237
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Description: This script requires that ChIA-PET data must be in the following
format: duplicated BED6, in which one line corresponds to an anchor of a PET
and two lines in succession correspond to the two anchors of the same PET:
#chrA startA endA chrA:startA..endA-chrB:startB..endB,weightA-B weightA-B .
1 712493 714023 1:712493..714023-22:43009914..43011424,2 2 .
#chrB startB endB chrA:startA..endA-chrB:startB..endB,weightA-B weightA-B .
22 43009914 43011424 1:712493..714023-22:43009914..43011424,2 2 .
This script allows to launch bedtools slop commands, in order to obtain two
BED files: one with exons and one with genes, with theirs coordinates set to
a window that we want to study, e.g. -200;+200.
Then this script allows to launch bedtools intersect commands, in order to
obtain one file with the list of exons that are matching with anchors of PETs
and one file with the list of genes that are matching with anchors of PETs.
These two files are generated for each project of ChIA-PET analyzed.
"""
import subprocess
from .config_interactions import ConfigInteractions as Config
from pathlib import Path
def launch_bedtools_slop(target_bed_file: Path, target_window: int,
target_output: Path) -> None:
"""
Launch bedtools slop commands, in order to obtain two BED files: one with \
exons and one with genes, with theirs coordinates set to a window that we \
want to study, e.g. -200;+200.
:param target_bed_file: A bed file which contains the coordinates of the \
targets that we want to study, e.g. genes.
:param target_window: The nucleotide window that we add to ours targets.
:param target_output: The output bed file of our target with its changed \
coordinates.
"""
subprocess.check_call(f"bedtools slop -i {target_bed_file} " f"-g "
f"{Config.chr_sizes_hg19} " f"-b {target_window} "
f"> {target_output}", shell=True,
stderr=subprocess.STDOUT)
def launch_bedtools_intersect(target_output: Path, output_repository: Path):
"""
Launch bedtools intersect commands, in order to obtain one file with the \
list of exons that are matching with anchors of PETs and one file with \
the list of genes that are matching with anchors of PETs. These two files \
are generated for each project of ChIA-PET analyzed.
:param target_output: The output bed file of our target with its changed \
coordinates.
:param output_repository: The output repository in which the bedtools \
intersect command result files are stored.
"""
subprocess.check_call(f"for files in `ls {Config.chia_pet}" f"/*` ; do "
f"basename_files=$(basename -s .bed " f"$files) ; "
f"" f"basename_output=$(basename -s .bed "
f"{target_output}) ; " f"bedtools intersect -wo -a "
f"{target_output} " f"-b $files > "
f"{output_repository}/" "${basename_output}_vs_"
"${basename_files}.bed ; done", shell=True,
stderr=subprocess.STDOUT)
def main_bedtools() -> None:
"""
Launch bedtools slop and bedtools intersect commands for exons and genes.
"""
launch_bedtools_slop(Config.exon, Config.exon_window, Config.exon_output)
launch_bedtools_slop(Config.gene, Config.gene_window, Config.gene_output)
launch_bedtools_intersect(Config.exon_output,
Config.pet_vs_exon_output)
launch_bedtools_intersect(Config.gene_output,
Config.pet_vs_gene_output)
if __name__ == "__main__":
main_bedtools()
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: Configuration variables for subfolder interactions.
"""
from ..config import Config
from pathlib import Path
class ConfigInteractions:
"""
Configuration variable for this subfolder
"""
exon = Config.data / 'bed' / 'exon.bed'
gene = Config.data / 'bed' / 'gene.bed'
chr_sizes_hg19 = Config.data / 'interactions_files' / \
'chr_sizes_hg19_nochr.txt'
exon_window = 200
gene_window = 200
chia_pet_interaction = Config.results / 'interactions'
exon_output = chia_pet_interaction / 'targets' / \
f'exon_w{exon_window}.bed'
gene_output = chia_pet_interaction / 'targets' / \
f'gene_w{gene_window}.bed'
chia_pet = Config.data / 'interactions_files' / 'chia_pet'
pet_vs_exon_output = chia_pet_interaction / 'intersections_exon'
pet_vs_gene_output = chia_pet_interaction / 'intersections_gene'
def get_interaction_file(bed_file: Path, region_name: str):
"""
Get the name of the interaction file created from the ``bed_file``.
:param bed_file: A bed file that was used to create the interaction \
file whose name is returned by this function.
:param region_name: The name of the genomic region searched in PETs.
:return: The name of the interaction file
"""
bed_name = bed_file.name.replace(".bed", "")
return ConfigInteractions.chia_pet_interaction / \
f"{bed_name}_{region_name}_interaction.csv"
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description:This script allows to determine which couples of genomic regions
interact, e.g. exons or genes, that for each ChIA-PET dataset.
This script requires the output file produced by the script bedtools.py, which
is the result of the intersection, between the genes or exons BED file with the
duplicate BED6 file of ChIA-PET data, that is in the following format:
#Columns of the exons file and then columns of the ChIA-PET data file, e.g.
18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 . 100
#Exons: chr=18 start=28681 end=28882 id=1_1 0=0 strand=-
#ChIA-PET: chr2=18 start2=28682 end2=28782
#chr1:start1..end1-chr2:start2..end2,weight1-2=1:47797..47799-18:28682..28782,2
#weight1-2=2 .=. number of base pairs of overlap between exons and ChIA-PET=100
"""
import pandas as pd
from datetime import datetime
print("Start:", str(datetime.now()))
def get_id_col(m_series: pd.Series) -> str:
"""
Allows to produce a id sorted, under this format: anchor1$anchor2.
:param m_series: Contains the anchor1 and anchor2.
:return:
"""
return "$".join(sorted([m_series["anchor1"], m_series["anchor2"]]))
def work_on_pet():
"""
This works on the duplicate BED6 files of ChIA-PET data. Input format is:
#chr1 start1 end1 chr1:start1..end1-chr2:start2..end2,weight1-2 weight1-2 .
#chr2 start2 end2 chr1:start1..end1-chr2:start2..end2,weight1-2 weight1-2 .
We want the following output format:
#chr1:start1..end1 chr2:start2..end2 weight1-2
10:100172432..100175026 6:92192672..92194172 2
:return:
"""
pet = pd.read_csv("/home/audrey/IE/ChIA_PET_network/data/interactions_files/chia_pet/GSM1517080.bed",sep="\t", header=None)
pet = pet[[3]].drop_duplicates()
pet = pet.iloc[:, 0].str.split(r"-|,", expand=True)
pet.columns = ["anchor1", "anchor2", "weight"]
pet["id"] = pet.apply(get_id_col, axis=1)
pet = pet[["weight", "id"]].groupby(["id"]).sum().reset_index(drop=False)
pet = pet["id"] + "$" + pet["weight"]
pet = pet.str.split("$", expand=True)
pet.columns = ["anchor1", "anchor2", "weight"]
return pet
def del_overlaps():
"""
This works on the previous dataframe result (pet). Input format is:
#chr1:start1..end1 chr2:start2..end2 weight1-2
We delete from this dataframe the pet that has overlapping anchors, e.g.
9:139773532..139778733 9:139778161..139781850 7
:return:
"""
pet = work_on_pet()
pet[["chr1", "start1", "space1", "end1"]] = pet["anchor1"].str.\
split(r"[:..]", expand=True)
pet[["chr2", "start2", "space2", "end2"]] = pet["anchor2"].str.\
split(r"[:..]", expand=True)
pet = pet.drop(["anchor1", "anchor2", "space1", "space2"], axis=1)
pet.loc[pet["chr1"] != pet["chr2"], "delete"] = "no"
pet.loc[(pet["chr1"] == pet["chr2"]) & ((pet["start1"].astype(int) >=
pet["start2"].astype(int)) &
(pet["start1"].astype(int) <=
pet["end2"].astype(int)) |
(pet["end1"].astype(int) >=
pet["start2"].astype(int)) &
(pet["end1"].astype(int) <=
pet["end2"].astype(int))),
"delete"] = "yes"
to_del = pet[pet.delete == "yes"].index.tolist()
pet = pet.drop(to_del)
pet["anchor1"] = pet["chr1"] + ":" + pet["start1"] + ".." + pet["end1"]
pet["anchor2"] = pet["chr2"] + ":" + pet["start2"] + ".." + pet["end2"]
pet.drop(["chr1", "start1", "end1", "chr2", "start2", "end2", "delete"],
inplace=True, axis=1)
pet = pet[["anchor1", "anchor2", "weight"]]
return pet
def work_on_intersection():
"""
This works on the output file produced by the script bedtools.py. The input
format is (see the description of this script for more information), e.g.
18 28681 28882 1_1 0 - 18 28682 28782 1:47797..47799-18:28682..28782,2 2 .
100
We want the following output format:
1_1 18:28682..28782 --> which is the ID of the genomic region, e.g. exon
and the coordinates of the anchor matching to this genomic region.
:return:
"""
inter = pd.read_csv("/home/audrey/IE/ChIA_PET_network/results/interactions/intersections_gene/gene_w200_vs_GSM1517080.bed", sep="\t", header=None)
inter = inter.iloc[:, [3, 6, 7, 8]]
inter.columns = ["id_region", "chr", "start", "end"]
inter["id_anchor"] = inter["chr"].astype("str") + ":" + inter["start"].\
astype("str") + ".." + inter["end"].astype("str")
inter.drop(["chr", "start", "end"], inplace=True, axis=1)
return inter
def interactions():
"""
Allows to determine which couples of genomic regions interact, according to
what weight.
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight
7832 10:10019900..10020058 16755 11:5834473..5834631 2
It means that gene 7832 interacts with gene 16755, according to a weight of
2.
:return:
"""
pet = del_overlaps()
df_final = pd.DataFrame()
for index, row in pet.iterrows():
if index == 50:
break
inter = work_on_intersection()
match_a = inter.loc[inter["id_anchor"] == row[0], :]
match_b = inter.loc[inter["id_anchor"] == row[1], :]
table_a = pd.DataFrame({"id_region": match_a["id_region"],
"id_anchor": match_a["id_anchor"],
"number": [index] * len(match_a)})
table_b = pd.DataFrame({"id_region": match_b["id_region"],
"id_anchor": match_b["id_anchor"],
"number": [index] * len(match_b)})
table_a = table_a.merge(table_b, how="outer", on="number",
suffixes=["_a1", "_a2"])
table_a.drop("number", inplace=True, axis=1)
table_a = table_a.loc[(-table_a["id_anchor_a1"].isna()) &
(-table_a["id_anchor_a2"].isna()), :]
if not table_a.empty:
df_final = pd.concat([df_final, table_a], axis=0,
ignore_index=True)
df_final = df_final.merge(pet, how="left",
left_on=["id_anchor_a1", "id_anchor_a2"],
right_on=["anchor1", "anchor2"])
df_final.drop(["anchor1", "anchor2"], axis=1, inplace=True)
df_final.rename(columns={2: "weight"}, inplace=True)
return df_final
def add_level():
"""
Add the level to the previous dataframe result (df_final):
- intrachromosomique: the two genomic regions that interact are in the same
chromosome
- interchromosomique: the two genomic regions that interact are in
different chromosomes
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique
:return:
"""
df_level = interactions()
df_level[["chr_a1", "coordinates_a1"]] = df_level.id_anchor_a1.str.\
split(":", expand=True)
df_level[["chr_a2", "coordinates_a2"]] = df_level.id_anchor_a2.str.\
split(":", expand=True)
df_level.loc[df_level["chr_a1"] == df_level["chr_a2"], "level"] = \
"intrachromosomique"
df_level.loc[df_level["chr_a1"] != df_level["chr_a2"], "level"] = \
"interchromosomique"
df_level.drop(["chr_a1", "coordinates_a1", "chr_a2", "coordinates_a2"],
axis="columns", inplace=True)
return df_level
def filtering_1():
"""
Filtering of the previous dataframe result (df_level) by removing:
- the genomic regions that interact with itself, e.g.
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 7832 10:10019900..10020088 2 intrachromosomique
:return:
"""
df_filter_1 = add_level()
df_filter_1.loc[df_filter_1["id_region_a1"] == df_filter_1["id_region_a2"],
"identical_or_not"] = "identical"
df_filter_1.loc[df_filter_1["id_region_a1"] != df_filter_1["id_region_a2"],
"identical_or_not"] = "not"
to_del = df_filter_1[df_filter_1.identical_or_not == "identical"].index.\
tolist()
df_filter_1 = df_filter_1.drop(to_del)
del df_filter_1["identical_or_not"]
return df_filter_1
def filtering_2():
"""
Filtering of the previous dataframe result (df_filter_1) by adding:
- the weights of the interactions that describe the same interaction, e.g.
#id_region_a1 id_anchor_a1 id_region_a2 id_anchor_a2 weight level
7832 10:10019900..10020058 16755 11:5834473..5834631 2 interchromosomique
7832 10:10019900..10020088 16755 11:5834422..5834625 2 interchromosomique
--> #id_region_a1 id_region_a2 weight level
--> 7832 16755 4 interchromosomique
:return:
"""
df_filter_2 = filtering_1()
df_filter_2 = df_filter_2.drop_duplicates()
df_filter_2["id"] = df_filter_2["id_region_a1"].astype(str) + "$" + \
df_filter_2["id_region_a2"].astype(str)
df_filter_2.drop(["id_anchor_a1", "id_anchor_a2", "id_region_a1",
"id_region_a2"], axis="columns", inplace=True)
df_filter_2["weight"] = df_filter_2["weight"].astype(int)
df_filter_2 = df_filter_2[["weight", "id", "level"]].groupby(
["id", "level"]).sum().reset_index(drop=False)
df_filter_2[["id_region_a1", "id_region_a2"]] = df_filter_2.id.str.\
split("$", expand=True)
del df_filter_2["id"]
print(df_filter_2)
print("End:", str(datetime.now()))
return df_filter_2
if __name__ == "__main__":
filtering_2()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment