Skip to content
Snippets Groups Projects
Commit 19988eab authored by alapendr's avatar alapendr
Browse files

tad_to_communities/tad_to_communities.py: initial commit

parent 7a13ea5a
Branches
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Use as input:
. BED files with TAD annotations:
- HiC_TADs_Stein-MCF7-WT.hg19.nochr.bed
- ISO_Costantini.lifted.hg19.nochr.bed
- K562_Lieberman-raw_TADs.hg19.nochr.bed
. BED file with fasterDB annotation of genes
Then a bedtools intersect command is use to obtain the list of annotated genes
present in each TAD.
Finally the data ouput format is:
community nodes genes project, e.g.:
TAD_Stein_MCF7_1 2 13028, 13038 HiC_TADs_Stein-MCF7-WT_gene.bed
Community = the TAD ID - Nodes = the number of TAD genes
Genes = the list of TAD genes - Project = BED file name where the TAD was
identified
"""
import subprocess
from .config import Config
import os
import pandas as pd
def launch_bedtools_intersect():
"""
Launch a bedtools intersect command, between a BED file with TAD
annotations and a BED file with fasterDB annotation of genes, in order to
obtain a file with the list of genes that are matching with TAD. This file
is generated for each TAD annotations analyzed.
"""
Config.intersect_tad_gene.mkdir(exist_ok=True, parents=True)
cmd = f"for tad in `ls {Config.tad}/*` ; " \
f"do basename_tad=`basename $tad .hg19.nochr.bed` ; " \
f"basename_target=`basename {Config.target_gene} .bed` ; " \
f"bedtools intersect -F 0.5 -a $tad -b {Config.target_gene} " \
f"-wa -wb > {Config.intersect_tad_gene}/" \
"${basename_tad}_${basename_target}.bed ; done"
subprocess.check_call(cmd, shell=True,
stderr=subprocess.STDOUT)
def obtain_tad_to_genes(my_file: str):
"""
Allows to obtain a dictionary with this format:
tad_to_gene[TAD_ID] = [list_of_TAD_genes],
e.g.: tad_to_gene[TAD_Stein_MCF7_1] = [13028, 13038]
:param my_file: the file obtain as result of launch_bedtools_intersect()
:return tad_to_gene: a dictionary with this format:
tad_to_gene[TAD_ID] = [list_of_TAD_genes],
e.g.: tad_to_gene[TAD_Stein_MCF7_1] = [13028, 13038]
"""
tad_to_gene = {}
my_file_t = str(Config.intersect_tad_gene) + "/" + my_file
with open(my_file_t, "r") as my_file:
for line in my_file:
lclean = line.strip()
elmt = lclean.split("\t")
if elmt[3] not in tad_to_gene:
tad_to_gene[elmt[3]] = [elmt[-3]]
elif elmt[3] in tad_to_gene:
tad_to_gene[elmt[3]] += [elmt[-3]]
return tad_to_gene
def obtain_final_df(tad_to_gene: dict, my_file: str):
"""
Allows to obtain the output file, with this format:
community nodes genes project, e.g.:
TAD_Stein_MCF7_1 2 13028, 13038 HiC_TADs_Stein-MCF7-WT_gene.bed
Community = the TAD ID - Nodes = the number of TAD genes
Genes = the list of TAD genes - Project = BED file name where the TAD was
identified
:param tad_to_gene: a dictionary with this format:
tad_to_gene[TAD_ID] = [list_of_TAD_genes], see obtain_tad_to_genes()
:param my_file: the file obtain as result of launch_bedtools_intersect()
"""
content = []
for key, value in tad_to_gene.items():
content.append([key, ", ".join(value), len(value)])
df = pd.DataFrame(content, columns=["community", "genes", "nodes"])
df["project"] = my_file
df = df[["community", "nodes", "genes", "project"]]
Config.tad_communities.mkdir(exist_ok=True, parents=True)
my_file_out = str(Config.tad_communities) + "/communities_" + \
my_file.replace("_gene.bed", ".txt")
df.to_csv(my_file_out, sep='\t', index=False)
def main_tad_to_communities():
"""
Allows to launch all the functions of tad_to_communities module:
launch_bedtools_intersect() - obtain_tad_to_genes() and obtain_final_df()
:return:
"""
print("Bedtools intersect step")
launch_bedtools_intersect()
results = os.listdir(Config.intersect_tad_gene)
for my_file in results:
print("Produce output file for:", my_file)
tad_to_gene = obtain_tad_to_genes(my_file)
obtain_final_df(tad_to_gene, my_file)
print("Results are in:", Config.tad_communities)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment