diff --git a/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py b/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py new file mode 100644 index 0000000000000000000000000000000000000000..2735d158d04e5db0827ae8eb3a16c039b4dabc09 --- /dev/null +++ b/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +# -*- coding: utf-8 -*- + + +""" +Use as input: +. BED files with TAD annotations: +- HiC_TADs_Stein-MCF7-WT.hg19.nochr.bed +- ISO_Costantini.lifted.hg19.nochr.bed +- K562_Lieberman-raw_TADs.hg19.nochr.bed +. BED file with fasterDB annotation of genes + +Then a bedtools intersect command is use to obtain the list of annotated genes +present in each TAD. + +Finally the data ouput format is: +community nodes genes project, e.g.: +TAD_Stein_MCF7_1 2 13028, 13038 HiC_TADs_Stein-MCF7-WT_gene.bed +Community = the TAD ID - Nodes = the number of TAD genes +Genes = the list of TAD genes - Project = BED file name where the TAD was +identified +""" + +import subprocess +from .config import Config +import os +import pandas as pd + + +def launch_bedtools_intersect(): + """ + Launch a bedtools intersect command, between a BED file with TAD + annotations and a BED file with fasterDB annotation of genes, in order to + obtain a file with the list of genes that are matching with TAD. This file + is generated for each TAD annotations analyzed. + """ + Config.intersect_tad_gene.mkdir(exist_ok=True, parents=True) + cmd = f"for tad in `ls {Config.tad}/*` ; " \ + f"do basename_tad=`basename $tad .hg19.nochr.bed` ; " \ + f"basename_target=`basename {Config.target_gene} .bed` ; " \ + f"bedtools intersect -F 0.5 -a $tad -b {Config.target_gene} " \ + f"-wa -wb > {Config.intersect_tad_gene}/" \ + "${basename_tad}_${basename_target}.bed ; done" + subprocess.check_call(cmd, shell=True, + stderr=subprocess.STDOUT) + + +def obtain_tad_to_genes(my_file: str): + """ + Allows to obtain a dictionary with this format: + tad_to_gene[TAD_ID] = [list_of_TAD_genes], + e.g.: tad_to_gene[TAD_Stein_MCF7_1] = [13028, 13038] + + :param my_file: the file obtain as result of launch_bedtools_intersect() + :return tad_to_gene: a dictionary with this format: + tad_to_gene[TAD_ID] = [list_of_TAD_genes], + e.g.: tad_to_gene[TAD_Stein_MCF7_1] = [13028, 13038] + """ + tad_to_gene = {} + my_file_t = str(Config.intersect_tad_gene) + "/" + my_file + with open(my_file_t, "r") as my_file: + for line in my_file: + lclean = line.strip() + elmt = lclean.split("\t") + if elmt[3] not in tad_to_gene: + tad_to_gene[elmt[3]] = [elmt[-3]] + elif elmt[3] in tad_to_gene: + tad_to_gene[elmt[3]] += [elmt[-3]] + return tad_to_gene + + +def obtain_final_df(tad_to_gene: dict, my_file: str): + """ + Allows to obtain the output file, with this format: + community nodes genes project, e.g.: + TAD_Stein_MCF7_1 2 13028, 13038 HiC_TADs_Stein-MCF7-WT_gene.bed + Community = the TAD ID - Nodes = the number of TAD genes + Genes = the list of TAD genes - Project = BED file name where the TAD was + identified + + :param tad_to_gene: a dictionary with this format: + tad_to_gene[TAD_ID] = [list_of_TAD_genes], see obtain_tad_to_genes() + :param my_file: the file obtain as result of launch_bedtools_intersect() + """ + content = [] + for key, value in tad_to_gene.items(): + content.append([key, ", ".join(value), len(value)]) + df = pd.DataFrame(content, columns=["community", "genes", "nodes"]) + df["project"] = my_file + df = df[["community", "nodes", "genes", "project"]] + Config.tad_communities.mkdir(exist_ok=True, parents=True) + my_file_out = str(Config.tad_communities) + "/communities_" + \ + my_file.replace("_gene.bed", ".txt") + df.to_csv(my_file_out, sep='\t', index=False) + + +def main_tad_to_communities(): + """ + Allows to launch all the functions of tad_to_communities module: + launch_bedtools_intersect() - obtain_tad_to_genes() and obtain_final_df() + :return: + """ + print("Bedtools intersect step") + launch_bedtools_intersect() + results = os.listdir(Config.intersect_tad_gene) + for my_file in results: + print("Produce output file for:", my_file) + tad_to_gene = obtain_tad_to_genes(my_file) + obtain_final_df(tad_to_gene, my_file) + print("Results are in:", Config.tad_communities)