tad_to_communities/tad_to_communities.py: initial commit

19988eab · alapendr · 7a13ea5a · 19988eab
Commit 19988eab authored 4 years ago by alapendr
--- a/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py
+++ b/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py
+#!/usr/bin/env python3
+
+# -*- coding: utf-8 -*-
+
+
+"""
+Use as input:
+. BED files with TAD annotations:
+- HiC_TADs_Stein-MCF7-WT.hg19.nochr.bed
+- ISO_Costantini.lifted.hg19.nochr.bed
+- K562_Lieberman-raw_TADs.hg19.nochr.bed
+. BED file with fasterDB annotation of genes
+
+Then a bedtools intersect command is use to obtain the list of annotated genes
+present in each TAD.
+
+Finally the data ouput format is:
+community   nodes   genes   project, e.g.:
+TAD_Stein_MCF7_1	2	13028, 13038	HiC_TADs_Stein-MCF7-WT_gene.bed
+Community = the TAD ID - Nodes = the number of TAD genes
+Genes = the list of TAD genes - Project = BED file name where the TAD was
+identified
+"""
+
+import subprocess
+from .config import Config
+import os
+import pandas as pd
+
+
+def launch_bedtools_intersect():
+    """
+    Launch a bedtools intersect command, between a BED file with TAD
+    annotations and a BED file with fasterDB annotation of genes, in order to
+    obtain a file with the list of genes that are matching with TAD. This file
+    is generated for each TAD annotations analyzed.
+    """
+    Config.intersect_tad_gene.mkdir(exist_ok=True, parents=True)
+    cmd = f"for tad in `ls {Config.tad}/*` ; " \
+          f"do basename_tad=`basename $tad .hg19.nochr.bed` ; " \
+          f"basename_target=`basename {Config.target_gene} .bed` ; " \
+          f"bedtools intersect -F 0.5 -a $tad -b {Config.target_gene} " \
+          f"-wa -wb > {Config.intersect_tad_gene}/" \
+          "${basename_tad}_${basename_target}.bed ; done"
+    subprocess.check_call(cmd, shell=True,
+                          stderr=subprocess.STDOUT)
+
+
+def obtain_tad_to_genes(my_file: str):
+    """
+    Allows to obtain a dictionary with this format:
+    tad_to_gene[TAD_ID] = [list_of_TAD_genes],
+    e.g.: tad_to_gene[TAD_Stein_MCF7_1] = [13028, 13038]
+
+    :param my_file: the file obtain as result of launch_bedtools_intersect()
+    :return tad_to_gene: a dictionary with this format:
+    tad_to_gene[TAD_ID] = [list_of_TAD_genes],
+    e.g.: tad_to_gene[TAD_Stein_MCF7_1] = [13028, 13038]
+    """
+    tad_to_gene = {}
+    my_file_t = str(Config.intersect_tad_gene) + "/" + my_file
+    with open(my_file_t, "r") as my_file:
+        for line in my_file:
+            lclean = line.strip()
+            elmt = lclean.split("\t")
+            if elmt[3] not in tad_to_gene:
+                tad_to_gene[elmt[3]] = [elmt[-3]]
+            elif elmt[3] in tad_to_gene:
+                tad_to_gene[elmt[3]] += [elmt[-3]]
+    return tad_to_gene
+
+
+def obtain_final_df(tad_to_gene: dict, my_file: str):
+    """
+    Allows to obtain the output file, with this format:
+    community   nodes   genes   project, e.g.:
+    TAD_Stein_MCF7_1	2	13028, 13038	HiC_TADs_Stein-MCF7-WT_gene.bed
+    Community = the TAD ID - Nodes = the number of TAD genes
+    Genes = the list of TAD genes - Project = BED file name where the TAD was
+    identified
+
+    :param tad_to_gene: a dictionary with this format:
+    tad_to_gene[TAD_ID] = [list_of_TAD_genes], see obtain_tad_to_genes()
+    :param my_file: the file obtain as result of launch_bedtools_intersect()
+    """
+    content = []
+    for key, value in tad_to_gene.items():
+        content.append([key, ", ".join(value), len(value)])
+    df = pd.DataFrame(content, columns=["community", "genes", "nodes"])
+    df["project"] = my_file
+    df = df[["community", "nodes", "genes", "project"]]
+    Config.tad_communities.mkdir(exist_ok=True, parents=True)
+    my_file_out = str(Config.tad_communities) + "/communities_" + \
+        my_file.replace("_gene.bed", ".txt")
+    df.to_csv(my_file_out, sep='\t', index=False)
+
+
+def main_tad_to_communities():
+    """
+    Allows to launch all the functions of tad_to_communities module:
+    launch_bedtools_intersect() - obtain_tad_to_genes() and obtain_final_df()
+    :return:
+    """
+    print("Bedtools intersect step")
+    launch_bedtools_intersect()
+    results = os.listdir(Config.intersect_tad_gene)
+    for my_file in results:
+        print("Produce output file for:", my_file)
+        tad_to_gene = obtain_tad_to_genes(my_file)
+        obtain_final_df(tad_to_gene, my_file)
+    print("Results are in:", Config.tad_communities)