diff --git a/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py b/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py index 2735d158d04e5db0827ae8eb3a16c039b4dabc09..ef51ec27243ba9b290aaeed168cc5562a8b55acc 100644 --- a/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py +++ b/src/find_interaction_cluster/tad_to_communities/tad_to_communities.py @@ -26,9 +26,11 @@ import subprocess from .config import Config import os import pandas as pd +from pathlib import Path +from typing import Dict, List -def launch_bedtools_intersect(): +def launch_bedtools_intersect() -> None: """ Launch a bedtools intersect command, between a BED file with TAD annotations and a BED file with fasterDB annotation of genes, in order to @@ -46,7 +48,27 @@ def launch_bedtools_intersect(): stderr=subprocess.STDOUT) -def obtain_tad_to_genes(my_file: str): +def targeted_bedtools_intersect(tad_file: Path, feature: str = "exon") -> Path: + """ + Launch a bedtools intersect command, between a BED file with TAD + annotations and a BED file with fasterDB annotation of genes or genes or \ + exons. + + :param tad_file: A bed file containing TAD + :param feature: The kind of feature to analyse + :return: The bed file created by bedtools + """ + Config.intersect_tad_gene.mkdir(exist_ok=True, parents=True) + target = Config.target_gene if feature == "gene" else Config.target_exon + basename_tad = tad_file.name.replace(".bed", "") + outfile = Config.intersect_tad_gene / f"{basename_tad}_{feature}.bed" + cmd = f"bedtools intersect -F 0.5 -a {tad_file} -b {target} " \ + f"-wa -wb > {outfile}" + subprocess.check_call(cmd, shell=True, stderr=subprocess.STDOUT) + return outfile + + +def obtain_tad_to_genes(my_file: str) -> Dict[str, List]: """ Allows to obtain a dictionary with this format: tad_to_gene[TAD_ID] = [list_of_TAD_genes], @@ -58,19 +80,18 @@ def obtain_tad_to_genes(my_file: str): e.g.: tad_to_gene[TAD_Stein_MCF7_1] = [13028, 13038] """ tad_to_gene = {} - my_file_t = str(Config.intersect_tad_gene) + "/" + my_file - with open(my_file_t, "r") as my_file: + with open(my_file, "r") as my_file: for line in my_file: lclean = line.strip() elmt = lclean.split("\t") if elmt[3] not in tad_to_gene: tad_to_gene[elmt[3]] = [elmt[-3]] - elif elmt[3] in tad_to_gene: + else: tad_to_gene[elmt[3]] += [elmt[-3]] return tad_to_gene -def obtain_final_df(tad_to_gene: dict, my_file: str): +def obtain_final_df(tad_to_gene: dict, my_file: str, outfile: str) -> None: """ Allows to obtain the output file, with this format: community nodes genes project, e.g.: @@ -82,20 +103,21 @@ def obtain_final_df(tad_to_gene: dict, my_file: str): :param tad_to_gene: a dictionary with this format: tad_to_gene[TAD_ID] = [list_of_TAD_genes], see obtain_tad_to_genes() :param my_file: the file obtain as result of launch_bedtools_intersect() + :param outfile: The output file """ - content = [] - for key, value in tad_to_gene.items(): - content.append([key, ", ".join(value), len(value)]) + content = [ + [key, ", ".join(value), len(value)] + for key, value in tad_to_gene.items() + ] + df = pd.DataFrame(content, columns=["community", "genes", "nodes"]) df["project"] = my_file df = df[["community", "nodes", "genes", "project"]] Config.tad_communities.mkdir(exist_ok=True, parents=True) - my_file_out = str(Config.tad_communities) + "/communities_" + \ - my_file.replace("_gene.bed", ".txt") - df.to_csv(my_file_out, sep='\t', index=False) + df.to_csv(outfile, sep='\t', index=False) -def main_tad_to_communities(): +def main_tad_to_communities() -> None: """ Allows to launch all the functions of tad_to_communities module: launch_bedtools_intersect() - obtain_tad_to_genes() and obtain_final_df() @@ -106,6 +128,46 @@ def main_tad_to_communities(): results = os.listdir(Config.intersect_tad_gene) for my_file in results: print("Produce output file for:", my_file) - tad_to_gene = obtain_tad_to_genes(my_file) - obtain_final_df(tad_to_gene, my_file) + tad_to_gene = obtain_tad_to_genes(str(Config.intersect_tad_gene) + + "/" + my_file) + outfile = str(Config.tad_communities) + "/communities_" + \ + my_file.replace("_gene.bed", ".txt") + obtain_final_df(tad_to_gene, my_file, outfile) print("Results are in:", Config.tad_communities) + + +def removing_chr(tad_file: Path) -> Path: + """ + Remove the heading chr in the first column of the bed file \ + if it is present + :param tad_file: A bed file containing tad + :return: The file containing the tad but without 'chr' + """ + + df = pd.read_csv(tad_file, header=None, sep="\t") + df[0] = df[0].str.replace("chr", "") + outfile = Config.intersect_tad_gene / tad_file.name + df.to_csv(outfile, sep="\t", header=False, index=False) + return outfile + + +def targeted_tad_to_communities(tad_file: str, outfile: str, feature: str + ) -> None: + """ + Create a community file from a bed file containing TADs. + + :param tad_file: A bed file containing TAD + :param outfile: The resulting community file + :param feature: The kind of feature to analyse + """ + outfile = Path(outfile) + if not outfile.is_file(): + tad_file = removing_chr(Path(tad_file)) + bed_out = targeted_bedtools_intersect(tad_file, feature) + dic_gene = obtain_tad_to_genes(str(bed_out)) + obtain_final_df(dic_gene, bed_out.name.replace(".bed", ""), + str(outfile)) + bed_out.unlink() + tad_file.unlink() + else: + print(f"{outfile} already exists !") \ No newline at end of file