diff --git a/src/hic_TAD_caller/create_TAD_HiC_files/calling_tads.py b/src/hic_TAD_caller/create_TAD_HiC_files/calling_tads.py index 1c7750ddc6ae8b40183b27393b730ae6a0ad300d..6b5a45189997b195c6a9ce2fd43975fe13117192 100644 --- a/src/hic_TAD_caller/create_TAD_HiC_files/calling_tads.py +++ b/src/hic_TAD_caller/create_TAD_HiC_files/calling_tads.py @@ -136,13 +136,13 @@ def check_n_save_bed(bed_file: Path, name_file: str, resolution: int) -> Path: :return: The path containing the bed file updated. """ data = pd.read_csv(bed_file, sep="\t") - outfile = bed_file.parent / f"{resolution}_" \ - f"{name_file.replace('.hic', '.bed')}" + outfile = bed_file.parent / \ + f"{resolution}_{name_file.replace('.hic', '_overlapped.bed')}" if outfile.is_file(): return outfile df = data.loc[(data["chr1"] == data["chr2"]) & - (data["x1"] == data["y1"]) & - (data["x2"] == data["y2"]), :] + (data["x1"] == data["y1"]) & + (data["x2"] == data["y2"]), :] if df.shape[0] != data.shape[0]: raise ValueError(f"The 3 first column of the bed file produced from " f"{name_file} should be equal the 3 next columns") @@ -175,6 +175,27 @@ def liftover(tad_file: Path) -> Path: return outfile +def merge_tads(tad_files: Path, name_file: str) -> Path: + """ + + :param tad_files: A simplified TAD file produced from ArrowHead + :param name_file: The file name of the hic file used to produced \ + the file containing TADs. + :return: The file containing merged TADs + """ + merged_tad = tad_files.parent / tad_files.name.replace("_overlapped.bed", + ".bed") + cmd = f"bedtools merge -i {tad_files} > {merged_tad}" + sp.check_call(cmd, shell=True) + df = pd.read_csv(merged_tad, sep="\t", names=["chr", "start", "stop"]) + new_name = name_file.replace("_grch38.hic", "") + df["name"] = [f"{new_name}_{x + 1}" for x in range(df.shape[0])] + df["score"] = ["."] * df.shape[0] + df["strand"] = ["+"] * df.shape[0] + df.to_csv(merged_tad, sep="\t", index=False, header=False) + return merged_tad + + def process_hic_files(exp_table: pd.DataFrame, resolution: int = 10000) -> None: """ @@ -196,11 +217,13 @@ def process_hic_files(exp_table: pd.DataFrame, continue logging.info(" Reformating TADs") tad_file = check_n_save_bed(tad_file, hic_file.name, resolution) + logging.info("Merging tad") + tad_file = merge_tads(tad_file, hic_file.name) logging.info(" LiftOver TADs") liftover_file = liftover(tad_file) logging.info(" Creating communities") final_file = liftover_file.parent / \ - f"communities_{liftover_file.name}" + f"communities_{liftover_file.name}" targeted_tad_to_communities(str(liftover_file), str(final_file), "gene")