diff --git a/bin/hicstuff_bam2pairs.py b/bin/hicstuff_bam2pairs.py new file mode 100755 index 0000000000000000000000000000000000000000..6697197d3d71d1787948a693bd0dbe75c4c75530 --- /dev/null +++ b/bin/hicstuff_bam2pairs.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python + +import os, time, csv, sys, re +import argparse +import pysam as ps +import pandas as pd +import itertools +from hicstuff_log import logger +import hicstuff_io as hio +import hicstuff_digest as hcd +from Bio import SeqIO + + + +def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30): + """ + Make a .pairs file from two Hi-C bam files sorted by read names. + The Hi-C mates are matched by read identifier. Pairs where at least one + reads maps with MAPQ below min_qual threshold are discarded. Pairs are + sorted by readID and stored in upper triangle (first pair higher). + Parameters + ---------- + bam1 : str + Path to the name-sorted BAM file with aligned Hi-C forward reads. + bam2 : str + Path to the name-sorted BAM file with aligned Hi-C reverse reads. + out_pairs : str + Path to the output space-separated .pairs file with columns + readID, chr1 pos1 chr2 pos2 strand1 strand2 + info_contigs : str + Path to the info contigs file, to get info on chromosome sizes and order. + min_qual : int + Minimum mapping quality required to keep a Hi-C pair. + """ + forward = ps.AlignmentFile(bam1, "rb") + reverse = ps.AlignmentFile(bam2, "rb") + + # Generate header lines + format_version = "## pairs format v1.0\n" + sorting = "#sorted: readID\n" + cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n" + # Chromosome order will be identical in info_contigs and pair files + chroms = pd.read_csv(info_contigs, sep="\t").apply( + lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1 + ) + with open(out_pairs, "w") as pairs: + pairs.writelines([format_version, sorting, cols] + chroms.tolist()) + pairs_writer = csv.writer(pairs, delimiter="\t") + n_reads = {"total": 0, "mapped": 0} + # Remember if some read IDs were missing from either file + unmatched_reads = 0 + # Remember if all reads in one bam file have been read + exhausted = [False, False] + # Iterate on both BAM simultaneously + end_regex = re.compile(r'/[12]$') + for end1, end2 in itertools.zip_longest(forward, reverse): + # Remove end-specific suffix if any + end1.query_name = re.sub(end_regex, '', end1.query_name) + end2.query_name = re.sub(end_regex, '', end2.query_name) + # Both file still have reads + # Check if reads pass filter + try: + end1_passed = end1.mapping_quality >= min_qual + # Happens if end1 bam file has been exhausted + except AttributeError: + exhausted[0] = True + end1_passed = False + try: + end2_passed = end2.mapping_quality >= min_qual + # Happens if end2 bam file has been exhausted + except AttributeError: + exhausted[1] = True + end2_passed = False + # Skip read if mate is not present until they match or reads + # have been exhausted + while sum(exhausted) == 0 and end1.query_name != end2.query_name: + # Get next read and check filters again + # Count single-read iteration + unmatched_reads += 1 + n_reads["total"] += 1 + if end1.query_name < end2.query_name: + try: + end1 = next(forward) + end1_passed = end1.mapping_quality >= min_qual + # If EOF is reached in BAM 1 + except (StopIteration, AttributeError): + exhausted[0] = True + end1_passed = False + n_reads["mapped"] += end1_passed + elif end1.query_name > end2.query_name: + try: + end2 = next(reverse) + end2_passed = end2.mapping_quality >= min_qual + # If EOF is reached in BAM 2 + except (StopIteration, AttributeError): + exhausted[1] = True + end2_passed = False + n_reads["mapped"] += end2_passed + + # 2 reads processed per iteration, unless one file is exhausted + n_reads["total"] += 2 - sum(exhausted) + n_reads["mapped"] += sum([end1_passed, end2_passed]) + # Keep only pairs where both reads have good quality + if end1_passed and end2_passed: + + # Flipping to get upper triangle + if ( + end1.reference_id == end2.reference_id + and end1.reference_start > end2.reference_start + ) or end1.reference_id > end2.reference_id: + end1, end2 = end2, end1 + pairs_writer.writerow( + [ + end1.query_name, + end1.reference_name, + end1.reference_start + 1, + end2.reference_name, + end2.reference_start + 1, + "-" if end1.is_reverse else "+", + "-" if end2.is_reverse else "+", + ] + ) + pairs.close() + if unmatched_reads > 0: + logger.warning( + "%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.", + unmatched_reads, + ) + logger.info( + "{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})".format( + total=n_reads["total"], + mapped=n_reads["mapped"], + perc_map=round(100 * n_reads["mapped"] / n_reads["total"]), + qual=min_qual, + ) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-b1", "--bam1") + parser.add_argument("-b2", "--bam2") + parser.add_argument("-o", "--out_pairs") + parser.add_argument("-x", "--out_idx") + parser.add_argument("-i", "--info_contigs") + parser.add_argument("-q", "--min_qual") + parser.add_argument("-e", "--enzyme") + parser.add_argument("-f", "--fasta") + args = parser.parse_args() + + bam1 = args.bam1 + bam2 = args.bam2 + out_pairs = args.out_pairs + out_idx = args.out_idx + info_contigs = args.info_contigs + min_qual = int(args.min_qual) + enzyme = args.enzyme + fasta =args.fasta + + #hicstuff case sensitive enzymes adaptation + if enzyme == "hindiii": + enzyme = "HindIII" + elif enzyme == "dpnii": + enzyme = "DpnII" + elif enzyme == "bglii": + enzyme = "BglII" + elif enzyme == "mboi": + enzyme = "MboI" + elif enzyme == "arima": + enzyme = "Arima" + + bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual) + + restrict_table = {} + for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"): + # Get chromosome restriction table + restrict_table[record.id] = hcd.get_restriction_table( + record.seq, enzyme, circular=False + ) + + hcd.attribute_fragments(out_pairs, out_idx, restrict_table) + + hio.sort_pairs( + out_idx, + out_idx + ".sorted", + keys=["chr1", "pos1", "chr2", "pos2"], + threads=1, + tmp_dir=None, + ) + os.rename(out_idx + ".sorted", out_idx) diff --git a/conf/hicstuff.config b/conf/hicstuff.config index f26ae409c03eff638d84ca85a26367592dce3e69..1d23768f7ed5cd5ab53b203d9ea83e31b38453f2 100644 --- a/conf/hicstuff.config +++ b/conf/hicstuff.config @@ -8,6 +8,10 @@ process { withName: 'BOWTIE2_ALIGNMENT' { ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" } + publishDir = [ + path: { "${params.outdir}/hicstuff/reads"}, + mode: 'copy' + ] } withName: 'FRAGMENT_ENZYME' { @@ -17,11 +21,18 @@ process { ] } + withName: 'BAM2PAIRS' { + publishDir = [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy' + ] + } + //********************************************** // PREPARE_GENOME withName: 'BOWTIE2_BUILD' { publishDir = [ - path: { "${params.outdir}/genome/bowtie2" }, + path: { "${params.outdir}/genome/bowtie2_index" }, mode: 'copy' ] } @@ -133,7 +144,7 @@ params { multiqc_methods_description = null // Boilerplate options - outdir = './results' + outdir = '${params.outdir}' tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null diff --git a/modules/local/hicstuff/bam2pairs.nf b/modules/local/hicstuff/bam2pairs.nf new file mode 100644 index 0000000000000000000000000000000000000000..4908e9f2fa9667ae77b0f381f43de09f38c7dcf4 --- /dev/null +++ b/modules/local/hicstuff/bam2pairs.nf @@ -0,0 +1,23 @@ +process BAM2PAIRS { + tag "$info_contigs" + label 'process_high' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + val(bam) + tuple val(meta), path(info_contigs) + val(digestion) + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("valid.pairs"), emit: valid_pairs + tuple val(meta), path("valid_idx.pairs"), emit: idx_pairs + + script: + """ + hicstuff_bam2pairs.py -b1 ${bam[1]} -b2 ${bam[3]} -o valid.pairs -x valid_idx.pairs -i ${info_contigs} -q 30 -e ${digestion} -f ${fasta} + """ + +} diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf index 99516b3a8e1818468cd4f62d82bac8cfc143fa68..25f311879cb9fab9cb4e5f24e15c04cefa2d6044 100644 --- a/subworkflows/local/hicstuff_sub.nf +++ b/subworkflows/local/hicstuff_sub.nf @@ -1,6 +1,7 @@ include { BOWTIE2_ALIGNMENT } from '../../modules/local/hicstuff/align_bowtie2' include { FRAGMENT_ENZYME } from '../../modules/local/hicstuff/fragment_enzyme' +include { BAM2PAIRS } from '../../modules/local/hicstuff/bam2pairs' // Paired-end to Single-end def pairToSingle(row, mates) { @@ -48,7 +49,10 @@ workflow HICSTUFF_SUB { fasta ) - emit: - info_contigs = FRAGMENT_ENZYME.out.info_contigs - fragments_list = FRAGMENT_ENZYME.out.fragments_list + BAM2PAIRS( + BOWTIE2_ALIGNMENT.out.bam.collect(), + FRAGMENT_ENZYME.out.info_contigs, + digestion, + fasta + ) }