From 47784de874b3e512a29c2225a40bba9a2597eb0f Mon Sep 17 00:00:00 2001 From: Mia Croiset <mia.croiset@ens-lyon.fr> Date: Tue, 30 Jan 2024 13:30:40 +0100 Subject: [PATCH] add iterative mode for alignment --- bin/hicstuff_iteralign.py | 352 ++++++++++++++++++++++++++++ conf/modules.config | 11 + modules/local/hicstuff/iteralign.nf | 34 +++ nextflow.config | 6 + subworkflows/local/hicstuff.nf | 27 ++- workflows/hic.nf | 3 +- 6 files changed, 424 insertions(+), 9 deletions(-) create mode 100755 bin/hicstuff_iteralign.py create mode 100644 modules/local/hicstuff/iteralign.nf diff --git a/bin/hicstuff_iteralign.py b/bin/hicstuff_iteralign.py new file mode 100755 index 0000000..a16b3c8 --- /dev/null +++ b/bin/hicstuff_iteralign.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python3 +# coding: utf-8 +"""Iterative alignment + +Aligns iteratively reads from a 3C fastq file: reads +are trimmed with a range-sweeping number of basepairs +and each read set generated this way is mapped onto +the reference genome. This may result in a small +increase of properly mapped reads. + +@author: Remi Montagne & cmdoret +""" + +import os +import sys +import glob +import re +import subprocess as sp +import pysam as ps +import shutil as st +import hicstuff_io as hio +import contextlib +import argparse +from hicstuff_log import logger +from os.path import join + + +def iterative_align( + fq_in, + tmp_dir, + ref, + n_cpu, + bam_out, + aligner="bowtie2", + min_len=20, + min_qual=30, + read_len=None, +): + """Iterative alignment + + Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are + truncated to the 20 first nucleotides and unmapped reads are extended by 20 + nucleotides and realigned on each iteration. + + Parameters + ---------- + + fq_in : str + Path to input fastq file to align iteratively. + tmp_dir : str + Path where temporary files should be written. + ref : str + Path to the reference genome if Minimap2 is used for alignment. + Path to the index genome if Bowtie2/bwa is used for alignment. + n_cpu : int + The number of CPUs to use for the iterative alignment. + bam_out : str + Path where the final alignment should be written in BAM format. + aligner : str + Choose between minimap2, bwa or bowtie2 for the alignment. + min_len : int + The initial length of the fragments to align. + min_qual : int + Minimum mapping quality required to keep Hi-C pairs. + read_len : int + Read length in the fasta file. If set to None, the length of the first read + is used. Set this value to the longest read length in the file if you have + different read lengths. + + Examples + -------- + + iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2") + iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2") + """ + # set with the name of the unaligned reads : + remaining_reads = set() + total_reads = 0 + # Store path of SAM containing aligned reads at each iteration. + iter_out = [] + + # If there is already a file with the same name as the output file, + # remove it. Otherwise, ignore. + with contextlib.suppress(FileNotFoundError): + try: + os.remove(bam_out) + except IsADirectoryError: + logger.error("You need to give the BAM output file, not a folder.") + raise + + # Bowtie only accepts uncompressed fastq: uncompress it into a temp file + if aligner == "bowtie2" and hio.is_compressed(fq_in): + uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp") + with hio.read_compressed(fq_in) as inf: + with open(uncomp_path, "w") as uncomp: + st.copyfileobj(inf, uncomp) + else: + uncomp_path = fq_in + + # throw error if index does not exist + index = hio.check_fasta_index(ref, mode=aligner) + if index is None: + logger.error( + f"Reference index is missing, please build the {aligner} index first." + ) + sys.exit(1) + # Counting reads + with hio.read_compressed(uncomp_path) as inf: + for _ in inf: + total_reads += 1 + total_reads /= 4 + + # Use first read to guess read length if not provided. + if read_len is None: + with hio.read_compressed(uncomp_path) as inf: + # Skip first line (read header) + size = inf.readline() + # Stripping newline from sequence line. + read_len = len(inf.readline().rstrip()) + + # initial length of the fragments to align + # In case reads are shorter than provided min_len + if read_len > min_len: + n = min_len + else: + logger.warning( + "min_len is longer than the reads. Iterative mapping will have no effect." + ) + n = read_len + logger.info("{0} reads to parse".format(int(total_reads))) + + first_round = True + # iterative alignment per se + while n <= read_len: + logger.info( + "Truncating unaligned reads to {size}bp and mapping{again}.".format( + size=int(n), again="" if first_round else " again" + ) + ) + iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] + # Generate a temporary input fastq file with the n first nucleotids + # of the reads. + truncated_reads = truncate_reads( + tmp_dir, uncomp_path, remaining_reads, n, first_round + ) + + # Align the truncated reads on reference genome + temp_alignment = join(tmp_dir, "temp_alignment.bam") + map_args = { + "fa": ref, + "cpus": n_cpu, + "fq": truncated_reads, + "idx": index, + "bam": temp_alignment, + } + if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE): + cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args) + elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE): + cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args) + elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE): + cmd = ( + "bowtie2 -x {idx} -p {cpus} --quiet --very-sensitive-local -U {fq}" + ).format(**map_args) + else: + raise ValueError( + "Unknown aligner. Select bowtie2, minimap2 or bwa." + ) + + map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) + sort_process = sp.Popen( + "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args), + shell=True, + stdin=map_process.stdout, + ) + out, err = sort_process.communicate() + + # filter the reads: the reads whose truncated end was aligned are written + # to the output file. + # The reads whose truncated end was not aligned are kept for the next round. + remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) + + n += 20 + first_round = False + + # one last round without trimming + logger.info( + "Trying to map unaligned reads at full length ({0}bp).".format( + int(read_len) + ) + ) + + truncated_reads = truncate_reads( + tmp_dir, + infile=uncomp_path, + unaligned_set=remaining_reads, + trunc_len=n, + first_round=first_round, + ) + if aligner == "minimap2" or aligner == "Minimap2": + cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format( + fa=ref, cpus=n_cpu, fq=truncated_reads + ) + elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA": + cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format( + idx=index, cpus=n_cpu, fq=truncated_reads + ) + else: + cmd = ( + "bowtie2 -x {idx} -p {cpus} --quiet " "--very-sensitive {fq}" + ).format(idx=index, cpus=n_cpu, fq=truncated_reads) + map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) + # Keep reads sorted by name + sort_process = sp.Popen( + "samtools sort -n -@ {cpus} -O BAM -o {bam}".format( + cpus=n_cpu, bam=temp_alignment + ), + shell=True, + stdin=map_process.stdout, + ) + out, err = sort_process.communicate() + iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] + remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) + + # Report unaligned reads as well + iter_out += [join(tmp_dir, "unaligned.bam")] + temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) + unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam) + for r in temp_bam: + # Do not write supplementary alignments (keeping 1 alignment/read) + if r.query_name in remaining_reads and not r.is_supplementary: + unmapped.write(r) + unmapped.close() + temp_bam.close() + + # Merge all aligned reads and unmapped reads into a single bam + ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out) + logger.info( + "{0} reads aligned / {1} total reads.".format( + int(total_reads - len(remaining_reads)), int(total_reads) + ) + ) + + return 0 + + +def truncate_reads(tmp_dir, infile, unaligned_set, trunc_len, first_round): + """Trim read ends + + Writes the n first nucleotids of each sequence in infile to an auxiliary. + file in the temporary folder. + + Parameters + ---------- + + tmp_dir : str + Path to the temporary folder. + infile : str + Path to the fastq file to truncate. + unaligned_set : set + Contains the names of all reads that did not map unambiguously in + previous rounds. + trunc_len : int + The number of basepairs to keep in each truncated sequence. + first_round : bool + If this is the first round, truncate all reads without checking mapping. + + Returns + ------- + + str : + Path to the output fastq file containing truncated reads. + """ + + outfile = "{0}/truncated.fastq".format(tmp_dir) + with ps.FastxFile(infile, "r") as inf, open(outfile, "w") as outf: + for entry in inf: + # If the read did not align in previous round or this is the first round + if (entry.name in unaligned_set) or first_round: + entry.sequence = entry.sequence[:trunc_len] + entry.quality = entry.quality[:trunc_len] + outf.write(str(entry) + "\n") + return outfile + + +def filter_bamfile(temp_alignment, filtered_out, min_qual=30): + """Filter alignment BAM files + + Reads all the reads in the input BAM alignment file. + Write reads to the output file if they are aligned with a good + quality, otherwise add their name in a set to stage them for the next round + of alignment. + + Parameters + ---------- + temp_alignment : str + Path to the input temporary alignment. + outfile : str + Path to the output filtered temporary alignment. + min_qual : int + Minimum mapping quality required to keep a Hi-C pair. + + Returns + ------- + set: + Contains the names reads that did not align. + """ + # Check the quality and status of each aligned fragment. + # Write the ones with good quality in the final output file. + # Keep those that do not map unambiguously for the next round. + + unaligned = set() + temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) + outf = ps.AlignmentFile(filtered_out, "wb", template=temp_bam) + for r in temp_bam: + if r.flag in [0, 16] and r.mapping_quality >= min_qual: + outf.write(r) + else: + unaligned.add(r.query_name) + + logger.info("{0} reads left to map.".format(len(unaligned))) + temp_bam.close() + outf.close() + + return unaligned + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-r", "--reads") + parser.add_argument("-t", "--tmp_dir") + parser.add_argument("-g", "--genome") + parser.add_argument("-n", "--n_cpu") + parser.add_argument("-b", "--bam_out") + parser.add_argument("-m", "--min_qual") + parser.add_argument("-l", "--read_len") + args = parser.parse_args() + + reads = args.reads + tmp_dir = args.tmp_dir + ref = args.genome + n_cpu = args.n_cpu + bam_out = args.bam_out + min_qual = int(args.min_qual) + read_len = args.read_len + + if tmp_dir == 'None': + tmp_dir = hio.generate_temp_dir(tmp_dir) + if read_len == 'None': + read_len = None + else: + read_len = int(read_len) + + iterative_align(fq_in=reads, tmp_dir=tmp_dir, ref=ref, n_cpu=n_cpu, bam_out=bam_out, min_qual=min_qual, read_len=read_len) diff --git a/conf/modules.config b/conf/modules.config index afd5c3e..e2f6984 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -451,4 +451,15 @@ process { enabled: params.save_digested ] } + + withName: 'ITERALIGN' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" } + ext.args = { [ + " -t ${params.hicstuff_tmp_dir}", + " -n ${params.hicstuff_cpu}", + " -m ${params.hicstuff_min_qual}", + " -l ${params.hicstuff_read_len}" + ].join('').trim() + } + } } diff --git a/modules/local/hicstuff/iteralign.nf b/modules/local/hicstuff/iteralign.nf new file mode 100644 index 0000000..82ad24a --- /dev/null +++ b/modules/local/hicstuff/iteralign.nf @@ -0,0 +1,34 @@ +process ITERALIGN { + tag "$meta.id" + label "process_high" + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "docker.io/lbmc/hicstuff:3.1.3" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + + output: + tuple val(meta), path ("*iteraligned.bam"), emit: bam + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}_${meta.mates}" + def args = task.ext.args ?: '' + + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/\\.rev.1.bt2l\$//"` + [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1 + + hicstuff_iteralign.py -r ${reads} -g \$INDEX -b ${prefix}.bam ${args} + + samtools view -F 2048 -h -@ $task.cpus -O BAM ${prefix}.bam -o ${prefix}_iteraligned.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hicstuff: v3.1.3) + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 32c983c..3c268a4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -161,6 +161,9 @@ params { hicstuff_distance_plot = 'false' hicstuff_distance_out_plot = 'plot_distance_law.pdf' hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs' + hicstuff_read_len = 'None' + hicstuff_cpu = 4 + hicstuff_tmp_dir = 'None' //Hicstuff optional modules filter_event = false @@ -176,6 +179,9 @@ params { cutsite_cpu = 4 save_digested = false cutsite = false + + //Iterative alignement + iteralign = false } // Load base.config by default for all pipelines diff --git a/subworkflows/local/hicstuff.nf b/subworkflows/local/hicstuff.nf index 576522e..018360b 100644 --- a/subworkflows/local/hicstuff.nf +++ b/subworkflows/local/hicstuff.nf @@ -10,6 +10,7 @@ include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law' include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr' include { FILTER_PCR_DUP } from './filter_pcr_dup' include { HIC_PLOT_MATRIX} from '../../modules/local/hicexplorer/hicPlotMatrix' +include { ITERALIGN } from '../../modules/local/hicstuff/iteralign' // Paired-end to Single-end def pairToSingle(row, mates) { @@ -62,11 +63,26 @@ workflow HICSTUFF { ch_reads = CUTSITE.out.fastq } */ - BOWTIE2_ALIGNMENT( + if (params.iteralign && params.cutsite) { + error "Error: iteralign and cutsite can't both be true at the same time! Set one of them false" + } + else if (params.iteralign){ + ITERALIGN( + ch_reads, + index.collect() + ) + ch_bam = ITERALIGN.out.bam + ch_versions = ch_versions.mix(ITERALIGN.out.versions) + } + else { + BOWTIE2_ALIGNMENT( ch_reads, index.collect() - ) - ch_versions = ch_versions.mix(BOWTIE2_ALIGNMENT.out.versions) + ) + ch_bam = BOWTIE2_ALIGNMENT.out.bam + ch_versions = ch_versions.mix(BOWTIE2_ALIGNMENT.out.versions) + } + FRAGMENT_ENZYME( digestion, @@ -79,7 +95,7 @@ workflow HICSTUFF { } else if (params.filter_pcr_picard){ FILTER_PCR_DUP( - BOWTIE2_ALIGNMENT.out.bam, + ch_bam, fasta, index ) @@ -87,9 +103,6 @@ workflow HICSTUFF { ch_versions = ch_versions.mix(FILTER_PCR_DUP.out.versions) } else { - - BOWTIE2_ALIGNMENT.out.bam.set{ ch_bam } - ch_bam.combine(ch_bam) .map { meta1, bam1, meta2, bam2 -> diff --git a/workflows/hic.nf b/workflows/hic.nf index 9a44393..f1cd351 100644 --- a/workflows/hic.nf +++ b/workflows/hic.nf @@ -206,10 +206,9 @@ workflow HIC { params.digestion ) ch_reads = CUTSITE.out.fastq - ch_reads.view() + ch_versions = ch_versions.mix(CUTSITE.out.versions) } - // // SUB-WORFLOW: HiC-Pro // -- GitLab