From 47784de874b3e512a29c2225a40bba9a2597eb0f Mon Sep 17 00:00:00 2001
From: Mia Croiset <mia.croiset@ens-lyon.fr>
Date: Tue, 30 Jan 2024 13:30:40 +0100
Subject: [PATCH] add iterative mode for alignment

---
 bin/hicstuff_iteralign.py           | 352 ++++++++++++++++++++++++++++
 conf/modules.config                 |  11 +
 modules/local/hicstuff/iteralign.nf |  34 +++
 nextflow.config                     |   6 +
 subworkflows/local/hicstuff.nf      |  27 ++-
 workflows/hic.nf                    |   3 +-
 6 files changed, 424 insertions(+), 9 deletions(-)
 create mode 100755 bin/hicstuff_iteralign.py
 create mode 100644 modules/local/hicstuff/iteralign.nf

diff --git a/bin/hicstuff_iteralign.py b/bin/hicstuff_iteralign.py
new file mode 100755
index 0000000..a16b3c8
--- /dev/null
+++ b/bin/hicstuff_iteralign.py
@@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+# coding: utf-8
+"""Iterative alignment
+
+Aligns iteratively reads from a 3C fastq file: reads
+are trimmed with a range-sweeping number of basepairs
+and each read set generated this way is mapped onto
+the reference genome. This may result in a small
+increase of properly mapped reads.
+
+@author: Remi Montagne & cmdoret
+"""
+
+import os
+import sys
+import glob
+import re
+import subprocess as sp
+import pysam as ps
+import shutil as st
+import hicstuff_io as hio
+import contextlib
+import argparse
+from hicstuff_log import logger
+from os.path import join
+
+
+def iterative_align(
+    fq_in,
+    tmp_dir,
+    ref,
+    n_cpu,
+    bam_out,
+    aligner="bowtie2",
+    min_len=20,
+    min_qual=30,
+    read_len=None,
+):
+    """Iterative alignment
+
+    Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are
+    truncated to the 20 first nucleotides and unmapped reads are extended by 20
+    nucleotides and realigned on each iteration.
+
+    Parameters
+    ----------
+
+    fq_in : str
+        Path to input fastq file to align iteratively.
+    tmp_dir : str
+        Path where temporary files should be written.
+    ref : str
+        Path to the reference genome if Minimap2 is used for alignment.
+        Path to the index genome if Bowtie2/bwa is used for alignment.
+    n_cpu : int
+        The number of CPUs to use for the iterative alignment.
+    bam_out : str
+        Path where the final alignment should be written in BAM format.
+    aligner : str
+        Choose between minimap2, bwa or bowtie2 for the alignment.
+    min_len : int
+        The initial length of the fragments to align.
+    min_qual : int
+        Minimum mapping quality required to keep Hi-C pairs.
+    read_len : int
+        Read length in the fasta file. If set to None, the length of the first read
+        is used. Set this value to the longest read length in the file if you have
+        different read lengths.
+
+    Examples
+    --------
+
+    iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2")
+    iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2")
+    """
+    # set with the name of the unaligned reads :
+    remaining_reads = set()
+    total_reads = 0
+    # Store path of SAM containing aligned reads at each iteration.
+    iter_out = []
+
+    # If there is already a file with the same name as the output file,
+    # remove it. Otherwise, ignore.
+    with contextlib.suppress(FileNotFoundError):
+        try:
+            os.remove(bam_out)
+        except IsADirectoryError:
+            logger.error("You need to give the BAM output file, not a folder.")
+            raise
+
+    # Bowtie only accepts uncompressed fastq: uncompress it into a temp file
+    if aligner == "bowtie2" and hio.is_compressed(fq_in):
+        uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp")
+        with hio.read_compressed(fq_in) as inf:
+            with open(uncomp_path, "w") as uncomp:
+                st.copyfileobj(inf, uncomp)
+    else:
+        uncomp_path = fq_in
+
+    # throw error if index does not exist
+    index = hio.check_fasta_index(ref, mode=aligner)
+    if index is None:
+        logger.error(
+            f"Reference index is missing, please build the {aligner} index first."
+        )
+        sys.exit(1)
+    # Counting reads
+    with hio.read_compressed(uncomp_path) as inf:
+        for _ in inf:
+            total_reads += 1
+    total_reads /= 4
+
+    # Use first read to guess read length if not provided.
+    if read_len is None:
+        with hio.read_compressed(uncomp_path) as inf:
+            # Skip first line (read header)
+            size = inf.readline()
+            # Stripping newline from sequence line.
+            read_len = len(inf.readline().rstrip())
+
+    # initial length of the fragments to align
+    # In case reads are shorter than provided min_len
+    if read_len > min_len:
+        n = min_len
+    else:
+        logger.warning(
+            "min_len is longer than the reads. Iterative mapping will have no effect."
+        )
+        n = read_len
+    logger.info("{0} reads to parse".format(int(total_reads)))
+
+    first_round = True
+    # iterative alignment per se
+    while n <= read_len:
+        logger.info(
+            "Truncating unaligned reads to {size}bp and mapping{again}.".format(
+                size=int(n), again="" if first_round else " again"
+            )
+        )
+        iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))]
+        # Generate a temporary input fastq file with the n first nucleotids
+        # of the reads.
+        truncated_reads = truncate_reads(
+            tmp_dir, uncomp_path, remaining_reads, n, first_round
+        )
+
+        # Align the truncated reads on reference genome
+        temp_alignment = join(tmp_dir, "temp_alignment.bam")
+        map_args = {
+            "fa": ref,
+            "cpus": n_cpu,
+            "fq": truncated_reads,
+            "idx": index,
+            "bam": temp_alignment,
+        }
+        if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE):
+            cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args)
+        elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE):
+            cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args)
+        elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE):
+            cmd = (
+                "bowtie2 -x {idx} -p {cpus} --quiet --very-sensitive-local -U {fq}"
+            ).format(**map_args)
+        else:
+            raise ValueError(
+                "Unknown aligner. Select bowtie2, minimap2 or bwa."
+            )
+
+        map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
+        sort_process = sp.Popen(
+            "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args),
+            shell=True,
+            stdin=map_process.stdout,
+        )
+        out, err = sort_process.communicate()
+
+        # filter the reads: the reads whose truncated end was aligned are written
+        # to the output file.
+        # The reads whose truncated end was not aligned are kept for the next round.
+        remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual)
+
+        n += 20
+        first_round = False
+
+    # one last round without trimming
+    logger.info(
+        "Trying to map unaligned reads at full length ({0}bp).".format(
+            int(read_len)
+        )
+    )
+
+    truncated_reads = truncate_reads(
+        tmp_dir,
+        infile=uncomp_path,
+        unaligned_set=remaining_reads,
+        trunc_len=n,
+        first_round=first_round,
+    )
+    if aligner == "minimap2" or aligner == "Minimap2":
+        cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(
+            fa=ref, cpus=n_cpu, fq=truncated_reads
+        )
+    elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA":
+        cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format(
+            idx=index, cpus=n_cpu, fq=truncated_reads
+        )
+    else:
+        cmd = (
+            "bowtie2 -x {idx} -p {cpus} --quiet " "--very-sensitive {fq}"
+        ).format(idx=index, cpus=n_cpu, fq=truncated_reads)
+    map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
+    # Keep reads sorted by name
+    sort_process = sp.Popen(
+        "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(
+            cpus=n_cpu, bam=temp_alignment
+        ),
+        shell=True,
+        stdin=map_process.stdout,
+    )
+    out, err = sort_process.communicate()
+    iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))]
+    remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual)
+
+    # Report unaligned reads as well
+    iter_out += [join(tmp_dir, "unaligned.bam")]
+    temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False)
+    unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam)
+    for r in temp_bam:
+        # Do not write supplementary alignments (keeping 1 alignment/read)
+        if r.query_name in remaining_reads and not r.is_supplementary:
+            unmapped.write(r)
+    unmapped.close()
+    temp_bam.close()
+
+    # Merge all aligned reads and unmapped reads into a single bam
+    ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out)
+    logger.info(
+        "{0} reads aligned / {1} total reads.".format(
+            int(total_reads - len(remaining_reads)), int(total_reads)
+        )
+    )
+
+    return 0
+
+
+def truncate_reads(tmp_dir, infile, unaligned_set, trunc_len, first_round):
+    """Trim read ends
+
+    Writes the n first nucleotids of each sequence in infile to an auxiliary.
+    file in the temporary folder.
+
+    Parameters
+    ----------
+
+    tmp_dir : str
+        Path to the temporary folder.
+    infile : str
+        Path to the fastq file to truncate.
+    unaligned_set : set
+        Contains the names of all reads that did not map unambiguously in
+        previous rounds.
+    trunc_len : int
+        The number of basepairs to keep in each truncated sequence.
+    first_round : bool
+        If this is the first round, truncate all reads without checking mapping.
+
+    Returns
+    -------
+
+    str :
+        Path to the output fastq file containing truncated reads.
+    """
+
+    outfile = "{0}/truncated.fastq".format(tmp_dir)
+    with ps.FastxFile(infile, "r") as inf, open(outfile, "w") as outf:
+        for entry in inf:
+            # If the read did not align in previous round or this is the first round
+            if (entry.name in unaligned_set) or first_round:
+                entry.sequence = entry.sequence[:trunc_len]
+                entry.quality = entry.quality[:trunc_len]
+                outf.write(str(entry) + "\n")
+    return outfile
+
+
+def filter_bamfile(temp_alignment, filtered_out, min_qual=30):
+    """Filter alignment BAM files
+
+    Reads all the reads in the input BAM alignment file.
+    Write reads to the output file if they are aligned with a good
+    quality, otherwise add their name in a set to stage them for the next round
+    of alignment.
+
+    Parameters
+    ----------
+    temp_alignment : str
+        Path to the input temporary alignment.
+    outfile : str
+        Path to the output filtered temporary alignment.
+    min_qual : int
+        Minimum mapping quality required to keep a Hi-C pair.
+
+    Returns
+    -------
+    set:
+        Contains the names reads that did not align.
+    """
+    # Check the quality and status of each aligned fragment.
+    # Write the ones with good quality in the final output file.
+    # Keep those that do not map unambiguously for the next round.
+
+    unaligned = set()
+    temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False)
+    outf = ps.AlignmentFile(filtered_out, "wb", template=temp_bam)
+    for r in temp_bam:
+        if r.flag in [0, 16] and r.mapping_quality >= min_qual:
+            outf.write(r)
+        else:
+            unaligned.add(r.query_name)
+
+    logger.info("{0} reads left to map.".format(len(unaligned)))
+    temp_bam.close()
+    outf.close()
+
+    return unaligned
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-r", "--reads")
+    parser.add_argument("-t", "--tmp_dir")
+    parser.add_argument("-g", "--genome")
+    parser.add_argument("-n", "--n_cpu")
+    parser.add_argument("-b", "--bam_out")
+    parser.add_argument("-m", "--min_qual")
+    parser.add_argument("-l", "--read_len")
+    args = parser.parse_args()
+
+    reads = args.reads
+    tmp_dir = args.tmp_dir
+    ref = args.genome
+    n_cpu = args.n_cpu
+    bam_out = args.bam_out
+    min_qual = int(args.min_qual)
+    read_len = args.read_len
+
+    if tmp_dir == 'None':
+        tmp_dir = hio.generate_temp_dir(tmp_dir)
+    if read_len == 'None':
+        read_len = None
+    else:
+        read_len = int(read_len)
+
+    iterative_align(fq_in=reads, tmp_dir=tmp_dir, ref=ref, n_cpu=n_cpu, bam_out=bam_out, min_qual=min_qual, read_len=read_len)
diff --git a/conf/modules.config b/conf/modules.config
index afd5c3e..e2f6984 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -451,4 +451,15 @@ process {
             enabled: params.save_digested
         ]
     }
+
+    withName: 'ITERALIGN' {
+        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" }
+        ext.args = { [
+            " -t ${params.hicstuff_tmp_dir}",
+            " -n ${params.hicstuff_cpu}",
+            " -m ${params.hicstuff_min_qual}",
+            " -l ${params.hicstuff_read_len}"
+            ].join('').trim()
+        }
+    }
 }
diff --git a/modules/local/hicstuff/iteralign.nf b/modules/local/hicstuff/iteralign.nf
new file mode 100644
index 0000000..82ad24a
--- /dev/null
+++ b/modules/local/hicstuff/iteralign.nf
@@ -0,0 +1,34 @@
+process ITERALIGN {
+    tag "$meta.id"
+    label "process_high"
+
+    conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3"
+    container = "docker.io/lbmc/hicstuff:3.1.3"
+
+    input:
+    tuple val(meta), path(reads)
+    tuple val(meta2), path(index)
+
+    output:
+    tuple val(meta), path ("*iteraligned.bam"), emit: bam
+    path "versions.yml", emit: versions
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}_${meta.mates}"
+    def args = task.ext.args ?: ''
+
+    """
+    INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"`
+    [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/\\.rev.1.bt2l\$//"`
+    [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1
+
+    hicstuff_iteralign.py -r ${reads} -g \$INDEX -b ${prefix}.bam ${args}
+
+    samtools view -F 2048 -h -@ $task.cpus -O BAM ${prefix}.bam -o ${prefix}_iteraligned.bam
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        hicstuff: v3.1.3)
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
index 32c983c..3c268a4 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -161,6 +161,9 @@ params {
     hicstuff_distance_plot = 'false'
     hicstuff_distance_out_plot = 'plot_distance_law.pdf'
     hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs'
+    hicstuff_read_len = 'None'
+    hicstuff_cpu = 4
+    hicstuff_tmp_dir = 'None'
 
     //Hicstuff optional modules
     filter_event = false
@@ -176,6 +179,9 @@ params {
     cutsite_cpu = 4
     save_digested = false
     cutsite = false
+
+    //Iterative alignement
+    iteralign = false
 }
 
 // Load base.config by default for all pipelines
diff --git a/subworkflows/local/hicstuff.nf b/subworkflows/local/hicstuff.nf
index 576522e..018360b 100644
--- a/subworkflows/local/hicstuff.nf
+++ b/subworkflows/local/hicstuff.nf
@@ -10,6 +10,7 @@ include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law'
 include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr'
 include { FILTER_PCR_DUP } from './filter_pcr_dup'
 include { HIC_PLOT_MATRIX} from '../../modules/local/hicexplorer/hicPlotMatrix'
+include { ITERALIGN } from '../../modules/local/hicstuff/iteralign'
 
 // Paired-end to Single-end
 def pairToSingle(row, mates) {
@@ -62,11 +63,26 @@ workflow HICSTUFF {
         ch_reads = CUTSITE.out.fastq
     } */
 
-    BOWTIE2_ALIGNMENT(
+    if (params.iteralign && params.cutsite) {
+        error "Error: iteralign and cutsite can't both be true at the same time! Set one of them false"
+    }
+    else if (params.iteralign){
+        ITERALIGN(
+            ch_reads,
+            index.collect()
+        )
+        ch_bam = ITERALIGN.out.bam
+        ch_versions = ch_versions.mix(ITERALIGN.out.versions)
+    }
+    else {
+        BOWTIE2_ALIGNMENT(
         ch_reads,
         index.collect()
-    )
-    ch_versions = ch_versions.mix(BOWTIE2_ALIGNMENT.out.versions)
+        )
+        ch_bam = BOWTIE2_ALIGNMENT.out.bam
+        ch_versions = ch_versions.mix(BOWTIE2_ALIGNMENT.out.versions)
+    }
+
 
     FRAGMENT_ENZYME(
         digestion,
@@ -79,7 +95,7 @@ workflow HICSTUFF {
     }
     else if (params.filter_pcr_picard){
         FILTER_PCR_DUP(
-            BOWTIE2_ALIGNMENT.out.bam,
+            ch_bam,
             fasta,
             index
         )
@@ -87,9 +103,6 @@ workflow HICSTUFF {
         ch_versions = ch_versions.mix(FILTER_PCR_DUP.out.versions)
     }
     else {
-
-        BOWTIE2_ALIGNMENT.out.bam.set{ ch_bam }
-
         ch_bam.combine(ch_bam)
         .map {
             meta1, bam1, meta2, bam2 ->
diff --git a/workflows/hic.nf b/workflows/hic.nf
index 9a44393..f1cd351 100644
--- a/workflows/hic.nf
+++ b/workflows/hic.nf
@@ -206,10 +206,9 @@ workflow HIC {
             params.digestion
         )
         ch_reads = CUTSITE.out.fastq
-        ch_reads.view()
+        ch_versions = ch_versions.mix(CUTSITE.out.versions)
     }
 
-
   //
   // SUB-WORFLOW: HiC-Pro
   //
-- 
GitLab