TEST module bam2pairs

9f6f0ce6 · Mia Croiset · f1fe040e · 9f6f0ce6 · 9f6f0ce6 · 9f6f0ce6
Verified Commit 9f6f0ce6 authored 2 years ago by Mia Croiset
--- a/bin/hicstuff_bam2pairs.py
+++ b/bin/hicstuff_bam2pairs.py
+#!/usr/bin/env python
+import os, time, csv, sys, re
+import argparse
+import pysam as ps
+import pandas as pd
+import itertools
+from hicstuff_log import logger
+import hicstuff_io as hio
+import hicstuff_digest as hcd
+from Bio import SeqIO
+def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30):
+    """
+    Make a .pairs file from two Hi-C bam files sorted by read names.
+    The Hi-C mates are matched by read identifier. Pairs where at least one
+    reads maps with MAPQ below  min_qual threshold are discarded. Pairs are
+    sorted by readID and stored in upper triangle (first pair higher).
+    Parameters
+    ----------
+    bam1 : str
+        Path to the name-sorted BAM file with aligned Hi-C forward reads.
+    bam2 : str
+        Path to the name-sorted BAM file with aligned Hi-C reverse reads.
+    out_pairs : str
+        Path to the output space-separated .pairs file with columns
+        readID, chr1 pos1 chr2 pos2 strand1 strand2
+    info_contigs : str
+        Path to the info contigs file, to get info on chromosome sizes and order.
+    min_qual : int
+        Minimum mapping quality required to keep a Hi-C pair.
+    """
+    forward = ps.AlignmentFile(bam1, "rb")
+    reverse = ps.AlignmentFile(bam2, "rb")
+    # Generate header lines
+    format_version = "## pairs format v1.0\n"
+    sorting = "#sorted: readID\n"
+    cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n"
+    # Chromosome order will be identical in info_contigs and pair files
+    chroms = pd.read_csv(info_contigs, sep="\t").apply(
+        lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1
+    )
+    with open(out_pairs, "w") as pairs:
+        pairs.writelines([format_version, sorting, cols] + chroms.tolist())
+        pairs_writer = csv.writer(pairs, delimiter="\t")
+        n_reads = {"total": 0, "mapped": 0}
+        # Remember if some read IDs were missing from either file
+        unmatched_reads = 0
+        # Remember if all reads in one bam file have been read
+        exhausted = [False, False]
+        # Iterate on both BAM simultaneously
+        end_regex = re.compile(r'/[12]$')
+        for end1, end2 in itertools.zip_longest(forward, reverse):
+            # Remove end-specific suffix if any
+            end1.query_name = re.sub(end_regex, '', end1.query_name)
+            end2.query_name = re.sub(end_regex, '', end2.query_name)
+            # Both file still have reads
+            # Check if reads pass filter
+            try:
+                end1_passed = end1.mapping_quality >= min_qual
+            # Happens if end1 bam file has been exhausted
+            except AttributeError:
+                exhausted[0] = True
+                end1_passed = False
+            try:
+                end2_passed = end2.mapping_quality >= min_qual
+            # Happens if end2 bam file has been exhausted
+            except AttributeError:
+                exhausted[1] = True
+                end2_passed = False
+            # Skip read if mate is not present until they match or reads
+            # have been exhausted
+            while sum(exhausted) == 0 and end1.query_name != end2.query_name:
+                # Get next read and check filters again
+                # Count single-read iteration
+                unmatched_reads += 1
+                n_reads["total"] += 1
+                if end1.query_name < end2.query_name:
+                    try:
+                        end1 = next(forward)
+                        end1_passed = end1.mapping_quality >= min_qual
+                    # If EOF is reached in BAM 1
+                    except (StopIteration, AttributeError):
+                        exhausted[0] = True
+                        end1_passed = False
+                    n_reads["mapped"] += end1_passed
+                elif end1.query_name > end2.query_name:
+                    try:
+                        end2 = next(reverse)
+                        end2_passed = end2.mapping_quality >= min_qual
+                    # If EOF is reached in BAM 2
+                    except (StopIteration, AttributeError):
+                        exhausted[1] = True
+                        end2_passed = False
+                    n_reads["mapped"] += end2_passed
+            # 2 reads processed per iteration, unless one file is exhausted
+            n_reads["total"] += 2 - sum(exhausted)
+            n_reads["mapped"] += sum([end1_passed, end2_passed])
+            # Keep only pairs where both reads have good quality
+            if end1_passed and end2_passed:
+                # Flipping to get upper triangle
+                if (
+                    end1.reference_id == end2.reference_id
+                    and end1.reference_start > end2.reference_start
+                ) or end1.reference_id > end2.reference_id:
+                    end1, end2 = end2, end1
+                pairs_writer.writerow(
+                    [
+                        end1.query_name,
+                        end1.reference_name,
+                        end1.reference_start + 1,
+                        end2.reference_name,
+                        end2.reference_start + 1,
+                        "-" if end1.is_reverse else "+",
+                        "-" if end2.is_reverse else "+",
+                    ]
+                )
+    pairs.close()
+    if unmatched_reads > 0:
+        logger.warning(
+            "%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.",
+            unmatched_reads,
+        )
+    logger.info(
+        "{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})".format(
+            total=n_reads["total"],
+            mapped=n_reads["mapped"],
+            perc_map=round(100 * n_reads["mapped"] / n_reads["total"]),
+            qual=min_qual,
+        )
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-b1", "--bam1")
+    parser.add_argument("-b2", "--bam2")
+    parser.add_argument("-o", "--out_pairs")
+    parser.add_argument("-x", "--out_idx")
+    parser.add_argument("-i", "--info_contigs")
+    parser.add_argument("-q", "--min_qual")
+    parser.add_argument("-e", "--enzyme")
+    parser.add_argument("-f", "--fasta")
+    args = parser.parse_args()
+    bam1 = args.bam1
+    bam2 = args.bam2
+    out_pairs = args.out_pairs
+    out_idx = args.out_idx
+    info_contigs = args.info_contigs
+    min_qual = int(args.min_qual)
+    enzyme = args.enzyme
+    fasta =args.fasta
+    #hicstuff case sensitive enzymes adaptation
+    if enzyme == "hindiii":
+        enzyme = "HindIII"
+    elif enzyme == "dpnii":
+        enzyme = "DpnII"
+    elif enzyme == "bglii":
+        enzyme = "BglII"
+    elif enzyme == "mboi":
+        enzyme = "MboI"
+    elif enzyme == "arima":
+        enzyme = "Arima"
+    bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual)
+    restrict_table = {}
+    for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"):
+        # Get chromosome restriction table
+        restrict_table[record.id] = hcd.get_restriction_table(
+            record.seq, enzyme, circular=False
+        )
+    hcd.attribute_fragments(out_pairs, out_idx, restrict_table)
+    hio.sort_pairs(
+        out_idx,
+        out_idx + ".sorted",
+        keys=["chr1", "pos1", "chr2", "pos2"],
+        threads=1,
+        tmp_dir=None,
+    )
+    os.rename(out_idx + ".sorted", out_idx)
--- a/conf/hicstuff.config
+++ b/conf/hicstuff.config
@@ -8,6 +8,10 @@ process {
    withName: 'BOWTIE2_ALIGNMENT' {
        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" }
+        publishDir = [
+            path: { "${params.outdir}/hicstuff/reads"},
+            mode: 'copy'
+        ]
    }
    withName: 'FRAGMENT_ENZYME' {
@@ -17,11 +21,18 @@ process {
        ]
    }
+    withName: 'BAM2PAIRS' {
+        publishDir = [
+            path: { "${params.outdir}/hicstuff/pairs" },
+            mode: 'copy'
+        ]
+    }
    //**********************************************
    // PREPARE_GENOME
    withName: 'BOWTIE2_BUILD' {
        publishDir = [
-            path: { "${params.outdir}/genome/bowtie2" },
+            path: { "${params.outdir}/genome/bowtie2_index" },
            mode: 'copy'
        ]
    }
@@ -133,7 +144,7 @@ params {
    multiqc_methods_description = null
    // Boilerplate options
-    outdir                     = './results'
+    outdir                     = '${params.outdir}'
    tracedir                   = "${params.outdir}/pipeline_info"
    publish_dir_mode           = 'copy'
    email                      = null

--- a/modules/local/hicstuff/bam2pairs.nf
+++ b/modules/local/hicstuff/bam2pairs.nf
+process BAM2PAIRS {
+    tag "$info_contigs"
+    label 'process_high'
+    conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3"
+    container = "lbmc/hicstuff:3.1.3"
+    input:
+    val(bam)
+    tuple val(meta), path(info_contigs)
+    val(digestion)
+    tuple val(meta), path(fasta)
+    output:
+    tuple val(meta), path("valid.pairs"), emit: valid_pairs
+    tuple val(meta), path("valid_idx.pairs"), emit: idx_pairs
+    script:
+    """
+    hicstuff_bam2pairs.py -b1 ${bam[1]} -b2 ${bam[3]} -o valid.pairs -x valid_idx.pairs -i ${info_contigs} -q 30 -e ${digestion} -f ${fasta}
+    """
+}
--- a/subworkflows/local/hicstuff_sub.nf
+++ b/subworkflows/local/hicstuff_sub.nf
 include { BOWTIE2_ALIGNMENT } from '../../modules/local/hicstuff/align_bowtie2'
 include { FRAGMENT_ENZYME } from '../../modules/local/hicstuff/fragment_enzyme'
+include { BAM2PAIRS } from '../../modules/local/hicstuff/bam2pairs'
 // Paired-end to Single-end
 def pairToSingle(row, mates) {
@@ -48,7 +49,10 @@ workflow HICSTUFF_SUB {
        fasta
    )
-    emit:
+    BAM2PAIRS(
-    info_contigs = FRAGMENT_ENZYME.out.info_contigs
+        BOWTIE2_ALIGNMENT.out.bam.collect(),
-    fragments_list = FRAGMENT_ENZYME.out.fragments_list
+        FRAGMENT_ENZYME.out.info_contigs,
+        digestion,
+        fasta
+    )
 }