From b607dc0eee8ef8b98fdc9fb151ca75b4c1d88c4c Mon Sep 17 00:00:00 2001
From: Mia Croiset <mia.croiset@ens-lyon.fr>
Date: Fri, 2 Jun 2023 16:45:26 +0200
Subject: [PATCH] TO FIX markduplicates picard

---
 conf/hicstuff.config                          | 22 ++++++
 .../custom/{ => gatk4}/markduplicates/main.nf |  0
 .../{ => gatk4}/markduplicates/meta.yml       |  0
 .../custom/picard/markduplicates/main.nf      | 59 +++++++++++++++
 .../custom/picard/markduplicates/meta.yml     | 71 +++++++++++++++++++
 .../nf-core/custom/samtools_n/sort/main.nf    | 49 +++++++++++++
 .../nf-core/custom/samtools_n/sort/meta.yml   | 48 +++++++++++++
 subworkflows/local/hicstuff_sub.nf            | 34 +++++++--
 8 files changed, 277 insertions(+), 6 deletions(-)
 rename modules/nf-core/custom/{ => gatk4}/markduplicates/main.nf (100%)
 rename modules/nf-core/custom/{ => gatk4}/markduplicates/meta.yml (100%)
 create mode 100644 modules/nf-core/custom/picard/markduplicates/main.nf
 create mode 100644 modules/nf-core/custom/picard/markduplicates/meta.yml
 create mode 100644 modules/nf-core/custom/samtools_n/sort/main.nf
 create mode 100644 modules/nf-core/custom/samtools_n/sort/meta.yml

diff --git a/conf/hicstuff.config b/conf/hicstuff.config
index 82360ec..3f5cdf6 100644
--- a/conf/hicstuff.config
+++ b/conf/hicstuff.config
@@ -245,6 +245,17 @@ process {
         ]
     }
 
+    withName: 'PICARD_MARKDUPLICATES' {
+        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" }
+        ext.args = { [
+            "--REMOVE_DUPLICATES true"
+        ].join('').trim() }
+        publishDir = [
+            path: { "${params.outdir}/picard/bam" },
+            mode: 'copy'
+        ]
+    }
+
     withName: 'SAMTOOLS_SORT' {
         ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_sorted" }
         ext.args = { [
@@ -252,6 +263,17 @@ process {
         ].join('').trim() }
     }
 
+    withName: 'SAMTOOLS_SORT_N' {
+        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_nsorted" }
+        ext.args = { [
+            "-n"
+        ].join('').trim() }
+        publishDir = [
+            path: { "${params.outdir}/gatk4/bam" },
+            mode: 'copy'
+        ]
+    }
+
     withName: 'SAMTOOLS_INDEX' {
         ext.args = { [
             ""
diff --git a/modules/nf-core/custom/markduplicates/main.nf b/modules/nf-core/custom/gatk4/markduplicates/main.nf
similarity index 100%
rename from modules/nf-core/custom/markduplicates/main.nf
rename to modules/nf-core/custom/gatk4/markduplicates/main.nf
diff --git a/modules/nf-core/custom/markduplicates/meta.yml b/modules/nf-core/custom/gatk4/markduplicates/meta.yml
similarity index 100%
rename from modules/nf-core/custom/markduplicates/meta.yml
rename to modules/nf-core/custom/gatk4/markduplicates/meta.yml
diff --git a/modules/nf-core/custom/picard/markduplicates/main.nf b/modules/nf-core/custom/picard/markduplicates/main.nf
new file mode 100644
index 0000000..b89101b
--- /dev/null
+++ b/modules/nf-core/custom/picard/markduplicates/main.nf
@@ -0,0 +1,59 @@
+process PICARD_MARKDUPLICATES {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::picard=3.0.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' :
+        'quay.io/biocontainers/picard:3.0.0--hdfd78af_1' }"
+
+    input:
+    tuple val(meta), path(bam)
+    tuple val(meta2), path(fasta)
+    tuple val(meta3), path(fai)
+
+    output:
+    tuple val(meta), path("*.bam")        , emit: bam
+    tuple val(meta), path("*.bai")        , optional:true, emit: bai
+    tuple val(meta), path("*.metrics.txt"), emit: metrics
+    path  "versions.yml"                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def avail_mem = 3072
+    if (!task.memory) {
+        log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
+    } else {
+        avail_mem = (task.memory.mega*0.8).intValue()
+    }
+    """
+    picard \\
+        -Xmx${avail_mem}M \\
+        MarkDuplicates \\
+        $args \\
+        --INPUT $bam \\
+        --OUTPUT ${prefix}.bam \\
+        --REFERENCE_SEQUENCE $fasta \\
+        --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:)
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.bam
+    touch ${prefix}.bam.bai
+    touch ${prefix}.MarkDuplicates.metrics.txt
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:)
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/nf-core/custom/picard/markduplicates/meta.yml b/modules/nf-core/custom/picard/markduplicates/meta.yml
new file mode 100644
index 0000000..e463328
--- /dev/null
+++ b/modules/nf-core/custom/picard/markduplicates/meta.yml
@@ -0,0 +1,71 @@
+name: picard_markduplicates
+description: Locate and tag duplicate reads in a BAM file
+keywords:
+  - markduplicates
+  - pcr
+  - duplicates
+  - bam
+  - sam
+  - cram
+tools:
+  - picard:
+      description: |
+        A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS)
+        data and formats such as SAM/BAM/CRAM and VCF.
+      homepage: https://broadinstitute.github.io/picard/
+      documentation: https://broadinstitute.github.io/picard/
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM file
+      pattern: "*.{bam,cram,sam}"
+  - meta2:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - fasta:
+      type: file
+      description: Reference genome fasta file
+      pattern: "*.{fasta,fa}"
+  - meta3:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - fai:
+      type: file
+      description: Reference genome fasta index
+      pattern: "*.{fai}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM file with duplicate reads marked/removed
+      pattern: "*.{bam}"
+  - bai:
+      type: file
+      description: An optional BAM index file. If desired, --CREATE_INDEX must be passed as a flag
+      pattern: "*.{bai}"
+  - metrics:
+      type: file
+      description: Duplicate metrics file generated by picard
+      pattern: "*.{metrics.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@drpatelh"
+  - "@projectoriented"
+  - "@ramprasadn"
\ No newline at end of file
diff --git a/modules/nf-core/custom/samtools_n/sort/main.nf b/modules/nf-core/custom/samtools_n/sort/main.nf
new file mode 100644
index 0000000..59a6e2a
--- /dev/null
+++ b/modules/nf-core/custom/samtools_n/sort/main.nf
@@ -0,0 +1,49 @@
+process SAMTOOLS_SORT_N {
+    tag "$meta.id"
+    label 'process_high' //medium
+
+    conda "bioconda::samtools=1.17"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
+        'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }"
+
+    input:
+    tuple val(meta), path(bam)
+
+    output:
+    tuple val(meta), path("*.bam"), emit: bam
+    tuple val(meta), path("*.csi"), emit: csi, optional: true
+    path  "versions.yml"          , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def sort_memory = (task.memory.mega/task.cpus).intValue()
+    if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    """
+    samtools sort \\
+        $args \\
+        -@ $task.cpus \\
+        -m ${sort_memory}M \\
+        -o ${prefix}.bam \\
+        -T $prefix \\
+        $bam
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.bam
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/nf-core/custom/samtools_n/sort/meta.yml b/modules/nf-core/custom/samtools_n/sort/meta.yml
new file mode 100644
index 0000000..1587857
--- /dev/null
+++ b/modules/nf-core/custom/samtools_n/sort/meta.yml
@@ -0,0 +1,48 @@
+name: samtools_sort
+description: Sort SAM/BAM/CRAM file
+keywords:
+  - sort
+  - bam
+  - sam
+  - cram
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: Sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - csi:
+      type: file
+      description: BAM index file (optional)
+      pattern: "*.csi"
+authors:
+  - "@drpatelh"
+  - "@ewels"
\ No newline at end of file
diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf
index 71b0053..69f04de 100644
--- a/subworkflows/local/hicstuff_sub.nf
+++ b/subworkflows/local/hicstuff_sub.nf
@@ -8,9 +8,11 @@ include { BUILD_MATRIX_COOL_ALT } from '../../modules/local/hicstuff/build_matri
 include { FILTER_EVENT } from '../../modules/local/hicstuff/filter_event'
 include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law'
 include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr'
-include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/custom/markduplicates/main'
+include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/custom/gatk4/markduplicates/main'
 include { SAMTOOLS_SORT } from '../../modules/nf-core/custom/samtools/sort/main'
+include { SAMTOOLS_SORT_N } from '../../modules/nf-core/custom/samtools_n/sort/main'
 include { SAMTOOLS_INDEX } from '../../modules/nf-core/custom/samtools/index/main'
+include { PICARD_MARKDUPLICATES } from '../../modules/nf-core/custom/picard/markduplicates/main'
 
 // Paired-end to Single-end
 def pairToSingle(row, mates) {
@@ -67,16 +69,36 @@ workflow HICSTUFF_SUB {
             BOWTIE2_ALIGNMENT.out.bam
         )
 
+/*         GATK4_MARKDUPLICATES(
+            SAMTOOLS_SORT.out.bam,
+            fasta.map{ it[1] }.collect(),
+            index.map{ it[1] }.collect()
+        )
+
+        SAMTOOLS_SORT_N(
+            GATK4_MARKDUPLICATES.out.bam
+        )
+
         SAMTOOLS_INDEX(
-            SAMTOOLS_SORT.out.bam
+            SAMTOOLS_SORT_N.out.bam
         )
+ */
 
-        GATK4_MARKDUPLICATES(
+        PICARD_MARKDUPLICATES(
             SAMTOOLS_SORT.out.bam,
-            fasta.map{ it[1] }.collect(),
-            index.map{ it[1] }.collect()
+            fasta.collect(),
+            index.collect()
+        )
+
+        SAMTOOLS_SORT_N(
+            PICARD_MARKDUPLICATES.out.bam
         )
-        GATK4_MARKDUPLICATES.out.bam.set{ ch_bam }
+
+        SAMTOOLS_INDEX(
+            SAMTOOLS_SORT_N.out.bam
+        )
+
+        SAMTOOLS_SORT_N.out.bam.set{ ch_bam }
 
     }
     else {
-- 
GitLab