From f8fcb912049ad6a0ae7529aa785ea0d198810667 Mon Sep 17 00:00:00 2001
From: Mia Croiset <mia.croiset@ens-lyon.fr>
Date: Thu, 1 Jun 2023 15:58:27 +0200
Subject: [PATCH] TO FIX markduplicates pcr filter option

---
 conf/base.config                              | 36 +++++++++-
 conf/hicstuff.config                          | 33 +++++++--
 modules/nf-core/custom/markduplicates/main.nf | 54 ++++++++++++++
 .../nf-core/custom/markduplicates/meta.yml    | 72 +++++++++++++++++++
 modules/nf-core/custom/samtools/index/main.nf | 46 ++++++++++++
 .../nf-core/custom/samtools/index/meta.yml    | 53 ++++++++++++++
 modules/nf-core/custom/sort/main.nf           | 49 +++++++++++++
 modules/nf-core/custom/sort/meta.yml          | 48 +++++++++++++
 subworkflows/local/hicstuff_sub.nf            | 36 ++++++++--
 9 files changed, 417 insertions(+), 10 deletions(-)
 create mode 100644 modules/nf-core/custom/markduplicates/main.nf
 create mode 100644 modules/nf-core/custom/markduplicates/meta.yml
 create mode 100644 modules/nf-core/custom/samtools/index/main.nf
 create mode 100644 modules/nf-core/custom/samtools/index/meta.yml
 create mode 100644 modules/nf-core/custom/sort/main.nf
 create mode 100644 modules/nf-core/custom/sort/meta.yml

diff --git a/conf/base.config b/conf/base.config
index 6808dbe..e57c358 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -40,8 +40,8 @@ process {
         time   = { check_max( 8.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_high {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 64.GB * task.attempt, 'memory'  ) }
+        cpus   = { check_max( 8    * task.attempt, 'cpus'    ) } //TODO go back to 16 when not local
+        memory = { check_max( 31.GB * task.attempt, 'memory'  ) }//TODO go back to 64 when not local
         time   = { check_max( 16.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_long {
@@ -61,3 +61,35 @@ process {
         cache = false
     }
 }
+// Function to ensure that resource requirements don't go beyond
+// a maximum limit
+def check_max(obj, type) {
+    if (type == 'memory') {
+        try {
+            if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
+                return params.max_memory as nextflow.util.MemoryUnit
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'time') {
+        try {
+            if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
+                return params.max_time as nextflow.util.Duration
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'cpus') {
+        try {
+            return Math.min( obj, params.max_cpus as int )
+        } catch (all) {
+            println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
+            return obj
+        }
+    }
+}
diff --git a/conf/hicstuff.config b/conf/hicstuff.config
index 0f27d39..82360ec 100644
--- a/conf/hicstuff.config
+++ b/conf/hicstuff.config
@@ -144,10 +144,10 @@ params {
     hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs'
 
     //Hicstuff optional modules
-    filter_event = true
-    distance_law = true
-    filter_pcr = true
-    filter_pcr_picard = false
+    filter_event = false
+    distance_law = false
+    filter_pcr = false
+    filter_pcr_picard = true
 }
 
 process {
@@ -234,6 +234,30 @@ process {
         ]
     }
 
+    withName: 'GATK4_MARKDUPLICATES' {
+        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}.bam" }
+        ext.args = { [
+            "--REMOVE_DUPLICATES true"
+        ].join('').trim() }
+        publishDir = [
+            path: { "${params.outdir}/gatk4/bam" },
+            mode: 'copy'
+        ]
+    }
+
+    withName: 'SAMTOOLS_SORT' {
+        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_sorted" }
+        ext.args = { [
+            ""
+        ].join('').trim() }
+    }
+
+    withName: 'SAMTOOLS_INDEX' {
+        ext.args = { [
+            ""
+        ].join('').trim() }
+    }
+
     withName: 'BUILD_MATRIX' {
         ext.args = params.hicstuff_matrix
         publishDir = [
@@ -350,3 +374,4 @@ profiles {
     test      { includeConfig 'conf/test.config'      }
     test_full { includeConfig 'conf/test_full.config' }
 }
+includeConfig 'base.config'
diff --git a/modules/nf-core/custom/markduplicates/main.nf b/modules/nf-core/custom/markduplicates/main.nf
new file mode 100644
index 0000000..223fa7c
--- /dev/null
+++ b/modules/nf-core/custom/markduplicates/main.nf
@@ -0,0 +1,54 @@
+process GATK4_MARKDUPLICATES {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::gatk4=4.4.0.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0':
+        'quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(bam)
+    path  fasta
+    path  fasta_fai
+
+    output:
+    tuple val(meta), path("*cram"),     emit: cram,  optional: true
+    tuple val(meta), path("*bam"),      emit: bam,   optional: true
+    tuple val(meta), path("*.crai"),    emit: crai,  optional: true
+    tuple val(meta), path("*.bai"),     emit: bai,   optional: true
+    tuple val(meta), path("*.metrics"), emit: metrics
+    path "versions.yml",                emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    prefix = task.ext.prefix ?: "${meta.id}"
+    def input_list = bam.collect{"--INPUT $it"}.join(' ')
+    def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : ""
+
+    def avail_mem = 3072
+    if (!task.memory) {
+        log.info '[GATK MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
+    } else {
+        avail_mem = (task.memory.mega*0.8).intValue()
+    }
+    """
+    gatk --java-options "-Xmx${avail_mem}M" MarkDuplicates \\
+        $input_list \\
+        --OUTPUT ${prefix} \\
+        --METRICS_FILE ${prefix}.metrics \\
+        --TMP_DIR . \\
+        ${reference} \\
+        $args
+    if  [[ ${prefix} == *.cram ]]&&[[ -f ${prefix}.bai ]]; then
+        mv ${prefix}.bai ${prefix}.crai
+    fi
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/nf-core/custom/markduplicates/meta.yml b/modules/nf-core/custom/markduplicates/meta.yml
new file mode 100644
index 0000000..ae7443d
--- /dev/null
+++ b/modules/nf-core/custom/markduplicates/meta.yml
@@ -0,0 +1,72 @@
+name: gatk4_markduplicates
+description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA.
+keywords:
+  - markduplicates
+  - bam
+  - sort
+tools:
+  - gatk4:
+      description:
+        Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
+        with a primary focus on variant discovery and genotyping. Its powerful processing engine
+        and high-performance computing features make it capable of taking on projects of any size.
+      homepage: https://gatk.broadinstitute.org/hc/en-us
+      documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard-
+      tool_dev_url: https://github.com/broadinstitute/gatk
+      doi: 10.1158/1538-7445.AM2017-3590
+      licence: ["MIT"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: Sorted BAM file
+      pattern: "*.{bam}"
+  - fasta:
+      type: file
+      description: Fasta file
+      pattern: "*.{fasta}"
+  - fasta_fai:
+      type: file
+      description: Fasta index file
+      pattern: "*.{fai}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - bam:
+      type: file
+      description: Marked duplicates BAM file
+      pattern: "*.{bam}"
+  - cram:
+      type: file
+      description: Marked duplicates CRAM file
+      pattern: "*.{cram}"
+  - bai:
+      type: file
+      description: BAM index file
+      pattern: "*.{bam.bai}"
+  - crai:
+      type: file
+      description: CRAM index file
+      pattern: "*.{cram.crai}"
+  - metrics:
+      type: file
+      description: Duplicate metrics file generated by GATK
+      pattern: "*.{metrics.txt}"
+
+authors:
+  - "@ajodeh-juma"
+  - "@FriederikeHanssen"
+  - "@maxulysse"
\ No newline at end of file
diff --git a/modules/nf-core/custom/samtools/index/main.nf b/modules/nf-core/custom/samtools/index/main.nf
new file mode 100644
index 0000000..05fa975
--- /dev/null
+++ b/modules/nf-core/custom/samtools/index/main.nf
@@ -0,0 +1,46 @@
+process SAMTOOLS_INDEX {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::samtools=1.17"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
+        'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }"
+
+    input:
+    tuple val(meta), path(input)
+
+    output:
+    tuple val(meta), path("*.bai") , optional:true, emit: bai
+    tuple val(meta), path("*.csi") , optional:true, emit: csi
+    tuple val(meta), path("*.crai"), optional:true, emit: crai
+    path  "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    samtools \\
+        index \\
+        -@ ${task.cpus-1} \\
+        $args \\
+        $input
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch ${input}.bai
+    touch ${input}.crai
+    touch ${input}.csi
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/nf-core/custom/samtools/index/meta.yml b/modules/nf-core/custom/samtools/index/meta.yml
new file mode 100644
index 0000000..6037b9e
--- /dev/null
+++ b/modules/nf-core/custom/samtools/index/meta.yml
@@ -0,0 +1,53 @@
+name: samtools_index
+description: Index SAM/BAM/CRAM file
+keywords:
+  - index
+  - bam
+  - sam
+  - cram
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bai:
+      type: file
+      description: BAM/CRAM/SAM index file
+      pattern: "*.{bai,crai,sai}"
+  - crai:
+      type: file
+      description: BAM/CRAM/SAM index file
+      pattern: "*.{bai,crai,sai}"
+  - csi:
+      type: file
+      description: CSI index file
+      pattern: "*.{csi}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@drpatelh"
+  - "@ewels"
+  - "@maxulysse"
\ No newline at end of file
diff --git a/modules/nf-core/custom/sort/main.nf b/modules/nf-core/custom/sort/main.nf
new file mode 100644
index 0000000..f569257
--- /dev/null
+++ b/modules/nf-core/custom/sort/main.nf
@@ -0,0 +1,49 @@
+process SAMTOOLS_SORT {
+    tag "$meta.id"
+    label 'process_high' //medium
+
+    conda "bioconda::samtools=1.17"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
+        'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }"
+
+    input:
+    tuple val(meta), path(bam)
+
+    output:
+    tuple val(meta), path("*.bam"), emit: bam
+    tuple val(meta), path("*.csi"), emit: csi, optional: true
+    path  "versions.yml"          , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def sort_memory = (task.memory.mega/task.cpus).intValue()
+    if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    """
+    samtools sort \\
+        $args \\
+        -@ $task.cpus \\
+        -m ${sort_memory}M \\
+        -o ${prefix}.bam \\
+        -T $prefix \\
+        $bam
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.bam
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/nf-core/custom/sort/meta.yml b/modules/nf-core/custom/sort/meta.yml
new file mode 100644
index 0000000..1587857
--- /dev/null
+++ b/modules/nf-core/custom/sort/meta.yml
@@ -0,0 +1,48 @@
+name: samtools_sort
+description: Sort SAM/BAM/CRAM file
+keywords:
+  - sort
+  - bam
+  - sam
+  - cram
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: Sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - csi:
+      type: file
+      description: BAM index file (optional)
+      pattern: "*.csi"
+authors:
+  - "@drpatelh"
+  - "@ewels"
\ No newline at end of file
diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf
index befd3e5..92c5181 100644
--- a/subworkflows/local/hicstuff_sub.nf
+++ b/subworkflows/local/hicstuff_sub.nf
@@ -8,6 +8,9 @@ include { BUILD_MATRIX_COOL_ALT } from '../../modules/local/hicstuff/build_matri
 include { FILTER_EVENT } from '../../modules/local/hicstuff/filter_event'
 include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law'
 include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr'
+include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/custom/gatk4/markduplicates/main'
+include { SAMTOOLS_SORT } from '../../modules/nf-core/custom/samtools/sort/main'
+include { SAMTOOLS_INDEX } from '../../modules/nf-core/custom/samtools/index/main'
 
 // Paired-end to Single-end
 def pairToSingle(row, mates) {
@@ -56,11 +59,36 @@ workflow HICSTUFF_SUB {
         fasta
     )
 
+    if (params.filter_pcr && params.filter_pcr_picard ){
+        error "Error: filter_pcr and filter_pcr_picard can't both be true at the same time! Set one of them false in the config file"
+    }
+    else if (params.filter_pcr_picard){
+        SAMTOOLS_SORT(
+            BOWTIE2_ALIGNMENT.out.bam
+        )
+
+        SAMTOOLS_INDEX(
+            SAMTOOLS_SORT.out.bam
+        )
+
+        GATK4_MARKDUPLICATES(
+            SAMTOOLS_SORT.out.bam,
+            fasta.map{ it[1] }.collect(),
+            index.map{ it[1] }.collect()
+        )
+        GATK4_MARKDUPLICATES.out.bam.set{ ch_bam }
+
+    }
+    else {
+
+        BOWTIE2_ALIGNMENT.out.bam.set{ ch_bam }
+
+    }
     BAM2PAIRS(
-        BOWTIE2_ALIGNMENT.out.bam.combine(BOWTIE2_ALIGNMENT.out.bam)
-            .map {
-                meta1, bam1, meta2, bam2 ->
-                    meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1,  bam1,  meta2, bam2 ] : null
+        ch_bam.combine(ch_bam)
+        .map {
+            meta1, bam1, meta2, bam2 ->
+                meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1,  bam1,  meta2, bam2 ] : null
         },
         FRAGMENT_ENZYME.out.info_contigs.collect(),
         digestion,
-- 
GitLab