From f8fcb912049ad6a0ae7529aa785ea0d198810667 Mon Sep 17 00:00:00 2001 From: Mia Croiset <mia.croiset@ens-lyon.fr> Date: Thu, 1 Jun 2023 15:58:27 +0200 Subject: [PATCH] TO FIX markduplicates pcr filter option --- conf/base.config | 36 +++++++++- conf/hicstuff.config | 33 +++++++-- modules/nf-core/custom/markduplicates/main.nf | 54 ++++++++++++++ .../nf-core/custom/markduplicates/meta.yml | 72 +++++++++++++++++++ modules/nf-core/custom/samtools/index/main.nf | 46 ++++++++++++ .../nf-core/custom/samtools/index/meta.yml | 53 ++++++++++++++ modules/nf-core/custom/sort/main.nf | 49 +++++++++++++ modules/nf-core/custom/sort/meta.yml | 48 +++++++++++++ subworkflows/local/hicstuff_sub.nf | 36 ++++++++-- 9 files changed, 417 insertions(+), 10 deletions(-) create mode 100644 modules/nf-core/custom/markduplicates/main.nf create mode 100644 modules/nf-core/custom/markduplicates/meta.yml create mode 100644 modules/nf-core/custom/samtools/index/main.nf create mode 100644 modules/nf-core/custom/samtools/index/meta.yml create mode 100644 modules/nf-core/custom/sort/main.nf create mode 100644 modules/nf-core/custom/sort/meta.yml diff --git a/conf/base.config b/conf/base.config index 6808dbe..e57c358 100644 --- a/conf/base.config +++ b/conf/base.config @@ -40,8 +40,8 @@ process { time = { check_max( 8.h * task.attempt, 'time' ) } } withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 64.GB * task.attempt, 'memory' ) } + cpus = { check_max( 8 * task.attempt, 'cpus' ) } //TODO go back to 16 when not local + memory = { check_max( 31.GB * task.attempt, 'memory' ) }//TODO go back to 64 when not local time = { check_max( 16.h * task.attempt, 'time' ) } } withLabel:process_long { @@ -61,3 +61,35 @@ process { cache = false } } +// Function to ensure that resource requirements don't go beyond +// a maximum limit +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } +} diff --git a/conf/hicstuff.config b/conf/hicstuff.config index 0f27d39..82360ec 100644 --- a/conf/hicstuff.config +++ b/conf/hicstuff.config @@ -144,10 +144,10 @@ params { hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs' //Hicstuff optional modules - filter_event = true - distance_law = true - filter_pcr = true - filter_pcr_picard = false + filter_event = false + distance_law = false + filter_pcr = false + filter_pcr_picard = true } process { @@ -234,6 +234,30 @@ process { ] } + withName: 'GATK4_MARKDUPLICATES' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}.bam" } + ext.args = { [ + "--REMOVE_DUPLICATES true" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/gatk4/bam" }, + mode: 'copy' + ] + } + + withName: 'SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_sorted" } + ext.args = { [ + "" + ].join('').trim() } + } + + withName: 'SAMTOOLS_INDEX' { + ext.args = { [ + "" + ].join('').trim() } + } + withName: 'BUILD_MATRIX' { ext.args = params.hicstuff_matrix publishDir = [ @@ -350,3 +374,4 @@ profiles { test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } } +includeConfig 'base.config' diff --git a/modules/nf-core/custom/markduplicates/main.nf b/modules/nf-core/custom/markduplicates/main.nf new file mode 100644 index 0000000..223fa7c --- /dev/null +++ b/modules/nf-core/custom/markduplicates/main.nf @@ -0,0 +1,54 @@ +process GATK4_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(bam) + path fasta + path fasta_fai + + output: + tuple val(meta), path("*cram"), emit: cram, optional: true + tuple val(meta), path("*bam"), emit: bam, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.metrics"), emit: metrics + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def input_list = bam.collect{"--INPUT $it"}.join(' ') + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M" MarkDuplicates \\ + $input_list \\ + --OUTPUT ${prefix} \\ + --METRICS_FILE ${prefix}.metrics \\ + --TMP_DIR . \\ + ${reference} \\ + $args + if [[ ${prefix} == *.cram ]]&&[[ -f ${prefix}.bai ]]; then + mv ${prefix}.bai ${prefix}.crai + fi + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/markduplicates/meta.yml b/modules/nf-core/custom/markduplicates/meta.yml new file mode 100644 index 0000000..ae7443d --- /dev/null +++ b/modules/nf-core/custom/markduplicates/meta.yml @@ -0,0 +1,72 @@ +name: gatk4_markduplicates +description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +keywords: + - markduplicates + - bam + - sort +tools: + - gatk4: + description: + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard- + tool_dev_url: https://github.com/broadinstitute/gatk + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - fasta: + type: file + description: Fasta file + pattern: "*.{fasta}" + - fasta_fai: + type: file + description: Fasta index file + pattern: "*.{fai}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Marked duplicates BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Marked duplicates CRAM file + pattern: "*.{cram}" + - bai: + type: file + description: BAM index file + pattern: "*.{bam.bai}" + - crai: + type: file + description: CRAM index file + pattern: "*.{cram.crai}" + - metrics: + type: file + description: Duplicate metrics file generated by GATK + pattern: "*.{metrics.txt}" + +authors: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" \ No newline at end of file diff --git a/modules/nf-core/custom/samtools/index/main.nf b/modules/nf-core/custom/samtools/index/main.nf new file mode 100644 index 0000000..05fa975 --- /dev/null +++ b/modules/nf-core/custom/samtools/index/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/samtools/index/meta.yml b/modules/nf-core/custom/samtools/index/meta.yml new file mode 100644 index 0000000..6037b9e --- /dev/null +++ b/modules/nf-core/custom/samtools/index/meta.yml @@ -0,0 +1,53 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" \ No newline at end of file diff --git a/modules/nf-core/custom/sort/main.nf b/modules/nf-core/custom/sort/main.nf new file mode 100644 index 0000000..f569257 --- /dev/null +++ b/modules/nf-core/custom/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_high' //medium + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sort_memory = (task.memory.mega/task.cpus).intValue() + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -m ${sort_memory}M \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/sort/meta.yml b/modules/nf-core/custom/sort/meta.yml new file mode 100644 index 0000000..1587857 --- /dev/null +++ b/modules/nf-core/custom/sort/meta.yml @@ -0,0 +1,48 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" \ No newline at end of file diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf index befd3e5..92c5181 100644 --- a/subworkflows/local/hicstuff_sub.nf +++ b/subworkflows/local/hicstuff_sub.nf @@ -8,6 +8,9 @@ include { BUILD_MATRIX_COOL_ALT } from '../../modules/local/hicstuff/build_matri include { FILTER_EVENT } from '../../modules/local/hicstuff/filter_event' include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law' include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr' +include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/custom/gatk4/markduplicates/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/custom/samtools/sort/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/custom/samtools/index/main' // Paired-end to Single-end def pairToSingle(row, mates) { @@ -56,11 +59,36 @@ workflow HICSTUFF_SUB { fasta ) + if (params.filter_pcr && params.filter_pcr_picard ){ + error "Error: filter_pcr and filter_pcr_picard can't both be true at the same time! Set one of them false in the config file" + } + else if (params.filter_pcr_picard){ + SAMTOOLS_SORT( + BOWTIE2_ALIGNMENT.out.bam + ) + + SAMTOOLS_INDEX( + SAMTOOLS_SORT.out.bam + ) + + GATK4_MARKDUPLICATES( + SAMTOOLS_SORT.out.bam, + fasta.map{ it[1] }.collect(), + index.map{ it[1] }.collect() + ) + GATK4_MARKDUPLICATES.out.bam.set{ ch_bam } + + } + else { + + BOWTIE2_ALIGNMENT.out.bam.set{ ch_bam } + + } BAM2PAIRS( - BOWTIE2_ALIGNMENT.out.bam.combine(BOWTIE2_ALIGNMENT.out.bam) - .map { - meta1, bam1, meta2, bam2 -> - meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null + ch_bam.combine(ch_bam) + .map { + meta1, bam1, meta2, bam2 -> + meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null }, FRAGMENT_ENZYME.out.info_contigs.collect(), digestion, -- GitLab