From b607dc0eee8ef8b98fdc9fb151ca75b4c1d88c4c Mon Sep 17 00:00:00 2001 From: Mia Croiset <mia.croiset@ens-lyon.fr> Date: Fri, 2 Jun 2023 16:45:26 +0200 Subject: [PATCH] TO FIX markduplicates picard --- conf/hicstuff.config | 22 ++++++ .../custom/{ => gatk4}/markduplicates/main.nf | 0 .../{ => gatk4}/markduplicates/meta.yml | 0 .../custom/picard/markduplicates/main.nf | 59 +++++++++++++++ .../custom/picard/markduplicates/meta.yml | 71 +++++++++++++++++++ .../nf-core/custom/samtools_n/sort/main.nf | 49 +++++++++++++ .../nf-core/custom/samtools_n/sort/meta.yml | 48 +++++++++++++ subworkflows/local/hicstuff_sub.nf | 34 +++++++-- 8 files changed, 277 insertions(+), 6 deletions(-) rename modules/nf-core/custom/{ => gatk4}/markduplicates/main.nf (100%) rename modules/nf-core/custom/{ => gatk4}/markduplicates/meta.yml (100%) create mode 100644 modules/nf-core/custom/picard/markduplicates/main.nf create mode 100644 modules/nf-core/custom/picard/markduplicates/meta.yml create mode 100644 modules/nf-core/custom/samtools_n/sort/main.nf create mode 100644 modules/nf-core/custom/samtools_n/sort/meta.yml diff --git a/conf/hicstuff.config b/conf/hicstuff.config index 82360ec..3f5cdf6 100644 --- a/conf/hicstuff.config +++ b/conf/hicstuff.config @@ -245,6 +245,17 @@ process { ] } + withName: 'PICARD_MARKDUPLICATES' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" } + ext.args = { [ + "--REMOVE_DUPLICATES true" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/picard/bam" }, + mode: 'copy' + ] + } + withName: 'SAMTOOLS_SORT' { ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_sorted" } ext.args = { [ @@ -252,6 +263,17 @@ process { ].join('').trim() } } + withName: 'SAMTOOLS_SORT_N' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_nsorted" } + ext.args = { [ + "-n" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/gatk4/bam" }, + mode: 'copy' + ] + } + withName: 'SAMTOOLS_INDEX' { ext.args = { [ "" diff --git a/modules/nf-core/custom/markduplicates/main.nf b/modules/nf-core/custom/gatk4/markduplicates/main.nf similarity index 100% rename from modules/nf-core/custom/markduplicates/main.nf rename to modules/nf-core/custom/gatk4/markduplicates/main.nf diff --git a/modules/nf-core/custom/markduplicates/meta.yml b/modules/nf-core/custom/gatk4/markduplicates/meta.yml similarity index 100% rename from modules/nf-core/custom/markduplicates/meta.yml rename to modules/nf-core/custom/gatk4/markduplicates/meta.yml diff --git a/modules/nf-core/custom/picard/markduplicates/main.nf b/modules/nf-core/custom/picard/markduplicates/main.nf new file mode 100644 index 0000000..b89101b --- /dev/null +++ b/modules/nf-core/custom/picard/markduplicates/main.nf @@ -0,0 +1,59 @@ +process PICARD_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::picard=3.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : + 'quay.io/biocontainers/picard:3.0.0--hdfd78af_1' }" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.metrics.txt"), emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def avail_mem = 3072 + if (!task.memory) { + log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + picard \\ + -Xmx${avail_mem}M \\ + MarkDuplicates \\ + $args \\ + --INPUT $bam \\ + --OUTPUT ${prefix}.bam \\ + --REFERENCE_SEQUENCE $fasta \\ + --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.bam.bai + touch ${prefix}.MarkDuplicates.metrics.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/picard/markduplicates/meta.yml b/modules/nf-core/custom/picard/markduplicates/meta.yml new file mode 100644 index 0000000..e463328 --- /dev/null +++ b/modules/nf-core/custom/picard/markduplicates/meta.yml @@ -0,0 +1,71 @@ +name: picard_markduplicates +description: Locate and tag duplicate reads in a BAM file +keywords: + - markduplicates + - pcr + - duplicates + - bam + - sam + - cram +tools: + - picard: + description: | + A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) + data and formats such as SAM/BAM/CRAM and VCF. + homepage: https://broadinstitute.github.io/picard/ + documentation: https://broadinstitute.github.io/picard/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Reference genome fasta index + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file with duplicate reads marked/removed + pattern: "*.{bam}" + - bai: + type: file + description: An optional BAM index file. If desired, --CREATE_INDEX must be passed as a flag + pattern: "*.{bai}" + - metrics: + type: file + description: Duplicate metrics file generated by picard + pattern: "*.{metrics.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@projectoriented" + - "@ramprasadn" \ No newline at end of file diff --git a/modules/nf-core/custom/samtools_n/sort/main.nf b/modules/nf-core/custom/samtools_n/sort/main.nf new file mode 100644 index 0000000..59a6e2a --- /dev/null +++ b/modules/nf-core/custom/samtools_n/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT_N { + tag "$meta.id" + label 'process_high' //medium + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sort_memory = (task.memory.mega/task.cpus).intValue() + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -m ${sort_memory}M \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/samtools_n/sort/meta.yml b/modules/nf-core/custom/samtools_n/sort/meta.yml new file mode 100644 index 0000000..1587857 --- /dev/null +++ b/modules/nf-core/custom/samtools_n/sort/meta.yml @@ -0,0 +1,48 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" \ No newline at end of file diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf index 71b0053..69f04de 100644 --- a/subworkflows/local/hicstuff_sub.nf +++ b/subworkflows/local/hicstuff_sub.nf @@ -8,9 +8,11 @@ include { BUILD_MATRIX_COOL_ALT } from '../../modules/local/hicstuff/build_matri include { FILTER_EVENT } from '../../modules/local/hicstuff/filter_event' include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law' include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr' -include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/custom/markduplicates/main' +include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/custom/gatk4/markduplicates/main' include { SAMTOOLS_SORT } from '../../modules/nf-core/custom/samtools/sort/main' +include { SAMTOOLS_SORT_N } from '../../modules/nf-core/custom/samtools_n/sort/main' include { SAMTOOLS_INDEX } from '../../modules/nf-core/custom/samtools/index/main' +include { PICARD_MARKDUPLICATES } from '../../modules/nf-core/custom/picard/markduplicates/main' // Paired-end to Single-end def pairToSingle(row, mates) { @@ -67,16 +69,36 @@ workflow HICSTUFF_SUB { BOWTIE2_ALIGNMENT.out.bam ) +/* GATK4_MARKDUPLICATES( + SAMTOOLS_SORT.out.bam, + fasta.map{ it[1] }.collect(), + index.map{ it[1] }.collect() + ) + + SAMTOOLS_SORT_N( + GATK4_MARKDUPLICATES.out.bam + ) + SAMTOOLS_INDEX( - SAMTOOLS_SORT.out.bam + SAMTOOLS_SORT_N.out.bam ) + */ - GATK4_MARKDUPLICATES( + PICARD_MARKDUPLICATES( SAMTOOLS_SORT.out.bam, - fasta.map{ it[1] }.collect(), - index.map{ it[1] }.collect() + fasta.collect(), + index.collect() + ) + + SAMTOOLS_SORT_N( + PICARD_MARKDUPLICATES.out.bam ) - GATK4_MARKDUPLICATES.out.bam.set{ ch_bam } + + SAMTOOLS_INDEX( + SAMTOOLS_SORT_N.out.bam + ) + + SAMTOOLS_SORT_N.out.bam.set{ ch_bam } } else { -- GitLab