Skip to content
Snippets Groups Projects
Verified Commit f8fcb912 authored by Mia Croiset's avatar Mia Croiset
Browse files

TO FIX markduplicates pcr filter option

parent 364489ed
No related branches found
No related tags found
No related merge requests found
...@@ -40,8 +40,8 @@ process { ...@@ -40,8 +40,8 @@ process {
time = { check_max( 8.h * task.attempt, 'time' ) } time = { check_max( 8.h * task.attempt, 'time' ) }
} }
withLabel:process_high { withLabel:process_high {
cpus = { check_max( 12 * task.attempt, 'cpus' ) } cpus = { check_max( 8 * task.attempt, 'cpus' ) } //TODO go back to 16 when not local
memory = { check_max( 64.GB * task.attempt, 'memory' ) } memory = { check_max( 31.GB * task.attempt, 'memory' ) }//TODO go back to 64 when not local
time = { check_max( 16.h * task.attempt, 'time' ) } time = { check_max( 16.h * task.attempt, 'time' ) }
} }
withLabel:process_long { withLabel:process_long {
...@@ -61,3 +61,35 @@ process { ...@@ -61,3 +61,35 @@ process {
cache = false cache = false
} }
} }
// Function to ensure that resource requirements don't go beyond
// a maximum limit
def check_max(obj, type) {
if (type == 'memory') {
try {
if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
return params.max_memory as nextflow.util.MemoryUnit
else
return obj
} catch (all) {
println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj"
return obj
}
} else if (type == 'time') {
try {
if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
return params.max_time as nextflow.util.Duration
else
return obj
} catch (all) {
println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj"
return obj
}
} else if (type == 'cpus') {
try {
return Math.min( obj, params.max_cpus as int )
} catch (all) {
println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
return obj
}
}
}
...@@ -144,10 +144,10 @@ params { ...@@ -144,10 +144,10 @@ params {
hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs' hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs'
//Hicstuff optional modules //Hicstuff optional modules
filter_event = true filter_event = false
distance_law = true distance_law = false
filter_pcr = true filter_pcr = false
filter_pcr_picard = false filter_pcr_picard = true
} }
process { process {
...@@ -234,6 +234,30 @@ process { ...@@ -234,6 +234,30 @@ process {
] ]
} }
withName: 'GATK4_MARKDUPLICATES' {
ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}.bam" }
ext.args = { [
"--REMOVE_DUPLICATES true"
].join('').trim() }
publishDir = [
path: { "${params.outdir}/gatk4/bam" },
mode: 'copy'
]
}
withName: 'SAMTOOLS_SORT' {
ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_sorted" }
ext.args = { [
""
].join('').trim() }
}
withName: 'SAMTOOLS_INDEX' {
ext.args = { [
""
].join('').trim() }
}
withName: 'BUILD_MATRIX' { withName: 'BUILD_MATRIX' {
ext.args = params.hicstuff_matrix ext.args = params.hicstuff_matrix
publishDir = [ publishDir = [
...@@ -350,3 +374,4 @@ profiles { ...@@ -350,3 +374,4 @@ profiles {
test { includeConfig 'conf/test.config' } test { includeConfig 'conf/test.config' }
test_full { includeConfig 'conf/test_full.config' } test_full { includeConfig 'conf/test_full.config' }
} }
includeConfig 'base.config'
process GATK4_MARKDUPLICATES {
tag "$meta.id"
label 'process_medium'
conda "bioconda::gatk4=4.4.0.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0':
'quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }"
input:
tuple val(meta), path(bam)
path fasta
path fasta_fai
output:
tuple val(meta), path("*cram"), emit: cram, optional: true
tuple val(meta), path("*bam"), emit: bam, optional: true
tuple val(meta), path("*.crai"), emit: crai, optional: true
tuple val(meta), path("*.bai"), emit: bai, optional: true
tuple val(meta), path("*.metrics"), emit: metrics
path "versions.yml", emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
def input_list = bam.collect{"--INPUT $it"}.join(' ')
def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : ""
def avail_mem = 3072
if (!task.memory) {
log.info '[GATK MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
} else {
avail_mem = (task.memory.mega*0.8).intValue()
}
"""
gatk --java-options "-Xmx${avail_mem}M" MarkDuplicates \\
$input_list \\
--OUTPUT ${prefix} \\
--METRICS_FILE ${prefix}.metrics \\
--TMP_DIR . \\
${reference} \\
$args
if [[ ${prefix} == *.cram ]]&&[[ -f ${prefix}.bai ]]; then
mv ${prefix}.bai ${prefix}.crai
fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
END_VERSIONS
"""
}
\ No newline at end of file
name: gatk4_markduplicates
description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA.
keywords:
- markduplicates
- bam
- sort
tools:
- gatk4:
description:
Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
with a primary focus on variant discovery and genotyping. Its powerful processing engine
and high-performance computing features make it capable of taking on projects of any size.
homepage: https://gatk.broadinstitute.org/hc/en-us
documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard-
tool_dev_url: https://github.com/broadinstitute/gatk
doi: 10.1158/1538-7445.AM2017-3590
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: Sorted BAM file
pattern: "*.{bam}"
- fasta:
type: file
description: Fasta file
pattern: "*.{fasta}"
- fasta_fai:
type: file
description: Fasta index file
pattern: "*.{fai}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- bam:
type: file
description: Marked duplicates BAM file
pattern: "*.{bam}"
- cram:
type: file
description: Marked duplicates CRAM file
pattern: "*.{cram}"
- bai:
type: file
description: BAM index file
pattern: "*.{bam.bai}"
- crai:
type: file
description: CRAM index file
pattern: "*.{cram.crai}"
- metrics:
type: file
description: Duplicate metrics file generated by GATK
pattern: "*.{metrics.txt}"
authors:
- "@ajodeh-juma"
- "@FriederikeHanssen"
- "@maxulysse"
\ No newline at end of file
process SAMTOOLS_INDEX {
tag "$meta.id"
label 'process_low'
conda "bioconda::samtools=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }"
input:
tuple val(meta), path(input)
output:
tuple val(meta), path("*.bai") , optional:true, emit: bai
tuple val(meta), path("*.csi") , optional:true, emit: csi
tuple val(meta), path("*.crai"), optional:true, emit: crai
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
"""
samtools \\
index \\
-@ ${task.cpus-1} \\
$args \\
$input
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
stub:
"""
touch ${input}.bai
touch ${input}.crai
touch ${input}.csi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
}
\ No newline at end of file
name: samtools_index
description: Index SAM/BAM/CRAM file
keywords:
- index
- bam
- sam
- cram
tools:
- samtools:
description: |
SAMtools is a set of utilities for interacting with and post-processing
short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
These files are generated as output by short read aligners like BWA.
homepage: http://www.htslib.org/
documentation: http://www.htslib.org/doc/samtools.html
doi: 10.1093/bioinformatics/btp352
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: BAM/CRAM/SAM file
pattern: "*.{bam,cram,sam}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bai:
type: file
description: BAM/CRAM/SAM index file
pattern: "*.{bai,crai,sai}"
- crai:
type: file
description: BAM/CRAM/SAM index file
pattern: "*.{bai,crai,sai}"
- csi:
type: file
description: CSI index file
pattern: "*.{csi}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@drpatelh"
- "@ewels"
- "@maxulysse"
\ No newline at end of file
process SAMTOOLS_SORT {
tag "$meta.id"
label 'process_high' //medium
conda "bioconda::samtools=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }"
input:
tuple val(meta), path(bam)
output:
tuple val(meta), path("*.bam"), emit: bam
tuple val(meta), path("*.csi"), emit: csi, optional: true
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def sort_memory = (task.memory.mega/task.cpus).intValue()
if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
samtools sort \\
$args \\
-@ $task.cpus \\
-m ${sort_memory}M \\
-o ${prefix}.bam \\
-T $prefix \\
$bam
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}.bam
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
}
\ No newline at end of file
name: samtools_sort
description: Sort SAM/BAM/CRAM file
keywords:
- sort
- bam
- sam
- cram
tools:
- samtools:
description: |
SAMtools is a set of utilities for interacting with and post-processing
short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
These files are generated as output by short read aligners like BWA.
homepage: http://www.htslib.org/
documentation: http://www.htslib.org/doc/samtools.html
doi: 10.1093/bioinformatics/btp352
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: BAM/CRAM/SAM file
pattern: "*.{bam,cram,sam}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: Sorted BAM/CRAM/SAM file
pattern: "*.{bam,cram,sam}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- csi:
type: file
description: BAM index file (optional)
pattern: "*.csi"
authors:
- "@drpatelh"
- "@ewels"
\ No newline at end of file
...@@ -8,6 +8,9 @@ include { BUILD_MATRIX_COOL_ALT } from '../../modules/local/hicstuff/build_matri ...@@ -8,6 +8,9 @@ include { BUILD_MATRIX_COOL_ALT } from '../../modules/local/hicstuff/build_matri
include { FILTER_EVENT } from '../../modules/local/hicstuff/filter_event' include { FILTER_EVENT } from '../../modules/local/hicstuff/filter_event'
include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law' include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law'
include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr' include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr'
include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/custom/gatk4/markduplicates/main'
include { SAMTOOLS_SORT } from '../../modules/nf-core/custom/samtools/sort/main'
include { SAMTOOLS_INDEX } from '../../modules/nf-core/custom/samtools/index/main'
// Paired-end to Single-end // Paired-end to Single-end
def pairToSingle(row, mates) { def pairToSingle(row, mates) {
...@@ -56,11 +59,36 @@ workflow HICSTUFF_SUB { ...@@ -56,11 +59,36 @@ workflow HICSTUFF_SUB {
fasta fasta
) )
if (params.filter_pcr && params.filter_pcr_picard ){
error "Error: filter_pcr and filter_pcr_picard can't both be true at the same time! Set one of them false in the config file"
}
else if (params.filter_pcr_picard){
SAMTOOLS_SORT(
BOWTIE2_ALIGNMENT.out.bam
)
SAMTOOLS_INDEX(
SAMTOOLS_SORT.out.bam
)
GATK4_MARKDUPLICATES(
SAMTOOLS_SORT.out.bam,
fasta.map{ it[1] }.collect(),
index.map{ it[1] }.collect()
)
GATK4_MARKDUPLICATES.out.bam.set{ ch_bam }
}
else {
BOWTIE2_ALIGNMENT.out.bam.set{ ch_bam }
}
BAM2PAIRS( BAM2PAIRS(
BOWTIE2_ALIGNMENT.out.bam.combine(BOWTIE2_ALIGNMENT.out.bam) ch_bam.combine(ch_bam)
.map { .map {
meta1, bam1, meta2, bam2 -> meta1, bam1, meta2, bam2 ->
meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null
}, },
FRAGMENT_ENZYME.out.info_contigs.collect(), FRAGMENT_ENZYME.out.info_contigs.collect(),
digestion, digestion,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment