From 94772e1662fd63fc7b0a1b8f417083d44bbe7da5 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Tue, 1 Jun 2021 10:14:15 +0200 Subject: [PATCH] gatk4: first version of new variant calling workflow --- src/nf_modules/gatk4/main.nf | 265 +++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) diff --git a/src/nf_modules/gatk4/main.nf b/src/nf_modules/gatk4/main.nf index ca8999ff..2f23b9e2 100644 --- a/src/nf_modules/gatk4/main.nf +++ b/src/nf_modules/gatk4/main.nf @@ -1,6 +1,271 @@ version = "4.2.0.0" container_url = "broadinstitute/gatk:${version}" +include { + index_fasta as samtools_index_fasta; +} from './nf_modules/samtools/main.nf' +include { + index_fasta as picard_index_fasta; + index_bam as picard_index_bam; + mark_duplicate; +} from './nf_modules/picard/main.nf' + +workflow germline_cohort_data_variant_calling { + take: + bam + fasta + main: + // data preparation + mark_duplicate(bam) + picard_index_bam(mark_duplicate.out.bam) + mark_duplicate.out.bam + .join(picard_index_bam.out.index) + set(bam_idx) + picard_index_fasta(fasta) + samtools_index_fasta(fasta) + fasta + .join(picard_index_fasta.out.index) + .join(samtools_index_fasta.out.index) + .set(fasta_idx) + + // variant calling + call_variants_per_sample( + bam_idx, + fasta_idx + ) + call_variants_all_sample( + call_variants_all_sample.out.gvcf, + fasta_idx + ) + emit: + vcf: call_variants_all_sample.out.vcf +} + +/*******************************************************************/ +workflow base_quality_recalibrator{ + take: + bam_idx + fasta_idx + vcf + + main: + index_vcf(vcf) + compute_base_recalibration( + bam_idx, + fasta_idx, + index_vcf.out.vcf_idx + ) + apply_base_recalibration( + bam_idx, + fasta_idx, + compute_base_recalibration.out.table + ) + emit: + bam: apply_base_recalibration.out.bam +} + +process index_vcf { + container = "${container_url}" + label "big_mem_mono_cpus" + tag "$file_id" + input: + tuple val(file_id), path(vcf) + output: + tuple val(file_id), path("${vcf}"), path("*"), emit: vcf_idx + + script: + xmx_memory = "${task.memory}" - ~/\s*GB/ + if (file_id instanceof List){ + file_prefix = file_id[0] + } else { + file_prefix = file_id + } +""" +gatk --java-options "-Xmx${xmx_memory}G" IndexFeatureFile \ + -I ${vcf} +""" +} + +process compute_base_recalibration { + container = "${container_url}" + label "big_mem_mono_cpus" + tag "$file_id" + input: + tuple val(file_id), path(bam), path(bam_idx) + tuple val(ref_id), path(fasta), path(fai), path(dict) + tuple val(vcf_id), path(vcf), path(vcf_idx) + output: + tuple val(file_id), path("${bam.simpleName}.table"), emit: table + + script: + xmx_memory = "${task.memory}" - ~/\s*GB/ + if (file_id instanceof List){ + file_prefix = file_id[0] + } else { + file_prefix = file_id + } + def vcf_cmd = "" + if (vcf instanceof List){ + for (vcf_file in vcf){ + vcf_cmd += "--known-sites ${vcf_file} " + } + } else { + vcf_cmd = "--known-sites ${vcf} " + } +""" + gatk --java-options "-Xmx${xmx_memory}G" BaseRecalibrator \ + -I ${bam} \ + -R ${fasta} \ + ${vcf_cmd} \ + -O ${bam.simpleName}.table +""" +} + +process apply_base_recalibration { + container = "${container_url}" + label "big_mem_mono_cpus" + tag "$file_id" + input: + tuple val(file_id), path(bam), path(bam_idx) + tuple val(ref_id), path(fasta), path(fai), path(dict) + tuple val(table_id), path(table) + output: + tuple val(file_id), path("${bam.simpleName}_recalibrate.bam"), emit: bam + + script: + xmx_memory = "${task.memory}" - ~/\s*GB/ + if (file_id instanceof List){ + file_prefix = file_id[0] + } else { + file_prefix = file_id + } +""" + gatk --java-options "-Xmx${xmx_memory}G" ApplyBQSR \ + -R ${fasta} \ + -I ${bam} \ + --bqsr-recal-file ${table} \ + -O ${bam.simpleName}_recalibrate.bam +""" +} + +/*******************************************************************/ + +process call_variants_per_sample { + container = "${container_url}" + label "big_mem_mono_cpus" + tag "$file_id" + input: + tuple val(file_id), path(bam), path(bam_idx) + tuple val(ref_id), path(fasta), path(fai), path(dict) + output: + tuple val(file_id), path("${bam.simpleName}.gvcf.gz"), emit: gvcf + + script: + xmx_memory = "${task.memory}" - ~/\s*GB/ + if (file_id instanceof List){ + file_prefix = file_id[0] + } else { + file_prefix = file_id + } +""" + gatk --java-options "-Xmx${xmx_memory}G" HaplotypeCaller \ + -R ${fata} \ + -I ${bam} \ + -O ${bam.simpleName}.gvcf.gz \ + -ERC GVCF \ + -G Standard \ + -G AS_Standard +""" +} + +/*******************************************************************/ + +workflow call_variants_all_sample { + take: + gvcf + fasta_idx + + main: + consolidate_gvcf( + gvcf.groupTuple() + ) + genomic_db_call( + consolidate_gvcf.out.gdb, + fasta_idx + ) + emit: + vcf: genomic_db_call.out.vcf +} + +process consolidate_gvcf { + container = "${container_url}" + label "big_mem_mono_cpus" + tag "$file_id" + input: + tuple val(file_id), path(gvcf) + output: + tuple val(file_id), path("${file_prefix}.db"), emit: gdb + + script: + xmx_memory = "${task.memory}" - ~/\s*GB/ + if (file_id instanceof List){ + file_prefix = file_id[0] + } else { + file_prefix = file_id + } + def gvcf_cmd = "" + if (gvcf instanceof List){ + for (gvcf_file in gvcf){ + gvcf_cmd += "--V ${gvcf_file} " + } + } else { + gvcf_cmd = "--V ${gvcf} " + } +""" +mkdir tmp +gatk --java-options "-Xmx${xmx_memory}G" GenomicsDBImport \ + ${gvcf_cmd} \ + --genomicsdb-workspace-path ${file_prefix}.db \ + --tmp-dir=./tmp \ +""" +} + +process genomic_db_call { + container = "${container_url}" + label "big_mem_mono_cpus" + tag "$file_id" + input: + tuple val(file_id), path(db) + tuple val(ref_id), path(fasta), path(fai), path(dict) + output: + tuple val(file_id), path("${db.simple_name}.vcf.gz"), emit: vcf + + script: + xmx_memory = "${task.memory}" - ~/\s*GB/ + if (file_id instanceof List){ + file_prefix = file_id[0] + } else { + file_prefix = file_id + } + def gvcf_cmd = "" + if (gvcf instanceof List){ + for (gvcf_file in gvcf){ + gvcf_cmd += "--V ${gvcf_file} " + } + } else { + gvcf_cmd = "--V ${gvcf} " + } +""" +mkdir tmp +gatk --java-options "-Xmx${xmx_memory}G" GenotypeGVCFs \ + -R ${fasta} \ + -V gendb://${db} \ + -O ${db.simpleName}.vcf.gz \ + --tmp-dir ./tmp +""" +} + +/*******************************************************************/ params.variant_calling = "" params.variant_calling_out = "" process variant_calling { -- GitLab