From bea9b094bcdc50a42ae3d094b9d084f79bd27029 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Fri, 5 Mar 2021 11:32:29 +0100
Subject: [PATCH] nf_modules: add gatk4

---
 src/nf_modules/gatk4/main.nf | 265 +++++++++++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 src/nf_modules/gatk4/main.nf

diff --git a/src/nf_modules/gatk4/main.nf b/src/nf_modules/gatk4/main.nf
new file mode 100644
index 00000000..0b419ecd
--- /dev/null
+++ b/src/nf_modules/gatk4/main.nf
@@ -0,0 +1,265 @@
+version = "4.2.0.0"
+container_url = "broadinstitute/gatk:${version}"
+
+process variant_calling {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(bam), path(bai)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*.vcf"), emit: vcf
+
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T HaplotypeCaller \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -I ${bam} \
+  -o ${file_id}.vcf
+"""
+}
+
+process filter_snp {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_snp.vcf"), emit: vcf
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T SelectVariants \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${vcf} \
+  -selectType SNP \
+  -o ${file_id}_snp.vcf
+"""
+}
+
+process filter_indels {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_indel.vcf"), emit: vcf
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T SelectVariants \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${vcf} \
+  -selectType INDEL \
+  -o ${file_id}_indel.vcf
+"""
+}
+
+high_confidence_snp_filter = "(QD < 2.0) || (FS > 60.0) || (MQ < 40.0) || (MQRankSum < -12.5) || (ReadPosRankSum < -8.0) || (SOR > 4.0)"
+
+process high_confidence_snp {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_snp.vcf"), emit: vcf
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T VariantFiltration \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${vcf} \
+  --filterExpression "${high_confidence_snp_filter}" \
+  --filterName "basic_snp_filter" \
+  -o ${file_id}_filtered_snp.vcf
+"""
+}
+
+high_confidence_indel_filter = "QD < 3.0 || FS > 200.0 || ReadPosRankSum < -20.0 || SOR > 10.0"
+
+process high_confidence_indels {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_indel.vcf"), emit: vcf
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T VariantFiltration \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${vcf} \
+  --filterExpression "${high_confidence_indel_filter}" \
+  --filterName "basic_indel_filter" \
+  -o ${file_id}_filtered_indel.vcf
+"""
+}
+
+process recalibrate_snp_table {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(snp_file), path(indel_file), path(bam), path(bam_idx)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("recal_data_table"), emit: recal_table
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T BaseRecalibrator \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -I ${bam} \
+  -knownSites ${snp_file} \
+  -knownSites ${indel_file} \
+  -o recal_data_table
+"""
+}
+
+process recalibrate_snp {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(snp_file), path(indel_file), path(bam), path(bam_idx)
+    tuple val(table_id), path(recal_data_table)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*.bam"), emit: bam
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T PrintReads \
+  --use_jdk_deflater \
+  --use_jdk_inflater \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -I ${bam} \
+  -BQSR recal_data_table \
+  -o ${file_id}_recal.bam
+"""
+}
+
+process haplotype_caller {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(bam)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*.gvcf"), emit: gvcf
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T HaplotypeCaller \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -I ${bam} \
+  -ERC GVCF \
+  -variant_index_type LINEAR -variant_index_parameter 128000 \
+  -o ${file_id}.gvcf
+"""
+}
+
+process gvcf_genotyping {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(gvcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*.vcf"), emit: vcf
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T GenotypeGVCFs \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${gvcf} \
+  -o ${file_id}_joint.vcf
+"""
+}
+
+process select_variants_snp {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_joint_snp.vcf"), emit: vcf
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T SelectVariants \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${vcf} \
+  -selectType SNP \
+  -o ${file_id}_joint_snp.vcf
+"""
+}
+
+process select_variants_indels {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_joint_indel.vcf"), emit: vcf
+  script:
+"""
+gatk --java-options "-Xmx${task.memory}" -T SelectVariants \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${vcf} \
+  -selectType INDEL \
+  -o ${file_id}_joint_indel.vcf
+"""
+}
+
+process personalized_genome {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_genome.fasta"), emit: fasta
+
+  script:
+  library = pick_library(file_id, library_list)
+"""
+gatk --java-options "-Xmx${task.memory}" -T FastaAlternateReferenceMaker\
+  -R ${reference} \
+  -V ${vcf} \
+  -o ${library}_genome.fasta
+"""
+}
+
-- 
GitLab