From 1207cdab57d68dea35ac3ea1c680cb25d53bc0b5 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Wed, 17 Feb 2021 10:35:33 +0100
Subject: [PATCH] gatk3: add new processes

---
 src/nf_modules/gatk3/main.nf | 176 ++++++++++++++++++++++++++++++++---
 1 file changed, 164 insertions(+), 12 deletions(-)

diff --git a/src/nf_modules/gatk3/main.nf b/src/nf_modules/gatk3/main.nf
index 3eca0880..cb3656f4 100644
--- a/src/nf_modules/gatk3/main.nf
+++ b/src/nf_modules/gatk3/main.nf
@@ -24,11 +24,11 @@ gatk3 -T HaplotypeCaller \
 
 process filter_snp {
   container = "${container_url}"
-  label "big_mem_mono_cpus"
+  label "big_mem_multi_cpus"
   tag "$file_id"
 
   input:
-    tuple val(file_id), path(variants)
+    tuple val(file_id), path(vcf)
     tuple val(ref_id), path(fasta), path(fai), path(dict)
   output:
     tuple val(file_id), path("*_snp.vcf"), emit: vcf
@@ -37,7 +37,7 @@ process filter_snp {
 gatk3 -T SelectVariants \
   -nct ${task.cpus} \
   -R ${fasta} \
-  -V ${variants} \
+  -V ${vcf} \
   -selectType SNP \
   -o ${file_id}_snp.vcf
 """
@@ -45,11 +45,11 @@ gatk3 -T SelectVariants \
 
 process filter_indels {
   container = "${container_url}"
-  label "big_mem_mono_cpus"
+  label "big_mem_multi_cpus"
   tag "$file_id"
 
   input:
-    tuple val(file_id), path(variants)
+    tuple val(file_id), path(vcf)
     tuple val(ref_id), path(fasta), path(fai), path(dict)
   output:
     tuple val(file_id), path("*_indel.vcf"), emit: vcf
@@ -58,7 +58,7 @@ process filter_indels {
 gatk3 -T SelectVariants \
   -nct ${task.cpus} \
   -R ${fasta} \
-  -V ${variants} \
+  -V ${vcf} \
   -selectType INDEL \
   -o ${file_id}_indel.vcf
 """
@@ -68,11 +68,11 @@ high_confidence_snp_filter = "(QD < 2.0) || (FS > 60.0) || (MQ < 40.0) || (MQRan
 
 process high_confidence_snp {
   container = "${container_url}"
-  label "big_mem_mono_cpus"
+  label "big_mem_multi_cpus"
   tag "$file_id"
 
   input:
-    tuple val(file_id), path(variants)
+    tuple val(file_id), path(vcf)
     tuple val(ref_id), path(fasta), path(fai), path(dict)
   output:
     tuple val(file_id), path("*_snp.vcf"), emit: vcf
@@ -81,7 +81,7 @@ process high_confidence_snp {
 gatk3 -T VariantFiltration \
   -nct ${task.cpus} \
   -R ${fasta} \
-  -V ${variants} \
+  -V ${vcf} \
   --filterExpression "${high_confidence_snp_filter}" \
   --filterName "basic_snp_filter" \
   -o ${file_id}_filtered_snp.vcf
@@ -92,11 +92,11 @@ high_confidence_indel_filter = "QD < 3.0 || FS > 200.0 || ReadPosRankSum < -20.0
 
 process high_confidence_indels {
   container = "${container_url}"
-  label "big_mem_mono_cpus"
+  label "big_mem_multi_cpus"
   tag "$file_id"
 
   input:
-    tuple val(file_id), path(variants)
+    tuple val(file_id), path(vcf)
     tuple val(ref_id), path(fasta), path(fai), path(dict)
   output:
     tuple val(file_id), path("*_indel.vcf"), emit: vcf
@@ -105,9 +105,161 @@ process high_confidence_indels {
 gatk3 -T VariantFiltration \
   -nct ${task.cpus} \
   -R ${fasta} \
-  -V ${variants} \
+  -V ${vcf} \
   --filterExpression "${high_confidence_indel_filter}" \
   --filterName "basic_indel_filter" \
   -o ${file_id}_filtered_indel.vcf
 """
 }
+
+process recalibrate_snp_table {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(snp_file), path(indel_file), path(bam), path(bam_idx)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("recal_data_table"), emit: recal_table
+  script:
+"""
+gatk3 -T BaseRecalibrator \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -I ${bam} \
+  -knownSites ${snp_file} \
+  -knownSites ${indel_file} \
+  -o recal_data_table
+"""
+}
+
+process recalibrate_snp {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(snp_file), path(indel_file), path(bam), path(bam_idx)
+    tuple val(table_id), path(recal_data_table)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*.bam"), emit: bam
+  script:
+"""
+gatk3 -T PrintReads \
+  --use_jdk_deflater \
+  --use_jdk_inflater \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -I ${bam} \
+  -BQSR recal_data_table \
+  -o ${file_id}_recal.bam
+"""
+}
+
+process haplotype_caller {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(bam)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*.gvcf"), emit: gvcf
+  script:
+"""
+gatk3 -T HaplotypeCaller \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -I ${bam} \
+  -ERC GVCF \
+  -variant_index_type LINEAR -variant_index_parameter 128000 \
+  -o ${file_id}.gvcf
+"""
+}
+
+process gvcf_genotyping {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(gvcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*.vcf"), emit: vcf
+  script:
+"""
+gatk3 -T GenotypeGVCFs \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${gvcf} \
+  -o ${file_id}_joint.vcf
+"""
+}
+
+process select_variants_snp {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_joint_snp.vcf"), emit: vcf
+  script:
+"""
+gatk3 -T SelectVariants \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${vcf} \
+  -selectType SNP \
+  -o ${file_id}_joint_snp.vcf
+"""
+}
+
+process select_variants_indels {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_joint_indel.vcf"), emit: vcf
+  script:
+"""
+gatk3 -T SelectVariants \
+  -nct ${task.cpus} \
+  -R ${fasta} \
+  -V ${vcf} \
+  -selectType INDEL \
+  -o ${file_id}_joint_indel.vcf
+"""
+}
+
+process personalized_genome {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "$file_id"
+
+  input:
+    tuple val(file_id), path(vcf)
+    tuple val(ref_id), path(fasta), path(fai), path(dict)
+  output:
+    tuple val(file_id), path("*_genome.fasta"), emit: fasta
+
+  script:
+  library = pick_library(file_id, library_list)
+"""
+gatk3 -T FastaAlternateReferenceMaker\
+  -R ${reference} \
+  -V ${vcf} \
+  -o ${library}_genome.fasta
+"""
+}
+
-- 
GitLab