diff --git a/src/1_JU28_59vs17_SNP_calling.sh b/src/1_JU28_59vs17_SNP_calling.sh
index be184a5d28a761426da572ccad00f2a05a7b6634..d806b325d9df67c6600287c1d1a09264fdcd0006 100644
--- a/src/1_JU28_59vs17_SNP_calling.sh
+++ b/src/1_JU28_59vs17_SNP_calling.sh
@@ -11,12 +11,14 @@ cd ~/projects/JU28_59vs17_SNP/
 
 # training set analysis
 
-./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/samples/*_{1,2}.fastq.gz" -resume -w ~/data/work_s/ --tumor "[\"s_NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"s_MR_550_clean\", \"s_MR_350_clean\"]"
+mkdir tests
+cd tests
+../nextflow ../src/SNP_calling.nf -c ../src/SNP_calling.config -profile docker --fasta "../data/fasta/DBG2OLC_output2.fasta" --fastq "../data/samples/*_{1,2}.fastq.gz" -resume -w ~/data/work_s/ --tumor "[\"s_NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"s_MR_550_clean\", \"s_MR_350_clean\"]" --seq_number 800000
 ~/scripts/sms.sh "SNP done"
 
 # real set analysis
 
-./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" -resume -w ~/data/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]"
+./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC_output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" -resume -w ~/data/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]"
 ~/scripts/sms.sh "SNP done"
 
 ./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" --sam "results/mapping/sam/*.sam" -resume -w ~/data/work/
diff --git a/src/SNP_calling.config b/src/SNP_calling.config
index 24d288442ac8b9c32b4b06bdca044a2bf80623d5..947ddf2c55fea6b024efa13827b098115fb0bbf7 100644
--- a/src/SNP_calling.config
+++ b/src/SNP_calling.config
@@ -9,6 +9,9 @@ profiles {
       withName: trimming {
         container = "urqt:d62c1f8"
       }
+      withName: filter_fasta {
+        container = "bioawk:1.0"
+      }
       withName: index_fasta {
         container = "bowtie2:2.3.4.1"
       }
@@ -33,6 +36,18 @@ profiles {
       withName: HaplotypeCaller {
         container = "gatk:4.0.8.1"
       }
+      withName: GetPileupSummaries {
+        container = "gatk:4.0.8.1"
+      }
+      withName: CalculateContamination {
+        container = "gatk:4.0.8.1"
+      }
+      withName: CollectSequencingArtifactMetrics {
+        container = "gatk:4.0.8.1"
+      }
+      withName: filter_SNP {
+        container = "gatk:4.0.8.1"
+      }
     }
   }
   sge {
diff --git a/src/SNP_calling.nf b/src/SNP_calling.nf
index 3d040bfced133f1ef6c794a20e4e805e369d3604..514d83db11d7605b722aea7ef2fff608d4019569 100644
--- a/src/SNP_calling.nf
+++ b/src/SNP_calling.nf
@@ -1,7 +1,9 @@
 params.fastq = "$baseDir/data/*.fastq"
 params.fasta = "$baseDir/data/*.fasta"
+params.seq_length = 800000
 log.info "fastq files : ${params.fastq}"
 log.info "fasta files : ${params.fasta}"
+log.info "fasta length to retain : ${params.seq_length}"
 def normal_sample = Eval.me(params.normal)
 def tumor_sample = Eval.me(params.tumor)
 log.info "normal : ${normal_sample}"
@@ -11,11 +13,7 @@ Channel
   .fromPath( params.fasta )
   .ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" }
   .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
-  .into { fasta_file;
-     indel_fasta_file;
-     recalibration_fasta_file;
-     haplotypecaller_fasta_file
-  }
+  .set { fasta_file  }
 Channel
   .fromFilePairs( params.fastq )
   .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq}" }
@@ -61,13 +59,39 @@ UrQt --t 20 --m ${task.cpus} --gz \
 """
 }
 
+process filter_fasta {
+  tag "$fasta_id"
+  cpus 4
+  publishDir "results/fasta/", mode: 'copy'
+
+  input:
+    set fasta_id, file(fasta) from fasta_file
+
+  output:
+    set fasta_idf, "*_filtered.fasta" into filter_fasta_files
+
+  script:
+    fasta_idf = "${fasta_id}_filtered"
+"""
+bioawk -c fastx '{ if(length(\$seq) > $params.seq_length) { print ">"\$name; print \$seq }}' ${fasta} > \
+${fasta_id}_filtered.fasta
+"""
+}
+
+filter_fasta_files.into{
+  filtered_fasta_files;
+  indel_fasta_file;
+  recalibration_fasta_file;
+  haplotypecaller_fasta_file
+}
+
 process index_fasta {
   tag "$file_id"
   cpus 4
   publishDir "results/mapping/index/", mode: 'copy'
 
   input:
-    set file_id, file(fasta) from fasta_file
+    set fasta_id, file(fasta) from filtered_fasta_files
 
   output:
     file "*.index*" into index_files
@@ -133,19 +157,18 @@ sambamba sort -t ${task.cpus} --tmpdir=./tmp -o ${file_id}_sorted.bam ${bam}
 
 sorted_bam_files.into {
   sorted_bam_file_norm;
-  sorted_bam_file_tumor
+  sorted_bam_file_tumor;
 }
 
 collect_sorted_bam_file_norm = sorted_bam_file_norm
   .filter{ normal_sample.contains(it[0]) }
   .map { it -> it[1]}
-  .collect()
+  .buffer( size: normal_sample.size())
   .map { it -> ["normal_sample", it]}
-
 collect_sorted_bam_file_tumor = sorted_bam_file_tumor
   .filter{ tumor_sample.contains(it[0]) }
   .map { it -> it[1]}
-  .collect()
+  .buffer( size: tumor_sample.size())
   .map { it -> ["tumor_sample", it]}
 
 collect_sorted_bam_file = Channel.create()
@@ -154,6 +177,7 @@ collect_sorted_bam_file = Channel.create()
 process merge_bam {
   tag "$file_id"
   cpus 4
+  publishDir "results/mapping/bam/", mode: 'copy'
 
   input:
     set file_id, file(bam) from collect_sorted_bam_file
@@ -200,7 +224,7 @@ index_bam_files.into{
 }
 
 haplotypecaller_fasta_file.into{
-    haplo_fasta_file;
+    final_fasta_file;
     index2_fasta_file
     index3_fasta_file
   }
@@ -237,117 +261,191 @@ samtools faidx ${fasta}
 """
 }
 
-haplotypecaller_bam_files_norm = haplo_bam_files_norm
+final_bam_files_norm = haplo_bam_files_norm
   .filter{ "normal_sample" == it[0] }
-haplotypecaller_bam_files_tumor = haplo_bam_files_tumor
-   .filter{ "tumor_sample" == it[0] }
+final_bam_files_tumor = haplo_bam_files_tumor
+  .filter{ "tumor_sample" == it[0] }
 
 indexed_bam_files.into {
   index_bam_files_norm;
   index_bam_files_tumor
 }
-indexed_bam_files_norm = index_bam_files_norm
+final_indexed_bam_files_norm = index_bam_files_norm
   .filter{ "normal_sample" == it[0] }
-indexed_bam_files_tumor = index_bam_files_tumor
+final_indexed_bam_files_tumor = index_bam_files_tumor
    .filter{ "tumor_sample" == it[0] }
 
+final_bam_files_norm.set{
+  haplotypecaller_bam_files_norm
+}
+final_bam_files_tumor.into{
+  haplotypecaller_bam_files_tumor;
+  artifact_bam_files_tumor;
+  pileup_bam_files_tumor
+}
+final_indexed_bam_files_norm.set{
+  haplo_index_bam_files_norm
+}
+final_indexed_bam_files_tumor.into{
+  haplo_index_bam_files_tumor;
+  artifact_index_bam_files_tumor;
+  pileup_index_bam_files_tumor
+}
+final_fasta_file.into{
+  haplo_fasta_file;
+  artifact_fasta_file;
+  filter_fasta_file
+}
+indexed2_fasta_file.into{
+  haplo_indexed2_fasta_file;
+  artifact_indexed2_fasta_file;
+  filter_indexed2_fasta_file
+}
+indexed3_fasta_file.into{
+  haplo_indexed3_fasta_file;
+  artifact_indexed3_fasta_file;
+  filter_indexed3_fasta_file
+}
+
 process HaplotypeCaller {
-  tag "$file_id"
-  cpus 4
+  tag "$file_id_norm"
+  cpus 10
   publishDir "results/SNP/vcf/", mode: 'copy'
 
   input:
-    set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm.collect()
-    set file_ididx_norm, file(bamidx_norm) from indexed_bam_files_norm.collect()
-    set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor.collect()
-    set file_ididx_tumor, file(bamidx_tumor) from indexed_bam_files_tumor.collect()
-    set genome_id, file(fasta) from haplo_fasta_file.collect()
-    set genome2_idx, file(fasta2idx) from indexed2_fasta_file.collect()
-    set genome3_idx, file(fasta3idx) from indexed3_fasta_file.collect()
+    set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm
+    set file_ididx_norm, file(bamidx_norm) from haplo_index_bam_files_norm
+    set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor
+    set file_ididx_tumor, file(bamidx_tumor) from haplo_index_bam_files_tumor
+    set genome_id, file(fasta) from haplo_fasta_file
+    set genome2_idx, file(fasta2idx) from haplo_indexed2_fasta_file
+    set genome3_idx, file(fasta3idx) from haplo_indexed3_fasta_file
 
   output:
-    set file_id, "*.vcf" into vcf_files
-    set file_id, "*.bam" into realigned_bams_files
+    set file_id_norm, "*.vcf" into vcf_files
+    set file_id_norm, "*.vcf.idx" into index_vcf_files
+    set file_id_norm, "*.bam" into realigned_bams_files
+    file "*_mutect2_report.txt" into mutect2_report
 
   script:
 """
-gatk Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \
+gatk --java-options "-Xmx32G" Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \
 -I ${bam_tumor} -tumor ${file_id_tumor} \
 -I ${bam_norm} -normal ${file_id_norm} \
--O ${file_id}_raw_calls.g.vcf \
--bamout ${file_id}_realigned.bam
+-O ${file_id_norm}_raw_calls.g.vcf \
+-bamout ${file_id_norm}_realigned.bam 2> ${file_id_norm}_mutect2_report.txt
 """
 }
 
+vcf_files.into{
+  pileup_vcf_files;
+  filter_vcf_files
+}
+index_vcf_files.into{
+  pileup_index_vcf_files;
+  filter_index_vcf_files
+}
+
 /*
-process filter_SNP {
-  tag "$file_id"
-  cpus 4
+process GetPileupSummaries {
+  tag "$file_id_norm"
+  cpus 1
   publishDir "results/SNP/vcf/", mode: 'copy'
 
   input:
+    set file_id_norm, file(vcf) from pileup_vcf_files
+    set fileidx_id_norm, file(vcfidx) from pileup_index_vcf_files
+    set file_id_tumor, file(bam_tumor) from pileup_bam_files_tumor
 
   output:
-    set file_id, "*.vcf" into vcf_files_filtered
+    set file_id_tumor, "*.table" into pileup_files
+    file "*_pileup_report.txt" into pileup_report
 
   script:
 """
-gatk --java-options "-Xmx2g" Mutect2 \
--R hg38/Homo_sapiens_assembly38.fasta \
--I tumor.bam \
--I normal.bam \
--tumor HCC1143_tumor \
--normal HCC1143_normal \
--pon resources/chr17_pon.vcf.gz \
---germline-resource resources/chr17_af-only-gnomad_grch38.vcf.gz \
---af-of-alleles-not-in-resource 0.0000025 \
---disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
--L chr17plus.interval_list \
--O 1_somatic_m2.vcf.gz \
--bamout 2_tumor_normal_m2.bam
-
-gatk Mutect2 \
--R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta \
--I HG00190.bam \
--tumor HG00190 \
---disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
--L chr17plus.interval_list \
--O 3_HG00190.vcf.gz
-
-gatk CreateSomaticPanelOfNormals \
--vcfs 3_HG00190.vcf.gz \
--vcfs 4_NA19771.vcf.gz \
--vcfs 5_HG02759.vcf.gz \
--O 6_threesamplepon.vcf.gz
-
-gatk GetPileupSummaries \
--I tumor.bam \
--V resources/chr17_small_exac_common_3_grch38.vcf.gz \
--O 7_tumor_getpileupsummaries.table
-
-gatk CalculateContamination \
--I 7_tumor_getpileupsummaries.table \
--O 8_tumor_calculatecontamination.table
+gatk --java-options "-Xmx32G" GetPileupSummaries \
+-I ${bam_tumor} \
+-V ${vcf} \
+-O ${file_id_tumor}_pileup.table \
+2> ${file_id_tumor}_pileup_report.txt
+"""
+}
 
-gatk FilterMutectCalls \
--V somatic_m2.vcf.gz \
---contamination-table tumor_calculatecontamination.table \
--O 9_somatic_oncefiltered.vcf.gz
+process CalculateContamination {
+  tag "$file_id_tumor"
+  cpus 1
+  publishDir "results/SNP/vcf/", mode: 'copy'
+
+  input:
+    set file_id_tumor, file(pileup_table) from pileup_files
+
+  output:
+    set file_id_tumor, "*.table" into contamination_files
+    file "*_contamination_report.txt" into contamination_report
+
+  script:
+"""
+gatk --java-options "-Xmx32G" CalculateContamination \
+-I ${pileup_table} \
+-O $file_id_tumor}_contamination.table \
+2> ${file_id_tumor}_contamination_report.txt
+"""
+}
+*/
+
+process CollectSequencingArtifactMetrics {
+  tag "$file_id_tumor"
+  cpus 1
+  publishDir "results/SNP/vcf/", mode: 'copy'
+
+  input:
+    set file_id_tumor, file(bam_tumor) from artifact_bam_files_tumor
+    set genome_id, file(fasta) from artifact_fasta_file
+    set genome2_idx, file(fasta2idx) from artifact_indexed2_fasta_file
+    set genome3_idx, file(fasta3idx) from artifact_indexed3_fasta_file
 
+  output:
+    set file_id_tumor, "*_artifact.*" into artifact_files
+    file "*_artifact_report.txt" into artifact_report
+
+  script:
+"""
 gatk CollectSequencingArtifactMetrics \
--I tumor.bam \
--O 10_tumor_artifact \
-–-FILE_EXTENSION ".txt" \
--R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta
-
-gatk FilterByOrientationBias \
--A G/T \
--A C/T \
--V 9_somatic_oncefiltered.vcf.gz \
--P tumor_artifact.pre_adapter_detail_metrics.txt \
--O 11_somatic_twicefiltered.vcf.gz
+-I ${bam_tumor} \
+-O ${file_id_tumor}_artifact \
+-R ${fasta} \
+2> ${file_id_tumor}_artifact_report.txt
+"""
+}
+
+process filter_SNP {
+  tag "$file_id_norm"
+  cpus 1
+  publishDir "results/SNP/vcf/", mode: 'copy'
 
+  input:
+    set file_id_norm, file(vcf) from filter_vcf_files
+    set fileidx_id_norm, file(vcfid) from filter_index_vcf_files
+    set genome_id, file(fasta) from filter_fasta_file
+    set genome2_idx, file(fasta2idx) from filter_indexed2_fasta_file
+    set genome3_idx, file(fasta3idx) from filter_indexed3_fasta_file
+
+  output:
+    set file_id_norm, "*.vcf" into vcf_files_filtered
+    file "*_filter_report.txt" into filter_report
+
+  script:
+"""
+gatk FilterMutectCalls \
+-V ${vcf} \
+-O ${file_id_norm}_filtered.vcf \
+2> ${file_id_norm}_filter_report.txt
+gatk SelectVariants \
+-R ${fasta} \
+--variant ${file_id_norm}_filtered.vcf \
+--exclude-filtered \
+-O ${file_id_norm}_filtered_pass.vcf \
+2>> ${file_id_norm}_filter_report.txt
 """
 }
 
-*/
diff --git a/src/docker_modules/bioawk/1.0/Dockerfile b/src/docker_modules/bioawk/1.0/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f7ca9803e60926ed90bce0abfe4cf7af90d72672
--- /dev/null
+++ b/src/docker_modules/bioawk/1.0/Dockerfile
@@ -0,0 +1,21 @@
+FROM ubuntu:18.04
+MAINTAINER Laurent Modolo
+
+ENV BIOAWK_VERSION=1.0
+ENV PACKAGES git=1:2.17* \
+   build-essential=12.4* \
+   ca-certificates=20180409 \
+   zlib1g-dev=1:1.2.11* \
+   byacc
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends ${PACKAGES} && \
+    apt-get clean
+
+RUN git clone https://github.com/lh3/bioawk.git && \
+  cd bioawk && \
+  git checkout tags/v${BIOAWK_VERSION} && \
+  make && \
+  cd .. && \
+  mv bioawk/bioawk /usr/bin/ && \
+  rm -Rf bioawk
diff --git a/src/docker_modules/bioawk/1.0/docker_init.sh b/src/docker_modules/bioawk/1.0/docker_init.sh
new file mode 100755
index 0000000000000000000000000000000000000000..23c163352f6d380d6dcb0accb6de24d8281af8b6
--- /dev/null
+++ b/src/docker_modules/bioawk/1.0/docker_init.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+docker build src/docker_modules/bioawk/1.0 -t 'bioawk:1.0'