diff --git a/src/1_JU28_59vs17_SNP_calling.sh b/src/1_JU28_59vs17_SNP_calling.sh index be184a5d28a761426da572ccad00f2a05a7b6634..d806b325d9df67c6600287c1d1a09264fdcd0006 100644 --- a/src/1_JU28_59vs17_SNP_calling.sh +++ b/src/1_JU28_59vs17_SNP_calling.sh @@ -11,12 +11,14 @@ cd ~/projects/JU28_59vs17_SNP/ # training set analysis -./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/samples/*_{1,2}.fastq.gz" -resume -w ~/data/work_s/ --tumor "[\"s_NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"s_MR_550_clean\", \"s_MR_350_clean\"]" +mkdir tests +cd tests +../nextflow ../src/SNP_calling.nf -c ../src/SNP_calling.config -profile docker --fasta "../data/fasta/DBG2OLC_output2.fasta" --fastq "../data/samples/*_{1,2}.fastq.gz" -resume -w ~/data/work_s/ --tumor "[\"s_NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"s_MR_550_clean\", \"s_MR_350_clean\"]" --seq_number 800000 ~/scripts/sms.sh "SNP done" # real set analysis -./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" -resume -w ~/data/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]" +./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC_output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" -resume -w ~/data/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]" ~/scripts/sms.sh "SNP done" ./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" --sam "results/mapping/sam/*.sam" -resume -w ~/data/work/ diff --git a/src/SNP_calling.config b/src/SNP_calling.config index 24d288442ac8b9c32b4b06bdca044a2bf80623d5..947ddf2c55fea6b024efa13827b098115fb0bbf7 100644 --- a/src/SNP_calling.config +++ b/src/SNP_calling.config @@ -9,6 +9,9 @@ profiles { withName: trimming { container = "urqt:d62c1f8" } + withName: filter_fasta { + container = "bioawk:1.0" + } withName: index_fasta { container = "bowtie2:2.3.4.1" } @@ -33,6 +36,18 @@ profiles { withName: HaplotypeCaller { container = "gatk:4.0.8.1" } + withName: GetPileupSummaries { + container = "gatk:4.0.8.1" + } + withName: CalculateContamination { + container = "gatk:4.0.8.1" + } + withName: CollectSequencingArtifactMetrics { + container = "gatk:4.0.8.1" + } + withName: filter_SNP { + container = "gatk:4.0.8.1" + } } } sge { diff --git a/src/SNP_calling.nf b/src/SNP_calling.nf index 3d040bfced133f1ef6c794a20e4e805e369d3604..514d83db11d7605b722aea7ef2fff608d4019569 100644 --- a/src/SNP_calling.nf +++ b/src/SNP_calling.nf @@ -1,7 +1,9 @@ params.fastq = "$baseDir/data/*.fastq" params.fasta = "$baseDir/data/*.fasta" +params.seq_length = 800000 log.info "fastq files : ${params.fastq}" log.info "fasta files : ${params.fasta}" +log.info "fasta length to retain : ${params.seq_length}" def normal_sample = Eval.me(params.normal) def tumor_sample = Eval.me(params.tumor) log.info "normal : ${normal_sample}" @@ -11,11 +13,7 @@ Channel .fromPath( params.fasta ) .ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" } .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} - .into { fasta_file; - indel_fasta_file; - recalibration_fasta_file; - haplotypecaller_fasta_file - } + .set { fasta_file } Channel .fromFilePairs( params.fastq ) .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq}" } @@ -61,13 +59,39 @@ UrQt --t 20 --m ${task.cpus} --gz \ """ } +process filter_fasta { + tag "$fasta_id" + cpus 4 + publishDir "results/fasta/", mode: 'copy' + + input: + set fasta_id, file(fasta) from fasta_file + + output: + set fasta_idf, "*_filtered.fasta" into filter_fasta_files + + script: + fasta_idf = "${fasta_id}_filtered" +""" +bioawk -c fastx '{ if(length(\$seq) > $params.seq_length) { print ">"\$name; print \$seq }}' ${fasta} > \ +${fasta_id}_filtered.fasta +""" +} + +filter_fasta_files.into{ + filtered_fasta_files; + indel_fasta_file; + recalibration_fasta_file; + haplotypecaller_fasta_file +} + process index_fasta { tag "$file_id" cpus 4 publishDir "results/mapping/index/", mode: 'copy' input: - set file_id, file(fasta) from fasta_file + set fasta_id, file(fasta) from filtered_fasta_files output: file "*.index*" into index_files @@ -133,19 +157,18 @@ sambamba sort -t ${task.cpus} --tmpdir=./tmp -o ${file_id}_sorted.bam ${bam} sorted_bam_files.into { sorted_bam_file_norm; - sorted_bam_file_tumor + sorted_bam_file_tumor; } collect_sorted_bam_file_norm = sorted_bam_file_norm .filter{ normal_sample.contains(it[0]) } .map { it -> it[1]} - .collect() + .buffer( size: normal_sample.size()) .map { it -> ["normal_sample", it]} - collect_sorted_bam_file_tumor = sorted_bam_file_tumor .filter{ tumor_sample.contains(it[0]) } .map { it -> it[1]} - .collect() + .buffer( size: tumor_sample.size()) .map { it -> ["tumor_sample", it]} collect_sorted_bam_file = Channel.create() @@ -154,6 +177,7 @@ collect_sorted_bam_file = Channel.create() process merge_bam { tag "$file_id" cpus 4 + publishDir "results/mapping/bam/", mode: 'copy' input: set file_id, file(bam) from collect_sorted_bam_file @@ -200,7 +224,7 @@ index_bam_files.into{ } haplotypecaller_fasta_file.into{ - haplo_fasta_file; + final_fasta_file; index2_fasta_file index3_fasta_file } @@ -237,117 +261,191 @@ samtools faidx ${fasta} """ } -haplotypecaller_bam_files_norm = haplo_bam_files_norm +final_bam_files_norm = haplo_bam_files_norm .filter{ "normal_sample" == it[0] } -haplotypecaller_bam_files_tumor = haplo_bam_files_tumor - .filter{ "tumor_sample" == it[0] } +final_bam_files_tumor = haplo_bam_files_tumor + .filter{ "tumor_sample" == it[0] } indexed_bam_files.into { index_bam_files_norm; index_bam_files_tumor } -indexed_bam_files_norm = index_bam_files_norm +final_indexed_bam_files_norm = index_bam_files_norm .filter{ "normal_sample" == it[0] } -indexed_bam_files_tumor = index_bam_files_tumor +final_indexed_bam_files_tumor = index_bam_files_tumor .filter{ "tumor_sample" == it[0] } +final_bam_files_norm.set{ + haplotypecaller_bam_files_norm +} +final_bam_files_tumor.into{ + haplotypecaller_bam_files_tumor; + artifact_bam_files_tumor; + pileup_bam_files_tumor +} +final_indexed_bam_files_norm.set{ + haplo_index_bam_files_norm +} +final_indexed_bam_files_tumor.into{ + haplo_index_bam_files_tumor; + artifact_index_bam_files_tumor; + pileup_index_bam_files_tumor +} +final_fasta_file.into{ + haplo_fasta_file; + artifact_fasta_file; + filter_fasta_file +} +indexed2_fasta_file.into{ + haplo_indexed2_fasta_file; + artifact_indexed2_fasta_file; + filter_indexed2_fasta_file +} +indexed3_fasta_file.into{ + haplo_indexed3_fasta_file; + artifact_indexed3_fasta_file; + filter_indexed3_fasta_file +} + process HaplotypeCaller { - tag "$file_id" - cpus 4 + tag "$file_id_norm" + cpus 10 publishDir "results/SNP/vcf/", mode: 'copy' input: - set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm.collect() - set file_ididx_norm, file(bamidx_norm) from indexed_bam_files_norm.collect() - set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor.collect() - set file_ididx_tumor, file(bamidx_tumor) from indexed_bam_files_tumor.collect() - set genome_id, file(fasta) from haplo_fasta_file.collect() - set genome2_idx, file(fasta2idx) from indexed2_fasta_file.collect() - set genome3_idx, file(fasta3idx) from indexed3_fasta_file.collect() + set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm + set file_ididx_norm, file(bamidx_norm) from haplo_index_bam_files_norm + set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor + set file_ididx_tumor, file(bamidx_tumor) from haplo_index_bam_files_tumor + set genome_id, file(fasta) from haplo_fasta_file + set genome2_idx, file(fasta2idx) from haplo_indexed2_fasta_file + set genome3_idx, file(fasta3idx) from haplo_indexed3_fasta_file output: - set file_id, "*.vcf" into vcf_files - set file_id, "*.bam" into realigned_bams_files + set file_id_norm, "*.vcf" into vcf_files + set file_id_norm, "*.vcf.idx" into index_vcf_files + set file_id_norm, "*.bam" into realigned_bams_files + file "*_mutect2_report.txt" into mutect2_report script: """ -gatk Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \ +gatk --java-options "-Xmx32G" Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \ -I ${bam_tumor} -tumor ${file_id_tumor} \ -I ${bam_norm} -normal ${file_id_norm} \ --O ${file_id}_raw_calls.g.vcf \ --bamout ${file_id}_realigned.bam +-O ${file_id_norm}_raw_calls.g.vcf \ +-bamout ${file_id_norm}_realigned.bam 2> ${file_id_norm}_mutect2_report.txt """ } +vcf_files.into{ + pileup_vcf_files; + filter_vcf_files +} +index_vcf_files.into{ + pileup_index_vcf_files; + filter_index_vcf_files +} + /* -process filter_SNP { - tag "$file_id" - cpus 4 +process GetPileupSummaries { + tag "$file_id_norm" + cpus 1 publishDir "results/SNP/vcf/", mode: 'copy' input: + set file_id_norm, file(vcf) from pileup_vcf_files + set fileidx_id_norm, file(vcfidx) from pileup_index_vcf_files + set file_id_tumor, file(bam_tumor) from pileup_bam_files_tumor output: - set file_id, "*.vcf" into vcf_files_filtered + set file_id_tumor, "*.table" into pileup_files + file "*_pileup_report.txt" into pileup_report script: """ -gatk --java-options "-Xmx2g" Mutect2 \ --R hg38/Homo_sapiens_assembly38.fasta \ --I tumor.bam \ --I normal.bam \ --tumor HCC1143_tumor \ --normal HCC1143_normal \ --pon resources/chr17_pon.vcf.gz \ ---germline-resource resources/chr17_af-only-gnomad_grch38.vcf.gz \ ---af-of-alleles-not-in-resource 0.0000025 \ ---disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \ --L chr17plus.interval_list \ --O 1_somatic_m2.vcf.gz \ --bamout 2_tumor_normal_m2.bam - -gatk Mutect2 \ --R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta \ --I HG00190.bam \ --tumor HG00190 \ ---disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \ --L chr17plus.interval_list \ --O 3_HG00190.vcf.gz - -gatk CreateSomaticPanelOfNormals \ --vcfs 3_HG00190.vcf.gz \ --vcfs 4_NA19771.vcf.gz \ --vcfs 5_HG02759.vcf.gz \ --O 6_threesamplepon.vcf.gz - -gatk GetPileupSummaries \ --I tumor.bam \ --V resources/chr17_small_exac_common_3_grch38.vcf.gz \ --O 7_tumor_getpileupsummaries.table - -gatk CalculateContamination \ --I 7_tumor_getpileupsummaries.table \ --O 8_tumor_calculatecontamination.table +gatk --java-options "-Xmx32G" GetPileupSummaries \ +-I ${bam_tumor} \ +-V ${vcf} \ +-O ${file_id_tumor}_pileup.table \ +2> ${file_id_tumor}_pileup_report.txt +""" +} -gatk FilterMutectCalls \ --V somatic_m2.vcf.gz \ ---contamination-table tumor_calculatecontamination.table \ --O 9_somatic_oncefiltered.vcf.gz +process CalculateContamination { + tag "$file_id_tumor" + cpus 1 + publishDir "results/SNP/vcf/", mode: 'copy' + + input: + set file_id_tumor, file(pileup_table) from pileup_files + + output: + set file_id_tumor, "*.table" into contamination_files + file "*_contamination_report.txt" into contamination_report + + script: +""" +gatk --java-options "-Xmx32G" CalculateContamination \ +-I ${pileup_table} \ +-O $file_id_tumor}_contamination.table \ +2> ${file_id_tumor}_contamination_report.txt +""" +} +*/ + +process CollectSequencingArtifactMetrics { + tag "$file_id_tumor" + cpus 1 + publishDir "results/SNP/vcf/", mode: 'copy' + + input: + set file_id_tumor, file(bam_tumor) from artifact_bam_files_tumor + set genome_id, file(fasta) from artifact_fasta_file + set genome2_idx, file(fasta2idx) from artifact_indexed2_fasta_file + set genome3_idx, file(fasta3idx) from artifact_indexed3_fasta_file + output: + set file_id_tumor, "*_artifact.*" into artifact_files + file "*_artifact_report.txt" into artifact_report + + script: +""" gatk CollectSequencingArtifactMetrics \ --I tumor.bam \ --O 10_tumor_artifact \ -–-FILE_EXTENSION ".txt" \ --R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta - -gatk FilterByOrientationBias \ --A G/T \ --A C/T \ --V 9_somatic_oncefiltered.vcf.gz \ --P tumor_artifact.pre_adapter_detail_metrics.txt \ --O 11_somatic_twicefiltered.vcf.gz +-I ${bam_tumor} \ +-O ${file_id_tumor}_artifact \ +-R ${fasta} \ +2> ${file_id_tumor}_artifact_report.txt +""" +} + +process filter_SNP { + tag "$file_id_norm" + cpus 1 + publishDir "results/SNP/vcf/", mode: 'copy' + input: + set file_id_norm, file(vcf) from filter_vcf_files + set fileidx_id_norm, file(vcfid) from filter_index_vcf_files + set genome_id, file(fasta) from filter_fasta_file + set genome2_idx, file(fasta2idx) from filter_indexed2_fasta_file + set genome3_idx, file(fasta3idx) from filter_indexed3_fasta_file + + output: + set file_id_norm, "*.vcf" into vcf_files_filtered + file "*_filter_report.txt" into filter_report + + script: +""" +gatk FilterMutectCalls \ +-V ${vcf} \ +-O ${file_id_norm}_filtered.vcf \ +2> ${file_id_norm}_filter_report.txt +gatk SelectVariants \ +-R ${fasta} \ +--variant ${file_id_norm}_filtered.vcf \ +--exclude-filtered \ +-O ${file_id_norm}_filtered_pass.vcf \ +2>> ${file_id_norm}_filter_report.txt """ } -*/ diff --git a/src/docker_modules/bioawk/1.0/Dockerfile b/src/docker_modules/bioawk/1.0/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..f7ca9803e60926ed90bce0abfe4cf7af90d72672 --- /dev/null +++ b/src/docker_modules/bioawk/1.0/Dockerfile @@ -0,0 +1,21 @@ +FROM ubuntu:18.04 +MAINTAINER Laurent Modolo + +ENV BIOAWK_VERSION=1.0 +ENV PACKAGES git=1:2.17* \ + build-essential=12.4* \ + ca-certificates=20180409 \ + zlib1g-dev=1:1.2.11* \ + byacc + +RUN apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + apt-get clean + +RUN git clone https://github.com/lh3/bioawk.git && \ + cd bioawk && \ + git checkout tags/v${BIOAWK_VERSION} && \ + make && \ + cd .. && \ + mv bioawk/bioawk /usr/bin/ && \ + rm -Rf bioawk diff --git a/src/docker_modules/bioawk/1.0/docker_init.sh b/src/docker_modules/bioawk/1.0/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..23c163352f6d380d6dcb0accb6de24d8281af8b6 --- /dev/null +++ b/src/docker_modules/bioawk/1.0/docker_init.sh @@ -0,0 +1,2 @@ +#!/bin/sh +docker build src/docker_modules/bioawk/1.0 -t 'bioawk:1.0'