Skip to content
Snippets Groups Projects
Unverified Commit 36fafd89 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

SNP_calling.nf: add SNP_filter step

parent 58e36e9e
Branches
No related tags found
1 merge request!10Module for BWA GATK sambamba samblaster and bcftools
...@@ -9,6 +9,9 @@ profiles { ...@@ -9,6 +9,9 @@ profiles {
withName: trimming { withName: trimming {
container = "urqt:d62c1f8" container = "urqt:d62c1f8"
} }
withName: filter_fasta {
container = "bioawk:1.0"
}
withName: index_fasta { withName: index_fasta {
container = "bwa:0.7.17" container = "bwa:0.7.17"
} }
...@@ -21,9 +24,6 @@ profiles { ...@@ -21,9 +24,6 @@ profiles {
withName: sort_bam { withName: sort_bam {
container = "sambamba:0.6.7" container = "sambamba:0.6.7"
} }
withName: name_fasta {
container = "samtools:1.7"
}
withName: index_bam { withName: index_bam {
container = "sambamba:0.6.7" container = "sambamba:0.6.7"
} }
...@@ -36,6 +36,18 @@ profiles { ...@@ -36,6 +36,18 @@ profiles {
withName: HaplotypeCaller { withName: HaplotypeCaller {
container = "gatk:4.0.8.1" container = "gatk:4.0.8.1"
} }
withName: GetPileupSummaries {
container = "gatk:4.0.8.1"
}
withName: CalculateContamination {
container = "gatk:4.0.8.1"
}
withName: CollectSequencingArtifactMetrics {
container = "gatk:4.0.8.1"
}
withName: filter_SNP {
container = "gatk:4.0.8.1"
}
} }
} }
sge { sge {
......
params.fastq = "$baseDir/data/*.fastq" params.fastq = "$baseDir/data/*.fastq"
params.fasta = "$baseDir/data/*.fasta" params.fasta = "$baseDir/data/*.fasta"
params.seq_length = 800000
log.info "fastq files : ${params.fastq}" log.info "fastq files : ${params.fastq}"
log.info "fasta files : ${params.fasta}" log.info "fasta files : ${params.fasta}"
log.info "fasta length to retain : ${params.seq_length}"
def normal_sample = Eval.me(params.normal) def normal_sample = Eval.me(params.normal)
def tumor_sample = Eval.me(params.tumor) def tumor_sample = Eval.me(params.tumor)
log.info "normal : ${normal_sample}" log.info "normal : ${normal_sample}"
...@@ -11,11 +13,7 @@ Channel ...@@ -11,11 +13,7 @@ Channel
.fromPath( params.fasta ) .fromPath( params.fasta )
.ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" } .ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" }
.map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
.into { fasta_file; .set { fasta_file }
indel_fasta_file;
recalibration_fasta_file;
haplotypecaller_fasta_file
}
Channel Channel
.fromFilePairs( params.fastq ) .fromFilePairs( params.fastq )
.ifEmpty { error "Cannot find any fastq files matching: ${params.fastq}" } .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq}" }
...@@ -61,13 +59,39 @@ UrQt --t 20 --m ${task.cpus} --gz \ ...@@ -61,13 +59,39 @@ UrQt --t 20 --m ${task.cpus} --gz \
""" """
} }
process filter_fasta {
tag "$fasta_id"
cpus 4
publishDir "results/fasta/", mode: 'copy'
input:
set fasta_id, file(fasta) from fasta_file
output:
set fasta_idf, "*_filtered.fasta" into filter_fasta_files
script:
fasta_idf = "${fasta_id}_filtered"
"""
bioawk -c fastx '{ if(length(\$seq) > $params.seq_length) { print ">"\$name; print \$seq }}' ${fasta} > \
${fasta_id}_filtered.fasta
"""
}
filter_fasta_files.into{
filtered_fasta_files;
indel_fasta_file;
recalibration_fasta_file;
haplotypecaller_fasta_file
}
process index_fasta { process index_fasta {
tag "$fasta_id" tag "$fasta_id"
cpus 4 cpus 4
publishDir "results/mapping/index/", mode: 'copy' publishDir "results/mapping/index/", mode: 'copy'
input: input:
set fasta_id, file(fasta) from fasta_file set fasta_id, file(fasta) from filtered_fasta_files
output: output:
set fasta_id, "${fasta.baseName}.*" into index_files set fasta_id, "${fasta.baseName}.*" into index_files
...@@ -205,7 +229,7 @@ index_bam_files.into{ ...@@ -205,7 +229,7 @@ index_bam_files.into{
} }
haplotypecaller_fasta_file.into{ haplotypecaller_fasta_file.into{
haplo_fasta_file; final_fasta_file;
index2_fasta_file index2_fasta_file
index3_fasta_file index3_fasta_file
} }
...@@ -242,20 +266,49 @@ samtools faidx ${fasta} ...@@ -242,20 +266,49 @@ samtools faidx ${fasta}
""" """
} }
haplotypecaller_bam_files_norm = haplo_bam_files_norm final_bam_files_norm = haplo_bam_files_norm
.filter{ "normal_sample" == it[0] } .filter{ "normal_sample" == it[0] }
haplotypecaller_bam_files_tumor = haplo_bam_files_tumor final_bam_files_tumor = haplo_bam_files_tumor
.filter{ "tumor_sample" == it[0] } .filter{ "tumor_sample" == it[0] }
indexed_bam_files.into { indexed_bam_files.into {
index_bam_files_norm; index_bam_files_norm;
index_bam_files_tumor index_bam_files_tumor
} }
indexed_bam_files_norm = index_bam_files_norm final_indexed_bam_files_norm = index_bam_files_norm
.filter{ "normal_sample" == it[0] } .filter{ "normal_sample" == it[0] }
indexed_bam_files_tumor = index_bam_files_tumor final_indexed_bam_files_tumor = index_bam_files_tumor
.filter{ "tumor_sample" == it[0] } .filter{ "tumor_sample" == it[0] }
final_bam_files_norm.set{
haplotypecaller_bam_files_norm
}
final_bam_files_tumor.into{
haplotypecaller_bam_files_tumor;
artifact_bam_files_tumor;
pileup_bam_files_tumor
}
final_indexed_bam_files_norm.set{
haplo_index_bam_files_norm
}
final_indexed_bam_files_tumor.into{
haplo_index_bam_files_tumor;
artifact_index_bam_files_tumor;
pileup_index_bam_files_tumor
}
final_fasta_file.into{
haplo_fasta_file
artifact_fasta_file
}
indexed2_fasta_file.into{
haplo_indexed2_fasta_file
artifact_indexed2_fasta_file
}
indexed3_fasta_file.into{
haplo_indexed3_fasta_file;
artifact_indexed3_fasta_file
}
process HaplotypeCaller { process HaplotypeCaller {
tag "$file_id_norm" tag "$file_id_norm"
cpus 10 cpus 10
...@@ -263,21 +316,22 @@ process HaplotypeCaller { ...@@ -263,21 +316,22 @@ process HaplotypeCaller {
input: input:
set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm
set file_ididx_norm, file(bamidx_norm) from indexed_bam_files_norm set file_ididx_norm, file(bamidx_norm) from haplo_index_bam_files_norm
set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor
set file_ididx_tumor, file(bamidx_tumor) from indexed_bam_files_tumor set file_ididx_tumor, file(bamidx_tumor) from haplo_index_bam_files_tumor
set genome_id, file(fasta) from haplo_fasta_file set genome_id, file(fasta) from haplo_fasta_file
set genome2_idx, file(fasta2idx) from indexed2_fasta_file set genome2_idx, file(fasta2idx) from haplo_indexed2_fasta_file
set genome3_idx, file(fasta3idx) from indexed3_fasta_file set genome3_idx, file(fasta3idx) from haplo_indexed3_fasta_file
output: output:
set file_id_norm, "*.vcf" into vcf_files set file_id_norm, "*.vcf" into vcf_files
set file_id_norm, "*.vcf.idx" into index_vcf_files
set file_id_norm, "*.bam" into realigned_bams_files set file_id_norm, "*.bam" into realigned_bams_files
set "*_mutect2_report.txt" into mutect2_report file "*_mutect2_report.txt" into mutect2_report
script: script:
""" """
gatk Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \ gatk --java-options "-Xmx32G" Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \
-I ${bam_tumor} -tumor ${file_id_tumor} \ -I ${bam_tumor} -tumor ${file_id_tumor} \
-I ${bam_norm} -normal ${file_id_norm} \ -I ${bam_norm} -normal ${file_id_norm} \
-O ${file_id_norm}_raw_calls.g.vcf \ -O ${file_id_norm}_raw_calls.g.vcf \
...@@ -285,75 +339,106 @@ gatk Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \ ...@@ -285,75 +339,106 @@ gatk Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \
""" """
} }
vcf_files.into{
pileup_vcf_files;
filter_vcf_files
}
index_vcf_files.into{
pileup_index_vcf_files;
filter_index_vcf_files
}
/* /*
process filter_SNP { process GetPileupSummaries {
tag "$file_id" tag "$file_id_norm"
cpus 4 cpus 1
publishDir "results/SNP/vcf/", mode: 'copy' publishDir "results/SNP/vcf/", mode: 'copy'
input: input:
set file_id_norm, file(vcf) from pileup_vcf_files
set fileidx_id_norm, file(vcfidx) from pileup_index_vcf_files
set file_id_tumor, file(bam_tumor) from pileup_bam_files_tumor
output: output:
set file_id, "*.vcf" into vcf_files_filtered set file_id_tumor, "*.table" into pileup_files
file "*_pileup_report.txt" into pileup_report
script: script:
""" """
gatk --java-options "-Xmx2g" Mutect2 \ gatk --java-options "-Xmx32G" GetPileupSummaries \
-R hg38/Homo_sapiens_assembly38.fasta \ -I ${bam_tumor} \
-I tumor.bam \ -V ${vcf} \
-I normal.bam \ -O ${file_id_tumor}_pileup.table \
-tumor HCC1143_tumor \ 2> ${file_id_tumor}_pileup_report.txt
-normal HCC1143_normal \ """
-pon resources/chr17_pon.vcf.gz \ }
--germline-resource resources/chr17_af-only-gnomad_grch38.vcf.gz \
--af-of-alleles-not-in-resource 0.0000025 \
--disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
-L chr17plus.interval_list \
-O 1_somatic_m2.vcf.gz \
-bamout 2_tumor_normal_m2.bam
gatk Mutect2 \
-R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta \
-I HG00190.bam \
-tumor HG00190 \
--disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
-L chr17plus.interval_list \
-O 3_HG00190.vcf.gz
gatk CreateSomaticPanelOfNormals \
-vcfs 3_HG00190.vcf.gz \
-vcfs 4_NA19771.vcf.gz \
-vcfs 5_HG02759.vcf.gz \
-O 6_threesamplepon.vcf.gz
gatk GetPileupSummaries \
-I tumor.bam \
-V resources/chr17_small_exac_common_3_grch38.vcf.gz \
-O 7_tumor_getpileupsummaries.table
gatk CalculateContamination \
-I 7_tumor_getpileupsummaries.table \
-O 8_tumor_calculatecontamination.table
gatk FilterMutectCalls \ process CalculateContamination {
-V somatic_m2.vcf.gz \ tag "$file_id_tumor"
--contamination-table tumor_calculatecontamination.table \ cpus 1
-O 9_somatic_oncefiltered.vcf.gz publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_tumor, file(pileup_table) from pileup_files
output:
set file_id_tumor, "*.table" into contamination_files
file "*_contamination_report.txt" into contamination_report
script:
"""
gatk --java-options "-Xmx32G" CalculateContamination \
-I ${pileup_table} \
-O $file_id_tumor}_contamination.table \
2> ${file_id_tumor}_contamination_report.txt
"""
}
*/
process CollectSequencingArtifactMetrics {
tag "$file_id_tumor"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_tumor, file(bam_tumor) from artifact_bam_files_tumor
set genome_id, file(fasta) from artifact_fasta_file
set genome2_idx, file(fasta2idx) from artifact_indexed2_fasta_file
set genome3_idx, file(fasta3idx) from artifact_indexed3_fasta_file
output:
set file_id_tumor, "*_artifact.*" into artifact_files
file "*_artifact_report.txt" into artifact_report
script:
"""
gatk CollectSequencingArtifactMetrics \ gatk CollectSequencingArtifactMetrics \
-I tumor.bam \ -I ${bam_tumor} \
-O 10_tumor_artifact \ -O ${file_id_tumor}_artifact \
–-FILE_EXTENSION ".txt" \ -R ${fasta} \
-R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta 2> ${file_id_tumor}_artifact_report.txt
"""
gatk FilterByOrientationBias \ }
-A G/T \
-A C/T \ process filter_SNP {
-V 9_somatic_oncefiltered.vcf.gz \ tag "$file_id_norm"
-P tumor_artifact.pre_adapter_detail_metrics.txt \ cpus 1
-O 11_somatic_twicefiltered.vcf.gz publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_norm, file(vcf) from filter_vcf_files
set fileidx_id_norm, file(vcfid) from filter_index_vcf_files
output:
set file_id_norm, "*.vcf" into vcf_files_filtered
file "*_filter_report.txt" into filter_report
script:
"""
gatk FilterMutectCalls \
-V ${vcf} \
-O ${file_id_norm}_filtered.vcf \
2> ${file_id_norm}_filter_report.txt
""" """
} }
*/
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment