Newer
Older
params.fastq = "$baseDir/data/*.fastq"
params.fasta = "$baseDir/data/*.fasta"
log.info "fastq files : ${params.fastq}"
log.info "fasta files : ${params.fasta}"
log.info "fasta length to retain : ${params.seq_length}"
def normal_sample = Eval.me(params.normal)
def tumor_sample = Eval.me(params.tumor)
log.info "normal : ${normal_sample}"
log.info "tumor : ${tumor_sample}"
.ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" }
.map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
Channel
.fromFilePairs( params.fastq )
.ifEmpty { error "Cannot find any fastq files matching: ${params.fastq}" }
.set { fastq_files }
process adaptor_removal {
tag "$pair_id"
publishDir "results/fastq/adaptor_removal/", mode: 'copy'
input:
set pair_id, file(reads) from fastq_files
output:
set pair_id, file("*.fastq.gz") into fastq_files_cut
file "*_cutadapt_report.txt" into cut_files_report
script:
"""
cutadapt -a AGATCGGAAGAG -g CTCTTCCGATCT -A AGATCGGAAGAG -G CTCTTCCGATCT \
-o ${pair_id}_cut_R1.fastq.gz -p ${pair_id}_cut_R2.fastq.gz \
${reads[0]} ${reads[1]} > ${pair_id}_cutadapt_report.txt
"""
}
process trimming {
tag "${pair_id}"
cpus 4
publishDir "results/fastq/trimming/", mode: 'copy'
input:
set pair_id, file(reads) from fastq_files_cut
output:
set pair_id, file("*.fastq.gz") into fastq_files_trim
file "*_trimming_report.txt" into trimming_files_report
script:
"""
UrQt --t 20 --m ${task.cpus} --gz \
--in ${reads[0]} --inpair ${reads[1]} \
--out ${pair_id}_trim_R1.fastq.gz --outpair ${pair_id}_trim_R2.fastq.gz \
> ${pair_id}_trimming_report.txt
"""
process filter_fasta {
tag "$fasta_id"
cpus 4
publishDir "results/fasta/", mode: 'copy'
input:
set fasta_id, file(fasta) from fasta_file
output:
set fasta_idf, "*_filtered.fasta" into filter_fasta_files
script:
fasta_idf = "${fasta_id}_filtered"
"""
bioawk -c fastx '{ if(length(\$seq) > $params.seq_length) { print ">"\$name; print \$seq }}' ${fasta} > \
${fasta_id}_filtered.fasta
"""
}
filter_fasta_files.into{
filtered_fasta_files;
indel_fasta_file;
recalibration_fasta_file;
haplotypecaller_fasta_file
}
process index_fasta {
tag "$fasta_id"
publishDir "results/mapping/index/", mode: 'copy'
set fasta_id, file(fasta) from filtered_fasta_files
set fasta_id, "${fasta.baseName}.*" into index_files
file "*_bwa_report.txt" into index_files_report
bwa index -p ${fasta_id} ${fasta} \
&> ${fasta.baseName}_bwa_report.txt
fastq_files_trim.into {
fastq_files_trim_norm;
fastq_files_trim_tumor
}
collect_fastq_files_trim_norm = fastq_files_trim_norm
.filter{ normal_sample.contains(it[0]) }
.map { it -> ["normal_sample", it[0], it[1]]}
collect_fastq_files_trim_tumor = fastq_files_trim_tumor
.filter{ tumor_sample.contains(it[0]) }
.map { it -> ["tumor_sample", it[0], it[1]]}
collect_fastq_files_trim = Channel.create()
.mix(collect_fastq_files_trim_norm, collect_fastq_files_trim_tumor)
process mapping_fastq {
tag "$pair_id"
cpus 6
publishDir "results/mapping/bam/", mode: 'copy'
set sample_name, pair_id, file(reads) from collect_fastq_files_trim
set index_id, file(index) from index_files.collect()
output:
set pair_id, "${pair_id}.bam" into bam_files
file "${pair_id}_bwa_report.txt" into mapping_repport_files
script:
"""
bwa mem -t ${task.cpus} -M \
-R '@RG\\tID:${sample_name}\\tSM:${sample_name}\\tPL:Illumina' \
${index_id} ${reads[0]} ${reads[1]} | \
samblaster --addMateTags -M -i /dev/stdin | \
sambamba view -t ${task.cpus} --valid -S -f bam -l 0 /dev/stdin -o ${pair_id}.bam \
2> ${pair_id}_bwa_report.txt
"""
}
process sort_bam {
tag "$file_id"
input:
set file_id, file(bam) from bam_files
set file_id, "*_sorted.bam" into sorted_bam_files
sambamba sort -t ${task.cpus} --tmpdir=./tmp -o ${file_id}_sorted.bam ${bam}
sorted_bam_files.into {
collect_sorted_bam_file_norm = sorted_bam_file_norm
.filter{ normal_sample.contains(it[0]) }
.map { it -> it[1]}
.buffer( size: normal_sample.size())
.map { it -> ["normal_sample", it]}
collect_sorted_bam_file_tumor = sorted_bam_file_tumor
.filter{ tumor_sample.contains(it[0]) }
.map { it -> it[1]}
.buffer( size: tumor_sample.size())
.map { it -> ["tumor_sample", it]}
collect_sorted_bam_file = Channel.create()
.mix(collect_sorted_bam_file_norm, collect_sorted_bam_file_tumor)
process merge_bam {
tag "$file_id"
cpus 4
input:
set file_id, file(bam) from collect_sorted_bam_file
output:
set file_id, "*.bam" into merged_bam_files
script:
"""
sambamba merge -t ${task.cpus} ${file_id}.bam ${bam}
"""
}
merged_bam_files.into{
index_merged_bam_files;
haplo_bam_files_norm;
haplo_bam_files_tumor
}
process index_bam {
tag "$file_id"
publishDir "results/mapping/bam/", mode: 'copy'
set file_id, file(bam) from index_merged_bam_files
set file_id, "*.bam*" into index_bam_files
sambamba index -t ${task.cpus} ${bam}
index_bam_files.into{
named_index_bam_files;
indexed_bam_files
}
haplotypecaller_fasta_file.into{
index2_fasta_file
index3_fasta_file
}
process index2_fasta {
tag "$genome_id"
publishDir "results/fasta/", mode: 'copy'
input:
set genome_id, file(fasta) from index2_fasta_file
output:
set genome_id, "*.dict" into indexed2_fasta_file
script:
"""
gatk CreateSequenceDictionary -R ${fasta} &> gatk_output.txt
"""
}
process index3_fasta {
tag "$genome_id"
publishDir "results/fasta/", mode: 'copy'
input:
set genome_id, file(fasta) from index3_fasta_file
output:
set genome_id, "*.fai" into indexed3_fasta_file
script:
"""
samtools faidx ${fasta}
"""
}
final_bam_files_norm = haplo_bam_files_norm
final_bam_files_tumor = haplo_bam_files_tumor
indexed_bam_files.into {
index_bam_files_norm;
index_bam_files_tumor
}
final_indexed_bam_files_norm = index_bam_files_norm
final_indexed_bam_files_tumor = index_bam_files_tumor
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
final_bam_files_norm.set{
haplotypecaller_bam_files_norm
}
final_bam_files_tumor.into{
haplotypecaller_bam_files_tumor;
artifact_bam_files_tumor;
pileup_bam_files_tumor
}
final_indexed_bam_files_norm.set{
haplo_index_bam_files_norm
}
final_indexed_bam_files_tumor.into{
haplo_index_bam_files_tumor;
artifact_index_bam_files_tumor;
pileup_index_bam_files_tumor
}
final_fasta_file.into{
haplo_fasta_file
artifact_fasta_file
}
indexed2_fasta_file.into{
haplo_indexed2_fasta_file
artifact_indexed2_fasta_file
}
indexed3_fasta_file.into{
haplo_indexed3_fasta_file;
artifact_indexed3_fasta_file
}
publishDir "results/SNP/vcf/", mode: 'copy'
set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm
set file_ididx_norm, file(bamidx_norm) from haplo_index_bam_files_norm
set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor
set file_ididx_tumor, file(bamidx_tumor) from haplo_index_bam_files_tumor
set genome_id, file(fasta) from haplo_fasta_file
set genome2_idx, file(fasta2idx) from haplo_indexed2_fasta_file
set genome3_idx, file(fasta3idx) from haplo_indexed3_fasta_file
set file_id_norm, "*.vcf" into vcf_files
set file_id_norm, "*.vcf.idx" into index_vcf_files
set file_id_norm, "*.bam" into realigned_bams_files
file "*_mutect2_report.txt" into mutect2_report
gatk --java-options "-Xmx32G" Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \
-I ${bam_tumor} -tumor ${file_id_tumor} \
-I ${bam_norm} -normal ${file_id_norm} \
-O ${file_id_norm}_raw_calls.g.vcf \
-bamout ${file_id_norm}_realigned.bam 2> ${file_id_norm}_mutect2_report.txt
vcf_files.into{
pileup_vcf_files;
filter_vcf_files
}
index_vcf_files.into{
pileup_index_vcf_files;
filter_index_vcf_files
}
/*
process GetPileupSummaries {
tag "$file_id_norm"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_norm, file(vcf) from pileup_vcf_files
set fileidx_id_norm, file(vcfidx) from pileup_index_vcf_files
set file_id_tumor, file(bam_tumor) from pileup_bam_files_tumor
output:
set file_id_tumor, "*.table" into pileup_files
file "*_pileup_report.txt" into pileup_report
script:
"""
gatk --java-options "-Xmx32G" GetPileupSummaries \
-I ${bam_tumor} \
-V ${vcf} \
-O ${file_id_tumor}_pileup.table \
2> ${file_id_tumor}_pileup_report.txt
"""
}
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
process CalculateContamination {
tag "$file_id_tumor"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_tumor, file(pileup_table) from pileup_files
output:
set file_id_tumor, "*.table" into contamination_files
file "*_contamination_report.txt" into contamination_report
script:
"""
gatk --java-options "-Xmx32G" CalculateContamination \
-I ${pileup_table} \
-O $file_id_tumor}_contamination.table \
2> ${file_id_tumor}_contamination_report.txt
"""
}
*/
process CollectSequencingArtifactMetrics {
tag "$file_id_tumor"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_tumor, file(bam_tumor) from artifact_bam_files_tumor
set genome_id, file(fasta) from artifact_fasta_file
set genome2_idx, file(fasta2idx) from artifact_indexed2_fasta_file
set genome3_idx, file(fasta3idx) from artifact_indexed3_fasta_file
output:
set file_id_tumor, "*_artifact.*" into artifact_files
file "*_artifact_report.txt" into artifact_report
gatk CollectSequencingArtifactMetrics \
-I ${bam_tumor} \
-O ${file_id_tumor}_artifact \
-R ${fasta} \
2> ${file_id_tumor}_artifact_report.txt
"""
}
process filter_SNP {
tag "$file_id_norm"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_norm, file(vcf) from filter_vcf_files
set fileidx_id_norm, file(vcfid) from filter_index_vcf_files
output:
set file_id_norm, "*.vcf" into vcf_files_filtered
file "*_filter_report.txt" into filter_report
script:
"""
gatk FilterMutectCalls \
-V ${vcf} \
-O ${file_id_norm}_filtered.vcf \
2> ${file_id_norm}_filter_report.txt
"""
}