Newer
Older
params.fastq = "$baseDir/data/*.fastq"
params.fasta = "$baseDir/data/*.fasta"
log.info "fastq files : ${params.fastq}"
log.info "fasta files : ${params.fasta}"
def normal_sample = Eval.me(params.normal)
def tumor_sample = Eval.me(params.tumor)
log.info "normal : ${normal_sample}"
log.info "tumor : ${tumor_sample}"
.ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" }
.map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
.into { fasta_file;
indel_fasta_file;
recalibration_fasta_file;
haplotypecaller_fasta_file
}
Channel
.fromFilePairs( params.fastq )
.ifEmpty { error "Cannot find any fastq files matching: ${params.fastq}" }
.set { fastq_files }
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
if (params.sam == "") {
process adaptor_removal {
tag "$pair_id"
publishDir "results/fastq/adaptor_removal/", mode: 'copy'
input:
set pair_id, file(reads) from fastq_files
output:
set pair_id, "*_cut_R{1,2}.fastq.gz" into fastq_files_cut
script:
"""
cutadapt -a AGATCGGAAGAG -g CTCTTCCGATCT -A AGATCGGAAGAG -G CTCTTCCGATCT \
-o ${pair_id}_cut_R1.fastq.gz -p ${pair_id}_cut_R2.fastq.gz \
${reads[0]} ${reads[1]} > ${pair_id}_report.txt
"""
}
process trimming {
tag "${reads}"
cpus 4
publishDir "results/fastq/trimming/", mode: 'copy'
input:
set pair_id, file(reads) from fastq_files_cut
output:
set pair_id, "*_trim_R{1,2}.fastq.gz" into fastq_files_trim
script:
"""
UrQt --t 20 --m ${task.cpus} --gz \
--in ${reads[0]} --inpair ${reads[1]} \
--out ${pair_id}_trim_R1.fastq.gz --outpair ${pair_id}_trim_R2.fastq.gz \
> ${pair_id}_trimming_report.txt
"""
}
process index_fasta {
tag "$fasta_id"
cpus 4
publishDir "results/mapping/index/", mode: 'copy'
input:
set fasta_id, file(fasta) from fasta_file
output:
set fasta_id, "${fasta.baseName}.*" into index_files
file "*_bwa_report.txt" into index_files_report
script:
"""
bwa index -p ${fasta.baseName} ${fasta} \
&> ${fasta.baseName}_bwa_report.txt
"""
}
process mapping_fastq {
tag "$reads"
cpus 4
publishDir "results/mapping/sam/", mode: 'copy'
input:
set pair_id, file(reads) from fastq_files_trim
set index_id, file(index) from index_files.collect()
output:
file "${pair_id}.sam" into sam_files
file "${pair_id}_bwa_report.txt" into mapping_repport_files
script:
"""
bwa mem -t ${task.cpus} \
${index_id} ${reads[0]} ${reads[1]} \
-o ${pair_id}.sam &> ${pair_id}_bwa_report.txt
"""
}
} else {
Channel
.fromPath( params.sam )
.ifEmpty { error "Cannot find any sam files matching: ${params.sam}" }
.map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
.set { sam_files }
}
process dedup_sam {
tag "$file_id"
cpus 4
set file_id, file(sam) from sam_files
set file_id, "*_dedup.sam*" into dedup_sam_files
samblaster --addMateTags -i ${sam} -o ${file_id}_dedup.sam
process sam_to_bam {
set file_id, file(sam) from dedup_sam_files
output:
set file_id, "*.bam" into dedup_bam_files
script:
"""
sambamba view -t ${task.cpus} -S -f bam -l 0 ${sam} -o ${file_id}.bam
"""
}
process sort_bam {
tag "$file_id"
input:
set file_id, file(bam) from dedup_bam_files
set file_id, "*_sorted.bam" into sorted_bam_files
sambamba sort -t ${task.cpus} --tmpdir=./tmp -o ${file_id}_sorted.bam ${bam}
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
sorted_bam_files.into {
sorted_bam_files_norm;
sorted_bam_files_tumor
}
collect_sorted_bam_file = sorted_bam_files_norm
.filter{ normal_sample.contains(it[0]) }
.map { it -> it[1]}
.collect()
.map { it -> ["normal_sample", it]}
collect_sorted_bam_file.join(
sorted_bam_files_tumor
.filter{ tumor_sample.contains(it[0]) }
.map { it -> it[1]}
.collect()
.map { it -> ["tumor_sample", it]}
)
process merge_bam {
tag "$file_id"
cpus 4
input:
set file_id, file(bam) from collect_sorted_bam_file
output:
set file_id, "*.bam" into merged_bam_files
script:
"""
sambamba merge -t ${task.cpus} ${file_id}.bam ${bam}
"""
}
process name_bam {
tag "$file_id"
cpus 4
publishDir "results/mapping/bam/", mode: 'copy'
input:
set file_id, file(bam) from merged_bam_files
output:
set file_id, "*_named.bam" into named_bam_files
script:
"""
samtools view -H ${bam} > header.sam
echo "@RG\tID:${file_id}\tLB:library1\tPL:illumina\tPU:${file_id}\tSM:${file_id}" \
>> header.sam
cp ${bam} ${file_id}_named.bam
samtools reheader header.sam ${file_id}_named.bam
"""
}
named_bam_files.into{
index_named_bam_files;
haplotypecaller_named_bam_files
}
process index_bam {
tag "$file_id"
publishDir "results/mapping/bam/", mode: 'copy'
set file_id, file(bam) from index_named_bam_files
set file_id, "*.bam*" into indexed_bam_files
sambamba index -t ${task.cpus} ${bam}
haplotypecaller_fasta_file.into{
haplo_fasta_file;
index2_fasta_file
index3_fasta_file
}
process index2_fasta {
tag "$genome_id"
publishDir "results/fasta/", mode: 'copy'
input:
set genome_id, file(fasta) from index2_fasta_file
output:
set genome_id, "*.dict" into indexed2_fasta_file
script:
"""
gatk CreateSequenceDictionary -R ${fasta} &> gatk_output.txt
"""
}
process index3_fasta {
tag "$genome_id"
publishDir "results/fasta/", mode: 'copy'
input:
set genome_id, file(fasta) from index3_fasta_file
output:
set genome_id, "*.fai" into indexed3_fasta_file
script:
"""
samtools faidx ${fasta}
"""
}
process HaplotypeCaller {
tag "$file_id"
publishDir "results/SNP/vcf/", mode: 'copy'
set file_id, file(bam) from haplotypecaller_named_bam_files.collect()
set file_ididx, file(bamidx) from indexed_bam_files.collect()
set genome_id, file(fasta) from haplo_fasta_file.collect()
set genome2_idx, file(fasta2idx) from indexed2_fasta_file.collect()
set genome3_idx, file(fasta3idx) from indexed3_fasta_file.collect()
set file_id, "*.vcf" into vcf_files
set file_id, "*.bam" into realigned_bams_files
gatk Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \
-I ${bam} -tumor ${params.tumor} -normal ${params.normal} \
-O ${file_id}_raw_calls.g.vcf \
-bamout ${file_id}_realigned.bam
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
/*
process filter_SNP {
tag "$file_id"
cpus 4
publishDir "results/SNP/vcf/", mode: 'copy'
input:
output:
set file_id, "*.vcf" into vcf_files_filtered
script:
"""
gatk --java-options "-Xmx2g" Mutect2 \
-R hg38/Homo_sapiens_assembly38.fasta \
-I tumor.bam \
-I normal.bam \
-tumor HCC1143_tumor \
-normal HCC1143_normal \
-pon resources/chr17_pon.vcf.gz \
--germline-resource resources/chr17_af-only-gnomad_grch38.vcf.gz \
--af-of-alleles-not-in-resource 0.0000025 \
--disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
-L chr17plus.interval_list \
-O 1_somatic_m2.vcf.gz \
-bamout 2_tumor_normal_m2.bam
gatk Mutect2 \
-R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta \
-I HG00190.bam \
-tumor HG00190 \
--disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
-L chr17plus.interval_list \
-O 3_HG00190.vcf.gz
gatk CreateSomaticPanelOfNormals \
-vcfs 3_HG00190.vcf.gz \
-vcfs 4_NA19771.vcf.gz \
-vcfs 5_HG02759.vcf.gz \
-O 6_threesamplepon.vcf.gz
gatk GetPileupSummaries \
-I tumor.bam \
-V resources/chr17_small_exac_common_3_grch38.vcf.gz \
-O 7_tumor_getpileupsummaries.table
gatk CalculateContamination \
-I 7_tumor_getpileupsummaries.table \
-O 8_tumor_calculatecontamination.table
gatk FilterMutectCalls \
-V somatic_m2.vcf.gz \
--contamination-table tumor_calculatecontamination.table \
-O 9_somatic_oncefiltered.vcf.gz
gatk CollectSequencingArtifactMetrics \
-I tumor.bam \
-O 10_tumor_artifact \
–-FILE_EXTENSION ".txt" \
-R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta
gatk FilterByOrientationBias \
-A G/T \
-A C/T \
-V 9_somatic_oncefiltered.vcf.gz \
-P tumor_artifact.pre_adapter_detail_metrics.txt \
-O 11_somatic_twicefiltered.vcf.gz
"""
}