Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
/*
========================================================================================================================
Bolero
========================================================================================================================
bolero pipeline :
* Pipeline dedicated to transcriptomic analysis of Hepatitis B Virus
* Preprocessing, filtration, alignment, quantification.
****************************************************************
Help Message Definition
****************************************************************
*/
def helpMessage() {
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow ./src/bolero.nf -c ./src/nextflow.config -profile singularity
Nextflow parameters:
-profile [str] Configuration profile to use.
Available: docker, singularity, podman, psmn, ccin2p3
Mandatory arguments:
--input [path] Path to the folder containing fast5 files.
If skip basecalling option enabled, path to fastq files folder.
--adapt [str] Sequence of 5'RACE adapter.
--gsp [str] Sequence of gene-specific primer used in 5'RACE amplification step.
References:
--genome [file] Path to the fasta file containing the genome.
--gtf [file] Path to the gtf file containing the genome annotation.
Nanopore basecalling:
--skipBC [boolean] Skip basecalling step. If true, give fastq folder as input. Default: true.
--flowcell [str] Nanopore flowcell. Default = FLO-MIN106.
--kit [str] Nanopore kit. Default = SQK-PBK004.
--gpu_mode [boolean] Guppy basecaller configuration. Default: false.
"gpu" mode is dedicated to NVIDIA Cuda compatible system according to Guppy specifications.
GPU basecalling parameters:
--min_qscore [float] Minimum quality score threshold, default = 7.0.
--gpu_runners_per_device [int] Number of runner per device, default = 32 (refer to guppy manual).
--num_callers [int] Number of callers, default = 16 (refer to guppy manual).
--chunks_per_runner [int] Number of chunks per runner, default = 512 (refer to guppy manual).
--chunk_size [int] Chunck size, default = 1900 (refer to guppy manual).
Help:
--help Display this help message.
""".stripIndent()
}
// Show help message
params.help = ""
params.h = ""
if (params.help || params.h) {
helpMessage()
exit 0
}
/*
****************************************************************
Default Parameters
****************************************************************
*/
/* params in */
params.skipBC = true
params.gpu_mode = false
params.adapt = "CGACTGGAGCACGAGGACACTGA" // "CGACTGGAGCACGAGGACACTGACATGGACTGAAGGAGTAGAAA" //
params.gsp = "TTAGGCAGAGGTGAAAAAGTTG"
params.transcriptome = "./data/202201_Full-length_HBV_GTFv3/20220112_preCore_FL_HBV_XGR_transcripts.fasta"
params.genome = "./data/202201_Full-length_HBV_GTFv3/preCore_XGR.fasta"
params.gtf = "./data/202201_Full-length_HBV_GTFv3/20220112_GTF_preCore_FL_HBV_XGR.gtf"
params.flowcell = "FLO-MIN106"
params.kit = "SQK-PBK004"
params.min_qscore = 7.0
params.gpu_runners_per_device = 32
params.num_callers = 16
params.chunks_per_runner = 512
params.chunk_size = 1900
/* Params out */
params.basecalling_out = "01_Basecalling/"
params.barcoding_out = "02_barcoding/"
params.fastq_out = "03_fastq/"
params.seqkit_grep_out = "03_fastq/"
params.cutadapt_out = "04_cutadapt/"
params.minimap2_genome_out = "05_minimap2/"
params.start_position_counts_out = "06_start_positions/"
params.pycoQC_out = "pycoQC/"
/*
****************************************************************
Logs
****************************************************************
*/
log.info "fast5/q folder : ${params.input}"
log.info "5'RACE adapter sequence : ${params.adapt}"
if(!params.skipBC) log.info "Guppy basecalling calculation using GPU mode : ${params.gpu_mode}."
log.info "Genome file : ${params.genome}"
log.info "Genome annotation file : ${params.gtf}"
/*
****************************************************************
Channel definitions
****************************************************************
*/
Channel
.of( params.input )
.ifEmpty { error "No fast5/q folder defined." }
.set { input }
Channel
.of( params.adapt )
.ifEmpty { error "No adapter sequence defined." }
.set { adapt }
Channel
.fromPath( params.genome )
.ifEmpty { error "No genome defined, a fasta file containing the full length preC RNA from HBV genome." }
.set { genome }
Channel
.fromPath( params.gtf )
.ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." }
.set { gtf }
// .map( it -> [it.baseName, it])
/*
****************************************************************
Imports
****************************************************************
*/
if(!params.skipBC) {
/* Hardware configuration, if Nvidia CUDA compatible graphic card is installed, use guppy-gpu, else guppy-cpu (much slower)*/
if(params.gpu_mode) {
include { basecall_fast5_gpu } from "./nf_modules/ont-guppy/main.nf"
}
else {
include { basecall_fast5_cpu } from "./nf_modules/ont-guppy/main.nf"
}
}
// Replace concatenate by seqkit fct to parallelization:
// include { concatenate } from "./nf_modules/seqkit/main.nf"
include { concatenate } from "./nf_modules/concatenate/main.nf"
include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf"
include { hbv_genome } from "./nf_modules/minimap2/main.nf"
include { seqkit_grep } from "./nf_modules/seqkit/main.nf"
include { sort_bam as sort_bam_genome } from './nf_modules/samtools/main.nf' addParams(sort_bam_out: params.minimap2_genome_out)
include { index_bam as index_bam_genome } from './nf_modules/samtools/main.nf' addParams(index_bam_out: params.minimap2_genome_out)
include { start_position_counts } from "./nf_modules/samtools/main.nf"
// creation des fonctions NanoSplicer:
// include { jwr_check } from "./nf_modules/nanosplicer/main.nf"
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/*
****************************************************************
Workflow
****************************************************************
*/
workflow {
//######################## BASECALLING ########################
if(params.skipBC) {
concatenate(params.input)
// Replace by seqkit scat to parallelization
}
else {
if(params.gpu_mode) {
basecall_fast5_gpu(input)
concatenate(basecall_fast5_gpu.out.pass)
// Replace by seqkit scat to parallelization
}
else {
basecall_fast5_cpu(input)
concatenate(basecall_fast5_cpu.out.pass)
// Replace by seqkit scat to parallelization
}
}
//####################### PREPROCESSING #######################
/*
seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp)
cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt)
//########################## MAPPING ##########################
hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome)
sort_bam_genome(hbv_genome.out.bam)
index_bam_genome(sort_bam_genome.out.sorted_bam.collect())
//###################### QUANTIFICATION #######################
start_position_counts(sort_bam_genome.out.sorted_bam)