Newer
Older
/*
========================================================================================================================
Bolero
========================================================================================================================
bolero pipeline :
* Pipeline dedicated to transcriptomic analysis of Hepatitis B Virus
* Preprocessing, filtration, alignment, quantification.
****************************************************************
Help Message Definition
****************************************************************
*/
def helpMessage() {
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow ./src/bolero.nf -c ./src/nextflow.config -profile singularity
Nextflow parameters:
-profile [str] Configuration profile to use.
Available: docker, singularity, podman, psmn, ccin2p3
Mandatory arguments:
--input [path] Path to the folder containing fast5 files.
If skip basecalling option enabled, path to fastq files folder.
--adapt [file] Path to the txt/fasta file containing the sequence of 5'RACE adapter.
--gsp [file] Path to the txt/fasta file containing the sequence of gene-specific primer used in 5'RACE amplification step.
References:
--genome [file] Path to the fasta file containing the genome.
--gtf [file] Path to the gtf file containing the genome annotation.
Nanopore basecalling:
--skipBC [boolean] Skip basecalling step. If true, give fastq folder as input. Default: true.
--flowcell [str] Nanopore flowcell. Default = FLO-MIN106.
--kit [str] Nanopore kit. Default = SQK-PBK004.
--gpu_mode [boolean] Guppy basecaller configuration. Default: false.
"gpu" mode is dedicated to NVIDIA Cuda compatible system according to Guppy specifications.
GPU basecalling parameters:
--min_qscore [float] Minimum quality score threshold, default = 7.0.
--gpu_runners_per_device [int] Number of runner per device, default = 32 (refer to guppy manual).
--num_callers [int] Number of callers, default = 16 (refer to guppy manual).
--chunks_per_runner [int] Number of chunks per runner, default = 512 (refer to guppy manual).
--chunk_size [int] Chunck size, default = 1900 (refer to guppy manual).
Help:
--help | --h Display this help message.
""".stripIndent()
}
// Show help message
params.help = ""
params.h = ""
if (params.help || params.h) {
helpMessage()
exit 0
}
/*
****************************************************************
Default Parameters
****************************************************************
*/
params.skipBC = true
params.gpu_mode = false
params.adapt = "CGACTGGAGCACGAGGACACTGA" // "CGACTGGAGCACGAGGACACTGACATGGACTGAAGGAGTAGAAA" //
params.gsp = "TTAGGCAGAGGTGAAAAAGTTG"
params.transcriptome = "./data/202201_Full-length_HBV_GTFv3/20220112_preCore_FL_HBV_XGR_transcripts.fasta"
params.genome = "./data/202201_Full-length_HBV_GTFv3/preCore_XGR.fasta"
params.gtf = "./data/202201_Full-length_HBV_GTFv3/20220112_GTF_preCore_FL_HBV_XGR.gtf"
params.flowcell = "FLO-MIN106"
params.kit = "SQK-PBK004"
params.min_qscore = 7.0
params.gpu_runners_per_device = 32
params.num_callers = 16
params.chunks_per_runner = 512
params.chunk_size = 1900
params.kit_barcoding = "EXP-PBC001"
params.basecalling_out = "01_basecalling/"
params.barcoding_out = "02_barcoding/"
params.fastq_out = "03_fastq/"
params.seqkit_grep_out = "03_fastq/"
params.cutadapt_out = "04_cutadapt/"
params.minimap2_genome_out = "05_minimap2/"
params.start_position_counts_out = "06_start_positions/"
params.nanosplicer_out = "07_nanosplicer/"
params.rna_count_out = "08_RNA_count/"
/*
****************************************************************
Logs
****************************************************************
*/
log.info "fast5/q folder : ${params.input}"
log.info "5'RACE adapter sequence : ${params.adapt}"
if(!params.skipBC) log.info "Guppy basecalling calculation using GPU mode : ${params.gpu_mode}."
log.info "Genome file : ${params.genome}"
log.info "Genome annotation file : ${params.gtf}"
/*
****************************************************************
Channel definitions
****************************************************************
*/
Channel
.of( params.input )
.ifEmpty { error "No fast5/q folder defined." }
.set { input }
.ifEmpty { error "No adapter sequence defined." }
.set { adapt }
Channel
.fromPath( params.gsp )
.ifEmpty { error "No adapter sequence defined." }
.set { gsp }
*/
Channel
.fromPath( params.genome )
.ifEmpty { error "No genome defined, a fasta file containing the full length preC RNA from HBV genome." }
.set { genome }
Channel
.fromPath( params.gtf )
.ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." }
.set { gtf }
Channel
.fromPath(params.input+'*/', type: 'dir')
.map(it -> [it.baseName, it])
.set{barcodes}
.fromPath('/home/alia/pipelines/bolero/results/01_Basecalling/pass/', type: 'dir')
Channel
.fromPath('/home/alia/pipelines/bolero/results/01_Basecalling/sequencing_summary.txt')
.set{ss}
/*
****************************************************************
Imports
****************************************************************
*/
if(!params.skipBC) {
/* Hardware configuration, if Nvidia CUDA compatible graphic card is installed, use guppy-gpu, else guppy-cpu (much slower)*/
if(params.gpu_mode) {
include { basecall_fast5_gpu } from "./nf_modules/ont-guppy/main.nf"
include { barcoding_gpu } from "./nf_modules/ont-guppy/main.nf"
}
else {
include { basecall_fast5_cpu } from "./nf_modules/ont-guppy/main.nf"
include { barcoding_cpu } from "./nf_modules/ont-guppy/main.nf"
include { control_basecalling } from "./nf_modules/pycoqc/main.nf"
include { control_bam } from "./nf_modules/pycoqc/main.nf"
include { concatenate } from "./nf_modules/seqkit/main.nf"
include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf"
include { hbv_genome } from "./nf_modules/minimap2/main.nf"
include { seqkit_grep } from "./nf_modules/seqkit/main.nf"
include { sort_bam } from './nf_modules/samtools/main.nf' addParams(sort_bam_out: params.minimap2_genome_out)
include { index_bam } from './nf_modules/samtools/main.nf' addParams(index_bam_out: params.minimap2_genome_out)
include { sort_index_bam } from './nf_modules/samtools/main.nf' addParams(indexed_bam_out: params.minimap2_genome_out)
include { start_position_counts } from "./nf_modules/samtools/main.nf"
include { start_position_individuals } from "./nf_modules/start_positions/main.nf"
include { jwr_checker } from "./nf_modules/nanosplicer/main.nf"
include { junctions_nanosplicer } from "./nf_modules/junction_nanosplicer/main.nf"
include { rna_count } from "./nf_modules/rna_count/main.nf"
/*
****************************************************************
Workflow
****************************************************************
*/
workflow {
//######################## BASECALLING ########################
if(params.skipBC) { // we take fastq files as input and skip basecalling
concatenate(barcodes)
//il reste à adapter ça
else { // we take fast5 files as input and proceed to basecalling with guppy
basecall_fast5_gpu(input)
if(params.kit_barcoding != ""){
barcoding_gpu(basecall_fast5_gpu.out.pass)
barcoding_gpu.out.barcodes
.flatten()
.map{it -> [it.name, it]}
.set{tuples_barcode}
concatenate(tuples_barcode)
}
else{
concatenate(basecall_fast5_gpu.out.pass)
control_basecalling(basecall_fast5_gpu.out.sequencing_summary)
basecall_fast5_cpu(input)
if(params.kit_barcoding != ""){
barcoding_cpu(pass)
barcoding_cpu.out.barcodes
.flatten()
.map{it -> [it.name, it]}
.set{tuples_barcode}
concatenate(tuples_barcode)
}
else{
concatenate(pass)
}
//control_basecalling(basecall_fast5_cpu.out.sequencing_summary)
//Filtration (seqkit_grep looks for the 5'RACE and the gsp patterns in the reads to keep only mature ARNs)
seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp)
//########################## MAPPING ##########################
hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome.collect())
sort_index_bam.out.indexed_bam
.flatten()
.filter(~/.*bam$/)
.collect()
.set{bam_path}
//control_bam(ss, bam_path)
//###################### START POSITIONS #######################
start_position_counts(sort_index_bam.out.indexed_bam)
start_position_individuals(start_position_counts.out.count)
//#################### VARIANTS D'EPISSAGE ####################
jwr_checker(sort_index_bam.out.indexed_bam)
start_position_individuals.out.classification_of_reads
.combine(jwr_checker.out.nanosplicer_jwr, by: 0)
.set{files_for_nanosplicer}
junctions_nanosplicer(files_for_nanosplicer)
//#################### VARIANTS D'EPISSAGE ####################
junctions_nanosplicer.out.identified_SPvariants
.combine(start_position_individuals.out.classification_of_reads, by: 0)
.set{files_for_rna_count}
rna_count(files_for_rna_count)