#!/usr/bin/env nextflow nextflow.enable.dsl=2 //syntax extension DSL2 /* ======================================================================================================================== Bolero ======================================================================================================================== bolero pipeline : * Pipeline dedicated to transcriptomic analysis of Hepatitis B Virus * Preprocessing, filtration, alignment, quantification. **************************************************************** Help Message Definition **************************************************************** */ def helpMessage() { log.info""" Usage: The typical command for running the pipeline is as follows: nextflow ./src/bolero.nf -c ./src/nextflow.config -profile singularity Nextflow parameters: -profile [str] Configuration profile to use. Available: docker, singularity, podman, psmn, ccin2p3 Mandatory arguments: --input [path] Path to the folder containing fast5 files. If skip basecalling option enabled, path to fastq files folder. --adapt [file] Path to the txt/fasta file containing the sequence of 5'RACE adapter. --gsp [file] Path to the txt/fasta file containing the sequence of gene-specific primer used in 5'RACE amplification step. References: --genome [file] Path to the fasta file containing the genome. --gtf [file] Path to the gtf file containing the genome annotation. Nanopore basecalling: --skipBC [boolean] Skip basecalling step. If true, give fastq folder as input. Default: true. --flowcell [str] Nanopore flowcell. Default = FLO-MIN106. --kit [str] Nanopore kit. Default = SQK-PBK004. --gpu_mode [boolean] Guppy basecaller configuration. Default: false. "gpu" mode is dedicated to NVIDIA Cuda compatible system according to Guppy specifications. Nanopore barcoding: --kit_barcoding Nanopore barcoding kit. --config_file Nanopore configuration file. GPU basecalling parameters: --min_qscore [float] Minimum quality score threshold, default = 7.0. --gpu_runners_per_device [int] Number of runner per device, default = 32 (refer to guppy manual). --num_callers [int] Number of callers, default = 16 (refer to guppy manual). --chunks_per_runner [int] Number of chunks per runner, default = 512 (refer to guppy manual). --chunk_size [int] Chunck size, default = 1900 (refer to guppy manual). Help: --help | --h Display this help message. """.stripIndent() } // Show help message params.help = "" params.h = "" if (params.help || params.h) { helpMessage() exit 0 } /* **************************************************************** Default Parameters **************************************************************** */ /* Params in */ params.skipBC = true params.gpu_mode = false params.adapt = "CGACTGGAGCACGAGGACACTGA" // "CGACTGGAGCACGAGGACACTGACATGGACTGAAGGAGTAGAAA" // params.gsp = "TTAGGCAGAGGTGAAAAAGTTG" params.transcriptome = "./data/202201_Full-length_HBV_GTFv3/20220112_preCore_FL_HBV_XGR_transcripts.fasta" params.genome = "./data/202201_Full-length_HBV_GTFv3/preCore_XGR.fasta" params.gtf = "./data/202201_Full-length_HBV_GTFv3/20220112_GTF_preCore_FL_HBV_XGR.gtf" params.flowcell = "FLO-MIN106" params.kit = "SQK-PBK004" params.min_qscore = 7.0 params.gpu_runners_per_device = 32 params.num_callers = 16 params.chunks_per_runner = 512 params.chunk_size = 1900 params.config_file = "" params.kit_barcoding = "" /* Params out */ params.basecalling_out = "01_basecalling/" params.barcoding_out = "02_barcoding/" params.fastq_out = "03_fastq/" params.seqkit_grep_out = "03_fastq/" params.porechop_out = "03_fastq/" params.cutadapt_out = "04_cutadapt/" params.minimap2_genome_out = "05_minimap2/" params.filtered_bam_out = "05_minimap2/" params.start_position_counts_out = "06_start_positions/" params.nanosplicer_out = "07_nanosplicer/" params.rna_count_out = "08_RNA_count/" params.rna_qc_out = "09_quality_control/" /* **************************************************************** Logs **************************************************************** */ log.info "fast5/q folder : ${params.input}" log.info "5'RACE adapter sequence : ${params.adapt}" log.info "Gene specific primer : ${params.gsp}" if(!params.skipBC) log.info "Guppy basecalling calculation using GPU mode : ${params.gpu_mode}." log.info "Genome file : ${params.genome}" log.info "Genome annotation file : ${params.gtf}" /* **************************************************************** Channel definitions **************************************************************** */ Channel .of( params.input ) .ifEmpty { error "No fast5/q folder defined." } .set { input } Channel .fromPath( params.genome ) .ifEmpty { error "No genome defined, a fasta file containing the full length preC RNA from HBV genome." } .set { genome } Channel .fromPath( params.gtf ) .ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." } .set { gtf } Channel .fromPath(params.input+'*/', type: 'dir') .map(it -> [it.baseName, it]) .set{barcodes} /* **************************************************************** Imports **************************************************************** */ if(!params.skipBC) { /* Hardware configuration, if Nvidia CUDA compatible graphic card is installed, use guppy-gpu, else guppy-cpu (much slower)*/ if(params.gpu_mode) { include { basecall_fast5_gpu } from "./nf_modules/ont-guppy/main.nf" include { barcoding_gpu } from "./nf_modules/ont-guppy/main.nf" } else { include { basecall_fast5_cpu } from "./nf_modules/ont-guppy/main.nf" include { barcoding_cpu } from "./nf_modules/ont-guppy/main.nf" } } include { barcoding_cpu } from "./nf_modules/ont-guppy/main.nf" include { control_basecalling } from "./nf_modules/pycoqc/main.nf" include { control_bam } from "./nf_modules/pycoqc/main.nf" include { concatenate } from "./nf_modules/seqkit/main.nf" include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf" include { hbv_genome } from "./nf_modules/minimap2/main.nf" include { seqkit_grep } from "./nf_modules/seqkit/main.nf" include { sort_bam } from './nf_modules/samtools/main.nf' addParams(sort_bam_out: params.minimap2_genome_out) include { index_bam } from './nf_modules/samtools/main.nf' addParams(index_bam_out: params.minimap2_genome_out) include { sort_index_bam } from './nf_modules/samtools/main.nf' addParams(indexed_bam_out: params.minimap2_genome_out) include { filter_as } from './nf_modules/samtools/main.nf' include { start_position_counts } from "./nf_modules/samtools/main.nf" include { start_position_individuals } from "./nf_modules/start_positions/main.nf" include { jwr_checker } from "./nf_modules/nanosplicer/main.nf" include { junctions_nanosplicer } from "./nf_modules/junction_nanosplicer/main.nf" include { rna_count } from "./nf_modules/rna_count/main.nf" include { porechop } from "./nf_modules/porechop/main.nf" /* **************************************************************** Workflow **************************************************************** */ workflow { if(params.skipBC) { // we take fastq files as input and skip basecalling concatenate(barcodes) } else { // we take fast5 files as input and proceed to basecalling with guppy if(params.gpu_mode) { basecall_fast5_gpu(input) if(params.kit_barcoding != ""){ barcoding_gpu(basecall_fast5_gpu.out.pass) barcoding_gpu.out.barcodes .flatten() .map{it -> [it.name, it]} .set{tuples_barcode} concatenate(tuples_barcode) } else{ basecall_fast5_gpu.out.pass .map{it -> ["Sample", it]} .set{tuple_sample} concatenate(tuple_sample) } } else { basecall_fast5_cpu(input) if(params.kit_barcoding != ""){ barcoding_cpu(basecall_fast5_cpu.out.pass) barcoding_cpu.out.barcodes .flatten() .map{it -> [it.name, it]} .set{tuples_barcode} concatenate(tuples_barcode) } else{ basecall_fast5_cpu.out.pass .map{it -> ["Sample", it]} .set{tuple_sample} concatenate(tuple_sample) } } } //####################### PREPROCESSING ####################### //Filtration (seqkit_grep looks for the 5'RACE and the gsp patterns in the reads to keep only mature ARNs) seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp) //Trimming with porechop porechop(seqkit_grep.out.filtered_fastq) //Cut of the 5'RACE sequence cut_5pRACE(porechop.out.porechoped_fastq, params.adapt) //cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt) //########################## MAPPING ########################## hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome.collect()) //Filter filter_as(hbv_genome.out.bam) //Index sort_index_bam(filter_as.out.filtered_bam) //Quality control if(params.skipBC == false) { if(params.gpu_mode) { control_bam(basecall_fast5_gpu.out.sequencing_summary.collect(), sort_index_bam.out.indexed_bam) } else { control_bam(basecall_fast5_cpu.out.sequencing_summary.collect(), sort_index_bam.out.indexed_bam) } } //###################### START POSITIONS ####################### //Identification of start positions start_position_counts(sort_index_bam.out.indexed_bam) //Identification of RNA start_position_individuals(start_position_counts.out.count) //#################### VARIANTS D'EPISSAGE #################### //Identification of splicing junction sites jwr_checker(sort_index_bam.out.indexed_bam) start_position_individuals.out.classification_of_reads .combine(jwr_checker.out.nanosplicer_jwr, by: 0) .set{files_for_nanosplicer} //Identification of variants junctions_nanosplicer(files_for_nanosplicer) //#################### VARIANTS D'EPISSAGE #################### junctions_nanosplicer.out.identified_SPvariants .combine(start_position_individuals.out.classification_of_reads, by: 0) .set{files_for_rna_count} //Variants count rna_count(files_for_rna_count) }