bolero.nf

#!/usr/bin/env nextflow

nextflow.enable.dsl=2

/*
========================================================================================================================
                                                      Bolero
========================================================================================================================

bolero pipeline :
 * Pipeline dedicated to transcriptomic analysis of Hepatitis B Virus 
 * Preprocessing, filtration, alignment, quantification.

 ****************************************************************
                      Help Message Definition
 ****************************************************************
*/

def helpMessage() {
    log.info"""
    Usage:
    The typical command for running the pipeline is as follows:

      nextflow ./src/bolero.nf -c ./src/nextflow.config -profile singularity

    Nextflow parameters:
      -profile [str]                  Configuration profile to use.
                                      Available: docker, singularity, podman, psmn, ccin2p3

    Mandatory arguments:
      --input [path]                  Path to the folder containing fast5 files. 
                                      If skip basecalling option enabled, path to fastq files folder.
      --adapt [str]                   Sequence of 5'RACE adapter.
      --gsp [str]                     Sequence of gene-specific primer used in 5'RACE amplification step.
      
    References:
      --genome [file]                 Path to the fasta file containing the genome.
      --gtf [file]                    Path to the gtf file containing the genome annotation.

    Nanopore basecalling:
      --skipBC [boolean]              Skip basecalling step. If true, give fastq folder as input. Default: true.
      --flowcell [str]                Nanopore flowcell. Default = FLO-MIN106.
      --kit [str]                     Nanopore kit. Default = SQK-PBK004.
      --gpu_mode [boolean]            Guppy basecaller configuration. Default: false.
                                      "gpu" mode is dedicated to NVIDIA Cuda compatible system according to Guppy specifications.
    
    GPU basecalling parameters:
      --min_qscore [float]            Minimum quality score threshold, default = 7.0.
      --gpu_runners_per_device [int]  Number of runner per device, default = 32 (refer to guppy manual).
      --num_callers [int]             Number of callers, default = 16 (refer to guppy manual).
      --chunks_per_runner [int]       Number of chunks per runner, default = 512 (refer to guppy manual).
      --chunk_size [int]              Chunck size, default = 1900 (refer to guppy manual).

    Help:
      --help                          Display this help message.
    
    """.stripIndent()
}

// Show help message

params.help = ""
params.h = ""

if (params.help || params.h) {
    helpMessage()
    exit 0
}

/*
 ****************************************************************
                      Default Parameters
 ****************************************************************
*/

/* params in */

params.skipBC = true
params.gpu_mode = false
params.adapt = "CGACTGGAGCACGAGGACACTGA" // "CGACTGGAGCACGAGGACACTGACATGGACTGAAGGAGTAGAAA" //  
params.gsp = "TTAGGCAGAGGTGAAAAAGTTG"
params.transcriptome = "./data/202201_Full-length_HBV_GTFv3/20220112_preCore_FL_HBV_XGR_transcripts.fasta"
params.genome = "./data/202201_Full-length_HBV_GTFv3/preCore_XGR.fasta"
params.gtf = "./data/202201_Full-length_HBV_GTFv3/20220112_GTF_preCore_FL_HBV_XGR.gtf"

params.flowcell = "FLO-MIN106"
params.kit = "SQK-PBK004"
params.min_qscore = 7.0
params.gpu_runners_per_device = 32
params.num_callers = 16
params.chunks_per_runner = 512
params.chunk_size = 1900

/* Params out */

params.basecalling_out = "01_Basecalling/"
params.barcoding_out = "02_barcoding/"
params.fastq_out = "03_fastq/"
params.seqkit_grep_out = "03_fastq/"
params.cutadapt_out = "04_cutadapt/"
params.minimap2_genome_out = "05_minimap2/"
params.start_position_counts_out = "06_start_positions/"
params.pycoQC_out = "pycoQC/"

/*
 ****************************************************************
                              Logs
 ****************************************************************
*/

log.info "fast5/q folder : ${params.input}"
log.info "5'RACE adapter sequence : ${params.adapt}"
if(!params.skipBC) log.info "Guppy basecalling calculation using GPU mode : ${params.gpu_mode}."
log.info "Genome file : ${params.genome}"
log.info "Genome annotation file : ${params.gtf}"

/*
 ****************************************************************
                        Channel definitions
 ****************************************************************
*/

Channel
    .of( params.input )
    .ifEmpty { error "No fast5/q folder defined." }
    .set { input }

Channel
  .of( params.adapt )
  .ifEmpty { error "No adapter sequence defined." }
  .set { adapt }

Channel
    .fromPath( params.genome )
    .ifEmpty { error "No genome defined, a fasta file containing the full length preC RNA from HBV genome." }
    .set { genome }

Channel
    .fromPath( params.gtf )
    .ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." }
    .set { gtf }

// .map( it -> [it.baseName, it])

/*
 ****************************************************************
                          Imports
 ****************************************************************
*/

if(!params.skipBC) {
  /* Hardware configuration, if Nvidia CUDA compatible graphic card is installed, use guppy-gpu, else guppy-cpu (much slower)*/
  if(params.gpu_mode) {
    include { basecall_fast5_gpu } from "./nf_modules/ont-guppy/main.nf"
  }
  else {
    include { basecall_fast5_cpu } from "./nf_modules/ont-guppy/main.nf"
  }
}

// Replace concatenate by seqkit fct to parallelization:
// include { concatenate } from "./nf_modules/seqkit/main.nf"
include { concatenate } from "./nf_modules/concatenate/main.nf"

include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf"
include { hbv_genome } from "./nf_modules/minimap2/main.nf"
include { seqkit_grep } from "./nf_modules/seqkit/main.nf"
include { sort_bam as sort_bam_genome } from './nf_modules/samtools/main.nf' addParams(sort_bam_out: params.minimap2_genome_out)
include { index_bam as index_bam_genome } from './nf_modules/samtools/main.nf' addParams(index_bam_out: params.minimap2_genome_out)
include { start_position_counts } from "./nf_modules/samtools/main.nf"

// creation des fonctions NanoSplicer:
// include { jwr_check } from "./nf_modules/nanosplicer/main.nf"

/*
 ****************************************************************
                          Workflow
 ****************************************************************
*/

workflow {

  //######################## BASECALLING ########################

  if(params.skipBC) {
    concatenate(params.input)
    // Replace by seqkit scat to parallelization
  }
  else {
    if(params.gpu_mode) {
      basecall_fast5_gpu(input)
      concatenate(basecall_fast5_gpu.out.pass)
      // Replace by seqkit scat to parallelization
    }
    else {
      basecall_fast5_cpu(input)
      concatenate(basecall_fast5_cpu.out.pass)
      // Replace by seqkit scat to parallelization
    }
  }
  //####################### PREPROCESSING #######################

  /*
  seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp)
  cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt)

  //########################## MAPPING ##########################

  hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome)
  sort_bam_genome(hbv_genome.out.bam)
  index_bam_genome(sort_bam_genome.out.sorted_bam.collect())

  //###################### QUANTIFICATION #######################

  start_position_counts(sort_bam_genome.out.sorted_bam)
  */

}