Add orelob.nf, pipeline to analyse 3pRACE-Nanopore RNAseq.

260af0c7 · Xavier Grand · 61b176c8 · 260af0c7 · 260af0c7 · 260af0c7
Commit 260af0c7 authored Feb 20, 2024 by Xavier Grand
--- a/src/.docker_modules/r-bolero/1.1/ggplot_theme_Publication-2.R
+++ b/src/.docker_modules/r-bolero/1.1/ggplot_theme_Publication-2.R
--- a/src/.docker_modules/samtools/1.17/Dockerfile
+++ b/src/.docker_modules/samtools/1.17/Dockerfile
--- a/src/.docker_modules/seqkit/2.4.0/Dockerfile
+++ b/src/.docker_modules/seqkit/2.4.0/Dockerfile
--- a/src/nf_modules/junction_nanosplicer/main.nf
+++ b/src/nf_modules/junction_nanosplicer/main.nf
--- a/src/nf_modules/nanosplicer/main.nf
+++ b/src/nf_modules/nanosplicer/main.nf
--- a/src/nf_modules/ont-guppy/main.nf
+++ b/src/nf_modules/ont-guppy/main.nf
--- a/src/nf_modules/pychopper/main.nf
+++ b/src/nf_modules/pychopper/main.nf
--- a/src/nf_modules/pycoqc/main.nf
+++ b/src/nf_modules/pycoqc/main.nf
--- a/src/nf_modules/rna_count/main.nf
+++ b/src/nf_modules/rna_count/main.nf
--- a/src/nf_modules/start_positions/main.nf
+++ b/src/nf_modules/start_positions/main.nf
--- a/src/orelob.nf
+++ b/src/orelob.nf
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+//syntax extension DSL2
+
+/*
+========================================================================================================================
+                                                      Orelob
+========================================================================================================================
+
+bolero pipeline :
+ * Pipeline dedicated to transcription terminaison analysis of Hepatitis B Virus from nanopore seq
+ * Preprocessing, filtration, alignment, quantification.
+
+ ****************************************************************
+                      Help Message Definition
+ ****************************************************************
+*/
+
+def helpMessage() {
+    log.info"""
+    Usage:
+    The typical command for running the pipeline is as follows:
+
+      nextflow ./src/orelob.nf -c ./src/nextflow.config -profile singularity
+
+    Nextflow parameters:
+      -profile [str]                  Configuration profile to use.
+                                      Available: docker, singularity, podman, psmn, ccin2p3
+
+    Mandatory arguments:
+      --input [path]                  Path to the folder containing fast5 files. 
+                                      If skip basecalling option enabled, path to fastq files folder.
+      --adapt [file]                  Sequence of 3'RACE adapter.
+      --gsp [file]                    Sequence of gene-specific primer used in 3'RACE amplification step.
+      
+    References:
+      --genome [file]                 Path to the fasta file containing the genome.
+      --gtf [file]                    Path to the gtf file containing the genome annotation.
+
+    Nanopore basecalling:
+      --skipBC [boolean]              Skip basecalling step. If true, give fastq folder as input. Default: true.
+      --flowcell [str]                Nanopore flowcell. Default = FLO-MIN106.
+      --kit [str]                     Nanopore kit. Default = SQK-PBK004.
+      --gpu_mode [boolean]            Guppy basecaller configuration. Default: false.
+                                      "gpu" mode is dedicated to NVIDIA Cuda compatible system according to Guppy specifications.
+    
+    Nanopore barcoding:
+      --kit_barcoding                 Nanopore barcoding kit.
+      --config_file                   Nanopore configuration file.
+    
+    GPU basecalling parameters:
+      --min_qscore [float]            Minimum quality score threshold, default = 7.0.
+      --gpu_runners_per_device [int]  Number of runner per device, default = 32 (refer to guppy manual).
+      --num_callers [int]             Number of callers, default = 16 (refer to guppy manual).
+      --chunks_per_runner [int]       Number of chunks per runner, default = 512 (refer to guppy manual).
+      --chunk_size [int]              Chunck size, default = 1900 (refer to guppy manual).
+
+    Help:
+      --help | --h                   Display this help message.
+    
+    """.stripIndent()
+}
+
+// Show help message
+
+params.help = ""
+params.h = ""
+
+if (params.help || params.h) {
+    helpMessage()
+    exit 0
+}
+
+/*
+ ****************************************************************
+                      Default Parameters
+ ****************************************************************
+*/
+
+/* Params in */
+
+params.skipBC = true
+params.gpu_mode = false
+params.adapt = ""
+params.gsp = "" 
+params.genome = "/home/xavier/Data/Genome/202201_Full-length_HBV_GTFv3/20230516_HBV_FL_preCore_reference.fasta"
+params.gtf = "/home/xavier/Data/Genome/202201_Full-length_HBV_GTFv3/20230516_GTF_preCore_FL_HBV_XGR.gtf"
+
+params.flowcell = "FLO-MIN106"
+params.kit = "SQK-PBK004"
+params.min_qscore = 7.0
+params.gpu_runners_per_device = 32
+params.num_callers = 16
+params.chunks_per_runner = 512
+params.chunk_size = 1900
+params.config_file = ""
+params.kit_barcoding = ""
+
+
+/* Params out */
+
+params.basecalling_out = "01_basecalling/"
+params.barcoding_out = "02_barcoding/"
+params.fastq_out = "03_fastq/"
+params.seqkit_grep_out = "03_fastq/"
+params.porechop_out = "03_fastq/"
+params.cutadapt_out = "04_cutadapt/"
+params.minimap2_genome_out = "05_minimap2/"
+params.filtered_bam_out = "05_minimap2/"
+params.start_position_counts_out = "06_start_positions/"
+params.nanosplicer_out = "07_nanosplicer/"
+params.rna_count_out = "08_RNA_count/"
+params.rna_qc_out = "09_quality_control/"
+
+/*
+ ****************************************************************
+                              Logs
+ ****************************************************************
+*/
+
+log.info "fast5/q folder : ${params.input}"
+log.info "3'RACE adapter sequence : ${params.adapt}"
+log.info "Gene specific primer : ${params.gsp}"
+if(!params.skipBC) log.info "Guppy basecalling calculation using GPU mode : ${params.gpu_mode}."
+log.info "Genome file : ${params.genome}"
+log.info "Genome annotation file : ${params.gtf}"
+
+/*
+ ****************************************************************
+                        Channel definitions
+ ****************************************************************
+*/
+
+Channel
+    .of( params.input )
+    .ifEmpty { error "No fast5/q folder defined." }
+    .set { input }
+
+Channel
+    .fromPath( params.genome )
+    .ifEmpty { error "No genome defined, a fasta file containing the full length preC RNA from HBV genome." }
+    .set { genome }
+
+Channel
+    .fromPath( params.gtf )
+    .ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." }
+    .set { gtf }
+
+Channel
+    .fromPath(params.input+'*/', type: 'dir')
+    .map(it -> [it.baseName, it])
+    .set{barcodes}
+
+
+/*
+ ****************************************************************
+                          Imports
+ ****************************************************************
+*/
+
+if(!params.skipBC) {
+  /* Hardware configuration, if Nvidia CUDA compatible graphic card is installed, use guppy-gpu, else guppy-cpu (much slower)*/
+  if(params.gpu_mode) {
+    include { basecall_fast5_gpu } from "./nf_modules/ont-guppy/main.nf"
+    include { barcoding_gpu } from "./nf_modules/ont-guppy/main.nf"
+  }
+  else {
+    include { basecall_fast5_cpu } from "./nf_modules/ont-guppy/main.nf"
+    include { barcoding_cpu } from "./nf_modules/ont-guppy/main.nf"
+  }
+}
+
+include { barcoding_cpu } from "./nf_modules/ont-guppy/main.nf"
+include { control_basecalling } from "./nf_modules/pycoqc/main.nf"
+include { control_bam } from "./nf_modules/pycoqc/main.nf"
+include { concatenate } from "./nf_modules/seqkit/main.nf"
+include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf"
+include { hbv_genome } from "./nf_modules/minimap2/main.nf"
+include { seqkit_grep } from "./nf_modules/seqkit/main.nf"
+include { sort_bam } from './nf_modules/samtools/main.nf' addParams(sort_bam_out: params.minimap2_genome_out)
+include { index_bam } from './nf_modules/samtools/main.nf' addParams(index_bam_out: params.minimap2_genome_out)
+include { sort_index_bam } from './nf_modules/samtools/main.nf' addParams(indexed_bam_out: params.minimap2_genome_out)
+include { filter_as } from './nf_modules/samtools/main.nf'
+include { start_position_counts } from "./nf_modules/samtools/main.nf"
+include { start_position_individuals } from "./nf_modules/start_positions/main.nf"
+include { jwr_checker } from "./nf_modules/nanosplicer/main.nf"
+include { junctions_nanosplicer } from "./nf_modules/junction_nanosplicer/main.nf"
+include { rna_count } from "./nf_modules/rna_count/main.nf"
+
+include { porechop } from "./nf_modules/porechop/main.nf"
+include { trimmming_pychopper } from "./nf_modules/pychopper/main.nf"
+
+/*
+ ****************************************************************
+                          Workflow
+ ****************************************************************
+*/
+
+workflow {
+
+  if(params.skipBC) { // we take fastq files as input and skip basecalling
+    concatenate(barcodes)
+  }
+
+  else { // we take fast5 files as input and proceed to basecalling with guppy
+    if(params.gpu_mode) {
+      basecall_fast5_gpu(input)
+      if(params.kit_barcoding != ""){
+        barcoding_gpu(basecall_fast5_gpu.out.pass)  
+        barcoding_gpu.out.barcodes
+          .flatten()
+          .map{it -> [it.name, it]}
+          .set{tuples_barcode}
+        concatenate(tuples_barcode)
+      }
+      else{
+        basecall_fast5_gpu.out.pass
+          .map{it -> ["Sample", it]}
+          .set{tuple_sample}
+        concatenate(tuple_sample)
+      }
+
+    }
+    else {
+      basecall_fast5_cpu(input)
+      if(params.kit_barcoding != ""){
+        barcoding_cpu(basecall_fast5_cpu.out.pass)  
+        barcoding_cpu.out.barcodes
+          .flatten()
+          .map{it -> [it.name, it]}
+          .set{tuples_barcode}
+        concatenate(tuples_barcode)
+      }
+      else{
+        basecall_fast5_cpu.out.pass
+          .map{it -> ["Sample", it]}
+          .set{tuple_sample}
+        concatenate(tuple_sample)
+      }
+    }
+  }
+
+
+
+  //####################### PREPROCESSING #######################
+  
+
+  //Filtration (seqkit_grep looks for the 5'RACE and the gsp patterns in the reads to keep only mature ARNs)
+  seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp)
+
+  //Trimming with porechop
+  porechop(seqkit_grep.out.filtered_fastq)
+
+  //Trimming with pychopper
+  //trimmming_pychopper(seqkit_grep.out.filtered_fastq)
+  
+  //Cut of the 5'RACE sequence
+  cut_5pRACE(porechop.out.porechoped_fastq, params.adapt)
+  //cut_5pRACE(trimmming_pychopper.out.pychoped_fastq, params.adapt)
+  //cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt)
+
+  //########################## MAPPING ##########################
+
+  hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome.collect())
+
+  //Filter
+  filter_as(hbv_genome.out.bam)
+
+  //Index
+  sort_index_bam(filter_as.out.filtered_bam)
+  
+  //Quality control
+  if(params.skipBC == false) {
+    if(params.gpu_mode) {
+      control_bam(basecall_fast5_gpu.out.sequencing_summary.collect(), sort_index_bam.out.indexed_bam)
+    }
+    else {
+      control_bam(basecall_fast5_cpu.out.sequencing_summary.collect(), sort_index_bam.out.indexed_bam)
+    }
+  }
+  //###################### START POSITIONS #######################
+
+  //Identification of start positions
+  start_position_counts(sort_index_bam.out.indexed_bam)
+
+  //Identification of RNA
+  start_position_individuals(start_position_counts.out.count)
+
+  //#################### VARIANTS D'EPISSAGE ####################
+
+  //Identification of splicing junction sites
+  jwr_checker(sort_index_bam.out.indexed_bam)
+
+  start_position_individuals.out.classification_of_reads
+    .combine(jwr_checker.out.nanosplicer_jwr, by: 0)
+    .set{files_for_nanosplicer}
+
+  //Identification of variants
+  junctions_nanosplicer(files_for_nanosplicer)
+  
+  //#################### VARIANTS D'EPISSAGE ####################
+
+  junctions_nanosplicer.out.identified_SPvariants
+    .combine(start_position_individuals.out.classification_of_reads, by: 0)
+    .set{files_for_rna_count}
+
+  //Variants count
+  rna_count(files_for_rna_count)
+
+}
+
+// End message:
+
+workflow.onComplete {
+
+   println ( workflow.success ? """
+       DUPFinder tools execution summary
+       ---------------------------
+       Completed at	: ${workflow.complete}
+       Duration    	: ${workflow.duration}
+       Success     	: ${workflow.success}
+       workDir     	: ${workflow.workDir}
+       exit status 	: ${workflow.exitStatus}
+       """ : """
+       Failed: ${workflow.errorReport}
+       exit status : ${workflow.exitStatus}
+       """
+   )
+}