From 6a48c301c4e7d7f77f57d2df8962a6afb8965c35 Mon Sep 17 00:00:00 2001
From: aliarifki <aliarifki@outlook.fr>
Date: Wed, 14 Jun 2023 10:10:44 +0200
Subject: [PATCH] Ajout des barcodes

---
 src/bolero.nf                               | 45 ++++++++++-----------
 src/nf_modules/cutadapt/main.nf             |  8 ++--
 src/nf_modules/junction_nanosplicer/main.nf | 17 ++++----
 src/nf_modules/minimap2/main.nf             | 11 +++--
 src/nf_modules/nanosplicer/main.nf          | 26 ++++++++++++
 src/nf_modules/rna_count/main.nf            | 16 ++++----
 src/nf_modules/samtools/main.nf             | 24 ++++++-----
 src/nf_modules/seqkit/main.nf               | 39 ++++++++++--------
 src/nf_modules/start_positions/main.nf      | 19 +++++----
 9 files changed, 124 insertions(+), 81 deletions(-)
 create mode 100644 src/nf_modules/nanosplicer/main.nf

diff --git a/src/bolero.nf b/src/bolero.nf
index 3ff3c53..c9851d4 100755
--- a/src/bolero.nf
+++ b/src/bolero.nf
@@ -129,9 +129,9 @@ Channel
     .set { input }
 
 Channel
-  .of( params.adapt )
-  .ifEmpty { error "No adapter sequence defined." }
-  .set { adapt }
+    .of( params.adapt )
+    .ifEmpty { error "No adapter sequence defined." }
+    .set { adapt }
 
 Channel
     .fromPath( params.genome )
@@ -143,7 +143,10 @@ Channel
     .ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." }
     .set { gtf }
 
-// .map( it -> [it.baseName, it])
+Channel
+    .fromPath(params.input+'*/', type: 'dir')
+    .map(it -> [it.baseName, it])
+    .set{barcodes}
 
 /*
  ****************************************************************
@@ -161,10 +164,8 @@ if(!params.skipBC) {
   }
 }
 
-// Replace concatenate by seqkit fct to parallelization:
+include { barecode } from "./nf_modules/barecode/main.nf" 
 include { concatenate } from "./nf_modules/seqkit/main.nf"
-//include { concatenate } from "./nf_modules/concatenate/main.nf"
-
 include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf"
 include { hbv_genome } from "./nf_modules/minimap2/main.nf"
 include { seqkit_grep } from "./nf_modules/seqkit/main.nf"
@@ -178,9 +179,6 @@ include { junctions_nanosplicer } from "./nf_modules/junction_nanosplicer/main.n
 include { rna_count } from "./nf_modules/rna_count/main.nf"
 
 
-// creation des fonctions NanoSplicer:
-// include { jwr_check } from "./nf_modules/nanosplicer/main.nf"
-
 /*
  ****************************************************************
                           Workflow
@@ -189,42 +187,41 @@ include { rna_count } from "./nf_modules/rna_count/main.nf"
 
 workflow {
 
+
   //######################## BASECALLING ########################
 
-  if(params.skipBC) {
-    concatenate(params.input)
-    // Replace by seqkit scat to parallelization
+  if(params.skipBC) { // we take fastq files as input and skip basecalling
+    concatenate(barcodes)
   }
-  else {
+
+  //il reste à adapter ça
+  else { // we take fast5 files as input and proceed to basecalling with guppy
     if(params.gpu_mode) {
       basecall_fast5_gpu(input)
       concatenate(basecall_fast5_gpu.out.pass)
-      // Replace by seqkit scat to parallelization
     }
     else {
       basecall_fast5_cpu(input)
       concatenate(basecall_fast5_cpu.out.pass)
-      // Replace by seqkit scat to parallelization
     }
   }
+
+
+
   //####################### PREPROCESSING #######################
-  
-  
-  
+    
+
   //Filtration (seqkit_grep looks for the 5'RACE and the gsp patterns in the reads to keep only mature ARNs)
   seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp)
   
   //Cut of the 5'RACE sequence
   cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt)
 
-  
-
   //########################## MAPPING ##########################
 
-  
-  hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome)
+  hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome.collect())
+
   sort_index_bam(hbv_genome.out.bam)
-  // index_bam(sort_bam_genome.out.sorted_bam.collect())
 
   //###################### START POSITIONS #######################
 
diff --git a/src/nf_modules/cutadapt/main.nf b/src/nf_modules/cutadapt/main.nf
index b7ffd3b..7c72b44 100755
--- a/src/nf_modules/cutadapt/main.nf
+++ b/src/nf_modules/cutadapt/main.nf
@@ -4,23 +4,23 @@ container_url = "xgrand/cutadapt:${version}"
 process cut_5pRACE {
   container = "${container_url}"
   label "small_mem_mono_cpus"
-  tag "cutadapt"
+  tag "${barcode}"
   
   if (params.cutadapt_out != "") {
     publishDir "results/${params.cutadapt_out}", mode: 'copy'
   }
 
   input:
-  path(fastq)
+  tuple val(barcode), path(fastq)
   val(adapt)
 
   output:
-  path("*_cut_*"), emit: fastq_cutadapt
+  tuple val(barcode), path("${barcode}_merged_porechoped_cut_fastq.fastq"), emit: fastq_cutadapt
 
   """
   cutadapt -e 0.2 -g ${adapt} \
    --revcomp \
-   -o "merged_porechoped_cut_fastq.fastq" \
+   -o "${barcode}_merged_porechoped_cut_fastq.fastq" \
    ${fastq}
   """
 }
\ No newline at end of file
diff --git a/src/nf_modules/junction_nanosplicer/main.nf b/src/nf_modules/junction_nanosplicer/main.nf
index 9f0a209..337ed6f 100644
--- a/src/nf_modules/junction_nanosplicer/main.nf
+++ b/src/nf_modules/junction_nanosplicer/main.nf
@@ -5,23 +5,24 @@ params.nanosplicer_out = ""
 process junctions_nanosplicer{
   container = "${container_url}"
   label "small_mem_mono_cpus"
-  tag "identification de variants d'épissage"
+  tag "${barcode}"
   if (params.nanosplicer_out != "") {
     publishDir "results/${params.nanosplicer_out}", mode: 'copy'
   }
 
   input:
-    path(txt)
-    path(csv)
+    tuple val(barcode), path(txt)
+    tuple val(barcode), path(csv)
 
   output:
-    path("Rplots.pdf")
-    path("JWR_check_parsed.csv")
-    path("*.png")
-    path("identified_SPvariants.csv"), emit: identified_SPvariants
+    path("${barcode}/JWR_check_parsed.csv")
+    tuple val(barcode), path("${barcode}/${barcode}_identified_SPvariants.csv"), emit: identified_SPvariants
 
   script:
     """
-    Rscript /Junctions_NanoSplicer.R -c ${txt} -j ${csv}
+    mkdir ${barcode}
+    cd ${barcode}/
+    Rscript /Junctions_NanoSplicer.R -c ../${txt} -j ../${csv}
+    mv identified_SPvariants.csv ${barcode}_identified_SPvariants.csv
     """
 }
\ No newline at end of file
diff --git a/src/nf_modules/minimap2/main.nf b/src/nf_modules/minimap2/main.nf
index 5e101b7..91c9193 100755
--- a/src/nf_modules/minimap2/main.nf
+++ b/src/nf_modules/minimap2/main.nf
@@ -89,22 +89,25 @@ params.mapping_hbv_genome = "-ax splice --secondary=no -G 1650 -u n --eqx"
 process hbv_genome {
   container = "${container_url}"
   label "big_mem_multi_cpus"
+  tag "${barcode}"
   if (params.minimap2_genome_out != "") {
     publishDir "results/${params.minimap2_genome_out}", mode: 'copy'
   }
 
   input:
-  path(fastq)
+  tuple val(barcode), path(fastq)
   path(genome)
 
   output:
-  path("*"), emit: bam
+  tuple val(barcode), path("${barcode}/${barcode}_res.bam"), emit: bam
 
   script:
   memory = "${task.memory}" - ~/\s*GB/
   memory = memory.toInteger() / (task.cpus + 1.0)
   """
-  minimap2 ${params.mapping_hbv_genome} -t${task.cpus} -K ${memory} ${genome} ${fastq} |
-    samtools view -Shb - > res.bam
+  mkdir ${barcode}
+  cd ${barcode}/
+  minimap2 ${params.mapping_hbv_genome} -t ${task.cpus} -K ${memory} ../${genome} ../${fastq} |
+    samtools view -Shb - > ${barcode}_res.bam
   """
 }
\ No newline at end of file
diff --git a/src/nf_modules/nanosplicer/main.nf b/src/nf_modules/nanosplicer/main.nf
new file mode 100644
index 0000000..71908d7
--- /dev/null
+++ b/src/nf_modules/nanosplicer/main.nf
@@ -0,0 +1,26 @@
+version = "1.0"
+container_url = "xgrand/nanosplicer:${version}"
+
+params.nanosplicer_out = ""
+process jwr_checker {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "${barcode}"
+  if (params.nanosplicer_out != "") {
+    publishDir "results/${params.nanosplicer_out}", mode: 'copy'
+  }
+
+  input:
+    tuple val(barcode), path(bam), path(index)
+
+  output:
+    tuple val(barcode), path("${barcode}/${barcode}_JWR_check.h5.csv"), emit: nanosplicer_jwr
+
+  script:
+    """
+    mkdir ${barcode}
+    cd ${barcode}/
+    python3 /NanoSplicer/bin/JWR_checker.py --output_csv ../${bam} ${barcode}_JWR_check.h5 
+    """
+}
+
diff --git a/src/nf_modules/rna_count/main.nf b/src/nf_modules/rna_count/main.nf
index a2ae2ce..06afba6 100644
--- a/src/nf_modules/rna_count/main.nf
+++ b/src/nf_modules/rna_count/main.nf
@@ -5,22 +5,24 @@ params.rna_count_out = ""
 process rna_count{
   container = "${container_url}"
   label "small_mem_mono_cpus"
-  tag "RNA quantification"
+  tag "${barcode}"
   if (params.rna_count_out != "") {
     publishDir "results/${params.rna_count_out}", mode: 'copy'
   }
 
   input:
-    path(spvariants)
-    path(classification)
+    tuple val(barcode), path(spvariants)
+    tuple val(barcode), path(classification)
 
   output:
-    path("*.csv")
-    path("*.pdf")
-    path("*.png")
+    path("${barcode}/*.csv")
+    path("${barcode}/*.pdf")
+    path("${barcode}/*.png")
 
   script:
     """
-    Rscript /HBV_RNAs_count.R -s ${spvariants} -c ${classification}
+    mkdir ${barcode}
+    cd ${barcode}/
+    Rscript /HBV_RNAs_count.R -s ../${spvariants} -c ../${classification}
     """
 }
diff --git a/src/nf_modules/samtools/main.nf b/src/nf_modules/samtools/main.nf
index 0b48cd5..d44804b 100755
--- a/src/nf_modules/samtools/main.nf
+++ b/src/nf_modules/samtools/main.nf
@@ -24,21 +24,23 @@ samtools sort -@ ${task.cpus} ${bam} -O BAM -o ${bam.simpleName}_sorted.bam
 
 params.start_position_counts_out = ""
 process start_position_counts {
-    tag "Start positions count"
+    tag "${barcode}"
     label "big_mem_multi_cpus"
     publishDir "results/${params.start_position_counts_out}", mode: 'copy'
 
     input:
-        tuple path(bam), path(index)
+        tuple val(barcode), path(bam), path(index)
 
     output:
-        path "*", emit: count
+        tuple val(barcode), path("${barcode}/${barcode}_start_positions_counts.txt"), emit: count
 
     script:
 """
-samtools view -F 260 ${bam} |
+mkdir ${barcode}
+cd ${barcode}/
+samtools view -F 260 ../${bam} |
   cut -f 1,4 |
-  sort > Start_positions_counts.txt
+  sort > ${barcode}_start_positions_counts.txt
 """
 }
 
@@ -67,20 +69,22 @@ params.indexed_bam_out =""
 process sort_index_bam {
   container = "${container_url}"
   label "big_mem_multi_cpus"
-  tag "sorting"
+  tag "${barcode}"
   if (params.indexed_bam_out != "") {
     publishDir "results/${params.indexed_bam_out}", mode: 'copy'
   }
 
   input:
-    path(bam)
+    tuple val(barcode), path(bam)
 
   output:
-    tuple path("*sorted.bam"), path("*.bai"), emit: indexed_bam
+    tuple val(barcode), path("${barcode}/*sorted.bam"), path("${barcode}/*.bai"), emit: indexed_bam
 
   script:
 """
-samtools sort -@ ${task.cpus} ${bam} -o ${bam.simpleName}_sorted.bam
-samtools index -@ ${task.cpus} ${bam.simpleName}_sorted.bam
+mkdir ${barcode}
+cd ${barcode}/
+samtools sort -@ ${task.cpus} ../${bam} -o ${barcode}_sorted.bam
+samtools index -@ ${task.cpus} ${barcode}_sorted.bam
 """
 }
\ No newline at end of file
diff --git a/src/nf_modules/seqkit/main.nf b/src/nf_modules/seqkit/main.nf
index e6d7e6b..683c924 100755
--- a/src/nf_modules/seqkit/main.nf
+++ b/src/nf_modules/seqkit/main.nf
@@ -29,35 +29,37 @@ params.seqkit_grep_out = ""
 process seqkit_grep {
   container = "${container_url}"
   label "small_mem_multi_cpus"
-  tag "Filter_reads"
+  tag "${barcode}"
   if (params.seqkit_grep_out != "") {
     publishDir "results/${params.seqkit_grep_out}", mode: 'copy'
   }
   
   input:
-    path(fastq)
+    tuple val(barcode), path(fastq)
     val(adapt)
     val(gsp)
 
   output:
-    path("filtered_5RACE_GSP.fastq"), emit: filtered_fastq
-    path("seq_stats.csv")
-    path("*.txt")
-    path("filtered_5RACE.fastq")
+    tuple val(barcode), path("${barcode}/${barcode}_filtered_5RACE_GSP.fastq"), emit: filtered_fastq
+    path("${barcode}/*.csv")
+    path("${barcode}/*.txt")
+    path("${barcode}/${barcode}_filtered_5RACE.fastq")
 
   script:
     lgadapt = Math.round(adapt.size().div(10))
     lggsp = Math.round(gsp.size().div(10))
     """
+    mkdir ${barcode}
+    cd ${barcode}/
     echo "mismatch allowed to 5'RACE adapter:  ${lgadapt}" > mismatch.txt
     echo "mismatch allowed to Gene Specific primer:  ${lggsp}" >> mismatch.txt
     echo ${adapt} > adapt.txt
     echo ${gsp} > gsp.txt
-    seqkit grep -i -f adapt.txt -m ${lgadapt} ${fastq} -o filtered_5RACE.fastq -j ${task.cpus}
-    seqkit grep -i -f gsp.txt -m ${lggsp} filtered_5RACE.fastq -o filtered_5RACE_GSP.fastq -j ${task.cpus}
-    seqkit stats ${fastq} -T -j ${task.cpus} > seq_stats.csv
-    seqkit stats filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv
-    seqkit stats filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv
+    seqkit grep -i -f adapt.txt -m ${lgadapt} ../${fastq} -o ${barcode}_filtered_5RACE.fastq -j ${task.cpus}
+    seqkit grep -i -f gsp.txt -m ${lggsp} ${barcode}_filtered_5RACE.fastq -o ${barcode}_filtered_5RACE_GSP.fastq -j ${task.cpus}
+    seqkit stats ../${fastq} -T -j ${task.cpus} > ${barcode}_seq_stats.csv
+    seqkit stats ${barcode}_filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv
+    seqkit stats ${barcode}_filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv
     """
 }
 
@@ -65,21 +67,24 @@ params.fastq_out = ""
 process concatenate {
   container = "${container_url}"
   label "big_mem_multi_cpus"
-  tag "Concatenate_reads"
+  tag "${barcode}"
   if (params.fastq_out != "") {
     publishDir "results/${params.fastq_out}", mode: 'copy'
   }
 
   input:
-    path fastq
+    tuple val(barcode), path(fastq)
 
   output:
-    path "merged.fastq.gz", emit: merged_fastq
+    tuple val(barcode), path("${barcode}/${barcode}_merged.fastq.gz"), emit: merged_fastq
 
   script:
     """
-    path=\$(readlink -f ${fastq})
-    seqkit scat -j ${task.cpus} -f \${path} --gz-only > merged.fastq
-    gzip merged.fastq
+    mv ${fastq} path_${fastq}
+    mkdir ${barcode}
+    cd ${barcode}/
+    path=\$(readlink -f ../path_${fastq})
+    seqkit scat -j ${task.cpus} -f \${path} --gz-only > ${barcode}_merged.fastq
+    gzip ${barcode}_merged.fastq
     """
 }
\ No newline at end of file
diff --git a/src/nf_modules/start_positions/main.nf b/src/nf_modules/start_positions/main.nf
index 27b8ebf..49f14e6 100644
--- a/src/nf_modules/start_positions/main.nf
+++ b/src/nf_modules/start_positions/main.nf
@@ -5,21 +5,26 @@ params.start_position_counts_out =""
 process start_position_individuals{
   container = "${container_url}"
   label "small_mem_mono_cpus"
-  tag "start positions"
+  tag "${barcode}"
   if (params.start_position_counts_out != "") {
     publishDir "results/${params.start_position_counts_out}", mode: 'copy'
   }
   input:
-    path(start_position_counts)
+    tuple val(barcode), path(start_position_counts)
 
   output:
-    path("Rplots.pdf")
-    path("*.png")
-    path("Count_reads_per_promoter.tsv")
-    path("classification_of_reads_per_RNA.txt"), emit: classification_of_reads
+    path("${barcode}/*.pdf")
+    path("${barcode}/*.png")
+    path("${barcode}/*.tsv")
+    tuple val(barcode), path("${barcode}/${barcode}_classification_of_reads_per_RNA.txt"), emit: classification_of_reads
 
   script:
     """
-    Rscript /Start_positions.R -i ${start_position_counts}
+    mkdir ${barcode}
+    cd ${barcode}/
+    Rscript /Start_positions.R -i ../${start_position_counts}
+    mv classification_of_reads_per_RNA.txt ${barcode}_classification_of_reads_per_RNA.txt
+    mv Count_reads_per_promoter.tsv ${barcode}_count_reads_per_promoter.tsv
+    mv Rplots.pdf ${barcode}_Rplots.pdf
     """
 }
\ No newline at end of file
-- 
GitLab