Ajout des barcodes

6a48c301 · aliarifki · f2dca093 · 6a48c301 · 6a48c301 · 6a48c301
Commit 6a48c301 authored 1 year ago by aliarifki
--- a/src/bolero.nf
+++ b/src/bolero.nf
@@ -129,9 +129,9 @@ Channel
    .set { input }

 Channel
-  .of( params.adapt )
-  .ifEmpty { error "No adapter sequence defined." }
-  .set { adapt }
+    .of( params.adapt )
+    .ifEmpty { error "No adapter sequence defined." }
+    .set { adapt }

 Channel
    .fromPath( params.genome )
@@ -143,7 +143,10 @@ Channel
    .ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." }
    .set { gtf }

-// .map( it -> [it.baseName, it])
+Channel
+    .fromPath(params.input+'*/', type: 'dir')
+    .map(it -> [it.baseName, it])
+    .set{barcodes}

 /*
 ****************************************************************
@@ -161,10 +164,8 @@ if(!params.skipBC) {
  }
 }

-// Replace concatenate by seqkit fct to parallelization:
+include { barecode } from "./nf_modules/barecode/main.nf" 
 include { concatenate } from "./nf_modules/seqkit/main.nf"
-//include { concatenate } from "./nf_modules/concatenate/main.nf"
-
 include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf"
 include { hbv_genome } from "./nf_modules/minimap2/main.nf"
 include { seqkit_grep } from "./nf_modules/seqkit/main.nf"
@@ -178,9 +179,6 @@ include { junctions_nanosplicer } from "./nf_modules/junction_nanosplicer/main.n
 include { rna_count } from "./nf_modules/rna_count/main.nf"


-// creation des fonctions NanoSplicer:
-// include { jwr_check } from "./nf_modules/nanosplicer/main.nf"
-
 /*
 ****************************************************************
                          Workflow
@@ -189,42 +187,41 @@ include { rna_count } from "./nf_modules/rna_count/main.nf"

 workflow {

+
  //######################## BASECALLING ########################

-  if(params.skipBC) {
-    concatenate(params.input)
-    // Replace by seqkit scat to parallelization
+  if(params.skipBC) { // we take fastq files as input and skip basecalling
+    concatenate(barcodes)
  }
-  else {
+
+  //il reste à adapter ça
+  else { // we take fast5 files as input and proceed to basecalling with guppy
    if(params.gpu_mode) {
      basecall_fast5_gpu(input)
      concatenate(basecall_fast5_gpu.out.pass)
-      // Replace by seqkit scat to parallelization
    }
    else {
      basecall_fast5_cpu(input)
      concatenate(basecall_fast5_cpu.out.pass)
-      // Replace by seqkit scat to parallelization
    }
  }
+
+
+
  //####################### PREPROCESSING #######################
-  
-  
-  
+    
+
  //Filtration (seqkit_grep looks for the 5'RACE and the gsp patterns in the reads to keep only mature ARNs)
  seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp)
  
  //Cut of the 5'RACE sequence
  cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt)

-  
-
  //########################## MAPPING ##########################

-  
-  hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome)
+  hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome.collect())
+
  sort_index_bam(hbv_genome.out.bam)
-  // index_bam(sort_bam_genome.out.sorted_bam.collect())

  //###################### START POSITIONS #######################


--- a/src/nf_modules/cutadapt/main.nf
+++ b/src/nf_modules/cutadapt/main.nf
@@ -4,23 +4,23 @@ container_url = "xgrand/cutadapt:${version}"
 process cut_5pRACE {
  container = "${container_url}"
  label "small_mem_mono_cpus"
-  tag "cutadapt"
+  tag "${barcode}"
  
  if (params.cutadapt_out != "") {
    publishDir "results/${params.cutadapt_out}", mode: 'copy'
  }

  input:
-  path(fastq)
+  tuple val(barcode), path(fastq)
  val(adapt)

  output:
-  path("*_cut_*"), emit: fastq_cutadapt
+  tuple val(barcode), path("${barcode}_merged_porechoped_cut_fastq.fastq"), emit: fastq_cutadapt

  """
  cutadapt -e 0.2 -g ${adapt} \
   --revcomp \
-   -o "merged_porechoped_cut_fastq.fastq" \
+   -o "${barcode}_merged_porechoped_cut_fastq.fastq" \
   ${fastq}
  """
 }
\ No newline at end of file
--- a/src/nf_modules/junction_nanosplicer/main.nf
+++ b/src/nf_modules/junction_nanosplicer/main.nf
@@ -5,23 +5,24 @@ params.nanosplicer_out = ""
 process junctions_nanosplicer{
  container = "${container_url}"
  label "small_mem_mono_cpus"
-  tag "identification de variants d'épissage"
+  tag "${barcode}"
  if (params.nanosplicer_out != "") {
    publishDir "results/${params.nanosplicer_out}", mode: 'copy'
  }

  input:
-    path(txt)
-    path(csv)
+    tuple val(barcode), path(txt)
+    tuple val(barcode), path(csv)

  output:
-    path("Rplots.pdf")
-    path("JWR_check_parsed.csv")
-    path("*.png")
-    path("identified_SPvariants.csv"), emit: identified_SPvariants
+    path("${barcode}/JWR_check_parsed.csv")
+    tuple val(barcode), path("${barcode}/${barcode}_identified_SPvariants.csv"), emit: identified_SPvariants

  script:
    """
-    Rscript /Junctions_NanoSplicer.R -c ${txt} -j ${csv}
+    mkdir ${barcode}
+    cd ${barcode}/
+    Rscript /Junctions_NanoSplicer.R -c ../${txt} -j ../${csv}
+    mv identified_SPvariants.csv ${barcode}_identified_SPvariants.csv
    """
 }
\ No newline at end of file
--- a/src/nf_modules/minimap2/main.nf
+++ b/src/nf_modules/minimap2/main.nf
@@ -89,22 +89,25 @@ params.mapping_hbv_genome = "-ax splice --secondary=no -G 1650 -u n --eqx"
 process hbv_genome {
  container = "${container_url}"
  label "big_mem_multi_cpus"
+  tag "${barcode}"
  if (params.minimap2_genome_out != "") {
    publishDir "results/${params.minimap2_genome_out}", mode: 'copy'
  }

  input:
-  path(fastq)
+  tuple val(barcode), path(fastq)
  path(genome)

  output:
-  path("*"), emit: bam
+  tuple val(barcode), path("${barcode}/${barcode}_res.bam"), emit: bam

  script:
  memory = "${task.memory}" - ~/\s*GB/
  memory = memory.toInteger() / (task.cpus + 1.0)
  """
-  minimap2 ${params.mapping_hbv_genome} -t${task.cpus} -K ${memory} ${genome} ${fastq} |
-    samtools view -Shb - > res.bam
+  mkdir ${barcode}
+  cd ${barcode}/
+  minimap2 ${params.mapping_hbv_genome} -t ${task.cpus} -K ${memory} ../${genome} ../${fastq} |
+    samtools view -Shb - > ${barcode}_res.bam
  """
 }
\ No newline at end of file
--- a/src/nf_modules/nanosplicer/main.nf
+++ b/src/nf_modules/nanosplicer/main.nf
+version = "1.0"
+container_url = "xgrand/nanosplicer:${version}"
+
+params.nanosplicer_out = ""
+process jwr_checker {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "${barcode}"
+  if (params.nanosplicer_out != "") {
+    publishDir "results/${params.nanosplicer_out}", mode: 'copy'
+  }
+
+  input:
+    tuple val(barcode), path(bam), path(index)
+
+  output:
+    tuple val(barcode), path("${barcode}/${barcode}_JWR_check.h5.csv"), emit: nanosplicer_jwr
+
+  script:
+    """
+    mkdir ${barcode}
+    cd ${barcode}/
+    python3 /NanoSplicer/bin/JWR_checker.py --output_csv ../${bam} ${barcode}_JWR_check.h5 
+    """
+}
+
--- a/src/nf_modules/rna_count/main.nf
+++ b/src/nf_modules/rna_count/main.nf
@@ -5,22 +5,24 @@ params.rna_count_out = ""
 process rna_count{
  container = "${container_url}"
  label "small_mem_mono_cpus"
-  tag "RNA quantification"
+  tag "${barcode}"
  if (params.rna_count_out != "") {
    publishDir "results/${params.rna_count_out}", mode: 'copy'
  }

  input:
-    path(spvariants)
-    path(classification)
+    tuple val(barcode), path(spvariants)
+    tuple val(barcode), path(classification)

  output:
-    path("*.csv")
-    path("*.pdf")
-    path("*.png")
+    path("${barcode}/*.csv")
+    path("${barcode}/*.pdf")
+    path("${barcode}/*.png")

  script:
    """
-    Rscript /HBV_RNAs_count.R -s ${spvariants} -c ${classification}
+    mkdir ${barcode}
+    cd ${barcode}/
+    Rscript /HBV_RNAs_count.R -s ../${spvariants} -c ../${classification}
    """
 }
--- a/src/nf_modules/samtools/main.nf
+++ b/src/nf_modules/samtools/main.nf
@@ -24,21 +24,23 @@ samtools sort -@ ${task.cpus} ${bam} -O BAM -o ${bam.simpleName}_sorted.bam

 params.start_position_counts_out = ""
 process start_position_counts {
-    tag "Start positions count"
+    tag "${barcode}"
    label "big_mem_multi_cpus"
    publishDir "results/${params.start_position_counts_out}", mode: 'copy'

    input:
-        tuple path(bam), path(index)
+        tuple val(barcode), path(bam), path(index)

    output:
-        path "*", emit: count
+        tuple val(barcode), path("${barcode}/${barcode}_start_positions_counts.txt"), emit: count

    script:
 """
-samtools view -F 260 ${bam} |
+mkdir ${barcode}
+cd ${barcode}/
+samtools view -F 260 ../${bam} |
  cut -f 1,4 |
-  sort > Start_positions_counts.txt
+  sort > ${barcode}_start_positions_counts.txt
 """
 }

@@ -67,20 +69,22 @@ params.indexed_bam_out =""
 process sort_index_bam {
  container = "${container_url}"
  label "big_mem_multi_cpus"
-  tag "sorting"
+  tag "${barcode}"
  if (params.indexed_bam_out != "") {
    publishDir "results/${params.indexed_bam_out}", mode: 'copy'
  }

  input:
-    path(bam)
+    tuple val(barcode), path(bam)

  output:
-    tuple path("*sorted.bam"), path("*.bai"), emit: indexed_bam
+    tuple val(barcode), path("${barcode}/*sorted.bam"), path("${barcode}/*.bai"), emit: indexed_bam

  script:
 """
-samtools sort -@ ${task.cpus} ${bam} -o ${bam.simpleName}_sorted.bam
-samtools index -@ ${task.cpus} ${bam.simpleName}_sorted.bam
+mkdir ${barcode}
+cd ${barcode}/
+samtools sort -@ ${task.cpus} ../${bam} -o ${barcode}_sorted.bam
+samtools index -@ ${task.cpus} ${barcode}_sorted.bam
 """
 }
\ No newline at end of file
--- a/src/nf_modules/seqkit/main.nf
+++ b/src/nf_modules/seqkit/main.nf
@@ -29,35 +29,37 @@ params.seqkit_grep_out = ""
 process seqkit_grep {
  container = "${container_url}"
  label "small_mem_multi_cpus"
-  tag "Filter_reads"
+  tag "${barcode}"
  if (params.seqkit_grep_out != "") {
    publishDir "results/${params.seqkit_grep_out}", mode: 'copy'
  }
  
  input:
-    path(fastq)
+    tuple val(barcode), path(fastq)
    val(adapt)
    val(gsp)

  output:
-    path("filtered_5RACE_GSP.fastq"), emit: filtered_fastq
-    path("seq_stats.csv")
-    path("*.txt")
-    path("filtered_5RACE.fastq")
+    tuple val(barcode), path("${barcode}/${barcode}_filtered_5RACE_GSP.fastq"), emit: filtered_fastq
+    path("${barcode}/*.csv")
+    path("${barcode}/*.txt")
+    path("${barcode}/${barcode}_filtered_5RACE.fastq")

  script:
    lgadapt = Math.round(adapt.size().div(10))
    lggsp = Math.round(gsp.size().div(10))
    """
+    mkdir ${barcode}
+    cd ${barcode}/
    echo "mismatch allowed to 5'RACE adapter:  ${lgadapt}" > mismatch.txt
    echo "mismatch allowed to Gene Specific primer:  ${lggsp}" >> mismatch.txt
    echo ${adapt} > adapt.txt
    echo ${gsp} > gsp.txt
-    seqkit grep -i -f adapt.txt -m ${lgadapt} ${fastq} -o filtered_5RACE.fastq -j ${task.cpus}
-    seqkit grep -i -f gsp.txt -m ${lggsp} filtered_5RACE.fastq -o filtered_5RACE_GSP.fastq -j ${task.cpus}
-    seqkit stats ${fastq} -T -j ${task.cpus} > seq_stats.csv
-    seqkit stats filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv
-    seqkit stats filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv
+    seqkit grep -i -f adapt.txt -m ${lgadapt} ../${fastq} -o ${barcode}_filtered_5RACE.fastq -j ${task.cpus}
+    seqkit grep -i -f gsp.txt -m ${lggsp} ${barcode}_filtered_5RACE.fastq -o ${barcode}_filtered_5RACE_GSP.fastq -j ${task.cpus}
+    seqkit stats ../${fastq} -T -j ${task.cpus} > ${barcode}_seq_stats.csv
+    seqkit stats ${barcode}_filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv
+    seqkit stats ${barcode}_filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv
    """
 }

@@ -65,21 +67,24 @@ params.fastq_out = ""
 process concatenate {
  container = "${container_url}"
  label "big_mem_multi_cpus"
-  tag "Concatenate_reads"
+  tag "${barcode}"
  if (params.fastq_out != "") {
    publishDir "results/${params.fastq_out}", mode: 'copy'
  }

  input:
-    path fastq
+    tuple val(barcode), path(fastq)

  output:
-    path "merged.fastq.gz", emit: merged_fastq
+    tuple val(barcode), path("${barcode}/${barcode}_merged.fastq.gz"), emit: merged_fastq

  script:
    """
-    path=\$(readlink -f ${fastq})
-    seqkit scat -j ${task.cpus} -f \${path} --gz-only > merged.fastq
-    gzip merged.fastq
+    mv ${fastq} path_${fastq}
+    mkdir ${barcode}
+    cd ${barcode}/
+    path=\$(readlink -f ../path_${fastq})
+    seqkit scat -j ${task.cpus} -f \${path} --gz-only > ${barcode}_merged.fastq
+    gzip ${barcode}_merged.fastq
    """
 }
\ No newline at end of file
--- a/src/nf_modules/start_positions/main.nf
+++ b/src/nf_modules/start_positions/main.nf
@@ -5,21 +5,26 @@ params.start_position_counts_out =""
 process start_position_individuals{
  container = "${container_url}"
  label "small_mem_mono_cpus"
-  tag "start positions"
+  tag "${barcode}"
  if (params.start_position_counts_out != "") {
    publishDir "results/${params.start_position_counts_out}", mode: 'copy'
  }
  input:
-    path(start_position_counts)
+    tuple val(barcode), path(start_position_counts)

  output:
-    path("Rplots.pdf")
-    path("*.png")
-    path("Count_reads_per_promoter.tsv")
-    path("classification_of_reads_per_RNA.txt"), emit: classification_of_reads
+    path("${barcode}/*.pdf")
+    path("${barcode}/*.png")
+    path("${barcode}/*.tsv")
+    tuple val(barcode), path("${barcode}/${barcode}_classification_of_reads_per_RNA.txt"), emit: classification_of_reads

  script:
    """
-    Rscript /Start_positions.R -i ${start_position_counts}
+    mkdir ${barcode}
+    cd ${barcode}/
+    Rscript /Start_positions.R -i ../${start_position_counts}
+    mv classification_of_reads_per_RNA.txt ${barcode}_classification_of_reads_per_RNA.txt
+    mv Count_reads_per_promoter.tsv ${barcode}_count_reads_per_promoter.tsv
+    mv Rplots.pdf ${barcode}_Rplots.pdf
    """
 }
\ No newline at end of file