From c3401f6e4a2311bc70d78164ab49f613422a4e3d Mon Sep 17 00:00:00 2001
From: Emmanuel Labaronne <emmanuel.labaronne@ens-lyon.fr>
Date: Thu, 24 Sep 2020 18:08:27 +0200
Subject: [PATCH] src/RNAseq.nf : add dedup option

---
 src/RNAseq.config |  4 ++++
 src/RNAseq.nf     | 60 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/src/RNAseq.config b/src/RNAseq.config
index 83560a81..c8f39d48 100644
--- a/src/RNAseq.config
+++ b/src/RNAseq.config
@@ -92,6 +92,10 @@ profiles {
         cpus = 4
         container = "lbmc/hisat2:2.1.0"
       }
+      withName: dedup_genome {
+        container = "lbmc/umi_tools:1.0.0"
+        cpus = 1
+      }
       withName: sort_bam {
         container = "lbmc/samtools:1.7"
         cpus = 1
diff --git a/src/RNAseq.nf b/src/RNAseq.nf
index 96ef0945..8022c44a 100644
--- a/src/RNAseq.nf
+++ b/src/RNAseq.nf
@@ -8,12 +8,16 @@ params.do_fastqc = true
 params.script_cov = "src/norm_coverage.sh"
 params.filter = "data/filter/human_rRNA_tRNA/*.bt2"
 params.index_genome = "data/genome/*.ht2"
+params.do_dedup = true
 
 log.info "input raw : ${params.fastq_raw}"
 log.info "outut directory : ${params.output}"
-log.info "do fastqc ? : ${params.do_fastqc}"
 log.info "filter index files : ${params.filter}"
 log.info "genome index : ${params.index_genome}"
+log.info ""
+log.info "do fastqc ? : ${params.do_fastqc}"
+log.info "do deduplication ? : ${params.do_dedup}"
+log.info ""
 
 Channel
    .fromFilePairs(params.fastq_raw)
@@ -75,7 +79,6 @@ process trimming {
            ${reads[0]} ${reads[1]} > ${file_id}_first_report.txt
 
   cutadapt -j ${task.cpus} \
-           -a "A{100}" \
            -u -10 \
            -o ${file_id}_cut_R1.fastq.gz \
            ${file_id}_tmp_R1.fastq.gz \
@@ -183,7 +186,7 @@ process hisat2_human {
 
   output:
     file "*.fastq.gz" into HISAT_UNALIGNED
-    set file_id, "*_sorted.{bam,bam.bai}" into HISAT_ALIGNED
+    set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED
     file "*.txt" into HISAT_LOG
 
   script:
@@ -211,7 +214,56 @@ fi
 """
 }
 
-//sorted_bam_files.into{sorted_bam_htseq; sorted_bam_coverage}
+HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC;
+                   HISAT_ALIGNED_DEDUP}
+
+
+/* Fastqc of filtred reads */
+
+process fastqc_genome {
+  tag "$file_id"
+  publishDir "${params.output}/00_fastqc/genome/", mode: 'copy'
+
+  input:
+  set file_id, file(reads) from HISAT_ALIGNED_FASTQC
+
+  output:
+  set file_id, "*.{html,zip}" into OUTPUT_FASTQC_GENOME
+
+  when:
+  params.do_fastqc
+
+    """
+    fastqc ${file_id}* -t ${task.cpus}
+    """
+}
+
+/* Deduplication of reads */
+
+process dedup_genome {
+  tag "$file_id"
+  publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy'
+
+  input:
+  set file_id, file(bam) from HISAT_ALIGNED_DEDUP
+
+  output:
+  set file_id, "*.bam" into DEDUP_GENOME
+  file "*.log" into DEDUP_LOG
+
+  when:
+  params.do_dedup
+
+    """
+    umi_tools dedup -I ${bam}[0] \
+    -S ${file_id}.dedup.bam \
+    2> dedup.log
+    """
+}
+
+if (! params.do_dedup) {
+  HISAT_ALIGNED_DEDUP.into{DEDUP_GENOME}
+}
 
 /*                   HTseq
 
-- 
GitLab