From bbc74d28baf8569ef60f7c4ce61c0857cd620c3b Mon Sep 17 00:00:00 2001
From: Emmanuel Labaronne <emmanuel.labaronne@ens-lyon.fr>
Date: Fri, 25 Sep 2020 12:35:52 +0200
Subject: [PATCH] src/RNAseq.nf : adapt htseq and add post genomic alignement

---
 src/RNAseq.config |   4 ++
 src/RNAseq.nf     | 108 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 84 insertions(+), 28 deletions(-)

diff --git a/src/RNAseq.config b/src/RNAseq.config
index d15b85fd..a4be61a6 100644
--- a/src/RNAseq.config
+++ b/src/RNAseq.config
@@ -104,6 +104,10 @@ profiles {
         container = "lbmc/htseq:0.11.2"
         cpus = 1
       }
+      withName: hisat2_postGenomic {
+        cpus = 4
+        container = "lbmc/hisat2:2.1.0"
+      }
       withName: multiqc {
         container = "ewels/multiqc:1.9"
         cpus = 1
diff --git a/src/RNAseq.nf b/src/RNAseq.nf
index e96e2fe5..87242145 100644
--- a/src/RNAseq.nf
+++ b/src/RNAseq.nf
@@ -5,18 +5,24 @@
 params.fastq_raw = "data/fastq/*{_R1,_R2}.fastq.gz"
 params.output = "results"
 params.do_fastqc = true
-params.script_cov = "src/norm_coverage.sh"
+//params.script_cov = "src/norm_coverage.sh"
 params.filter = "data/filter/human_rRNA_tRNA/*.bt2"
 params.index_genome = "data/genome/*.ht2"
 params.do_dedup = true
+params.gtf = "data/annotation/*.gtf"
+params.index_postgenome = "data/post_genome/*.ht2"
+params.do_postgenome = true
 
 log.info "input raw : ${params.fastq_raw}"
 log.info "outut directory : ${params.output}"
 log.info "filter index files : ${params.filter}"
 log.info "genome index : ${params.index_genome}"
+log.info "gtf file : ${params.gtf}"
+log.info "post-genome index : ${params.index_postgenome}"
 log.info ""
 log.info "do fastqc ? : ${params.do_fastqc}"
 log.info "do deduplication ? : ${params.do_dedup}"
+log.info "do post genome alignement ? : ${params.do_postgenome}"
 log.info ""
 
 Channel
@@ -35,6 +41,16 @@ Channel
    .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" }
    .set { GENOME_INDEX }
 
+Channel
+   .fromPath( params.gtf )
+   .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" }
+   .set { GTF_FILE }
+
+Channel
+   .fromPath ( params.index_genome )
+   .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" }
+   .set { POSTGENOME_INDEX }
+
 /* Fastqc of raw input */
 
 process fastqc_raw {
@@ -66,7 +82,9 @@ process trimming {
 
   output:
   set file_id, "*cut_{R1,R2}.fastq.gz" into CUTADAPT_OUTPUT
-  file "*.txt" into CUTADAPT_LOG
+  file "*first_report.txt" into CUTADAPT_LOG
+  file "*{second,third}_report.txt" into CUTADAPT_LOG_2
+
 
   script:
   """
@@ -79,7 +97,7 @@ process trimming {
            ${reads[0]} ${reads[1]} > ${file_id}_first_report.txt
 
   cutadapt -j ${task.cpus} \
-           -u -10 \
+           -a "A{100}" \
            -o ${file_id}_cut_R1.fastq.gz \
            ${file_id}_tmp_R1.fastq.gz \
               > ${file_id}_second_report.txt
@@ -140,13 +158,13 @@ process rRNA_removal {
 bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \
 -1 ${reads[0]} -2 ${reads[1]} --no-unal \
 --un-conc-gz ${file_id}_R%.fastq.gz 2> \
-${file_id}_bowtie2_report.txt | samtools view -bS - \
+${file_id}_filter.txt | samtools view -bS - \
 | samtools sort -@ ${task.cpus} -o ${file_id}.filter.bam \
             && samtools index ${file_id}.filter.bam \
             && samtools idxstats ${file_id}.filter.bam  > \
                ${file_id}.filter.stats
 
-if grep -q "Error " ${file_id}_bowtie2_report.txt; then
+if grep -q "Error " ${file_id}_filter.txt; then
   exit 1
 fi
 """
@@ -185,7 +203,7 @@ process hisat2_human {
     file index from GENOME_INDEX.toList()
 
   output:
-    file "*.fastq.gz" into HISAT_UNALIGNED
+    set file_id, "*.fastq.gz" into HISAT_UNALIGNED
     set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED
     file "*.txt" into HISAT_LOG
 
@@ -203,12 +221,12 @@ hisat2 -x ${index_id} \
        -2 ${fastq_filtred[1]} \
        --un-conc-gz ${file_id}_notaligned_R%.fastq.gz \
        --rna-strandness 'F' \
-       2> ${file_id}_hisat2_hg38.txt \
+       2> ${file_id}.txt \
 | samtools view -bS -F 4 - \
 | samtools sort -@ ${task.cpus} -o ${file_id}.bam \
 && samtools index ${file_id}.bam
 
-if grep -q "ERR" ${file_id}_hisat2_hg38.txt; then
+if grep -q "ERR" ${file_id}.txt; then
   exit 1
 fi
 """
@@ -248,7 +266,7 @@ process dedup_genome {
   set file_id, file(bam) from HISAT_ALIGNED_DEDUP
 
   output:
-  set file_id, "*.bam" into DEDUP_GENOME
+  set file_id, "*.{bam, bai}" into DEDUP_GENOME
   file "*.log" into DEDUP_LOG
 
   when:
@@ -265,19 +283,16 @@ if (! params.do_dedup) {
   HISAT_ALIGNED_DEDUP.into{DEDUP_GENOME}
 }
 
-
-
-
-/*                   HTseq
+/*                   HTseq */
 
 process sort_bam {
   tag "$file_id"
 
   input:
-    set file_id, file(bam) from sorted_bam_htseq
+    set file_id, file(bam) from DEDUP_GENOME
 
   output:
-    set file_id, "*_htseq.bam" into sorted_bam_files_2
+    set file_id, "*_htseq.bam" into SORTED_NAME_GENOME
 
   script:
 """
@@ -285,24 +300,16 @@ samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]}
 """
 }
 
-params.gtf = "$baseDir/data/annotation/*.gtf"
-log.info "gtf files : ${params.gtf}"
-
-Channel
-  .fromPath( params.gtf )
-  .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" }
-  .set { gtf_file }
-
 process counting {
   tag "$file_id"
   publishDir "${params.output}/04_HTseq/", mode: 'copy'
 
   input:
-  set file_id, file(bam) from sorted_bam_files_2
-  file gtf from gtf_file.toList()
+  set file_id, file(bam) from SORTED_NAME_GENOME
+  file gtf from GTF_FILE.toList()
 
   output:
-  file "*.count" into count_files
+  file "*.count" into HTSEQ_COUNT
 
   script:
 """
@@ -322,11 +329,55 @@ htseq-count ${bam[0]} ${gtf} \
             -t exon \
             -i gene_id \
             -f bam \
-> ${file_id}_exon.count
+> ${file_id}.count
 
 """
 }
 
+/* HISAT2 POST_GENOMIC */
+
+process hisat2_postGenomic {
+  tag "$file_id"
+  publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'
+
+  input:
+    set file_id, file(fastq_unaligned) from HISAT_UNALIGNED
+    file index2 from POSTGENOME_INDEX.toList()
+
+  output:
+    set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED
+    file "*.txt" into POSTGENOME_LOG
+
+  when:
+  params.do_postgenome
+
+  script:
+  index2_id = index2[0]
+  for (index2_file in index2) {
+    if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) {
+        index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1]
+    }
+  }
+"""
+hisat2 -x ${index2_id} \
+       -p ${task.cpus} \
+       -1 ${fastq_unaligned[0]} \
+       -2 ${fastq_unaligned[1]} \
+       --rna-strandness 'F' \
+       2> ${file_id}.txt \
+| samtools view -bS -F 4 - \
+| samtools sort -@ ${task.cpus} -o ${file_id}.bam \
+&& samtools index ${file_id}.bam
+
+if grep -q "ERR" ${file_id}.txt; then
+  exit 1
+fi
+"""
+}
+
+POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC;
+                   POSTGENOME_ALIGNED_DEDUP}
+/*
 Channel
    .fromFilePairs(params.script_cov)
    .ifEmpty { error "Cannot find any file matching: ${params.script_cov}" }
@@ -368,7 +419,8 @@ process multiqc {
   file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
   file ('*') from HISAT_LOG.collect().ifEmpty([])
   file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
-
+  file ('*') from HTSEQ_COUNT.collect().ifEmpty([])
+  file ('*') from POSTGENOME_LOG.collect().ifEmpty([])
 
   output:
   file "multiqc_report.html" into multiqc_report
-- 
GitLab