src/RNAseq.nf : adapt htseq and add post genomic alignement

bbc74d28 · elabaron · a27d714c · bbc74d28 · bbc74d28
Commit bbc74d28 authored Sep 25, 2020 by elabaron
--- a/src/RNAseq.config
+++ b/src/RNAseq.config
@@ -104,6 +104,10 @@ profiles {
        container = "lbmc/htseq:0.11.2"
        cpus = 1
      }
+      withName: hisat2_postGenomic {
+        cpus = 4
+        container = "lbmc/hisat2:2.1.0"
+      }
      withName: multiqc {
        container = "ewels/multiqc:1.9"
        cpus = 1

--- a/src/RNAseq.nf
+++ b/src/RNAseq.nf
@@ -5,18 +5,24 @@
 params.fastq_raw = "data/fastq/*{_R1,_R2}.fastq.gz"
 params.output = "results"
 params.do_fastqc = true
-params.script_cov = "src/norm_coverage.sh"
+//params.script_cov = "src/norm_coverage.sh"
 params.filter = "data/filter/human_rRNA_tRNA/*.bt2"
 params.index_genome = "data/genome/*.ht2"
 params.do_dedup = true
+params.gtf = "data/annotation/*.gtf"
+params.index_postgenome = "data/post_genome/*.ht2"
+params.do_postgenome = true

 log.info "input raw : ${params.fastq_raw}"
 log.info "outut directory : ${params.output}"
 log.info "filter index files : ${params.filter}"
 log.info "genome index : ${params.index_genome}"
+log.info "gtf file : ${params.gtf}"
+log.info "post-genome index : ${params.index_postgenome}"
 log.info ""
 log.info "do fastqc ? : ${params.do_fastqc}"
 log.info "do deduplication ? : ${params.do_dedup}"
+log.info "do post genome alignement ? : ${params.do_postgenome}"
 log.info ""

 Channel
@@ -35,6 +41,16 @@ Channel
   .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" }
   .set { GENOME_INDEX }

+Channel
+   .fromPath( params.gtf )
+   .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" }
+   .set { GTF_FILE }
+
+Channel
+   .fromPath ( params.index_genome )
+   .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" }
+   .set { POSTGENOME_INDEX }
+
 /* Fastqc of raw input */

 process fastqc_raw {
@@ -66,7 +82,9 @@ process trimming {

  output:
  set file_id, "*cut_{R1,R2}.fastq.gz" into CUTADAPT_OUTPUT
-  file "*.txt" into CUTADAPT_LOG
+  file "*first_report.txt" into CUTADAPT_LOG
+  file "*{second,third}_report.txt" into CUTADAPT_LOG_2
+

  script:
  """
@@ -79,7 +97,7 @@ process trimming {
           ${reads[0]} ${reads[1]} > ${file_id}_first_report.txt

  cutadapt -j ${task.cpus} \
-           -u -10 \
+           -a "A{100}" \
           -o ${file_id}_cut_R1.fastq.gz \
           ${file_id}_tmp_R1.fastq.gz \
              > ${file_id}_second_report.txt
@@ -140,13 +158,13 @@ process rRNA_removal {
 bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \
 -1 ${reads[0]} -2 ${reads[1]} --no-unal \
 --un-conc-gz ${file_id}_R%.fastq.gz 2> \
-${file_id}_bowtie2_report.txt | samtools view -bS - \
+${file_id}_filter.txt | samtools view -bS - \
 | samtools sort -@ ${task.cpus} -o ${file_id}.filter.bam \
            && samtools index ${file_id}.filter.bam \
            && samtools idxstats ${file_id}.filter.bam  > \
               ${file_id}.filter.stats

-if grep -q "Error " ${file_id}_bowtie2_report.txt; then
+if grep -q "Error " ${file_id}_filter.txt; then
  exit 1
 fi
 """
@@ -185,7 +203,7 @@ process hisat2_human {
    file index from GENOME_INDEX.toList()

  output:
-    file "*.fastq.gz" into HISAT_UNALIGNED
+    set file_id, "*.fastq.gz" into HISAT_UNALIGNED
    set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED
    file "*.txt" into HISAT_LOG

@@ -203,12 +221,12 @@ hisat2 -x ${index_id} \
       -2 ${fastq_filtred[1]} \
       --un-conc-gz ${file_id}_notaligned_R%.fastq.gz \
       --rna-strandness 'F' \
-       2> ${file_id}_hisat2_hg38.txt \
+       2> ${file_id}.txt \
 | samtools view -bS -F 4 - \
 | samtools sort -@ ${task.cpus} -o ${file_id}.bam \
 && samtools index ${file_id}.bam

-if grep -q "ERR" ${file_id}_hisat2_hg38.txt; then
+if grep -q "ERR" ${file_id}.txt; then
  exit 1
 fi
 """
@@ -248,7 +266,7 @@ process dedup_genome {
  set file_id, file(bam) from HISAT_ALIGNED_DEDUP

  output:
-  set file_id, "*.bam" into DEDUP_GENOME
+  set file_id, "*.{bam, bai}" into DEDUP_GENOME
  file "*.log" into DEDUP_LOG

  when:
@@ -265,19 +283,16 @@ if (! params.do_dedup) {
  HISAT_ALIGNED_DEDUP.into{DEDUP_GENOME}
 }

-
-
-
-/*                   HTseq
+/*                   HTseq */

 process sort_bam {
  tag "$file_id"

  input:
-    set file_id, file(bam) from sorted_bam_htseq
+    set file_id, file(bam) from DEDUP_GENOME

  output:
-    set file_id, "*_htseq.bam" into sorted_bam_files_2
+    set file_id, "*_htseq.bam" into SORTED_NAME_GENOME

  script:
 """
@@ -285,24 +300,16 @@ samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]}
 """
 }

-params.gtf = "$baseDir/data/annotation/*.gtf"
-log.info "gtf files : ${params.gtf}"
-
-Channel
-  .fromPath( params.gtf )
-  .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" }
-  .set { gtf_file }
-
 process counting {
  tag "$file_id"
  publishDir "${params.output}/04_HTseq/", mode: 'copy'

  input:
-  set file_id, file(bam) from sorted_bam_files_2
-  file gtf from gtf_file.toList()
+  set file_id, file(bam) from SORTED_NAME_GENOME
+  file gtf from GTF_FILE.toList()

  output:
-  file "*.count" into count_files
+  file "*.count" into HTSEQ_COUNT

  script:
 """
@@ -322,11 +329,55 @@ htseq-count ${bam[0]} ${gtf} \
            -t exon \
            -i gene_id \
            -f bam \
-> ${file_id}_exon.count
+> ${file_id}.count

 """
 }

+/* HISAT2 POST_GENOMIC */
+
+process hisat2_postGenomic {
+  tag "$file_id"
+  publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'
+
+  input:
+    set file_id, file(fastq_unaligned) from HISAT_UNALIGNED
+    file index2 from POSTGENOME_INDEX.toList()
+
+  output:
+    set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED
+    file "*.txt" into POSTGENOME_LOG
+
+  when:
+  params.do_postgenome
+
+  script:
+  index2_id = index2[0]
+  for (index2_file in index2) {
+    if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) {
+        index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1]
+    }
+  }
+"""
+hisat2 -x ${index2_id} \
+       -p ${task.cpus} \
+       -1 ${fastq_unaligned[0]} \
+       -2 ${fastq_unaligned[1]} \
+       --rna-strandness 'F' \
+       2> ${file_id}.txt \
+| samtools view -bS -F 4 - \
+| samtools sort -@ ${task.cpus} -o ${file_id}.bam \
+&& samtools index ${file_id}.bam
+
+if grep -q "ERR" ${file_id}.txt; then
+  exit 1
+fi
+"""
+}
+
+POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC;
+                   POSTGENOME_ALIGNED_DEDUP}
+/*
 Channel
   .fromFilePairs(params.script_cov)
   .ifEmpty { error "Cannot find any file matching: ${params.script_cov}" }
@@ -368,7 +419,8 @@ process multiqc {
  file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
  file ('*') from HISAT_LOG.collect().ifEmpty([])
  file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
-
+  file ('*') from HTSEQ_COUNT.collect().ifEmpty([])
+  file ('*') from POSTGENOME_LOG.collect().ifEmpty([])

  output:
  file "multiqc_report.html" into multiqc_report