add normalized coverage in the pipeline

8635caa8 · elabaron · 7fa9c529 · 8635caa8 · 8635caa8 · 8635caa8
Commit 8635caa8 authored 4 years ago by elabaron
--- a/src/RNAseq.config
+++ b/src/RNAseq.config
@@ -71,6 +71,16 @@ profiles {
          time = "12h"
          queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
      }
+      withName: coverage_postgenome {
+          container = "lbmc/deeptools:3.0.2"
+          executor = "sge"
+          clusterOptions = "-cwd -V"
+          memory = "20GB"
+          cpus = 16
+  	  penv = 'openmp16'
+          time = "12h"
+          queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
+      }
      withName: fastqc_genome {
          container = "lbmc/fastqc:0.11.5"
          executor = "sge"
@@ -91,6 +101,16 @@ profiles {
          time = "12h"
          queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
      }
+      withName: hisat2_postGenomic {
+          container = "lbmc/hisat2:2.1.0"
+          executor = "sge"
+          clusterOptions = "-cwd -V"
+          memory = "20GB"
+          cpus = 16
+  	  penv = 'openmp16'
+          time = "12h"
+          queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
+      }
      withName: dedup_postgenome {
          container = "lbmc/hisat2:2.1.0"
          executor = "sge"
@@ -137,6 +157,16 @@ profiles {
          time = "12h"
          queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
      }
+      withName: fastqc_postgenome {
+          container = "lbmc/fastqc:0.11.5"
+          executor = "sge"
+          clusterOptions = "-cwd -V"
+          cpus = 4
+          penv = 'openmp4'
+          memory = "20GB"
+          time = "12h"
+          queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
+      }
    }
  }
  docker {

--- a/src/RNAseq.nf
+++ b/src/RNAseq.nf
@@ -153,7 +153,7 @@ process trimming {
              > ${file_id}_second_report.txt

  cutadapt -j ${task.cpus} \
-         -u -13 \
+         -u -60 \
         -o ${file_id}_cut_R2.fastq.gz \
         ${file_id}_tmp_R2.fastq.gz \
           > ${file_id}_third_report.txt
@@ -305,7 +305,8 @@ fi

 HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC;
                   HISAT_ALIGNED_DEDUP;
-                   HISAT_ALIGNED_COV}
+                   HISAT_ALIGNED_COV;
+                   HISAT_ALIGNED_COV_2}

 ////////////////////////////
 /* Fastqc of genome reads */
@@ -329,27 +330,6 @@ process fastqc_genome {
    """
 }

-
-//////////////
-/* Coverage */
-//////////////
-
-process coverage {
- tag "$file_id"
- publishDir "${params.output}/03_hisat2/coverage/", mode: 'copy'
-
- input:
- set file_id, file(bam) from HISAT_ALIGNED_COV
-
- output:
- file "*.bigwig" into COVERAGE_OUTPUT
-
- """
- bamCoverage -p ${task.cpus} --binSize 1 -b ${bam[0]} -o ${file_id}.bigwig
-
- """
-}
-
 ////////////////////////////
 /* Deduplication of reads */
 ////////////////////////////
@@ -516,7 +496,8 @@ fi
 }

 POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC;
-                   POSTGENOME_ALIGNED_DEDUP}
+                   POSTGENOME_ALIGNED_DEDUP;
+                   POSTGENOME_ALIGNED_COV}

 ////////////////////////////
 /* Deduplication of reads */
@@ -574,6 +555,55 @@ process fastqc_postgenome {
 //////////////////////////// POST PROCESS //////////////////////////////
 ////////////////////////////////////////////////////////////////////////

+//////////////
+/* Coverage */
+//////////////
+
+process coverage_postgenome {
+ tag "$file_id"
+ publishDir "${params.output}/07_coverage/", mode: 'copy'
+
+ input:
+ set file_id, file(bam) from HISAT_ALIGNED_COV
+ set file_id_2, file(post_genome) from POSTGENOME_ALIGNED_COV
+
+ output:
+ file "*.bigwig" into COVERAGE_OUTPUT
+
+ when:
+ params.do_postgenome
+
+ shell:
+ '''
+ genome=$(samtools view !{bam[0]} | awk '{print $1}' | sort | uniq | wc -l)
+ postgenome=$(samtools view !{post_genome[0]} | awk '{print $1}' | sort | uniq | wc -l)
+ total=$(($genome + $postgenome))
+ factor=$(awk -v c=$total 'BEGIN {print 1000000/c}')
+ bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}_genome.bigwig
+ bamCoverage -p !{task.cpus} --binSize 1 -b !{post_genome[0]} -o !{file_id}_postgenome.bigwig
+ '''
+}
+
+process coverage {
+ tag "$file_id"
+ publishDir "${params.output}/07_coverage/", mode: 'copy'
+
+ input:
+ set file_id, file(bam) from HISAT_ALIGNED_COV_2
+
+ output:
+ file "*.bigwig" into COVERAGE_OUTPUT_2
+
+ when:
+ params.do_postgenome==false
+ shell:
+ '''
+ genome=$(samtools view !{bam[0]} | awk '{print $1} | sort | uniq | wc -l)
+ factor=$(awk -v c=$genome 'BEGIN {print 1000000/c}')
+ bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}.bigwig
+ '''
+}
+

 /////////////
 /* MultiQC */
@@ -585,16 +615,16 @@ process multiqc {
  publishDir "${params.output}/multiqc", mode: 'copy'

  input:
-  file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
+//  file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
  file ('*') from CUTADAPT_LOG.collect().ifEmpty([])
  file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([])
  file ('*') from FILTER_LOG.collect().ifEmpty([])
-  file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
+//  file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
  file ('*') from HISAT_LOG.collect().ifEmpty([])
-  file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
+//  file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
  file ('*') from HTSEQ_COUNT.collect().ifEmpty([])
  file ('*') from POSTGENOME_LOG.collect().ifEmpty([])
-  file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
+//  file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
  file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([])

  output:

--- a/src/RibosomeProfiling.nf
+++ b/src/RibosomeProfiling.nf
 /*
-*	RNAseq Analysis pipeline
+*	Ribosome Profiling Analysis pipeline
 */

 //////////////////////////////////////////////////////
@@ -281,8 +281,8 @@ hisat2 -x ${index_id} \
       --trim3 1\
       2> ${file_id}_genome.txt \
 | samtools view -bS -F 4 - \
-| samtools sort -@ ${task.cpus} -o ${file_id}.bam \
-&& samtools index ${file_id}.bam
+| samtools sort -@ ${task.cpus} -o ${file_id}_genome.bam \
+&& samtools index ${file_id}_genome.bam

 if grep -q "ERR" ${file_id}.txt; then
  exit 1
@@ -292,7 +292,8 @@ fi

 HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC;
                   HISAT_ALIGNED_DEDUP;
-                   HISAT_ALIGNED_COV}
+                   HISAT_ALIGNED_COV;
+                   HISAT_ALIGNED_COV_2}

 ////////////////////////////
 /* Fastqc of genome reads */
@@ -316,112 +317,95 @@ process fastqc_genome {
    """
 }

-//////////////
-/* Coverage */
-//////////////
-
-process coverage {
- tag "$file_id"
- publishDir "${params.output}/03_hisat2/coverage/", mode: 'copy'
-
- input:
- set file_id, file(bam) from HISAT_ALIGNED_COV
-
- output:
- file "*.bigwig" into COVERAGE_OUTPUT
-
- """
- bamCoverage -p ${task.cpus} --binSize 1 -b ${bam[0]} -o ${file_id}.bigwig
- """
-}

 ////////////////////////////
 /* Deduplication of reads */
 ////////////////////////////

 if (params.do_dedup) {
-  process dedup_genome {
-    tag "$file_id"
-    publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy'
-
-    input:
-    set file_id, file(bam) from HISAT_ALIGNED_DEDUP
-
-    output:
-    set file_id, "*.{bam, bai}" into DEDUP_GENOME
-    file "*.log" into DEDUP_LOG
-
-    when:
-    params.do_dedup
-    
-    shell:
-      '''
-      samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam
-      input=$(samtools view -h !{bam[0]} | wc -l)
-      output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l)
-      diff=$(($input - $output))
-      per=$(($diff * 100 / $input))
-      echo "Input : $input reads" > !{file_id}_dedup.log
-      echo "Output : $output reads" >> !{file_id}_dedup.log
-      echo "$per % duplicated reads" >> !{file_id}_dedup.log
-      '''
-  }
+process dedup_genome {
+tag "$file_id"
+publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy'
+
+input:
+set file_id, file(bam) from HISAT_ALIGNED_DEDUP
+
+output:
+set file_id, "*.{bam, bai}" into DEDUP_GENOME
+file "*.log" into DEDUP_LOG
+
+when:
+params.do_dedup
+
+shell:
+'''
+samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam
+samtools index !{file_id}_dedup.bam
+input=$(samtools view -h !{bam[0]} | wc -l)
+output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l)
+diff=$(($input - $output))
+per=$(($diff * 100 / $input))
+echo "Input : $input reads" > !{file_id}_dedup.log
+echo "Output : $output reads" >> !{file_id}_dedup.log
+echo "$per % duplicated reads" >> !{file_id}_dedup.log
+'''
+}
 } else {
-  HISAT_ALIGNED_DEDUP.set{DEDUP_GENOME}
+HISAT_ALIGNED_DEDUP.set{DEDUP_GENOME}
 }

 DEDUP_GENOME.into{DEDUP_GENOME_HTSEQ;
-                  DEDUP_GENOME_RNASEQC
-                }
+	  DEDUP_GENOME_RNASEQC
+	}

 ///////////
 /* HTseq */
 ///////////

 process sort_bam {
-  tag "$file_id"
+tag "$file_id"

-  input:
-    set file_id, file(bam) from DEDUP_GENOME_HTSEQ
+input:
+set file_id, file(bam) from DEDUP_GENOME_HTSEQ

-  output:
-    set file_id, "*_htseq.bam" into SORTED_NAME_GENOME
+output:
+set file_id, "*_htseq.bam" into SORTED_NAME_GENOME

-  script:
+script:
 """
 samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]}
 """
 }

 process counting {
-  tag "$file_id"
-  publishDir "${params.output}/04_HTseq/", mode: 'copy'
+tag "$file_id"
+publishDir "${params.output}/04_HTseq/", mode: 'copy'

-  input:
-  set file_id, file(bam) from SORTED_NAME_GENOME
-  file gtf from GTF_FILE.toList()
+input:
+set file_id, file(bam) from SORTED_NAME_GENOME
+file gtf from GTF_FILE.toList()

-  output:
-  file "*.count" into HTSEQ_COUNT
+output:
+file "*.count" into HTSEQ_COUNT

-  script:
+script:
 """
 htseq-count ${bam[0]} ${gtf} \
-            --mode=intersection-nonempty \
-            -a 10 \
-            -s yes \
-            -t CDS \
-            -i gene_id \
-            -f bam \
+    --mode=intersection-nonempty \
+    -a 10 \
+    -s yes \
+    -t CDS \
+    -i gene_id \
+    -f bam \
 > ${file_id}_CDS.count

 htseq-count ${bam[0]} ${gtf} \
-            --mode=intersection-nonempty \
-            -a 10 \
-            -s yes \
-            -t exon \
-            -i gene_id \
-            -f bam \
+    --mode=intersection-nonempty \
+    -a 10 \
+    -s yes \
+    -t exon \
+    -i gene_id \
+    -f bam \
 > ${file_id}.count

 """
@@ -432,17 +416,17 @@ htseq-count ${bam[0]} ${gtf} \
 ///////////////

 process rnaseq_qc {
-  tag "$file_id"
-  publishDir "${params.output}/06_RNAseqQC/", mode: 'copy'
+tag "$file_id"
+publishDir "${params.output}/06_RNAseqQC/", mode: 'copy'

-  input:
-    set file_id, file(bam) from DEDUP_GENOME_RNASEQC
-    file (gtf) from GTF_COLLAPSE.collect()
+input:
+set file_id, file(bam) from DEDUP_GENOME_RNASEQC
+file (gtf) from GTF_COLLAPSE.collect()

-  output:
-    file "*" into RNASEQC_OUTPUT
+output:
+file "*" into RNASEQC_OUTPUT

-  script:
+script:
 """
 rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./
 """
@@ -457,49 +441,50 @@ rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./
 /////////////////////////

 process hisat2_postGenomic {
-  tag "$file_id"
-  publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'
+tag "$file_id"
+publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'

-  input:
-    set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME
-    file index2 from POSTGENOME_INDEX.collect()
+input:
+set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME
+file index2 from POSTGENOME_INDEX.collect()

-  output:
-    set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED
-    file "*_postgenome.txt" into POSTGENOME_LOG
+output:
+set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED
+file "*_postgenome.txt" into POSTGENOME_LOG

-  when:
-  params.do_postgenome
+when:
+params.do_postgenome

-  script:
-  index2_id = index2[0]
-  for (index2_file in index2) {
-    if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) {
-        index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1]
-    }
-  }
+script:
+index2_id = index2[0]
+for (index2_file in index2) {
+if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) {
+index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1]
+}
+}
 """
 hisat2 -x ${index2_id} \
-       -p ${task.cpus} \
-       -U ${fastq_unaligned[0]} \
-       --rna-strandness ${params.strand} \
-       --dta\
-       --no-softclip\
-       --trim3 1\
-       --trim5 1\
-       2> ${file_id}_postgenome.txt \
+-p ${task.cpus} \
+-U ${fastq_unaligned[0]} \
+--rna-strandness ${params.strand} \
+--dta \
+--no-softclip \
+--trim3 1 \
+--trim5 1 \
+2> ${file_id}_postgenome.txt \
 | samtools view -bS -F 4 -F 256 - \
-| samtools sort -@ ${task.cpus} -o ${file_id}.bam \
-&& samtools index ${file_id}.bam
+| samtools sort -@ ${task.cpus} -o ${file_id}_postgenome.bam \
+&& samtools index ${file_id}_postgenome.bam

 if grep -q "ERR" ${file_id}_postgenome.txt; then
-  exit 1
+exit 1
 fi
 """
 }

 POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC;
-                   POSTGENOME_ALIGNED_DEDUP}
+		POSTGENOME_ALIGNED_DEDUP;
+		POSTGENOME_ALIGNED_COV}

 ////////////////////////////
 /* Deduplication of reads */
@@ -523,6 +508,7 @@ if (params.do_dedup){
    shell:
    '''
    samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam
+    samtools index !{file_id}_dedup.bam
    input=$(samtools view -h !{bam[0]} | wc -l)
    output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l)
    diff=$(($input - $output))
@@ -560,6 +546,54 @@ process fastqc_postgenome {
 ////////////////////////////////////////////////////////////////////////


+//////////////
+/* Coverage */
+//////////////
+
+process coverage_postgenome {
+ tag "$file_id"
+ publishDir "${params.output}/07_coverage/", mode: 'copy'
+
+ input:
+ set file_id, file(bam) from HISAT_ALIGNED_COV
+ set file_id_2, file(post_genome) from POSTGENOME_ALIGNED_COV 
+
+ output:
+ file "*.bigwig" into COVERAGE_OUTPUT
+
+ when:
+ params.do_postgenome
+
+ shell:
+ '''
+ genome=$(samtools view !{bam[0]} | awk '{print $1}' | sort | uniq | wc -l)
+ postgenome=$(samtools view !{post_genome[0]} | awk '{print $1}' | sort | uniq | wc -l)
+ total=$(($genome + $postgenome))
+ factor=$(awk -v c=$total 'BEGIN {print 1000000/c}')
+ bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}_genome.bigwig
+ bamCoverage -p !{task.cpus} --binSize 1 -b !{post_genome[0]} -o !{file_id}_postgenome.bigwig
+ '''
+}
+
+process coverage {
+ tag "$file_id"
+ publishDir "${params.output}/07_coverage/", mode: 'copy'
+
+ input:
+ set file_id, file(bam) from HISAT_ALIGNED_COV_2
+
+ output:
+ file "*.bigwig" into COVERAGE_OUTPUT_2
+ 
+ when:
+ params.do_postgenome==false
+ shell:
+ '''
+ genome=$(samtools view !{bam[0]} | awk '{print $1} | sort | uniq | wc -l)
+ factor=$(awk -v c=$genome 'BEGIN {print 1000000/c}')
+ bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}.bigwig
+ '''
+}
 /////////////
 /* MultiQC */
 /////////////
@@ -570,16 +604,16 @@ process multiqc {
  publishDir "${params.output}/multiqc", mode: 'copy'

  input:
-  file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
+//  file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
  file ('*') from CUTADAPT_LOG.collect().ifEmpty([])
  file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([])
  file ('*') from FILTER_LOG.collect().ifEmpty([])
-  file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
+//  file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
  file ('*') from HISAT_LOG.collect().ifEmpty([])
-  file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
+//  file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
  file ('*') from HTSEQ_COUNT.collect().ifEmpty([])
  file ('*') from POSTGENOME_LOG.collect().ifEmpty([])
-  file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
+//  file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
  file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([])

  output: