From aa52ea7efda8887f5112cf168f44fbe35afe4114 Mon Sep 17 00:00:00 2001
From: elabaron <emmanuel.labaronne@ens-lyon.fr>
Date: Tue, 10 Nov 2020 16:45:02 +0100
Subject: [PATCH] src/RibosomeProfiling.nf : fix coverage normalisation

---
 src/RNAseq.config        |  22 +++++-
 src/RibosomeProfiling.nf | 141 +++++++++++++++++++++++++++------------
 2 files changed, 118 insertions(+), 45 deletions(-)

diff --git a/src/RNAseq.config b/src/RNAseq.config
index 1ec50488..5a4cf78f 100644
--- a/src/RNAseq.config
+++ b/src/RNAseq.config
@@ -61,6 +61,15 @@ profiles {
           time = "12h"
           queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
       }
+      withName: calc_scalingFactor{
+          container = "lbmc/hisat2:2.1.0"
+          executor = "sge"
+          clusterOptions = "-cwd -V"
+          memory = "20GB"
+          cpus = 1
+          time = "12h"
+          queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
+      }
       withName: coverage {
           container = "lbmc/deeptools:3.0.2"
           executor = "sge"
@@ -71,7 +80,16 @@ profiles {
           time = "12h"
           queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
       }
-      withName: coverage_postgenome {
+      withName: calc_scalingFactor_with_postgenome{
+          container = "lbmc/hisat2:2.1.0"
+          executor = "sge"
+          clusterOptions = "-cwd -V"
+          memory = "20GB"
+          cpus = 1
+          time = "12h"
+          queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
+      }
+      withName: coverage_with_postgenome {
           container = "lbmc/deeptools:3.0.2"
           executor = "sge"
           clusterOptions = "-cwd -V"
@@ -101,7 +119,7 @@ profiles {
           time = "12h"
           queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
       }
-      withName: hisat2_postGenomic {
+      withName: hisat2_postgenome {
           container = "lbmc/hisat2:2.1.0"
           executor = "sge"
           clusterOptions = "-cwd -V"
diff --git a/src/RibosomeProfiling.nf b/src/RibosomeProfiling.nf
index 0f002d48..bb19b1c5 100644
--- a/src/RibosomeProfiling.nf
+++ b/src/RibosomeProfiling.nf
@@ -440,7 +440,7 @@ rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./
 /* HISAT2 POST_GENOMIC */
 /////////////////////////
 
-process hisat2_postGenomic {
+process hisat2_postgenome {
 tag "$file_id"
 publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'
 
@@ -550,50 +550,105 @@ process fastqc_postgenome {
 /* Coverage */
 //////////////
 
-process coverage_postgenome {
- tag "$file_id"
- publishDir "${params.output}/07_coverage/", mode: 'copy'
-
- input:
- set file_id, file(bam) from HISAT_ALIGNED_COV
- set file_id_2, file(post_genome) from POSTGENOME_ALIGNED_COV 
-
- output:
- file "*.bigwig" into COVERAGE_OUTPUT
-
- when:
- params.do_postgenome
-
- shell:
- '''
- genome=$(samtools view !{bam[0]} | awk '{print $1}' | sort | uniq | wc -l)
- postgenome=$(samtools view !{post_genome[0]} | awk '{print $1}' | sort | uniq | wc -l)
- total=$(($genome + $postgenome))
- factor=$(awk -v c=$total 'BEGIN {print 1000000/c}')
- bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}_genome.bigwig
- bamCoverage -p !{task.cpus} --binSize 1 -b !{post_genome[0]} -o !{file_id}_postgenome.bigwig
- '''
-}
+if (params.do_postgenome){
+        HISAT_ALIGNED_COV.join(POSTGENOME_ALIGNED_COV)
+                         .into{COVERAGE_MERGED_VALUES;COVERAGE_MERGED_BAM}
+
+        process calc_scalingFactor_with_postgenome{
+         tag "$file_id"
+
+         input:
+         set file_id, file(genome), file(post_genome) from COVERAGE_MERGED_VALUES
+
+         output:
+         set file_id, env(genome), env(postgenome), env(factor) into COVERAGE_MERGED_FACTOR
+
+         shell:
+         '''
+         genome=$(samtools view !{genome[0]} | awk '{print $1}' | sort | uniq | wc -l)
+         postgenome=$(samtools view !{post_genome[0]} | awk '{print $1}' | sort | uniq | wc -l)
+         total=$(($genome + $postgenome))
+         factor=$(awk -v c=$total 'BEGIN {print 1000000/c}')
+         '''
+        }
+
+        COVERAGE_MERGED_FACTOR.join(COVERAGE_MERGED_BAM)
+                              .set{COVERAGE_MERGED_BIGWIG}
+
+        process coverage_with_postgenome{
+         tag "$file_id"
+         publishDir "${params.output}/07_coverage/", mode: 'copy'
+
+         input:
+         set file_id, val(genome), val(postgenome), val (factor),  file(genome_bam), file(post_genome_bam) from COVERAGE_MERGED_BIGWIG
+
+         output:
+         file "*.bigwig" into COVERAGE_OUTPUT
+         file "*.txt" into COVERAGE_LOG
+
+         shell:
+         '''
+         total=$((!{genome} + !{postgenome}))
+         echo "genome aligment : !{genome} counts" >> !{file_id}.txt
+         echo "postgenome aligment : !{postgenome} counts" >> !{file_id}.txt
+         echo "total counts : $total" >> !{file_id}.txt
+         echo "scaling factor : !{factor}" >> !{file_id}.txt
+         if [ !{genome} -gt 0 ]
+         then 
+           bamCoverage -p !{task.cpus} --binSize 1  --scaleFactor !{factor} -b !{genome_bam[0]} -o !{file_id}_genome.bigwig
+         fi
+         if [ !{postgenome} -gt 0 ]
+         then
+           bamCoverage -p !{task.cpus} --binSize 1  --scaleFactor !{factor} -b !{post_genome_bam[0]} -o !{file_id}_postgenome.bigwig
+         fi
+         '''
+        }
+} else {
+        HISAT_ALIGNED_COV.into{COVERAGE_MERGED_VALUES;COVERAGE_MERGED_BAM}
+
+        process calc_scalingFactor{
+         tag "$file_id"
+
+         input:
+         set file_id, file(genome) from COVERAGE_MERGED_VALUES
+
+         output:
+         set file_id, env(genome), env(factor) into COVERAGE_MERGED_FACTOR
+
+         shell:
+         '''
+         genome=$(samtools view !{genome[0]} | awk '{print $1}' | sort | uniq | wc -l)
+         factor=$(awk -v c=$genome 'BEGIN {print 1000000/c}')
+         '''
+        }
 
-process coverage {
- tag "$file_id"
- publishDir "${params.output}/07_coverage/", mode: 'copy'
-
- input:
- set file_id, file(bam) from HISAT_ALIGNED_COV_2
-
- output:
- file "*.bigwig" into COVERAGE_OUTPUT_2
- 
- when:
- params.do_postgenome==false
- shell:
- '''
- genome=$(samtools view !{bam[0]} | awk '{print $1}' | sort | uniq | wc -l)
- factor=$(awk -v c=$genome 'BEGIN {print 1000000/c}')
- bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}.bigwig
- '''
+        COVERAGE_MERGED_FACTOR.join(COVERAGE_MERGED_BAM)
+                              .set{COVERAGE_MERGED_BIGWIG}
+
+
+        process coverage {
+         tag "$file_id"
+         publishDir "${params.output}/07_coverage/", mode: 'copy'
+
+         input:
+         set file_id, val(genome), val (factor),  file(genome_bam) from COVERAGE_MERGED_BIGWIG
+
+         output:
+         file "*.bigwig" into COVERAGE_OUTPUT
+         file "*.txt" into COVERAGE_LOG
+
+         shell:
+         '''
+         echo "genome aligment : !{genome} counts" >> !{file_id}.txt
+         echo "scaling factor : !{factor}" >> !{file_id}.txt
+         if [ !{genome} -gt 0 ]
+         then 
+           bamCoverage -p !{task.cpus} --binSize 1  --scaleFactor !{factor} -b !{genome_bam[0]} -o !{file_id}_genome.bigwig
+         fi
+         '''
+        }
 }
+
 /////////////
 /* MultiQC */
 /////////////
-- 
GitLab