Skip to content
Snippets Groups Projects
Commit 8635caa8 authored by elabaron's avatar elabaron
Browse files

add normalized coverage in the pipeline

parent 7fa9c529
Branches
No related tags found
No related merge requests found
......@@ -71,6 +71,16 @@ profiles {
time = "12h"
queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
}
withName: coverage_postgenome {
container = "lbmc/deeptools:3.0.2"
executor = "sge"
clusterOptions = "-cwd -V"
memory = "20GB"
cpus = 16
penv = 'openmp16'
time = "12h"
queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
}
withName: fastqc_genome {
container = "lbmc/fastqc:0.11.5"
executor = "sge"
......@@ -91,6 +101,16 @@ profiles {
time = "12h"
queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
}
withName: hisat2_postGenomic {
container = "lbmc/hisat2:2.1.0"
executor = "sge"
clusterOptions = "-cwd -V"
memory = "20GB"
cpus = 16
penv = 'openmp16'
time = "12h"
queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
}
withName: dedup_postgenome {
container = "lbmc/hisat2:2.1.0"
executor = "sge"
......@@ -137,6 +157,16 @@ profiles {
time = "12h"
queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
}
withName: fastqc_postgenome {
container = "lbmc/fastqc:0.11.5"
executor = "sge"
clusterOptions = "-cwd -V"
cpus = 4
penv = 'openmp4'
memory = "20GB"
time = "12h"
queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D'
}
}
}
docker {
......
......@@ -153,7 +153,7 @@ process trimming {
> ${file_id}_second_report.txt
cutadapt -j ${task.cpus} \
-u -13 \
-u -60 \
-o ${file_id}_cut_R2.fastq.gz \
${file_id}_tmp_R2.fastq.gz \
> ${file_id}_third_report.txt
......@@ -305,7 +305,8 @@ fi
HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC;
HISAT_ALIGNED_DEDUP;
HISAT_ALIGNED_COV}
HISAT_ALIGNED_COV;
HISAT_ALIGNED_COV_2}
////////////////////////////
/* Fastqc of genome reads */
......@@ -329,27 +330,6 @@ process fastqc_genome {
"""
}
//////////////
/* Coverage */
//////////////
process coverage {
tag "$file_id"
publishDir "${params.output}/03_hisat2/coverage/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_COV
output:
file "*.bigwig" into COVERAGE_OUTPUT
"""
bamCoverage -p ${task.cpus} --binSize 1 -b ${bam[0]} -o ${file_id}.bigwig
"""
}
////////////////////////////
/* Deduplication of reads */
////////////////////////////
......@@ -516,7 +496,8 @@ fi
}
POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC;
POSTGENOME_ALIGNED_DEDUP}
POSTGENOME_ALIGNED_DEDUP;
POSTGENOME_ALIGNED_COV}
////////////////////////////
/* Deduplication of reads */
......@@ -574,6 +555,55 @@ process fastqc_postgenome {
//////////////////////////// POST PROCESS //////////////////////////////
////////////////////////////////////////////////////////////////////////
//////////////
/* Coverage */
//////////////
process coverage_postgenome {
tag "$file_id"
publishDir "${params.output}/07_coverage/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_COV
set file_id_2, file(post_genome) from POSTGENOME_ALIGNED_COV
output:
file "*.bigwig" into COVERAGE_OUTPUT
when:
params.do_postgenome
shell:
'''
genome=$(samtools view !{bam[0]} | awk '{print $1}' | sort | uniq | wc -l)
postgenome=$(samtools view !{post_genome[0]} | awk '{print $1}' | sort | uniq | wc -l)
total=$(($genome + $postgenome))
factor=$(awk -v c=$total 'BEGIN {print 1000000/c}')
bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}_genome.bigwig
bamCoverage -p !{task.cpus} --binSize 1 -b !{post_genome[0]} -o !{file_id}_postgenome.bigwig
'''
}
process coverage {
tag "$file_id"
publishDir "${params.output}/07_coverage/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_COV_2
output:
file "*.bigwig" into COVERAGE_OUTPUT_2
when:
params.do_postgenome==false
shell:
'''
genome=$(samtools view !{bam[0]} | awk '{print $1} | sort | uniq | wc -l)
factor=$(awk -v c=$genome 'BEGIN {print 1000000/c}')
bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}.bigwig
'''
}
/////////////
/* MultiQC */
......@@ -585,16 +615,16 @@ process multiqc {
publishDir "${params.output}/multiqc", mode: 'copy'
input:
file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
// file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
file ('*') from CUTADAPT_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([])
file ('*') from FILTER_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
// file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
file ('*') from HISAT_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
// file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
file ('*') from HTSEQ_COUNT.collect().ifEmpty([])
file ('*') from POSTGENOME_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
// file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([])
output:
......
/*
* RNAseq Analysis pipeline
* Ribosome Profiling Analysis pipeline
*/
//////////////////////////////////////////////////////
......@@ -281,8 +281,8 @@ hisat2 -x ${index_id} \
--trim3 1\
2> ${file_id}_genome.txt \
| samtools view -bS -F 4 - \
| samtools sort -@ ${task.cpus} -o ${file_id}.bam \
&& samtools index ${file_id}.bam
| samtools sort -@ ${task.cpus} -o ${file_id}_genome.bam \
&& samtools index ${file_id}_genome.bam
if grep -q "ERR" ${file_id}.txt; then
exit 1
......@@ -292,7 +292,8 @@ fi
HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC;
HISAT_ALIGNED_DEDUP;
HISAT_ALIGNED_COV}
HISAT_ALIGNED_COV;
HISAT_ALIGNED_COV_2}
////////////////////////////
/* Fastqc of genome reads */
......@@ -316,112 +317,95 @@ process fastqc_genome {
"""
}
//////////////
/* Coverage */
//////////////
process coverage {
tag "$file_id"
publishDir "${params.output}/03_hisat2/coverage/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_COV
output:
file "*.bigwig" into COVERAGE_OUTPUT
"""
bamCoverage -p ${task.cpus} --binSize 1 -b ${bam[0]} -o ${file_id}.bigwig
"""
}
////////////////////////////
/* Deduplication of reads */
////////////////////////////
if (params.do_dedup) {
process dedup_genome {
tag "$file_id"
publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_DEDUP
output:
set file_id, "*.{bam, bai}" into DEDUP_GENOME
file "*.log" into DEDUP_LOG
when:
params.do_dedup
shell:
'''
samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam
input=$(samtools view -h !{bam[0]} | wc -l)
output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l)
diff=$(($input - $output))
per=$(($diff * 100 / $input))
echo "Input : $input reads" > !{file_id}_dedup.log
echo "Output : $output reads" >> !{file_id}_dedup.log
echo "$per % duplicated reads" >> !{file_id}_dedup.log
'''
}
process dedup_genome {
tag "$file_id"
publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_DEDUP
output:
set file_id, "*.{bam, bai}" into DEDUP_GENOME
file "*.log" into DEDUP_LOG
when:
params.do_dedup
shell:
'''
samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam
samtools index !{file_id}_dedup.bam
input=$(samtools view -h !{bam[0]} | wc -l)
output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l)
diff=$(($input - $output))
per=$(($diff * 100 / $input))
echo "Input : $input reads" > !{file_id}_dedup.log
echo "Output : $output reads" >> !{file_id}_dedup.log
echo "$per % duplicated reads" >> !{file_id}_dedup.log
'''
}
} else {
HISAT_ALIGNED_DEDUP.set{DEDUP_GENOME}
HISAT_ALIGNED_DEDUP.set{DEDUP_GENOME}
}
DEDUP_GENOME.into{DEDUP_GENOME_HTSEQ;
DEDUP_GENOME_RNASEQC
}
DEDUP_GENOME_RNASEQC
}
///////////
/* HTseq */
///////////
process sort_bam {
tag "$file_id"
tag "$file_id"
input:
set file_id, file(bam) from DEDUP_GENOME_HTSEQ
input:
set file_id, file(bam) from DEDUP_GENOME_HTSEQ
output:
set file_id, "*_htseq.bam" into SORTED_NAME_GENOME
output:
set file_id, "*_htseq.bam" into SORTED_NAME_GENOME
script:
script:
"""
samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]}
"""
}
process counting {
tag "$file_id"
publishDir "${params.output}/04_HTseq/", mode: 'copy'
tag "$file_id"
publishDir "${params.output}/04_HTseq/", mode: 'copy'
input:
set file_id, file(bam) from SORTED_NAME_GENOME
file gtf from GTF_FILE.toList()
input:
set file_id, file(bam) from SORTED_NAME_GENOME
file gtf from GTF_FILE.toList()
output:
file "*.count" into HTSEQ_COUNT
output:
file "*.count" into HTSEQ_COUNT
script:
script:
"""
htseq-count ${bam[0]} ${gtf} \
--mode=intersection-nonempty \
-a 10 \
-s yes \
-t CDS \
-i gene_id \
-f bam \
--mode=intersection-nonempty \
-a 10 \
-s yes \
-t CDS \
-i gene_id \
-f bam \
> ${file_id}_CDS.count
htseq-count ${bam[0]} ${gtf} \
--mode=intersection-nonempty \
-a 10 \
-s yes \
-t exon \
-i gene_id \
-f bam \
--mode=intersection-nonempty \
-a 10 \
-s yes \
-t exon \
-i gene_id \
-f bam \
> ${file_id}.count
"""
......@@ -432,17 +416,17 @@ htseq-count ${bam[0]} ${gtf} \
///////////////
process rnaseq_qc {
tag "$file_id"
publishDir "${params.output}/06_RNAseqQC/", mode: 'copy'
tag "$file_id"
publishDir "${params.output}/06_RNAseqQC/", mode: 'copy'
input:
set file_id, file(bam) from DEDUP_GENOME_RNASEQC
file (gtf) from GTF_COLLAPSE.collect()
input:
set file_id, file(bam) from DEDUP_GENOME_RNASEQC
file (gtf) from GTF_COLLAPSE.collect()
output:
file "*" into RNASEQC_OUTPUT
output:
file "*" into RNASEQC_OUTPUT
script:
script:
"""
rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./
"""
......@@ -457,49 +441,50 @@ rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./
/////////////////////////
process hisat2_postGenomic {
tag "$file_id"
publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'
tag "$file_id"
publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'
input:
set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME
file index2 from POSTGENOME_INDEX.collect()
input:
set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME
file index2 from POSTGENOME_INDEX.collect()
output:
set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED
file "*_postgenome.txt" into POSTGENOME_LOG
output:
set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED
file "*_postgenome.txt" into POSTGENOME_LOG
when:
params.do_postgenome
when:
params.do_postgenome
script:
index2_id = index2[0]
for (index2_file in index2) {
if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) {
index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1]
}
}
script:
index2_id = index2[0]
for (index2_file in index2) {
if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) {
index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1]
}
}
"""
hisat2 -x ${index2_id} \
-p ${task.cpus} \
-U ${fastq_unaligned[0]} \
--rna-strandness ${params.strand} \
--dta\
--no-softclip\
--trim3 1\
--trim5 1\
2> ${file_id}_postgenome.txt \
-p ${task.cpus} \
-U ${fastq_unaligned[0]} \
--rna-strandness ${params.strand} \
--dta \
--no-softclip \
--trim3 1 \
--trim5 1 \
2> ${file_id}_postgenome.txt \
| samtools view -bS -F 4 -F 256 - \
| samtools sort -@ ${task.cpus} -o ${file_id}.bam \
&& samtools index ${file_id}.bam
| samtools sort -@ ${task.cpus} -o ${file_id}_postgenome.bam \
&& samtools index ${file_id}_postgenome.bam
if grep -q "ERR" ${file_id}_postgenome.txt; then
exit 1
exit 1
fi
"""
}
POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC;
POSTGENOME_ALIGNED_DEDUP}
POSTGENOME_ALIGNED_DEDUP;
POSTGENOME_ALIGNED_COV}
////////////////////////////
/* Deduplication of reads */
......@@ -523,6 +508,7 @@ if (params.do_dedup){
shell:
'''
samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam
samtools index !{file_id}_dedup.bam
input=$(samtools view -h !{bam[0]} | wc -l)
output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l)
diff=$(($input - $output))
......@@ -560,6 +546,54 @@ process fastqc_postgenome {
////////////////////////////////////////////////////////////////////////
//////////////
/* Coverage */
//////////////
process coverage_postgenome {
tag "$file_id"
publishDir "${params.output}/07_coverage/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_COV
set file_id_2, file(post_genome) from POSTGENOME_ALIGNED_COV
output:
file "*.bigwig" into COVERAGE_OUTPUT
when:
params.do_postgenome
shell:
'''
genome=$(samtools view !{bam[0]} | awk '{print $1}' | sort | uniq | wc -l)
postgenome=$(samtools view !{post_genome[0]} | awk '{print $1}' | sort | uniq | wc -l)
total=$(($genome + $postgenome))
factor=$(awk -v c=$total 'BEGIN {print 1000000/c}')
bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}_genome.bigwig
bamCoverage -p !{task.cpus} --binSize 1 -b !{post_genome[0]} -o !{file_id}_postgenome.bigwig
'''
}
process coverage {
tag "$file_id"
publishDir "${params.output}/07_coverage/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_COV_2
output:
file "*.bigwig" into COVERAGE_OUTPUT_2
when:
params.do_postgenome==false
shell:
'''
genome=$(samtools view !{bam[0]} | awk '{print $1} | sort | uniq | wc -l)
factor=$(awk -v c=$genome 'BEGIN {print 1000000/c}')
bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}.bigwig
'''
}
/////////////
/* MultiQC */
/////////////
......@@ -570,16 +604,16 @@ process multiqc {
publishDir "${params.output}/multiqc", mode: 'copy'
input:
file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
// file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
file ('*') from CUTADAPT_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([])
file ('*') from FILTER_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
// file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
file ('*') from HISAT_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
// file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
file ('*') from HTSEQ_COUNT.collect().ifEmpty([])
file ('*') from POSTGENOME_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
// file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([])
output:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment