diff --git a/src/RNAseq.config b/src/RNAseq.config index 7dfd997fab1adb881f634d57e3ca0fca5e55cfac..2554406dcc815664eedd25e11cf372edb90cf845 100644 --- a/src/RNAseq.config +++ b/src/RNAseq.config @@ -71,6 +71,16 @@ profiles { time = "12h" queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' } + withName: coverage_postgenome { + container = "lbmc/deeptools:3.0.2" + executor = "sge" + clusterOptions = "-cwd -V" + memory = "20GB" + cpus = 16 + penv = 'openmp16' + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } withName: fastqc_genome { container = "lbmc/fastqc:0.11.5" executor = "sge" @@ -91,6 +101,16 @@ profiles { time = "12h" queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' } + withName: hisat2_postGenomic { + container = "lbmc/hisat2:2.1.0" + executor = "sge" + clusterOptions = "-cwd -V" + memory = "20GB" + cpus = 16 + penv = 'openmp16' + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } withName: dedup_postgenome { container = "lbmc/hisat2:2.1.0" executor = "sge" @@ -137,6 +157,16 @@ profiles { time = "12h" queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' } + withName: fastqc_postgenome { + container = "lbmc/fastqc:0.11.5" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 4 + penv = 'openmp4' + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } } } docker { diff --git a/src/RNAseq.nf b/src/RNAseq.nf index 53edf63dc37a6dafc7445be99a78db25ee79cb6f..62971cedaaaaa7e0e320bd1dfc86857a0b37e8ff 100644 --- a/src/RNAseq.nf +++ b/src/RNAseq.nf @@ -153,7 +153,7 @@ process trimming { > ${file_id}_second_report.txt cutadapt -j ${task.cpus} \ - -u -13 \ + -u -60 \ -o ${file_id}_cut_R2.fastq.gz \ ${file_id}_tmp_R2.fastq.gz \ > ${file_id}_third_report.txt @@ -305,7 +305,8 @@ fi HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC; HISAT_ALIGNED_DEDUP; - HISAT_ALIGNED_COV} + HISAT_ALIGNED_COV; + HISAT_ALIGNED_COV_2} //////////////////////////// /* Fastqc of genome reads */ @@ -329,27 +330,6 @@ process fastqc_genome { """ } - -////////////// -/* Coverage */ -////////////// - -process coverage { - tag "$file_id" - publishDir "${params.output}/03_hisat2/coverage/", mode: 'copy' - - input: - set file_id, file(bam) from HISAT_ALIGNED_COV - - output: - file "*.bigwig" into COVERAGE_OUTPUT - - """ - bamCoverage -p ${task.cpus} --binSize 1 -b ${bam[0]} -o ${file_id}.bigwig - - """ -} - //////////////////////////// /* Deduplication of reads */ //////////////////////////// @@ -516,7 +496,8 @@ fi } POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC; - POSTGENOME_ALIGNED_DEDUP} + POSTGENOME_ALIGNED_DEDUP; + POSTGENOME_ALIGNED_COV} //////////////////////////// /* Deduplication of reads */ @@ -574,6 +555,55 @@ process fastqc_postgenome { //////////////////////////// POST PROCESS ////////////////////////////// //////////////////////////////////////////////////////////////////////// +////////////// +/* Coverage */ +////////////// + +process coverage_postgenome { + tag "$file_id" + publishDir "${params.output}/07_coverage/", mode: 'copy' + + input: + set file_id, file(bam) from HISAT_ALIGNED_COV + set file_id_2, file(post_genome) from POSTGENOME_ALIGNED_COV + + output: + file "*.bigwig" into COVERAGE_OUTPUT + + when: + params.do_postgenome + + shell: + ''' + genome=$(samtools view !{bam[0]} | awk '{print $1}' | sort | uniq | wc -l) + postgenome=$(samtools view !{post_genome[0]} | awk '{print $1}' | sort | uniq | wc -l) + total=$(($genome + $postgenome)) + factor=$(awk -v c=$total 'BEGIN {print 1000000/c}') + bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}_genome.bigwig + bamCoverage -p !{task.cpus} --binSize 1 -b !{post_genome[0]} -o !{file_id}_postgenome.bigwig + ''' +} + +process coverage { + tag "$file_id" + publishDir "${params.output}/07_coverage/", mode: 'copy' + + input: + set file_id, file(bam) from HISAT_ALIGNED_COV_2 + + output: + file "*.bigwig" into COVERAGE_OUTPUT_2 + + when: + params.do_postgenome==false + shell: + ''' + genome=$(samtools view !{bam[0]} | awk '{print $1} | sort | uniq | wc -l) + factor=$(awk -v c=$genome 'BEGIN {print 1000000/c}') + bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}.bigwig + ''' +} + ///////////// /* MultiQC */ @@ -585,16 +615,16 @@ process multiqc { publishDir "${params.output}/multiqc", mode: 'copy' input: - file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([]) +// file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([]) file ('*') from CUTADAPT_LOG.collect().ifEmpty([]) file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([]) file ('*') from FILTER_LOG.collect().ifEmpty([]) - file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([]) +// file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([]) file ('*') from HISAT_LOG.collect().ifEmpty([]) - file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([]) +// file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([]) file ('*') from HTSEQ_COUNT.collect().ifEmpty([]) file ('*') from POSTGENOME_LOG.collect().ifEmpty([]) - file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([]) +// file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([]) file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([]) output: diff --git a/src/RibosomeProfiling.nf b/src/RibosomeProfiling.nf index b65679ec0f30939ce00e31e48761233b2f28ed5c..82a96198400f63cde3efe78ccda1736677cd5ce3 100644 --- a/src/RibosomeProfiling.nf +++ b/src/RibosomeProfiling.nf @@ -1,5 +1,5 @@ /* -* RNAseq Analysis pipeline +* Ribosome Profiling Analysis pipeline */ ////////////////////////////////////////////////////// @@ -281,8 +281,8 @@ hisat2 -x ${index_id} \ --trim3 1\ 2> ${file_id}_genome.txt \ | samtools view -bS -F 4 - \ -| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ -&& samtools index ${file_id}.bam +| samtools sort -@ ${task.cpus} -o ${file_id}_genome.bam \ +&& samtools index ${file_id}_genome.bam if grep -q "ERR" ${file_id}.txt; then exit 1 @@ -292,7 +292,8 @@ fi HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC; HISAT_ALIGNED_DEDUP; - HISAT_ALIGNED_COV} + HISAT_ALIGNED_COV; + HISAT_ALIGNED_COV_2} //////////////////////////// /* Fastqc of genome reads */ @@ -316,112 +317,95 @@ process fastqc_genome { """ } -////////////// -/* Coverage */ -////////////// - -process coverage { - tag "$file_id" - publishDir "${params.output}/03_hisat2/coverage/", mode: 'copy' - - input: - set file_id, file(bam) from HISAT_ALIGNED_COV - - output: - file "*.bigwig" into COVERAGE_OUTPUT - - """ - bamCoverage -p ${task.cpus} --binSize 1 -b ${bam[0]} -o ${file_id}.bigwig - """ -} //////////////////////////// /* Deduplication of reads */ //////////////////////////// if (params.do_dedup) { - process dedup_genome { - tag "$file_id" - publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy' - - input: - set file_id, file(bam) from HISAT_ALIGNED_DEDUP - - output: - set file_id, "*.{bam, bai}" into DEDUP_GENOME - file "*.log" into DEDUP_LOG - - when: - params.do_dedup - - shell: - ''' - samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam - input=$(samtools view -h !{bam[0]} | wc -l) - output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l) - diff=$(($input - $output)) - per=$(($diff * 100 / $input)) - echo "Input : $input reads" > !{file_id}_dedup.log - echo "Output : $output reads" >> !{file_id}_dedup.log - echo "$per % duplicated reads" >> !{file_id}_dedup.log - ''' - } +process dedup_genome { +tag "$file_id" +publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy' + +input: +set file_id, file(bam) from HISAT_ALIGNED_DEDUP + +output: +set file_id, "*.{bam, bai}" into DEDUP_GENOME +file "*.log" into DEDUP_LOG + +when: +params.do_dedup + +shell: +''' +samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam +samtools index !{file_id}_dedup.bam +input=$(samtools view -h !{bam[0]} | wc -l) +output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l) +diff=$(($input - $output)) +per=$(($diff * 100 / $input)) +echo "Input : $input reads" > !{file_id}_dedup.log +echo "Output : $output reads" >> !{file_id}_dedup.log +echo "$per % duplicated reads" >> !{file_id}_dedup.log +''' +} } else { - HISAT_ALIGNED_DEDUP.set{DEDUP_GENOME} +HISAT_ALIGNED_DEDUP.set{DEDUP_GENOME} } DEDUP_GENOME.into{DEDUP_GENOME_HTSEQ; - DEDUP_GENOME_RNASEQC - } + DEDUP_GENOME_RNASEQC + } /////////// /* HTseq */ /////////// process sort_bam { - tag "$file_id" +tag "$file_id" - input: - set file_id, file(bam) from DEDUP_GENOME_HTSEQ +input: +set file_id, file(bam) from DEDUP_GENOME_HTSEQ - output: - set file_id, "*_htseq.bam" into SORTED_NAME_GENOME +output: +set file_id, "*_htseq.bam" into SORTED_NAME_GENOME - script: +script: """ samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]} """ } process counting { - tag "$file_id" - publishDir "${params.output}/04_HTseq/", mode: 'copy' +tag "$file_id" +publishDir "${params.output}/04_HTseq/", mode: 'copy' - input: - set file_id, file(bam) from SORTED_NAME_GENOME - file gtf from GTF_FILE.toList() +input: +set file_id, file(bam) from SORTED_NAME_GENOME +file gtf from GTF_FILE.toList() - output: - file "*.count" into HTSEQ_COUNT +output: +file "*.count" into HTSEQ_COUNT - script: +script: """ htseq-count ${bam[0]} ${gtf} \ - --mode=intersection-nonempty \ - -a 10 \ - -s yes \ - -t CDS \ - -i gene_id \ - -f bam \ + --mode=intersection-nonempty \ + -a 10 \ + -s yes \ + -t CDS \ + -i gene_id \ + -f bam \ > ${file_id}_CDS.count htseq-count ${bam[0]} ${gtf} \ - --mode=intersection-nonempty \ - -a 10 \ - -s yes \ - -t exon \ - -i gene_id \ - -f bam \ + --mode=intersection-nonempty \ + -a 10 \ + -s yes \ + -t exon \ + -i gene_id \ + -f bam \ > ${file_id}.count """ @@ -432,17 +416,17 @@ htseq-count ${bam[0]} ${gtf} \ /////////////// process rnaseq_qc { - tag "$file_id" - publishDir "${params.output}/06_RNAseqQC/", mode: 'copy' +tag "$file_id" +publishDir "${params.output}/06_RNAseqQC/", mode: 'copy' - input: - set file_id, file(bam) from DEDUP_GENOME_RNASEQC - file (gtf) from GTF_COLLAPSE.collect() +input: +set file_id, file(bam) from DEDUP_GENOME_RNASEQC +file (gtf) from GTF_COLLAPSE.collect() - output: - file "*" into RNASEQC_OUTPUT +output: +file "*" into RNASEQC_OUTPUT - script: +script: """ rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./ """ @@ -457,49 +441,50 @@ rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./ ///////////////////////// process hisat2_postGenomic { - tag "$file_id" - publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy' +tag "$file_id" +publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy' - input: - set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME - file index2 from POSTGENOME_INDEX.collect() +input: +set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME +file index2 from POSTGENOME_INDEX.collect() - output: - set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED - file "*_postgenome.txt" into POSTGENOME_LOG +output: +set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED +file "*_postgenome.txt" into POSTGENOME_LOG - when: - params.do_postgenome +when: +params.do_postgenome - script: - index2_id = index2[0] - for (index2_file in index2) { - if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) { - index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1] - } - } +script: +index2_id = index2[0] +for (index2_file in index2) { +if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) { +index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1] +} +} """ hisat2 -x ${index2_id} \ - -p ${task.cpus} \ - -U ${fastq_unaligned[0]} \ - --rna-strandness ${params.strand} \ - --dta\ - --no-softclip\ - --trim3 1\ - --trim5 1\ - 2> ${file_id}_postgenome.txt \ +-p ${task.cpus} \ +-U ${fastq_unaligned[0]} \ +--rna-strandness ${params.strand} \ +--dta \ +--no-softclip \ +--trim3 1 \ +--trim5 1 \ +2> ${file_id}_postgenome.txt \ | samtools view -bS -F 4 -F 256 - \ -| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ -&& samtools index ${file_id}.bam +| samtools sort -@ ${task.cpus} -o ${file_id}_postgenome.bam \ +&& samtools index ${file_id}_postgenome.bam if grep -q "ERR" ${file_id}_postgenome.txt; then - exit 1 +exit 1 fi """ } POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC; - POSTGENOME_ALIGNED_DEDUP} + POSTGENOME_ALIGNED_DEDUP; + POSTGENOME_ALIGNED_COV} //////////////////////////// /* Deduplication of reads */ @@ -523,6 +508,7 @@ if (params.do_dedup){ shell: ''' samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam + samtools index !{file_id}_dedup.bam input=$(samtools view -h !{bam[0]} | wc -l) output=$(samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | wc -l) diff=$(($input - $output)) @@ -560,6 +546,54 @@ process fastqc_postgenome { //////////////////////////////////////////////////////////////////////// +////////////// +/* Coverage */ +////////////// + +process coverage_postgenome { + tag "$file_id" + publishDir "${params.output}/07_coverage/", mode: 'copy' + + input: + set file_id, file(bam) from HISAT_ALIGNED_COV + set file_id_2, file(post_genome) from POSTGENOME_ALIGNED_COV + + output: + file "*.bigwig" into COVERAGE_OUTPUT + + when: + params.do_postgenome + + shell: + ''' + genome=$(samtools view !{bam[0]} | awk '{print $1}' | sort | uniq | wc -l) + postgenome=$(samtools view !{post_genome[0]} | awk '{print $1}' | sort | uniq | wc -l) + total=$(($genome + $postgenome)) + factor=$(awk -v c=$total 'BEGIN {print 1000000/c}') + bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}_genome.bigwig + bamCoverage -p !{task.cpus} --binSize 1 -b !{post_genome[0]} -o !{file_id}_postgenome.bigwig + ''' +} + +process coverage { + tag "$file_id" + publishDir "${params.output}/07_coverage/", mode: 'copy' + + input: + set file_id, file(bam) from HISAT_ALIGNED_COV_2 + + output: + file "*.bigwig" into COVERAGE_OUTPUT_2 + + when: + params.do_postgenome==false + shell: + ''' + genome=$(samtools view !{bam[0]} | awk '{print $1} | sort | uniq | wc -l) + factor=$(awk -v c=$genome 'BEGIN {print 1000000/c}') + bamCoverage -p !{task.cpus} --binSize 1 -b !{bam[0]} -o !{file_id}.bigwig + ''' +} ///////////// /* MultiQC */ ///////////// @@ -570,16 +604,16 @@ process multiqc { publishDir "${params.output}/multiqc", mode: 'copy' input: - file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([]) +// file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([]) file ('*') from CUTADAPT_LOG.collect().ifEmpty([]) file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([]) file ('*') from FILTER_LOG.collect().ifEmpty([]) - file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([]) +// file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([]) file ('*') from HISAT_LOG.collect().ifEmpty([]) - file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([]) +// file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([]) file ('*') from HTSEQ_COUNT.collect().ifEmpty([]) file ('*') from POSTGENOME_LOG.collect().ifEmpty([]) - file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([]) +// file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([]) file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([]) output: