diff --git a/src/RibosomeProfiling.nf b/src/RibosomeProfiling.nf index faeb37f2402504642be76281ab7a57462073c3a0..289ca05c0dff928c5dcd3090de54c780ca23aad2 100644 --- a/src/RibosomeProfiling.nf +++ b/src/RibosomeProfiling.nf @@ -1,58 +1,194 @@ /* -* RibosomeProfiling Analysis pipeline +* RNAseq Analysis pipeline */ -/* Trimming */ +////////////////////////////////////////////////////// +// PARAMETERS // +////////////////////////////////////////////////////// + +/////////////////////// +// PARMS : FILE PATH // +/////////////////////// + +params.fastq_raw = "data/fastq/*.fastq.gz" params.output = "results" -params.fastq_raw = "${params.output}/00_demultiplexing/*.fastq.gz" +params.filter = "data/filter/human_rRNA_tRNA/*.bt2" +params.index_genome = "data/genome/*.ht2" +params.gtf = "data/annotation/*.gtf" +params.gtf_collapse = "data/annotation/*.gtf" +params.index_postgenome = "data/post_genome/*.ht2" + +///////////////////// +// PARMS : OPTIONS // +///////////////////// + +params.do_fastqc = true +params.do_dedup = true +params.do_postgenome = true + +/////////////////////// +// LIBRARIES OPTIONS // +/////////////////////// + +params.adaptorR1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA" +params.strand = "F" + + +////////////// +// LOG INFO // +////////////// + +log.info "input raw : ${params.fastq_raw}" +log.info "outut directory : ${params.output}" +log.info "filter index files : ${params.filter}" +log.info "genome index : ${params.index_genome}" +log.info "gtf file : ${params.gtf}" +log.info "collapsed gtf file for rnaseqc: ${params.gtf_collapse}" +log.info "post-genome index : ${params.index_postgenome}" +log.info "" +log.info "adaptor sequence : ${params.adaptorR1}" +log.info "strand of the library : ${params.strand}" +log.info "" +log.info "do fastqc ? : ${params.do_fastqc}" +log.info "do deduplication ? : ${params.do_dedup}" +log.info "do post genome alignement ? : ${params.do_postgenome}" +log.info "" + +////////////// +// CHANNELS // +////////////// + +Channel + .fromPath(params.fastq_raw) + .ifEmpty { error "Cannot find any file matching: ${params.fastq_raw}" } + .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} + .into {INPUT_FASTQC_RAW; + INPUT_CUTADAPT} + +Channel + .fromPath( params.filter ) + .ifEmpty { error "Cannot find any index files matching: ${params.filter}" } + .set { FILTER_INDEX } + +Channel + .fromPath ( params.index_genome ) + .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" } + .set { GENOME_INDEX } + +Channel + .fromPath( params.gtf ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } + .set { GTF_FILE } + +Channel + .fromPath( params.gtf_collapse ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf_collapse}" } + .set { GTF_COLLAPSE } Channel - .fromPath( params.fastq_raw ) - .ifEmpty { error "Cannot find any files matching: ${params.fastq_raw}" } - .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} - .set { fastq_raw_flow } + .fromPath ( params.index_postgenome ) + .ifEmpty { error "Cannot find any index files matching: ${params.index_postgenome}" } + .set { POSTGENOME_INDEX } + +////////////////////////////////////////////////////// +// PROCESS // +////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// PRE PROCESS /////////////////////////////// +//////////////////////////////////////////////////////////////////////// + + +///////////////////////// +/* Fastqc of raw input */ +///////////////////////// + +process fastqc_raw { + tag "$file_id" + publishDir "${params.output}/00_fastqc/raw/", mode: 'copy' + input: + set file_id, file(reads) from INPUT_FASTQC_RAW + + output: + file "*_fastqc.{html,zip}" into OUTPUT_FASTQC_RAW + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +/////////////////////// +/* Trimming adaptors */ +/////////////////////// process trimming { tag "$file_id" - publishDir "${params.output}/01_trimming/", mode: 'copy' + publishDir "${params.output}/01_cutadapt/", mode: 'copy' + echo true input: - set file_id, file(fastq_raw) from fastq_raw_flow + set file_id, file(reads) from INPUT_CUTADAPT output: - set file_id, "*_cut.fastq.gz" into fastq_trim_filt - file "*.txt" into log_trim + set file_id, "*cut.fastq.gz" into CUTADAPT_OUTPUT + file "*first_report.txt" into CUTADAPT_LOG + script: """ - cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC -m 20\ - -o ${file_id}_cut.fastq.gz \ - ${fastq_raw} > ${file_id}_report.txt + cutadapt -j ${task.cpus} \ + -a ${params.adaptorR1} \ + -o ${file_id}_cut.fastq.gz \ + --minimum-length 15 \ + ${reads[0]} > ${file_id}_first_report.txt """ } -/* rRNA and tRNA filtering */ +CUTADAPT_OUTPUT.into{CUTADAPT_OUTPUT_FASTQC; + CUTADAPT_OUTPUT_FILTER} -params.indexrRNA = "results/human_rRNA_tRNA/*.bt2" -log.info "index files rRNA : ${params.indexrRNA}" +///////////////////////// +/* Fastqc of raw input */ +///////////////////////// -Channel - .fromPath( params.indexrRNA ) - .ifEmpty { error "Cannot find any index files matching: ${params.indexrRNA}" } - .set { rRNA_index_files } +process fastqc_cut { + tag "$file_id" + publishDir "${params.output}/00_fastqc/cut/", mode: 'copy' + + input: + set file_id, file(reads) from CUTADAPT_OUTPUT_FASTQC + + output: + file "*.{html,zip}" into OUTPUT_FASTQC_CUT + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +///////////////////////////// +/* rRNA and tRNA filtering */ +///////////////////////////// process rRNA_removal { tag "$file_id" publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy' input: - set file_id, file(reads) from fastq_trim_filt - file index from rRNA_index_files.toList() + set file_id, file(reads) from CUTADAPT_OUTPUT_FILTER + file index from FILTER_INDEX.toList() output: - set file_id, "*.fastq.gz" into rRNA_removed_reads - file "*.txt" into bowtie_report + set file_id, "*.fastq.gz" into FILTER_FASTQ + set file_id, "*.bam*" into FILTER_BAM + file "*.{txt,stats}" into FILTER_LOG script: index_id = index[0] @@ -62,40 +198,69 @@ process rRNA_removal { } } """ -zcat ${reads} | bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \ --U - --un-gz ${file_id}_mRNA.fastq.gz 2> \ -${file_id}_bowtie2_report.txt > /dev/null - -if grep -q "Error " ${file_id}_bowtie2_report.txt; then +bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \ +-U ${reads[0]} --no-unal \ +--un-gz ${file_id}_filter.fastq.gz 2> \ +${file_id}_filter.txt | samtools view -bS - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.filter.bam \ + && samtools index ${file_id}.filter.bam \ + && samtools idxstats ${file_id}.filter.bam > \ + ${file_id}.filter.stats + +if grep -q "Error " ${file_id}_filter.txt; then exit 1 fi """ } +FILTER_FASTQ.into{FILTER_FASTQ_FASTQC; + FILTER_FASTQ_HISAT; + FILTER_FASTQ_POSTGENOME + } -/* mapping against human genome with hisat2 */ +///////////////////////////// +/* Fastqc of filtred reads */ +///////////////////////////// -params.index_hg38 = "/media/adminmanu/Stockage/HISAT2_index_hg38_tran/*.ht2" +process fastqc_filter { + tag "$file_id" + publishDir "${params.output}/00_fastqc/filter/", mode: 'copy' -log.info "index : ${params.index_hg38}" + input: + set file_id, file(reads) from FILTER_FASTQ_FASTQC + output: + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_FILTER -Channel - .fromPath ( params.index_hg38 ) - .ifEmpty { error "Cannot find any index files matching: ${params.index_hg38}" } - .set { index_file_hg38 } + when: + params.do_fastqc -process hisat2_human { - tag "$file_id" + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + + +/////////////////////////////////////////////////////////////////// +//////////////////////////// GENOME /////////////////////////////// +/////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////// +/* mapping against human genome with hisat2 */ +/////////////////////////////////////////////// + +process hisat2_genome { + tag "$file_id" + publishDir "${params.output}/03_hisat2/", mode: 'copy' + input: - set file_id, file(fastq_filtred) from rRNA_removed_reads - file index from index_file_hg38.toList() + set file_id, file(fastq_filtred) from FILTER_FASTQ_HISAT + file index from GENOME_INDEX.toList() output: - set file_id, "*.fastq.gz" into reads_non_aligned_hg38 - set file_id, "*.bam" into reads_aligned_hg38 - file "*.txt" into hisat_report + set file_id, "*_notaligned.fastq.gz" into HISAT_UNALIGNED + set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED + file "*.txt" into HISAT_LOG script: index_id = index[0] @@ -105,79 +270,110 @@ process hisat2_human { } } """ -hisat2 -x ${index_id} -p ${task.cpus} \ --U ${fastq_filtred} --un-gz ${file_id}_notaligned.fastq.gz \ ---end-to-end --rna-strandness 'F' \ -2> ${file_id}_hisat2_hg38.txt | samtools view -bS -F 4 -o ${file_id}.bam - -if grep -q "Error " ${file_id}_hisat2_hg38.txt; then +hisat2 -x ${index_id} \ + -p ${task.cpus} \ + -U ${fastq_filtred[0]} \ + --un-gz ${file_id}_notaligned.fastq.gz \ + --rna-strandness ${params.strand} \ + --dta \ + 2> ${file_id}_genome.txt \ +| samtools view -bS -F 4 - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ +&& samtools index ${file_id}.bam + +if grep -q "ERR" ${file_id}.txt; then exit 1 fi """ } -process save_hisat { +HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC; + HISAT_ALIGNED_DEDUP} + +//////////////////////////// +/* Fastqc of genome reads */ +//////////////////////////// + +process fastqc_genome { tag "$file_id" - publishDir "${params.output}/03_hisat2/", mode: 'copy' + publishDir "${params.output}/00_fastqc/genome/", mode: 'copy' input: - set file_id, file(fastq) from reads_non_aligned_hg38 + set file_id, file(reads) from HISAT_ALIGNED_FASTQC output: - file "*" into saved_hisat + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_GENOME - script: -""" -cat ${fastq} > ${file_id}_nonhuman.fastq.gz -""" + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ } -/* sorting */ +//////////////////////////// +/* Deduplication of reads */ +//////////////////////////// -process index_bam { +process dedup_genome { tag "$file_id" - publishDir "${params.output}/03_hisat2/", mode: 'copy' + publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy' input: - set file_id, file(bam) from reads_aligned_hg38 - file report from hisat_report + set file_id, file(bam) from HISAT_ALIGNED_DEDUP output: - set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files - file "*.txt" into report_hisat2 + set file_id, "*.{bam, bai}" into DEDUP_GENOME + file "*.log" into DEDUP_LOG - script: -""" -samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${bam} -samtools index ${file_id}_sorted.bam + when: + params.do_dedup -cat ${report} > ${file_id}_hg38_hisat2.txt -""" + """ + umi_tools dedup -I ${bam}[0] \ + -S ${file_id}.dedup.bam \ + 2> ${file_id}_dedup.log + """ } +if (! params.do_dedup) { + HISAT_ALIGNED_DEDUP.into{DEDUP_GENOME} +} -/* HTseq */ +DEDUP_GENOME.into{DEDUP_GENOME_HTSEQ; + DEDUP_GENOME_RNASEQC + } -params.gtf = "$baseDir/data/annotation/*.gtf" -log.info "gtf files : ${params.gtf}" +/////////// +/* HTseq */ +/////////// -Channel - .fromPath( params.gtf ) - .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } - .set { gtf_file } +process sort_bam { + tag "$file_id" + + input: + set file_id, file(bam) from DEDUP_GENOME_HTSEQ + + output: + set file_id, "*_htseq.bam" into SORTED_NAME_GENOME + + script: +""" +samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]} +""" +} process counting { tag "$file_id" publishDir "${params.output}/04_HTseq/", mode: 'copy' - errorStrategy 'retry' - maxRetries 2 input: - set file_id, file(bam) from sorted_bam_files - file gtf from gtf_file.toList() + set file_id, file(bam) from SORTED_NAME_GENOME + file gtf from GTF_FILE.toList() output: - file "*.count" into count_files + file "*.count" into HTSEQ_COUNT script: """ @@ -187,7 +383,6 @@ htseq-count ${bam[0]} ${gtf} \ -s yes \ -t CDS \ -i gene_id \ - -r pos \ -f bam \ > ${file_id}_CDS.count @@ -197,9 +392,159 @@ htseq-count ${bam[0]} ${gtf} \ -s yes \ -t exon \ -i gene_id \ - -r pos \ -f bam \ -> ${file_id}_exon.count +> ${file_id}.count + +""" +} + +/////////////// +/* RNASEQ QC */ +/////////////// + +process rnaseq_qc { + tag "$file_id" + publishDir "${params.output}/06_RNAseqQC/", mode: 'copy' + + input: + set file_id, file(bam) from DEDUP_GENOME_RNASEQC + file (gtf) from GTF_COLLAPSE.collect() + + output: + file "*" into RNASEQC_OUTPUT + + script: +""" +rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./ +""" +} + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// POST GENOME /////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +///////////////////////// +/* HISAT2 POST_GENOMIC */ +///////////////////////// + +process hisat2_postGenomic { + tag "$file_id" + publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy' + + input: + set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME + file index2 from POSTGENOME_INDEX.collect() + + output: + set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED + file "*_postgenome.txt" into POSTGENOME_LOG + + when: + params.do_postgenome + script: + index2_id = index2[0] + for (index2_file in index2) { + if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) { + index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1] + } + } +""" +hisat2 -x ${index2_id} \ + -p ${task.cpus} \ + -U ${fastq_unaligned[0]} \ + --rna-strandness ${params.strand} \ + --dta\ + 2> ${file_id}_postgenome.txt \ +| samtools view -bS -F 4 - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ +&& samtools index ${file_id}.bam + +if grep -q "ERR" ${file_id}_postgenome.txt; then + exit 1 +fi +""" +} + +POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC; + POSTGENOME_ALIGNED_DEDUP} + +//////////////////////////// +/* Deduplication of reads */ +//////////////////////////// + +process dedup_postgenome { + tag "$file_id" + publishDir "${params.output}/05_post_genome_hisat2/dedup/", mode: 'copy' + + input: + set file_id, file(bam) from POSTGENOME_ALIGNED_DEDUP + + output: + set file_id, "*.{bam, bai}" into DEDUP_POSTGENOME + file "*.log" into DEDUP_POSTGENOME_LOG + + when: + params.do_dedup + + """ + umi_tools dedup -I ${bam}[0] \ + -S ${file_id}.dedup.bam \ + 2> ${file_id}_dedup.log + """ +} + +process fastqc_postgenome { + tag "$file_id" + publishDir "${params.output}/00_fastqc/postgenome/", mode: 'copy' + + input: + set file_id, file(reads) from POSTGENOME_ALIGNED_FASTQC + + output: + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_POSTGENOME + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// POST PROCESS ////////////////////////////// +//////////////////////////////////////////////////////////////////////// + + +///////////// +/* MultiQC */ +///////////// + + +process multiqc { + tag "multiqc" + publishDir "${params.output}/multiqc", mode: 'copy' + + input: + file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([]) + file ('*') from CUTADAPT_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([]) + file ('*') from FILTER_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([]) + file ('*') from HISAT_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([]) + file ('*') from HTSEQ_COUNT.collect().ifEmpty([]) + file ('*') from POSTGENOME_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([]) + file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([]) + + output: + file "multiqc_report.html" into multiqc_report + file "multiqc_data" + + script: +""" +multiqc ./ """ }