Select Git revision
Forked from
LBMC / nextflow
Source project has a limited visibility.
RNAseq.nf 13.32 KiB
/*
* RNAseq Analysis pipeline
*/
//////////////////////////////////////////////////////
// PARAMETERS //
//////////////////////////////////////////////////////
///////////////////////
// PARMS : FILE PATH //
///////////////////////
params.fastq_raw = "data/fastq/*{_R1,_R2}.fastq.gz"
params.output = "results"
params.filter = "data/filter/human_rRNA_tRNA/*.bt2"
params.index_genome = "data/genome/*.ht2"
params.gtf = "data/annotation/*.gtf"
params.gtf_collapse = "data/annotation/*.gtf"
params.index_postgenome = "data/post_genome/*.ht2"
/////////////////////
// PARMS : OPTIONS //
/////////////////////
params.do_fastqc = true
params.do_dedup = true
params.do_postgenome = true
////////////////////////
// ADAPTORS SEQUENCES //
////////////////////////
params.adaptorR1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"
params.adaptorR2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
//////////////
// LOG INFO //
//////////////
log.info "input raw : ${params.fastq_raw}"
log.info "outut directory : ${params.output}"
log.info "filter index files : ${params.filter}"
log.info "genome index : ${params.index_genome}"
log.info "gtf file : ${params.gtf}"
log.info "collapsed gtf file for rnaseqc: ${params.gtf_collapse}"
log.info "post-genome index : ${params.index_postgenome}"
log.info ""
log.info "do fastqc ? : ${params.do_fastqc}"
log.info "do deduplication ? : ${params.do_dedup}"
log.info "do post genome alignement ? : ${params.do_postgenome}"
log.info ""
//////////////
// CHANNELS //
//////////////
Channel
.fromFilePairs(params.fastq_raw)
.ifEmpty { error "Cannot find any file matching: ${params.fastq_raw}" }
.into {INPUT_FASTQC_RAW;
INPUT_CUTADAPT}
Channel
.fromPath( params.filter )
.ifEmpty { error "Cannot find any index files matching: ${params.filter}" }
.set { FILTER_INDEX }
Channel
.fromPath ( params.index_genome )
.ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" }
.set { GENOME_INDEX }
Channel
.fromPath( params.gtf )
.ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" }
.set { GTF_FILE }
Channel
.fromPath( params.gtf_collapse )
.ifEmpty { error "Cannot find any gtf file matching: ${params.gtf_collapse}" }
.set { GTF_COLLAPSE }
Channel
.fromPath ( params.index_postgenome )
.ifEmpty { error "Cannot find any index files matching: ${params.index_postgenome}" }
.set { POSTGENOME_INDEX }
//////////////////////////////////////////////////////
// PROCESS //
//////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
//////////////////////////// PRE PROCESS ///////////////////////////////
////////////////////////////////////////////////////////////////////////
/////////////////////////
/* Fastqc of raw input */
/////////////////////////
process fastqc_raw {
tag "$file_id"
publishDir "${params.output}/00_fastqc/raw/", mode: 'copy'
input:
set file_id, file(reads) from INPUT_FASTQC_RAW
output:
file "*_fastqc.{html,zip}" into OUTPUT_FASTQC_RAW
when:
params.do_fastqc
"""
fastqc ${file_id}* -t ${task.cpus}
"""
}
///////////////////////
/* Trimming adaptors */
///////////////////////
process trimming {
tag "$file_id"
publishDir "${params.output}/01_cutadapt/", mode: 'copy'
echo true
input:
set file_id, file(reads) from INPUT_CUTADAPT
output:
set file_id, "*cut_{R1,R2}.fastq.gz" into CUTADAPT_OUTPUT
file "*first_report.txt" into CUTADAPT_LOG
file "*{second,third}_report.txt" into CUTADAPT_LOG_2
script:
"""
cutadapt -j ${task.cpus} \
-a "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA" \
-A "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" \
-o ${file_id}_tmp_R1.fastq.gz \
-p ${file_id}_tmp_R2.fastq.gz \
--minimum-length 70 \
${reads[0]} ${reads[1]} > ${file_id}_first_report.txt
cutadapt -j ${task.cpus} \
-a "A{100}" \
-o ${file_id}_cut_R1.fastq.gz \
${file_id}_tmp_R1.fastq.gz \
> ${file_id}_second_report.txt
cutadapt -j ${task.cpus} \
-u -13 \
-o ${file_id}_cut_R2.fastq.gz \
${file_id}_tmp_R2.fastq.gz \
> ${file_id}_third_report.txt
"""
}
CUTADAPT_OUTPUT.into{CUTADAPT_OUTPUT_FASTQC;
CUTADAPT_OUTPUT_FILTER}
/////////////////////////
/* Fastqc of raw input */
/////////////////////////
process fastqc_cut {
tag "$file_id"
publishDir "${params.output}/00_fastqc/cut/", mode: 'copy'
input:
set file_id, file(reads) from CUTADAPT_OUTPUT_FASTQC
output:
file "*.{html,zip}" into OUTPUT_FASTQC_CUT
when:
params.do_fastqc
"""
fastqc ${file_id}* -t ${task.cpus}
"""
}
/////////////////////////////
/* rRNA and tRNA filtering */
/////////////////////////////
process rRNA_removal {
tag "$file_id"
publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy'
input:
set file_id, file(reads) from CUTADAPT_OUTPUT_FILTER
file index from FILTER_INDEX.toList()
output:
set file_id, "*.fastq.gz" into FILTER_FASTQ
set file_id, "*.bam*" into FILTER_BAM
file "*.{txt,stats}" into FILTER_LOG
script:
index_id = index[0]
for (index_file in index) {
if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) {
index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1]
}
}
"""
bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \
-1 ${reads[0]} -2 ${reads[1]} --no-unal \
--un-conc-gz ${file_id}_R%.fastq.gz 2> \
${file_id}_filter.txt | samtools view -bS - \
| samtools sort -@ ${task.cpus} -o ${file_id}.filter.bam \
&& samtools index ${file_id}.filter.bam \
&& samtools idxstats ${file_id}.filter.bam > \
${file_id}.filter.stats
if grep -q "Error " ${file_id}_filter.txt; then
exit 1
fi
"""
}
FILTER_FASTQ.into{FILTER_FASTQ_FASTQC;
FILTER_FASTQ_HISAT;
FILTER_FASTQ_POSTGENOME
}
/////////////////////////////
/* Fastqc of filtred reads */
/////////////////////////////
process fastqc_filter {
tag "$file_id"
publishDir "${params.output}/00_fastqc/filter/", mode: 'copy'
input:
set file_id, file(reads) from FILTER_FASTQ_FASTQC
output:
set file_id, "*.{html,zip}" into OUTPUT_FASTQC_FILTER
when:
params.do_fastqc
"""
fastqc ${file_id}* -t ${task.cpus}
"""
}
///////////////////////////////////////////////////////////////////
//////////////////////////// GENOME ///////////////////////////////
///////////////////////////////////////////////////////////////////
///////////////////////////////////////////////
/* mapping against human genome with hisat2 */
///////////////////////////////////////////////
process hisat2_human {
tag "$file_id"
publishDir "${params.output}/03_hisat2/", mode: 'copy'
input:
set file_id, file(fastq_filtred) from FILTER_FASTQ_HISAT
file index from GENOME_INDEX.toList()
output:
set file_id, "*_notaligned_R{1,2}.fastq.gz" into HISAT_UNALIGNED
set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED
file "*.txt" into HISAT_LOG
script:
index_id = index[0]
for (index_file in index) {
if (index_file =~ /.*\.1\.ht2/ && !(index_file =~ /.*\.rev\.1\.ht2/)) {
index_id = ( index_file =~ /(.*)\.1\.ht2/)[0][1]
}
}
"""
hisat2 -x ${index_id} \
-p ${task.cpus} \
-1 ${fastq_filtred[0]} \
-2 ${fastq_filtred[1]} \
--un-conc-gz ${file_id}_notaligned_R%.fastq.gz \
--rna-strandness 'FR' \
--dta \
2> ${file_id}_genome.txt \
| samtools view -bS -F 4 - \
| samtools sort -@ ${task.cpus} -o ${file_id}.bam \
&& samtools index ${file_id}.bam
if grep -q "ERR" ${file_id}.txt; then
exit 1
fi
"""
}
HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC;
HISAT_ALIGNED_DEDUP}
////////////////////////////
/* Fastqc of genome reads */
////////////////////////////
process fastqc_genome {
tag "$file_id"
publishDir "${params.output}/00_fastqc/genome/", mode: 'copy'
input:
set file_id, file(reads) from HISAT_ALIGNED_FASTQC
output:
set file_id, "*.{html,zip}" into OUTPUT_FASTQC_GENOME
when:
params.do_fastqc
"""
fastqc ${file_id}* -t ${task.cpus}
"""
}
////////////////////////////
/* Deduplication of reads */
////////////////////////////
process dedup_genome {
tag "$file_id"
publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy'
input:
set file_id, file(bam) from HISAT_ALIGNED_DEDUP
output:
set file_id, "*.{bam, bai}" into DEDUP_GENOME
file "*.log" into DEDUP_LOG
when:
params.do_dedup
"""
umi_tools dedup -I ${bam}[0] \
-S ${file_id}.dedup.bam \
2> dedup.log
"""
}
if (! params.do_dedup) {
HISAT_ALIGNED_DEDUP.into{DEDUP_GENOME}
}
DEDUP_GENOME.into{DEDUP_GENOME_HTSEQ;
DEDUP_GENOME_RNASEQC
}
///////////
/* HTseq */
///////////
process sort_bam {
tag "$file_id"
input:
set file_id, file(bam) from DEDUP_GENOME_HTSEQ
output:
set file_id, "*_htseq.bam" into SORTED_NAME_GENOME
script:
"""
samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]}
"""
}
process counting {
tag "$file_id"
publishDir "${params.output}/04_HTseq/", mode: 'copy'
input:
set file_id, file(bam) from SORTED_NAME_GENOME
file gtf from GTF_FILE.toList()
output:
file "*.count" into HTSEQ_COUNT
script:
"""
htseq-count ${bam[0]} ${gtf} \
--mode=intersection-nonempty \
-a 10 \
-s yes \
-t CDS \
-i gene_id \
-f bam \
> ${file_id}_CDS.count
htseq-count ${bam[0]} ${gtf} \
--mode=intersection-nonempty \
-a 10 \
-s yes \
-t exon \
-i gene_id \
-f bam \
> ${file_id}.count
"""
}
///////////////
/* RNASEQ QC */
///////////////
process rnaseq_qc {
tag "$file_id"
publishDir "${params.output}/06_RNAseqQC/", mode: 'copy'
input:
set file_id, file(bam) from DEDUP_GENOME_RNASEQC
file (gtf) from GTF_COLLAPSE.collect()
output:
file "*" into RNASEQC_OUTPUT
script:
"""
rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./ \
--stranded 'FR'
"""
}
////////////////////////////////////////////////////////////////////////
//////////////////////////// POST GENOME ///////////////////////////////
////////////////////////////////////////////////////////////////////////
/////////////////////////
/* HISAT2 POST_GENOMIC */
/////////////////////////
process hisat2_postGenomic {
tag "$file_id"
publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'
input:
set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME
file index2 from POSTGENOME_INDEX.collect()
output:
set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED
file "*_postgenome.txt" into POSTGENOME_LOG
when:
params.do_postgenome
script:
index2_id = index2[0]
for (index2_file in index2) {
if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) {
index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1]
}
}
"""
hisat2 -x ${index2_id} \
-p ${task.cpus} \
-1 ${fastq_unaligned[0]} \
-2 ${fastq_unaligned[1]} \
--rna-strandness 'FR' \
--dta\
2> ${file_id}_postgenome.txt \
| samtools view -bS -F 4 - \
| samtools sort -@ ${task.cpus} -o ${file_id}.bam \
&& samtools index ${file_id}.bam
if grep -q "ERR" ${file_id}_postgenome.txt; then
exit 1
fi
"""
}
POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC;
POSTGENOME_ALIGNED_DEDUP}
////////////////////////////
/* Deduplication of reads */
////////////////////////////
process dedup_postgenome {
tag "$file_id"
publishDir "${params.output}/05_post_genome_hisat2/dedup/", mode: 'copy'
input:
set file_id, file(bam) from POSTGENOME_ALIGNED_DEDUP
output:
set file_id, "*.{bam, bai}" into DEDUP_POSTGENOME
file "*.log" into DEDUP_POSTGENOME_LOG
when:
params.do_dedup
"""
umi_tools dedup -I ${bam}[0] \
-S ${file_id}.dedup.bam \
2> dedup.log
"""
}
process fastqc_postgenome {
tag "$file_id"
publishDir "${params.output}/00_fastqc/postgenome/", mode: 'copy'
input:
set file_id, file(reads) from POSTGENOME_ALIGNED_FASTQC
output:
set file_id, "*.{html,zip}" into OUTPUT_FASTQC_POSTGENOME
when:
params.do_fastqc
"""
fastqc ${file_id}* -t ${task.cpus}
"""
}
////////////////////////////////////////////////////////////////////////
//////////////////////////// POST PROCESS //////////////////////////////
////////////////////////////////////////////////////////////////////////
/////////////
/* MultiQC */
/////////////
process multiqc {
tag "multiqc"
publishDir "${params.output}/multiqc", mode: 'copy'
input:
file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
file ('*') from CUTADAPT_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([])
file ('*') from FILTER_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
file ('*') from HISAT_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
file ('*') from HTSEQ_COUNT.collect().ifEmpty([])
file ('*') from POSTGENOME_LOG.collect().ifEmpty([])
file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([])
output:
file "multiqc_report.html" into multiqc_report
file "multiqc_data"
script:
"""
multiqc ./
"""
}