From bbc74d28baf8569ef60f7c4ce61c0857cd620c3b Mon Sep 17 00:00:00 2001 From: Emmanuel Labaronne <emmanuel.labaronne@ens-lyon.fr> Date: Fri, 25 Sep 2020 12:35:52 +0200 Subject: [PATCH] src/RNAseq.nf : adapt htseq and add post genomic alignement --- src/RNAseq.config | 4 ++ src/RNAseq.nf | 108 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 84 insertions(+), 28 deletions(-) diff --git a/src/RNAseq.config b/src/RNAseq.config index d15b85fd..a4be61a6 100644 --- a/src/RNAseq.config +++ b/src/RNAseq.config @@ -104,6 +104,10 @@ profiles { container = "lbmc/htseq:0.11.2" cpus = 1 } + withName: hisat2_postGenomic { + cpus = 4 + container = "lbmc/hisat2:2.1.0" + } withName: multiqc { container = "ewels/multiqc:1.9" cpus = 1 diff --git a/src/RNAseq.nf b/src/RNAseq.nf index e96e2fe5..87242145 100644 --- a/src/RNAseq.nf +++ b/src/RNAseq.nf @@ -5,18 +5,24 @@ params.fastq_raw = "data/fastq/*{_R1,_R2}.fastq.gz" params.output = "results" params.do_fastqc = true -params.script_cov = "src/norm_coverage.sh" +//params.script_cov = "src/norm_coverage.sh" params.filter = "data/filter/human_rRNA_tRNA/*.bt2" params.index_genome = "data/genome/*.ht2" params.do_dedup = true +params.gtf = "data/annotation/*.gtf" +params.index_postgenome = "data/post_genome/*.ht2" +params.do_postgenome = true log.info "input raw : ${params.fastq_raw}" log.info "outut directory : ${params.output}" log.info "filter index files : ${params.filter}" log.info "genome index : ${params.index_genome}" +log.info "gtf file : ${params.gtf}" +log.info "post-genome index : ${params.index_postgenome}" log.info "" log.info "do fastqc ? : ${params.do_fastqc}" log.info "do deduplication ? : ${params.do_dedup}" +log.info "do post genome alignement ? : ${params.do_postgenome}" log.info "" Channel @@ -35,6 +41,16 @@ Channel .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" } .set { GENOME_INDEX } +Channel + .fromPath( params.gtf ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } + .set { GTF_FILE } + +Channel + .fromPath ( params.index_genome ) + .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" } + .set { POSTGENOME_INDEX } + /* Fastqc of raw input */ process fastqc_raw { @@ -66,7 +82,9 @@ process trimming { output: set file_id, "*cut_{R1,R2}.fastq.gz" into CUTADAPT_OUTPUT - file "*.txt" into CUTADAPT_LOG + file "*first_report.txt" into CUTADAPT_LOG + file "*{second,third}_report.txt" into CUTADAPT_LOG_2 + script: """ @@ -79,7 +97,7 @@ process trimming { ${reads[0]} ${reads[1]} > ${file_id}_first_report.txt cutadapt -j ${task.cpus} \ - -u -10 \ + -a "A{100}" \ -o ${file_id}_cut_R1.fastq.gz \ ${file_id}_tmp_R1.fastq.gz \ > ${file_id}_second_report.txt @@ -140,13 +158,13 @@ process rRNA_removal { bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \ -1 ${reads[0]} -2 ${reads[1]} --no-unal \ --un-conc-gz ${file_id}_R%.fastq.gz 2> \ -${file_id}_bowtie2_report.txt | samtools view -bS - \ +${file_id}_filter.txt | samtools view -bS - \ | samtools sort -@ ${task.cpus} -o ${file_id}.filter.bam \ && samtools index ${file_id}.filter.bam \ && samtools idxstats ${file_id}.filter.bam > \ ${file_id}.filter.stats -if grep -q "Error " ${file_id}_bowtie2_report.txt; then +if grep -q "Error " ${file_id}_filter.txt; then exit 1 fi """ @@ -185,7 +203,7 @@ process hisat2_human { file index from GENOME_INDEX.toList() output: - file "*.fastq.gz" into HISAT_UNALIGNED + set file_id, "*.fastq.gz" into HISAT_UNALIGNED set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED file "*.txt" into HISAT_LOG @@ -203,12 +221,12 @@ hisat2 -x ${index_id} \ -2 ${fastq_filtred[1]} \ --un-conc-gz ${file_id}_notaligned_R%.fastq.gz \ --rna-strandness 'F' \ - 2> ${file_id}_hisat2_hg38.txt \ + 2> ${file_id}.txt \ | samtools view -bS -F 4 - \ | samtools sort -@ ${task.cpus} -o ${file_id}.bam \ && samtools index ${file_id}.bam -if grep -q "ERR" ${file_id}_hisat2_hg38.txt; then +if grep -q "ERR" ${file_id}.txt; then exit 1 fi """ @@ -248,7 +266,7 @@ process dedup_genome { set file_id, file(bam) from HISAT_ALIGNED_DEDUP output: - set file_id, "*.bam" into DEDUP_GENOME + set file_id, "*.{bam, bai}" into DEDUP_GENOME file "*.log" into DEDUP_LOG when: @@ -265,19 +283,16 @@ if (! params.do_dedup) { HISAT_ALIGNED_DEDUP.into{DEDUP_GENOME} } - - - -/* HTseq +/* HTseq */ process sort_bam { tag "$file_id" input: - set file_id, file(bam) from sorted_bam_htseq + set file_id, file(bam) from DEDUP_GENOME output: - set file_id, "*_htseq.bam" into sorted_bam_files_2 + set file_id, "*_htseq.bam" into SORTED_NAME_GENOME script: """ @@ -285,24 +300,16 @@ samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]} """ } -params.gtf = "$baseDir/data/annotation/*.gtf" -log.info "gtf files : ${params.gtf}" - -Channel - .fromPath( params.gtf ) - .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } - .set { gtf_file } - process counting { tag "$file_id" publishDir "${params.output}/04_HTseq/", mode: 'copy' input: - set file_id, file(bam) from sorted_bam_files_2 - file gtf from gtf_file.toList() + set file_id, file(bam) from SORTED_NAME_GENOME + file gtf from GTF_FILE.toList() output: - file "*.count" into count_files + file "*.count" into HTSEQ_COUNT script: """ @@ -322,11 +329,55 @@ htseq-count ${bam[0]} ${gtf} \ -t exon \ -i gene_id \ -f bam \ -> ${file_id}_exon.count +> ${file_id}.count """ } +/* HISAT2 POST_GENOMIC */ + +process hisat2_postGenomic { + tag "$file_id" + publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy' + + input: + set file_id, file(fastq_unaligned) from HISAT_UNALIGNED + file index2 from POSTGENOME_INDEX.toList() + + output: + set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED + file "*.txt" into POSTGENOME_LOG + + when: + params.do_postgenome + + script: + index2_id = index2[0] + for (index2_file in index2) { + if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) { + index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1] + } + } +""" +hisat2 -x ${index2_id} \ + -p ${task.cpus} \ + -1 ${fastq_unaligned[0]} \ + -2 ${fastq_unaligned[1]} \ + --rna-strandness 'F' \ + 2> ${file_id}.txt \ +| samtools view -bS -F 4 - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ +&& samtools index ${file_id}.bam + +if grep -q "ERR" ${file_id}.txt; then + exit 1 +fi +""" +} + +POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC; + POSTGENOME_ALIGNED_DEDUP} +/* Channel .fromFilePairs(params.script_cov) .ifEmpty { error "Cannot find any file matching: ${params.script_cov}" } @@ -368,7 +419,8 @@ process multiqc { file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([]) file ('*') from HISAT_LOG.collect().ifEmpty([]) file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([]) - + file ('*') from HTSEQ_COUNT.collect().ifEmpty([]) + file ('*') from POSTGENOME_LOG.collect().ifEmpty([]) output: file "multiqc_report.html" into multiqc_report -- GitLab