From 5c3896e28cacffdf444006981eed9e75b1198ec8 Mon Sep 17 00:00:00 2001 From: Emmanuel Labaronne <emmanuel.labaronne@ens-lyon.fr> Date: Thu, 12 Nov 2020 12:56:01 +0100 Subject: [PATCH] src/RibosomeProfiling.nf : genome mapping with star --- src/RNAseq.config | 6 ++- src/RibosomeProfiling.nf | 88 +++++++++++++++++++++++----------------- 2 files changed, 56 insertions(+), 38 deletions(-) diff --git a/src/RNAseq.config b/src/RNAseq.config index 00c35fde..66b33150 100644 --- a/src/RNAseq.config +++ b/src/RNAseq.config @@ -211,9 +211,13 @@ profiles { container = "lbmc/bowtie2:2.3.4.1" cpus = 8 } + withName: index_genome { + container = "lbmc/star:2.7.3a" + cpus = 16 + } withName: hisat2_genome { + container = "lbmc/star:2.7.3a" cpus = 8 - container = "lbmc/hisat2:2.1.0" } withName: fastqc_genome { container = "lbmc/fastqc:0.11.5" diff --git a/src/RibosomeProfiling.nf b/src/RibosomeProfiling.nf index bb19b1c5..768ab274 100644 --- a/src/RibosomeProfiling.nf +++ b/src/RibosomeProfiling.nf @@ -14,6 +14,7 @@ params.fastq_raw = "data/fastq/*.fastq.gz" params.output = "results" params.filter = "data/filter/human_rRNA_tRNA/*.bt2" params.index_genome = "data/genome/*.ht2" +params.fasta_genome = "data/genome/*.fasta" params.gtf = "data/annotation/*.gtf" params.gtf_collapse = "data/annotation/*.gtf" params.index_postgenome = "data/post_genome/*.ht2" @@ -41,7 +42,7 @@ params.strand = "F" log.info "input raw : ${params.fastq_raw}" log.info "outut directory : ${params.output}" log.info "filter index files : ${params.filter}" -log.info "genome index : ${params.index_genome}" +log.info "genome fasta : ${params.fasta_genome}" log.info "gtf file : ${params.gtf}" log.info "collapsed gtf file for rnaseqc: ${params.gtf_collapse}" log.info "post-genome index : ${params.index_postgenome}" @@ -72,13 +73,14 @@ Channel Channel .fromPath ( params.index_genome ) - .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" } - .set { GENOME_INDEX } + .ifEmpty { error "Cannot find any index files matching: ${params.fasta_genome}" } + .set { INDEX_GENOME } Channel .fromPath( params.gtf ) .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } - .set { GTF_FILE } + .into { GTF_FILE ; + GTF_INDEXING} Channel .fromPath( params.gtf_collapse ) @@ -248,46 +250,58 @@ process fastqc_filter { /////////////////////////////////////////////// /* mapping against human genome with hisat2 */ /////////////////////////////////////////////// +/*process index_genome { + tag "$fasta.baseName" + publishDir "results/star_index/", mode: 'copy' + + input: + file fasta from FASTA_GENOME + file annotation from GTF_INDEXING + + output: + file "*" into INDEX_FILES + + script: +""" +STAR --runThreadN ${task.cpus} --runMode genomeGenerate \ +--genomeDir ./ \ +--genomeFastaFiles ${fasta} \ +--sjdbGTFfile ${annotation} \ +--genomeSAindexNbases 14 # min(14, log2(GenomeLength)/2 - 1) +""" +}*/ process hisat2_genome { tag "$file_id" publishDir "${params.output}/03_hisat2/", mode: 'copy' input: - set file_id, file(fastq_filtred) from FILTER_FASTQ_HISAT - file index from GENOME_INDEX.toList() + set file_id, file(reads) from FILTER_FASTQ_HISAT + file index from INDEX_GENOME.collect() output: - set file_id, "*_notaligned.fastq.gz" into HISAT_UNALIGNED - set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED - file "*.txt" into HISAT_LOG + set file_id, "*Aligned.sortedByCoord.out.bam" into HISAT_ALIGNED + file "*.out" into HISAT_LOG + set file_id, "*Aligned.toTranscriptome.out.bam" into ALIGNED_TRANSCRIPTOME script: - index_id = index[0] - for (index_file in index) { - if (index_file =~ /.*\.1\.ht2/ && !(index_file =~ /.*\.rev\.1\.ht2/)) { - index_id = ( index_file =~ /(.*)\.1\.ht2/)[0][1] - } - } -""" -hisat2 -x ${index_id} \ - -p ${task.cpus} \ - -U ${fastq_filtred[0]} \ - --un-gz ${file_id}_notaligned.fastq.gz \ - --rna-strandness ${params.strand} \ - --dta \ - --no-softclip \ - --trim5 1\ - --trim3 1\ - 2> ${file_id}_genome.txt \ -| samtools view -bS -F 4 - \ -| samtools sort -@ ${task.cpus} -o ${file_id}_genome.bam \ -&& samtools index ${file_id}_genome.bam - -if grep -q "ERR" ${file_id}.txt; then - exit 1 -fi -""" + """ + mkdir -p index + mv ${index} index/ + + STAR --outFilterType BySJout \ + --runThreadN ${task.cpus} \ + --outFilterMismatchNmax 2 \ + --genomeDir index\ + --readFilesIn ${reads} \ + --readFilesCommand zcat \ + --outFileNamePrefix ${file_id} \ + --outSAMtype BAM SortedByCoordinate \ + --quantMode TranscriptomeSAM GeneCounts \ + --outFilterMultimapNmax 1 \ + --outFilterMatchNmin 16 \ + --alignEndsType EndToEnd + """ } HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC; @@ -504,7 +518,7 @@ if (params.do_dedup){ when: params.do_dedup - + shell: ''' samtools view -h !{bam[0]} | awk '!seen[substr($1, length($1)-5) $3 $4 $10]++' | samtools view -bS -o !{file_id}_dedup.bam @@ -594,7 +608,7 @@ if (params.do_postgenome){ echo "total counts : $total" >> !{file_id}.txt echo "scaling factor : !{factor}" >> !{file_id}.txt if [ !{genome} -gt 0 ] - then + then bamCoverage -p !{task.cpus} --binSize 1 --scaleFactor !{factor} -b !{genome_bam[0]} -o !{file_id}_genome.bigwig fi if [ !{postgenome} -gt 0 ] @@ -642,7 +656,7 @@ if (params.do_postgenome){ echo "genome aligment : !{genome} counts" >> !{file_id}.txt echo "scaling factor : !{factor}" >> !{file_id}.txt if [ !{genome} -gt 0 ] - then + then bamCoverage -p !{task.cpus} --binSize 1 --scaleFactor !{factor} -b !{genome_bam[0]} -o !{file_id}_genome.bigwig fi ''' -- GitLab