diff --git a/src/RNAseq_illumina.nf b/src/RNAseq_illumina.nf new file mode 100644 index 0000000000000000000000000000000000000000..b1e9338131a42e14ff91c15ca55ca9c26300ec88 --- /dev/null +++ b/src/RNAseq_illumina.nf @@ -0,0 +1,188 @@ +/* +* RNAseq Analysis pipeline +*/ + +params.fastq_raw = "data/demultiplexed/*{_R1,_R2}.fastq.gz" +params.output = "results" + +Channel + .fromFilePairs(params.fastq_raw) + .ifEmpty { error "Cannot find any file matching: ${params.fastq_raw}" } + .set {fastq_raw_channel} + +/* Trimming by quality */ + +process trimming { + tag "$file_id" + cpus 4 + publishDir "${params.output}/01_cutadapt/", mode: 'copy' + echo true + + input: + set file_id, file(reads) from fastq_raw_channel + + output: + set file_id, "*cut_{R1,R2}.fastq.gz" into fastq_files_cut + file "*.txt" into rapport_UrQt + + script: + """ + cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \ + --minimum-length 50 \ + -o ${file_id}_cut_R1.fastq.gz -p ${file_id}_cut_R2.fastq.gz \ + ${reads[0]} ${reads[1]} > ${file_id}_report.txt + + """ +} + + +/* rRNA and tRNA filtering */ + +params.indexrRNA = "/Xnfs/lbmcdb/Ricci_team/shared_data/genomes/human_rRNA_tRNA/*.bt2" + +log.info "index files : ${params.indexrRNA}" + +Channel + .fromPath( params.indexrRNA ) + .ifEmpty { error "Cannot find any index files matching: ${params.indexrRNA}" } + .set { rRNA_index_files } + +process rRNA_removal { + tag "$file_id" + cpus 8 + publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy' + + input: + set file_id, file(reads) from fastq_files_cut + file index from rRNA_index_files.toList() + + output: + set file_id, "*.fastq.gz" into rRNA_removed_files + file "*.txt" into bowtie_report + + script: + index_id = index[0] + for (index_file in index) { + if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) { + index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1] + } + } +""" +bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \ +-1 ${reads[0]} -2 ${reads[1]} --un-conc-gz ${file_id}_R%.fastq.gz 2> \ +${file_id}_bowtie2_report.txt > /dev/null + +if grep -q "Error " ${file_id}_bowtie2_report.txt; then + exit 1 +fi +""" +} + +/* mapping against human genome with hisat2 */ + +params.index_hg38 = "/media/adminmanu/Stockage/HISAT2_index_hg38_tran/*.ht2" + +log.info "index : ${params.index_hg38}" + + +Channel + .fromPath ( params.index_hg38 ) + .ifEmpty { error "Cannot find any index files matching: ${params.index_hg38}" } + .set { index_file_hg38 } + +process hisat2_human { + tag "$file_id" + publishDir "${params.output}/03_hisat2/", mode: 'copy' + errorStrategy 'finish' + + input: + set file_id, file(fastq_filtred) from rRNA_removed_files + file index from index_file_hg38.toList() + + output: + file "*.fastq.gz" into reads_non_aligned_hg38 + set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files + file "*.txt" into hisat_report + + script: + index_id = index[0] + for (index_file in index) { + if (index_file =~ /.*\.1\.ht2/ && !(index_file =~ /.*\.rev\.1\.ht2/)) { + index_id = ( index_file =~ /(.*)\.1\.ht2/)[0][1] + } + } +""" +hisat2 -x ${index_id} -p ${task.cpus} \ +-1 ${fastq_filtred[0]} -2 ${fastq_filtred[1]} \ +--un-conc-gz ${file_id}_notaligned_R%.fastq.gz \ +--rna-strandness 'F' 2> ${file_id}_hisat2_hg38.txt | samtools view -bS -F 4 -o ${file_id}.bam + +if grep -q "ERR" ${file_id}_hisat2_hg38.txt; then + exit 1 +fi + +samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${file_id}.bam +samtools index ${file_id}_sorted.bam + +""" +} + + +/* HTseq */ + +process sort_bam { + tag "$file_id" + + input: + set file_id, file(bam) from sorted_bam_files + + output: + set file_id, "*_htseq.bam" into sorted_bam_files_2 + + script: +""" +samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]} +""" +} + +params.gtf = "$baseDir/data/annotation/*.gtf" +log.info "gtf files : ${params.gtf}" + +Channel + .fromPath( params.gtf ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } + .set { gtf_file } + +process counting { + tag "$file_id" + publishDir "${params.output}/04_HTseq/", mode: 'copy' + + input: + set file_id, file(bam) from sorted_bam_files_2 + file gtf from gtf_file.toList() + + output: + file "*.count" into count_files + + script: +""" +htseq-count ${bam[0]} ${gtf} \ + --mode=intersection-nonempty \ + -a 10 \ + -s yes \ + -t CDS \ + -i gene_id \ + -f bam \ +> ${file_id}_CDS.count + +htseq-count ${bam[0]} ${gtf} \ + --mode=intersection-nonempty \ + -a 10 \ + -s yes \ + -t exon \ + -i gene_id \ + -f bam \ +> ${file_id}_exon.count + +""" +}