diff --git a/src/RibosomeProfiling.config b/src/RibosomeProfiling.config new file mode 100644 index 0000000000000000000000000000000000000000..6837f9cc1217343f7067a3a7a0eff8019c7391c0 --- /dev/null +++ b/src/RibosomeProfiling.config @@ -0,0 +1,112 @@ +profiles { + sge { + process{ + withName: trimming { + beforeScript = "source $baseDir/.conda_psmn.sh" + conda = "$baseDir/.conda_envs/cutadapt_2.1" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128' + } + withName: rRNA_removal { + beforeScript = "source $baseDir/.conda_psmn.sh" + conda = "$baseDir/.conda_envs/bowtie2_2.3.4.1" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 16 + memory = "30GB" + time = "24h" + queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F' + penv = 'openmp16' + } + withName: hisat2_human { + beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" + module = "hisat2/2.1.0:samtools/1.7" + executor = "sge" + clusterOptions = "-cwd -V" + memory = "20GB" + cpus = 16 + time = "12h" + queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F' + penv = 'openmp16' + } + withName: sort_bam { + beforeScript = "source $baseDir/.conda_psmn.sh" + conda = "$baseDir/.conda_envs/samtools_1.7" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128' + } + withName: index_bam { + beforeScript = "source $baseDir/.conda_psmn.sh" + conda = "$baseDir/.conda_envs/samtools_1.7" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128' + } + withName: dedup { + beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" + module = "umi_tools/1.0.0" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128' + } + withName: counting { + beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" + module = "htseq/0.11.2" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128' + } + } + } + docker { + docker.temp = 'auto' + docker.enabled = true + process { + withName: adaptor_removal { + container = "lbmc/cutadapt:2.1" + cpus = 1 + } + withName: rRNA_removal { + container = "lbmc/bowtie2:2.3.4.1" + cpus = 4 + } + withName: hisat2_human { + cpus = 4 + container = "lbmc/hisat2:2.1.0" + } + withName: sort_bam { + container = "lbmc/samtools:1.7" + cpus = 1 + } + withName: index_bam { + container = "lbmc/samtools:1.7" + cpus = 1 + } + withName: dedup { + container = "lbmc/umi_tools:1.0.0" + cpus = 1 + } + withName: counting { + container = "lbmc/htseq:0.11.2" + cpus = 1 + } + } + } +} diff --git a/src/RibosomeProfiling.nf b/src/RibosomeProfiling.nf new file mode 100644 index 0000000000000000000000000000000000000000..15e02098a462d9a5c7126218b3364106ee53da4d --- /dev/null +++ b/src/RibosomeProfiling.nf @@ -0,0 +1,211 @@ +/* +* RibosomeProfiling Analysis pipeline +*/ + +/* Trimming */ +params.output = "results" +params.fastq_raw = "${params.output}/00_demultiplexing/*.fastq.gz" + +Channel + .fromPath( params.fastq_raw ) + .ifEmpty { error "Cannot find any files matching: ${params.fastq_raw}" } + .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} + .set { fastq_raw_flow } + + +process trimming { + tag "$file_id" + publishDir "${params.output}/01_trimming/", mode: 'copy' + + input: + set file_id, file(fastq_raw) from fastq_raw_flow + + output: + set file_id, "*_cut.fastq.gz" into fastq_trim_filt + file "*.txt" into log_trim + + script: + """ + cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC -m 20\ + -o ${file_id}_cut.fastq.gz \ + ${fastq_raw} > ${file_id}_report.txt + """ +} + +/* rRNA and tRNA filtering */ + +params.indexrRNA = "results/human_rRNA_tRNA/*.bt2" +log.info "index files rRNA : ${params.indexrRNA}" + +Channel + .fromPath( params.indexrRNA ) + .ifEmpty { error "Cannot find any index files matching: ${params.indexrRNA}" } + .set { rRNA_index_files } + +process rRNA_removal { + tag "$file_id" + publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy' + + input: + set file_id, file(reads) from fastq_trim_filt + file index from rRNA_index_files.toList() + + output: + set file_id, "*.fastq.gz" into rRNA_removed_reads + file "*.txt" into bowtie_report + + script: +""" +zcat ${reads} | bowtie2 --sensitive -p ${task.cpus} -x human_rRNA_tRNA \ +-U - --un-gz ${file_id}_mRNA.fastq.gz 2> \ +${file_id}_bowtie2_report.txt > /dev/null + +if grep -q "Error " ${file_id}_bowtie2_report.txt; then + exit 1 +fi +""" +} + +/* mapping against human genome with hisat2 */ + +params.index_hg38 = "/media/adminmanu/Stockage/HISAT2_index_hg38_tran/*.ht2" + +log.info "index : ${params.index_hg38}" + + +Channel + .fromPath ( params.index_hg38 ) + .ifEmpty { error "Cannot find any hg38 index files matching: ${params.index_hg38}" } + .set { index_file_hg38 } + +process hisat2_human { + tag "$file_id" + + + input: + set file_id, file(fastq_filtred) from rRNA_removed_reads + file index from index_file_hg38.toList() + + output: + set file_id, "*.fastq.gz" into reads_non_aligned_hg38 + set file_id, "*.bam" into reads_aligned_hg38 + file "*.txt" into hisat_report + + script: +""" +hisat2 -x genome_tran -p ${task.cpus} \ +-U ${fastq_filtred} --un-gz ${file_id}_notaligned_hg38.fastq.gz \ +--end-to-end --rna-strandness 'F' \ +2> ${file_id}_hisat2_hg38.txt | samtools view -bS -F 4 -o ${file_id}.bam + +""" +} + +/* sorting */ + +process index_bam { + tag "$file_id" + publishDir "${params.output}/03_hisat2_hg38/", mode: 'copy' + + input: + set file_id, file(bam) from reads_aligned_hg38 + + output: + set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files + + script: +""" +samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${bam} +samtools index ${file_id}_sorted.bam +""" +} + +sorted_bam_files.into{for_dedup;for_htseq} + +/* deduplicating reads */ + +params.dedup_options = "" + +process dedup { + tag "$file_id" + + input: + set file_id, file(bam) from for_dedup + + output: + set file_id, "*dedup.bam" into dedup_bam + file "*.txt" into dedup_report + + script: +""" +umi_tools dedup -I ${bam[0]} \ + ${params.dedup_options} \ + -S ${file_id}_dedup.bam > report.txt +""" +} + +process sort_bam { + tag "$file_id" + publishDir "${params.output}/03_hisat2_hg38_dedup/", mode: 'copy' + + input: + set file_id, file(bam) from dedup_bam + file dedup from dedup_report + + output: + set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files_2 + file "*.txt" into report_dedup + + script: +""" +samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${bam} +samtools index ${file_id}_sorted.bam +cat ${dedup} > ${file_id}_dedup_report.txt +""" +} + +/* HTseq */ + +params.gtf = "$baseDir/data/annotation/*.gtf" +log.info "gtf files : ${params.gtf}" + +Channel + .fromPath( params.gtf ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } + .set { gtf_file } + +process counting { + tag "$file_id" + publishDir "${params.output}/04_HTseq/", mode: 'copy' + + input: + set file_id, file(bam) from for_htseq + file gtf from gtf_file.toList() + + output: + file "*.count" into count_files + + script: +""" +htseq-count ${bam[0]} ${gtf} \ + --mode=intersection-nonempty \ + -a 10 \ + -s yes \ + -t CDS \ + -i gene_id \ + -r pos \ + -f bam \ +> ${file_id}_CDS.count + +htseq-count ${bam[0]} ${gtf} \ + --mode=intersection-nonempty \ + -a 10 \ + -s yes \ + -t exon \ + -i gene_id \ + -r pos \ + -f bam \ +> ${file_id}_exon.count + +""" +}