Add PASeq pipeline

ac3dbc46 · elabaron · f99e684b · ac3dbc46 · ac3dbc46
Commit ac3dbc46 authored 5 years ago by elabaron
--- a/src/PASseq.config
+++ b/src/PASseq.config
+profiles {
+  sge {
+    process{
+      withName: trimming {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/cutadapt_2.1"
+        executor = "sge"
+        clusterOptions = "-cwd -V"
+        cpus = 1
+        memory = "20GB"
+        time = "12h"
+        queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128'
+      }
+      withName: rRNA_removal {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/bowtie2_2.3.4.1"
+        executor = "sge"
+        clusterOptions = "-cwd -V"
+        cpus = 16
+        memory = "30GB"
+        time = "24h"
+        queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F'
+        penv = 'openmp16'
+      }
+      withName: hisat2_human {
+        beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules"
+        module = "hisat2/2.1.0:samtools/1.7"
+        executor = "sge"
+        clusterOptions = "-cwd -V"
+        memory = "20GB"
+        cpus = 16
+        time = "12h"
+        queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F'
+        penv = 'openmp16'
+      }
+      withName: sort_bam {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/samtools_1.7"
+        executor = "sge"
+        clusterOptions = "-cwd -V"
+        cpus = 1
+        memory = "20GB"
+        time = "12h"
+        queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128'
+      }
+      withName: index_bam {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/samtools_1.7"
+        executor = "sge"
+        clusterOptions = "-cwd -V"
+        cpus = 1
+        memory = "20GB"
+        time = "12h"
+        queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128'
+      }
+      withName: dedup {
+        beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules"
+        module = "umi_tools/1.0.0"
+        executor = "sge"
+        clusterOptions = "-cwd -V"
+        cpus = 1
+        memory = "20GB"
+        time = "12h"
+        queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128'
+      }
+      withName: counting {
+        beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules"
+        module = "htseq/0.11.2"
+        executor = "sge"
+        clusterOptions = "-cwd -V"
+        cpus = 1
+        memory = "20GB"
+        time = "12h"
+        queue = 'monointeldeb128,monointeldeb48,h48-E5-2670deb128,h6-E5-2667v4deb128'
+      }
+    }
+  }
+  docker {
+    docker.temp = 'auto'
+    docker.enabled = true
+    process {
+      withName: adaptor_removal {
+        container = "lbmc/cutadapt:2.1"
+        cpus = 1
+      }
+      withName: rRNA_removal {
+        container = "lbmc/bowtie2:2.3.4.1"
+        cpus = 4
+      }
+      withName: hisat2_human {
+        cpus = 4
+        container = "lbmc/hisat2:2.1.0"
+      }
+      withName: sort_bam {
+        container = "lbmc/samtools:1.7"
+        cpus = 1
+      }
+      withName: index_bam {
+        container = "lbmc/samtools:1.7"
+        cpus = 1
+      }
+      withName: dedup {
+        container = "lbmc/umi_tools:1.0.0"
+        cpus = 1
+      }
+      withName: counting {
+        container = "lbmc/htseq:0.11.2"
+        cpus = 1
+      }
+    }
+  }
+}
--- a/src/PASseq.nf
+++ b/src/PASseq.nf
+/*
+*	RibosomeProfiling Analysis pipeline
+*/
+
+/* Trimming */
+params.output = "results"
+params.fastq_raw = "${params.output}/00_demultiplexing/*.fastq.gz"
+
+Channel
+  .fromPath( params.fastq_raw )
+  .ifEmpty { error "Cannot find any files matching: ${params.fastq_raw}" }
+  .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
+  .set { fastq_raw_flow }
+
+
+process trimming {
+  tag "$file_id"
+  publishDir "${params.output}/01_trimming/", mode: 'copy'
+
+  input:
+  set file_id, file(fastq_raw) from fastq_raw_flow
+
+  output:
+  set file_id, "*_cut.fastq.gz" into fastq_trim_filt
+  file "*.txt" into log_trim
+
+  script:
+  """
+  cutadapt -a AAAAAAAAAAAAAAAAAAAAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG -m 15\
+  -u 2 -o ${file_id}_cut.fastq.gz \
+  ${fastq_raw} > ${file_id}_report.txt
+  """
+}
+
+/* rRNA and tRNA filtering */
+
+params.indexrRNA = "results/human_rRNA_tRNA/*.bt2"
+log.info "index files rRNA : ${params.indexrRNA}"
+
+Channel
+  .fromPath( params.indexrRNA )
+  .ifEmpty { error "Cannot find any index files matching: ${params.indexrRNA}" }
+  .set { rRNA_index_files }
+
+process rRNA_removal {
+  tag "$file_id"
+  publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy'
+
+  input:
+  set file_id, file(reads) from fastq_trim_filt
+  file index from rRNA_index_files.toList()
+
+  output:
+  set file_id, "*.fastq.gz" into rRNA_removed_reads
+  file "*.txt" into bowtie_report
+
+  script:
+"""
+zcat ${reads} | bowtie2 --sensitive -p ${task.cpus} -x human_rRNA_tRNA \
+-U - --un-gz ${file_id}_mRNA.fastq.gz 2> \
+${file_id}_bowtie2_report.txt > /dev/null
+
+if grep -q "Error " ${file_id}_bowtie2_report.txt; then
+  exit 1
+fi
+"""
+}
+
+/*	mapping against human genome with hisat2 */
+
+params.index_hg38 = "/media/adminmanu/Stockage/HISAT2_index_hg38_tran/*.ht2"
+
+log.info "index : ${params.index_hg38}"
+
+
+Channel
+  .fromPath ( params.index_hg38 )
+  .ifEmpty { error "Cannot find any hg38 index files matching: ${params.index_hg38}" }
+  .set { index_file_hg38 }
+
+process hisat2_human {
+  tag "$file_id"
+
+
+  input:
+    set file_id, file(fastq_filtred) from rRNA_removed_reads
+    file index from index_file_hg38.toList()
+
+  output:
+    set file_id, "*.fastq.gz" into reads_non_aligned_hg38
+    set file_id, "*.bam" into reads_aligned_hg38
+    file "*.txt" into hisat_report
+
+  script:
+"""
+hisat2 -x genome_tran -p ${task.cpus} \
+-U ${fastq_filtred} --un-gz ${file_id}_notaligned_hg38.fastq.gz \
+--end-to-end  --rna-strandness 'F' \
+2> ${file_id}_hisat2_hg38.txt | samtools view -bS -F 4 -o ${file_id}.bam
+
+"""
+}
+
+/*                   sorting                             */
+
+process index_bam {
+  tag "$file_id"
+  publishDir "${params.output}/03_hisat2_hg38/", mode: 'copy'
+
+  input:
+    set file_id, file(bam) from reads_aligned_hg38
+
+  output:
+    set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files
+
+  script:
+"""
+samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${bam}
+samtools index ${file_id}_sorted.bam
+"""
+}
+
+sorted_bam_files.into{for_dedup;for_htseq}
+
+/*                   deduplicating reads                            */
+
+params.dedup_options = ""
+
+process dedup {
+  tag "$file_id"
+
+  input:
+  set file_id, file(bam) from for_dedup
+
+  output:
+  set file_id, "*dedup.bam" into dedup_bam
+  file "*.txt" into dedup_report
+
+  script:
+"""
+umi_tools dedup -I ${bam[0]} \
+                ${params.dedup_options} \
+                -S ${file_id}_dedup.bam > report.txt
+"""
+}
+
+process sort_bam {
+  tag "$file_id"
+  publishDir "${params.output}/03_hisat2_hg38_dedup/", mode: 'copy'
+
+  input:
+    set file_id, file(bam) from dedup_bam
+    file dedup from dedup_report
+
+  output:
+    set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files_2
+    file "*.txt" into report_dedup
+
+  script:
+"""
+samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${bam}
+samtools index ${file_id}_sorted.bam
+cat ${dedup} > ${file_id}_dedup_report.txt
+"""
+}
+
+/*                   HTseq                            */
+
+params.gtf = "$baseDir/data/annotation/*.gtf"
+log.info "gtf files : ${params.gtf}"
+
+Channel
+  .fromPath( params.gtf )
+  .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" }
+  .set { gtf_file }
+
+process counting {
+  tag "$file_id"
+  publishDir "${params.output}/04_HTseq/", mode: 'copy'
+
+  input:
+  set file_id, file(bam) from for_htseq
+  file gtf from gtf_file.toList()
+
+  output:
+  file "*.count" into count_files
+
+  script:
+"""
+htseq-count ${bam[0]} ${gtf} \
+            --mode=intersection-nonempty \
+            -a 10 \
+            -s yes \
+            -t CDS \
+            -i gene_id \
+            -r pos \
+            -f bam \
+> ${file_id}_CDS.count
+
+htseq-count ${bam[0]} ${gtf} \
+            --mode=intersection-nonempty \
+            -a 10 \
+            -s yes \
+            -t exon \
+            -i gene_id \
+            -r pos \
+            -f bam \
+> ${file_id}_exon.count
+
+"""
+}