From 6f2eec1ed16f5ef0c42d2ad346551125f4bc0227 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Thu, 23 Aug 2018 15:26:57 +0200
Subject: [PATCH] add pipeline to create training dataset

---
 src/training_dataset.config |  70 +++++++++++
 src/training_dataset.nf     | 236 ++++++++++++++++++++++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100644 src/training_dataset.config
 create mode 100644 src/training_dataset.nf

diff --git a/src/training_dataset.config b/src/training_dataset.config
new file mode 100644
index 0000000..5dc8127
--- /dev/null
+++ b/src/training_dataset.config
@@ -0,0 +1,70 @@
+profiles {
+  docker {
+    docker.temp = 'auto'
+    docker.enabled = true
+    process {
+      $build_synthetic_bed {
+        container = "bedtools:2.25.0"
+      }
+      $fasta_from_bed {
+        container = "bedtools:2.25.0"
+      }
+      $index_fasta {
+        container = "bowtie2:2.3.4.1"
+      }
+      $mapping_fastq_paired {
+        container = "bowtie2:2.3.4.1"
+      }
+      $bam_2_fastq_paired {
+        container = "samtools:1.7"
+      }
+      $mapping_fastq_single {
+        container = "bowtie2:2.3.4.1"
+      }
+      $bam_2_fastq_single {
+        container = "samtools:1.7"
+      }
+    }
+  }
+  sge {
+    process{
+      $build_synthetic_bed {
+        beforeScript = "module purge; module load BEDtools/2.25.0"
+        executor = "sge"
+        cpus = 1
+        memory = "5GB"
+        time = "6h"
+        queueSize = 1000
+        pollInterval = '60sec'
+        queue = 'h6-E5-2667v4deb128'
+        penv = 'openmp8'
+      }
+      $fasta_from_bed {
+        beforeScript = "module purge; module load BEDtools/2.25.0"
+        executor = "sge"
+        cpus = 1
+        memory = "5GB"
+        time = "6h"
+        queueSize = 1000
+        pollInterval = '60sec'
+        queue = 'h6-E5-2667v4deb128'
+        penv = 'openmp8'
+      }
+      $index_fasta {
+        beforeScript = "module purge; module load Bowtie2/2.3.4.1"
+      }
+      $mapping_fastq_paired {
+        beforeScript = "module purge; module load SAMtools/1.7; module load Bowtie2/2.3.4.1"
+      }
+      $bam_2_fastq_paired {
+        beforeScript = "module purge; module load SAMtools/1.7"
+      }
+      $mapping_fastq_single {
+        beforeScript = "module purge; module load SAMtools/1.7; module load Bowtie2/2.3.4.1"
+      }
+      $bam_2_fastq_single {
+        beforeScript = "module purge; module load SAMtools/1.7"
+      }
+    }
+  }
+}
diff --git a/src/training_dataset.nf b/src/training_dataset.nf
new file mode 100644
index 0000000..c8c7e66
--- /dev/null
+++ b/src/training_dataset.nf
@@ -0,0 +1,236 @@
+/*
+small pipeline to build a training dataset from whole genome data
+
+input:
+- fasta
+- fastq
+- chromosome
+- start position
+- stop position
+
+output:
+- sort fasta
+- sort fastq
+*/
+
+params.fastq_paired = ""
+params.fastq_single = ""
+
+log.info "fasta files : ${params.fasta}"
+log.info "fastq paired files : ${params.fastq_paired}"
+log.info "fastq single files : ${params.fastq_single}"
+log.info "chromosome : ${params.chromosome}"
+log.info "start position : ${params.start}"
+log.info "stop position : ${params.stop}"
+
+
+Channel
+  .fromPath( params.fasta )
+  .ifEmpty { error "Cannot find any index files matching: ${params.fasta}" }
+  .set { fasta_file }
+
+
+process build_synthetic_bed {
+  tag "${chromosome}:${start}-${stop}"
+  cpus 4
+
+  input:
+  val chromosome from params.chromosome
+  val start from params.start
+  val stop from params.stop
+
+  output:
+  file "*.bed" into bed_files
+
+  script:
+"""
+echo "${chromosome}\t${start}\t${stop}" > synthetic.bed
+"""
+}
+
+process fasta_from_bed {
+  tag "${fasta.baseName}"
+  cpus 4
+  publishDir "results/training/fasta/", mode: 'copy'
+
+  input:
+  file fasta from fasta_file
+  file bed from bed_files
+
+  output:
+  file "*.fasta" into fasta_files_extracted
+
+  script:
+"""
+bedtools getfasta -name \
+-fi ${fasta} -bed ${bed} -fo ${fasta.baseName}_S.fasta
+"""
+}
+
+process index_fasta {
+  tag "$fasta.baseName"
+  cpus 4
+  publishDir "results/training/mapping/index/", mode: 'copy'
+
+  input:
+    file fasta from fasta_files_extracted
+
+  output:
+    file "*.index*" into index_files
+    file "*_report.txt" into indexing_report
+
+  script:
+"""
+bowtie2-build --threads ${task.cpus} ${fasta} ${fasta.baseName}.index &> ${fasta.baseName}_bowtie2_report.txt
+
+if grep -q "Error" ${fasta.baseName}_bowtie2_report.txt; then
+  exit 1
+fi
+"""
+}
+
+if ( params.fastq_paired != "" ) {
+  Channel
+    .fromFilePairs( params.fastq_paired )
+    .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq_paired}" }
+    .set { fastq_files_paired }
+
+  process mapping_fastq_paired {
+    tag "$pair_id"
+    cpus 4
+
+    input:
+    set pair_id, file(reads) from fastq_files_paired
+    file index from index_files.collect()
+
+    output:
+    set pair_id, "*.bam" into bam_files_paired
+    file "*_report.txt" into mapping_report
+
+    script:
+    index_id = index[0]
+    for (index_file in index) {
+      if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) {
+          index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1]
+      }
+    }
+  """
+  bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \
+  -1 ${reads[0]} -2 ${reads[1]} 2> \
+  ${pair_id}_bowtie2_report.txt | \
+  samtools view -Sb - > ${pair_id}.bam
+
+  if grep -q "Error" ${pair_id}_bowtie2_report.txt; then
+    exit 1
+  fi
+  """
+  }
+
+  bam_files_paired.into{ bam_files_paired_fa; bam_files_paired_ba}
+
+  process bam_2_fastq_paired {
+    tag "$file_id"
+    publishDir "results/training/fastq/", mode: 'copy'
+
+    input:
+      set file_id, file(bam) from bam_files_paired_fa
+
+    output:
+      set file_id, "*.fastq" into fastq_files_extracted
+    script:
+  """
+  samtools fastq -1 ${file_id}_SR1.fastq -2 ${file_id}_SR2.fastq -f 0x2 ${bam}
+  """
+  }
+
+  process filter_bam_paired {
+    tag "$file_id"
+    publishDir "results/training/bams/", mode: 'copy'
+    cpus 4
+
+    input:
+      set file_id, file(bam) from bam_files_paired_ba
+      file bed from bed_files
+
+    output:
+      set file_id, "*.bam" into filtered_bam_files
+    script:
+  """
+  samtools view -@ ${task.cpus} -hb ${bam} -f 0x2 > ${file_id}_S.bam
+  """
+  }
+}
+
+
+if ( params.fastq_single != "" ) {
+  Channel
+    .fromPath( params.fastq_single )
+    .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq_single}" }
+    .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
+    .set { fastq_files_single }
+
+  process mapping_fastq_single {
+    tag "$file_id"
+    cpus 4
+
+    input:
+    set file_id, file(reads) from fastq_files_single
+    file index from index_files.collect()
+
+    output:
+    set file_id, "*.bam" into bam_files_single
+    file "*_report.txt" into mapping_report
+
+    script:
+    index_id = index[0]
+    for (index_file in index) {
+      if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) {
+          index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1]
+      }
+    }
+  """
+  bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \
+  -U ${reads} 2> \
+  ${file_id}_bowtie2_report.txt | \
+  samtools view -Sb - > ${file_id}.bam
+
+  if grep -q "Error" ${file_id}_bowtie2_report.txt; then
+    exit 1
+  fi
+  """
+  }
+
+  bam_files_single.into{ bam_files_single_fa; bam_files_single_ba}
+
+  process bam_2_fastq_single {
+    tag "$file_id"
+    publishDir "results/training/fastq/", mode: 'copy'
+
+    input:
+      set file_id, file(bam) from bam_files_single_fa
+
+    output:
+      set file_id, "*.fastq" into fastq_files_extracted
+    script:
+  """
+  samtools fastq -s ${file_id}_S.fastq -f 0x2 ${bam}
+  """
+  }
+
+  process filter_bam_single {
+    tag "$file_id"
+    publishDir "results/training/bams/", mode: 'copy'
+    cpus 4
+
+    input:
+      set file_id, file(bam) from bam_files_single_ba
+      file bed from bed_files
+
+    output:
+      set file_id, "*_S.bam" into filtered_bam_files
+    script:
+  """
+  samtools view -@ ${task.cpus} -hb ${bam} -f 0x2 > ${file_id}_S.bam
+  """
+  }
+}
-- 
GitLab