Compare revisions

ceca3ce0 · ceca3ce0 · 94be868e · ceca3ce0 · ceca3ce0 · ceca3ce0
--- a/src/nf_modules/ucsc/main.nf
+++ b/src/nf_modules/ucsc/main.nf
+// SPDX-FileCopyrightText: 2022 Laurent Modolo <laurent.modolo@ens-lyon.fr>
+//
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+version = "407"
+container_url = "lbmc/ucsc:${version}"
+
+include {
+  index_fasta
+} from './../samtools/main'
+
+params.bedgraph_to_bigwig = ""
+params.bedgraph_to_bigwig_out = ""
+process bedgraph_to_bigwig {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "${file_id}"
+  if (params.bedgraph_to_bigwig_out != "") {
+    publishDir "results/${params.bedgraph_to_bigwig_out}", mode: 'copy'
+  }
+
+  input:
+  tuple val(file_id), path(bg)
+  tuple val(file_id), path(bed)
+
+  output:
+  tuple val(file_id), path("*.bw"), emit: bw
+
+  script:
+"""
+LC_COLLATE=C
+# transform bed file of start-stop chromosome size to stop chromosome size
+awk -v OFS="\\t" '{print \$1, \$3}' ${bed} > chromsize.txt
+
+sort -T ./ -k1,1 -k2,2n ${bg} > \
+  bedGraphToBigWig ${params.bedgraph_to_bigwig} - \
+    chromsize.txt \
+    ${bg.simpleName}_norm.bw
+"""
+}
+
+params.wig_to_bedgraph = ""
+params.wig_to_bedgraph_out = ""
+workflow wig_to_bedgraph {
+  take:
+    fasta
+    wig
+  main:
+    wig_to_bigwig(
+      fasta,
+      wig
+    )
+    bigwig_to_bedgraph(
+      wig_to_bigwig.out.bw
+    )
+  emit:
+  bg = bigwig_to_bedgraph.out.bg
+}
+
+workflow wig2_to_bedgraph2 {
+  take:
+    fasta
+    wig
+  main:
+    wig2_to_bigwig2(
+      fasta,
+      wig
+    )
+    bigwig2_to_bedgraph2(
+      wig2_to_bigwig2.out.bw
+    )
+  emit:
+  bg = bigwig2_to_bedgraph2.out.bg
+}
+
+params.bigwig_to_bedgraph = ""
+params.bigwig_to_bedgraph_out = ""
+process bigwig_to_bedgraph {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "${file_id}"
+  if (params.bigwig_to_bedgraph_out != "") {
+    publishDir "results/${params.bigwig_to_bedgraph_out}", mode: 'copy'
+  }
+
+  input:
+  tuple val(file_id), path(bw)
+
+  output:
+  tuple val(file_id), path("*.bg"), emit: bg
+
+  script:
+"""
+bigWigToBedGraph ${bw} ${bw.simpleName}.bg
+"""
+}
+
+params.bigwig2_to_bedgraph2 = ""
+params.bigwig2_to_bedgraph2_out = ""
+process bigwig2_to_bedgraph2 {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "${file_id}"
+  if (params.bigwig_to_bedgraph_out != "") {
+    publishDir "results/${params.bigwig_to_bedgraph_out}", mode: 'copy'
+  }
+
+  input:
+  tuple val(file_id), path(bw_a), path(bw_b)
+
+  output:
+  tuple val(file_id), path("${bw_a.simpleName}.bg"), path("${bw_b.simpleName}.bg"), emit: bg
+
+  script:
+"""
+bigWigToBedGraph ${bw_a} ${bw_a.simpleName}.bg
+bigWigToBedGraph ${bw_b} ${bw_b.simpleName}.bg
+"""
+}
+
+params.bigwig_to_wig = ""
+params.bigwig_to_wig_out = ""
+process bigwig_to_wig {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "${file_id}"
+  if (params.bigwig_to_wig_out != "") {
+    publishDir "results/${params.bigwig_to_wig_out}", mode: 'copy'
+  }
+
+  input:
+  tuple val(file_id), path(bw)
+
+  output:
+  tuple val(file_id), path("*.wig"), emit: wig
+
+  script:
+"""
+bigWigToBedGraph ${bw} ${bw.simpleName}.bg
+bedgraph_to_wig.pl --bedgraph ${bw.simpleName}.bg --wig ${bw.simpleName}.wig --step 10
+"""
+}
+
+params.bigwig2_to_wig2 = ""
+params.bigwig2_to_wig2_out = ""
+process bigwig2_to_wig2 {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "${file_id}"
+  if (params.bigwig_to_wig_out != "") {
+    publishDir "results/${params.bigwig_to_wig_out}", mode: 'copy'
+  }
+
+  input:
+  tuple val(file_id), path(bw_a), path(bw_b)
+
+  output:
+  tuple val(file_id), path("${bw_a.simpleName}.wig"), path("${bw_b.simpleName}.wig"), emit: wig
+
+  script:
+"""
+bigWigToBedGraph ${bw_a} ${bw_a.simpleName}.bg
+bedgraph_to_wig.pl --bedgraph ${bw_a.simpleName}.bg --wig ${bw_a.simpleName}.wig --step 10
+bigWigToBedGraph ${bw_b} ${bw_b.simpleName}.bg
+bedgraph_to_wig.pl --bedgraph ${bw_b.simpleName}.bg --wig ${bw_b.simpleName}.wig --step 10
+"""
+}
+
+params.wig_to_bigwig = ""
+params.wig_to_bigwig_out = ""
+
+workflow wig_to_bigwig {
+  take:
+    fasta
+    wig
+  main:
+    index_fasta(fasta)
+    wig_to_bigwig_sub(
+      wig,
+      index_fasta.out.index
+    )
+  emit:
+  bw = wig_to_bigwig_sub.out.bw
+}
+
+process wig_to_bigwig_sub {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "${file_id}"
+  if (params.bigwig_to_wig_out != "") {
+    publishDir "results/${params.bigwig_to_wig_out}", mode: 'copy'
+  }
+
+  input:
+  tuple val(file_id), path(w)
+  tuple val(idx_id), path(fasta_idx)
+
+  output:
+  tuple val(file_id), path("${w.simpleName}.bw"), emit: bw
+
+  script:
+"""
+cut -f 1,2 ${fasta_idx} > ${fasta_idx.simpleName}.sizes
+wigToBigWig -clip ${w} ${fasta_idx.simpleName}.sizes ${w.simpleName}.bw
+"""
+}
+
+params.wig2_to_bigwig2 = ""
+params.wig2_to_bigwig2_out = ""
+
+workflow wig2_to_bigwig2 {
+  take:
+    fasta
+    wigs
+  main:
+    index_fasta(fasta)
+    wig2_to_bigwig2_sub(
+      wigs,
+      index_fasta.out.index
+    )
+  emit:
+  bw = wig2_to_bigwig2_sub.out.bw
+}
+
+process wig2_to_bigwig2_sub {
+  container = "${container_url}"
+  label "big_mem_mono_cpus"
+  tag "${file_id}"
+  if (params.bigwig_to_wig_out != "") {
+    publishDir "results/${params.bigwig_to_wig_out}", mode: 'copy'
+  }
+
+  input:
+  tuple val(file_id), path(w_a), path(w_b)
+  tuple val(idx_id), path(fasta_idx)
+
+  output:
+  tuple val(file_id), path("${w_a.simpleName}.bw"), path("${w_b.simpleName}.bw"), emit: bw
+
+  script:
+"""
+cut -f 1,2 ${fasta_idx} > ${fasta_idx.simpleName}.sizes
+wigToBigWig -clip ${w_a} ${fasta_idx.simpleName}.sizes ${w_a.simpleName}.bw
+wigToBigWig -clip ${w_b} ${fasta_idx.simpleName}.sizes ${w_b.simpleName}.bw
+"""
+}
\ No newline at end of file
--- a/src/nf_modules/urqt/main.nf
+++ b/src/nf_modules/urqt/main.nf
+// SPDX-FileCopyrightText: 2022 Laurent Modolo <laurent.modolo@ens-lyon.fr>
+//
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+version = "d62c1f8"
+container_url = "lbmc/urqt:${version}"
+
+trim_quality = "20"
+
+params.trimming = "--t 20"
+process trimming {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "${file_id}"
+
+  input:
+  tuple val(file_id), path(reads)
+
+  output:
+  tuple val(pair_id), path("*_trim_R{1,2}.fastq.gz"), emit: fastq
+  path "*_report.txt", emit: report
+
+  script:
+  if (file_id instanceof List){
+    file_prefix = file_id[0]
+  } else {
+    file_prefix = file_id
+  }
+  if (reads.size() == 2)
+"""
+UrQt ${params.trimming} --m ${task.cpus} --gz \
+  --in ${reads[0]} --inpair ${reads[1]} \
+  --out ${file_prefix}_trim_R1.fastq.gz --outpair ${file_prefix}_trim_R2.fastq.gz \
+  > ${pair_id}_trimming_report.txt
+"""
+  else
+"""
+UrQt ${params.trimming} --m ${task.cpus} --gz \
+  --in ${reads[0]} \
+  --out ${file_prefix}_trim.fastq.gz \
+  > ${file_prefix}_trimming_report.txt
+"""
+}
\ No newline at end of file
--- a/sge_modules @ 94be868e
+++ b/sge_modules @ 94be868e
-Subproject commit 94be868ea503b4810b110b35520d61f129035967
--- a/src/solution_RNASeq.nf
+++ b/src/solution_RNASeq.nf
+// SPDX-FileCopyrightText: 2022 Laurent Modolo <laurent.modolo@ens-lyon.fr>
+//
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+nextflow.enable.dsl=2
+
+include { fastp } from "./nf_modules/fastp/main.nf"
+include { fasta_from_bed } from "./nf_modules/bedtools/main.nf"
+include { index_fasta; mapping_fastq } from './nf_modules/kallisto/main.nf' addParams(mapping_fastq_out: "quantification/")
+
+
+params.fastq = "data/fastq/*_{1,2}.fastq"
+
+log.info "fastq files: ${params.fastq}"
+log.info "fasta file : ${params.fasta}"
+log.info "bed file : ${params.bed}"
+
+channel
+  .fromFilePairs( params.fastq, size: -1)
+  .set { fastq_files }
+
+channel
+  .fromPath( params.fasta )
+  .ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" }
+  .map { it -> [it.simpleName, it]}
+  .set { fasta_files }
+channel
+  .fromPath( params.bed )
+  .ifEmpty { error "Cannot find any bed files matching: ${params.bed}" }
+  .map { it -> [it.simpleName, it]}
+  .set { bed_files }
+
+workflow {
+  fastp(fastq_files)
+  fasta_from_bed(fasta_files, bed_files)
+  index_fasta(fasta_from_bed.out.fasta)
+  mapping_fastq(index_fasta.out.index.collect(), fastp.out.fastq)
+}
--- a/src/training_dataset.config
+++ b/src/training_dataset.config
+# SPDX-FileCopyrightText: 2022 Laurent Modolo <laurent.modolo@ens-lyon.fr>
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+profiles {
+  docker {
+    docker.temp = "auto"
+    docker.enabled = true
+    process {
+      withName: build_synthetic_bed {
+        container = "lbmc/bedtools:2.25.0"
+        cpus = 1
+      }
+      withName: fasta_from_bed {
+        container = "lbmc/bedtools:2.25.0"
+        cpus = 1
+      }
+      withName: index_fasta {
+        container = "lbmc/bowtie2:2.3.4.1"
+        cpus = 4
+      }
+      withName: mapping_fastq_paired {
+        container = "lbmc/bowtie2:2.3.4.1"
+        cpus = 4
+      }
+      withName: bam_2_fastq_paired {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: filter_bam_paired {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: sort_bam_paired {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: index_bam_paired {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: mapping_fastq_single {
+        container = "lbmc/bowtie2:2.3.4.1"
+        cpus = 4
+      }
+      withName: bam_2_fastq_single {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: filter_bam_single {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: sort_bam_single {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: index_bam_single {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+    }
+  }
+  singularity {
+    singularity.enabled = true
+    singularity.cacheDir = "./bin/"
+    process {
+      withName: build_synthetic_bed {
+        container = "lbmc/bedtools:2.25.0"
+        cpus = 1
+      }
+      withName: fasta_from_bed {
+        container = "lbmc/bedtools:2.25.0"
+        cpus = 1
+      }
+      withName: index_fasta {
+        container = "lbmc/bowtie2:2.3.4.1"
+        cpus = 4
+      }
+      withName: mapping_fastq_single {
+        container = "lbmc/bowtie2:2.3.4.1"
+        cpus = 4
+      }
+      withName: mapping_fastq_paired {
+        container = "lbmc/bowtie2:2.3.4.1"
+        cpus = 4
+      }
+      withName: bam_2_fastq_paired {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: filter_bam_paired {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: sort_bam_paired {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: index_bam_paired {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: bam_2_fastq_single {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: filter_bam_single {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: sort_bam_single {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+      withName: index_bam_single {
+        container = "lbmc/samtools:1.7"
+        cpus = 4
+      }
+    }
+  }
+  psmn {
+    process{
+      withName: build_synthetic_bed {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/bedtools_2.25.0"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 1
+        memory = "20GB"
+        time = "12h"
+        queue = "monointeldeb128"
+      }
+      withName: fasta_from_bed {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/bedtools_2.25.0"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 1
+        memory = "20GB"
+        time = "12h"
+        queue = "monointeldeb128"
+      }
+      withName: index_fasta {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/bowtie2_2.3.4.1"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "20GB"
+        time = "12h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+      withName: mapping_fastq_paired {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/bowtie2_2.3.4.1"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "30GB"
+        time = "24h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+      withName: bam_2_fastq_paired {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/samtools_1.7"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "30GB"
+        time = "24h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+      withName: sort_bam_paired {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/samtools_1.7"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "30GB"
+        time = "24h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+      withName: index_bam_paired {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/samtools_1.7"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "30GB"
+        time = "24h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+      withName: mapping_fastq_single {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/bowtie2_2.3.4.1"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "30GB"
+        time = "24h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+      withName: bam_2_fastq_single {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/samtools_1.7"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "30GB"
+        time = "24h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+      withName: sort_bam_single {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/samtools_1.7"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "30GB"
+        time = "24h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+      withName: index_bam_single {
+        beforeScript = "source $baseDir/.conda_psmn.sh"
+        conda = "$baseDir/.conda_envs/samtools_1.7"
+        executor = "sge"
+        clusterOptions = "-m e -cwd -V"
+        cpus = 32
+        memory = "30GB"
+        time = "24h"
+        queue = "CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D"
+        penv = "openmp32"
+      }
+    }
+  }
+  ccin2p3 {
+    singularity.enabled = true
+    singularity.cacheDir = "$baseDir/.singularity_in2p3/"
+    singularity.runOptions = "--bind /pbs,/sps,/scratch"
+    process{
+      withName: fasta_from_bed {
+        container = "lbmc/bedtools:2.25.0"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+    }
+    process{
+      withName: build_synthetic_bed {
+        container = "lbmc/bedtools:2.25.0"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: fasta_from_bed {
+        container = "lbmc/bedtools:2.25.0"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: index_fasta {
+        container = "lbmc/bowtie2:2.3.4.1"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: mapping_fastq_paired {
+        container = "lbmc/bowtie2:2.3.4.1"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: bam_2_fastq_paired {
+        container = "lbmc/samtools:1.7"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: sort_bam_paired {
+        container = "lbmc/samtools:1.7"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: index_bam_paired {
+        container = "lbmc/samtools:1.7"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: mapping_fastq_single {
+        container = "lbmc/bowtie2:2.3.4.1"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: bam_2_fastq_single {
+        container = "lbmc/samtools:1.7"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: sort_bam_single {
+        container = "lbmc/samtools:1.7"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+      withName: index_bam_single {
+        container = "lbmc/samtools:1.7"
+        scratch = true
+        stageInMode = "copy"
+        stageOutMode = "rsync"
+        executor = "sge"
+        clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n"
+        cpus = 1
+        queue = "huge"
+      }
+    }
+  }
+}
--- a/src/training_dataset.nf
+++ b/src/training_dataset.nf
+// SPDX-FileCopyrightText: 2022 Laurent Modolo <laurent.modolo@ens-lyon.fr>
+//
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+/*
+small pipeline to build a training dataset from whole genome data
+
+input:
+- fasta
+- fastq
+- chromosome
+- start position
+- stop position
+
+output:
+- sort fasta
+- sort fastq
+
+example for paired-end data:
+./nextflow src/training_dataset.nf -c src/training_dataset.config --fasta "data/genome.fa" --fastq_paired "data/*_R{1,2}.fastq.gz" --chromosome "X" --start 5305683 --stop 5333928 -resume
+
+example for single-end data:
+./nextflow src/training_dataset.nf -c src/training_dataset.config --fasta "data/genome.fa" --fastq_single  "data/*_R1.fastq.gz"  --chromosome "X" --start 5305683 --stop 5333928 -resume
+
+*/
+
+params.fastq_paired = ""
+params.fastq_single = ""
+
+log.info "fasta files : ${params.fasta}"
+log.info "fastq paired files : ${params.fastq_paired}"
+log.info "fastq single files : ${params.fastq_single}"
+log.info "chromosome : ${params.chromosome}"
+log.info "start position : ${params.start}"
+log.info "stop position : ${params.stop}"
+
+
+Channel
+  .fromPath( params.fasta )
+  .ifEmpty { error "Cannot find any index files matching: ${params.fasta}" }
+  .set { fasta_file }
+
+
+process build_synthetic_bed {
+  tag "${chromosome}:${start}-${stop}"
+  cpus 4
+
+  input:
+  val chromosome from params.chromosome
+  val start from params.start
+  val stop from params.stop
+
+  output:
+  file "*.bed" into bed_files
+
+  script:
+"""
+echo "${chromosome}\t${start}\t${stop}" > synthetic.bed
+"""
+}
+
+process fasta_from_bed {
+  tag "${fasta.baseName}"
+  cpus 4
+  publishDir "results/training/fasta/", mode: 'copy'
+
+  input:
+  file fasta from fasta_file
+  file bed from bed_files
+  val chromosome from params.chromosome
+
+  output:
+  file "*.fasta" into fasta_files_extracted
+
+  script:
+"""
+bedtools getfasta \
+-fi ${fasta} -bed ${bed} -fo s${fasta.baseName}.fasta
+"""
+}
+
+process index_fasta {
+  tag "$fasta.baseName"
+  cpus 4
+  publishDir "results/training/mapping/index/", mode: 'copy'
+
+  input:
+    file fasta from fasta_files_extracted
+
+  output:
+    file "*.index*" into index_files
+    file "*_report.txt" into indexing_report
+
+  script:
+"""
+bowtie2-build --threads ${task.cpus} ${fasta} ${fasta.baseName}.index &> ${fasta.baseName}_bowtie2_report.txt
+
+if grep -q "Error" ${fasta.baseName}_bowtie2_report.txt; then
+  exit 1
+fi
+"""
+}
+
+if ( params.fastq_paired != "" ) {
+  Channel
+    .fromFilePairs( params.fastq_paired )
+    .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq_paired}" }
+    .set { fastq_files_paired }
+
+  process mapping_fastq_paired {
+    tag "$pair_id"
+    cpus 4
+
+    input:
+    set pair_id, file(reads) from fastq_files_paired
+    file index from index_files.collect()
+
+    output:
+    set pair_id, "*.bam" into bam_files_paired
+    file "*_report.txt" into mapping_report
+
+    script:
+    index_id = index[0]
+    for (index_file in index) {
+      if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) {
+          index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1]
+      }
+    }
+  """
+  bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \
+  -1 ${reads[0]} -2 ${reads[1]} 2> \
+  ${pair_id}_bowtie2_report.txt | \
+  samtools view -Sb - > ${pair_id}.bam
+
+  if grep -q "Error" ${pair_id}_bowtie2_report.txt; then
+    exit 1
+  fi
+  """
+  }
+
+  bam_files_paired.into{ bam_files_paired_fa; bam_files_paired_ba}
+
+  process bam_2_fastq_paired {
+    tag "$file_id"
+    publishDir "results/training/fastq/", mode: 'copy'
+
+    input:
+      set file_id, file(bam) from bam_files_paired_fa
+
+    output:
+      set file_id, "*.fastq" into fastq_files_extracted
+    script:
+  """
+  samtools fastq -1 s${file_id}_R1.fastq -2 s${file_id}_R2.fastq -F 0x4 ${bam}
+  """
+  }
+
+  process filter_bam_paired {
+    tag "$file_id"
+    cpus 4
+
+    input:
+      set file_id, file(bam) from bam_files_paired_ba
+      file bed from bed_files
+
+    output:
+      set file_id, "*.bam" into filtered_bam_files_paired
+    script:
+  """
+  samtools view -@ ${task.cpus} -hb ${bam} -F 0x4 > f${file_id}.bam
+  """
+  }
+
+  process sort_bam_paired {
+    tag "$file_id"
+    publishDir "results/training/bams/", mode: 'copy'
+    cpus 4
+
+    input:
+      set file_id, file(bam) from filtered_bam_files_paired
+
+    output:
+      set file_id, "*.bam" into sorted_bam_files_paired
+
+    script:
+  """
+  samtools sort -@ ${task.cpus} -O BAM -o s${file_id}.bam ${bam}
+  """
+  }
+
+  process index_bam_paired {
+    tag "$file_id"
+    publishDir "results/training/bams/", mode: 'copy'
+
+    input:
+      set file_id, file(bam) from sorted_bam_files_paired
+
+    output:
+      set file_id, "*.bam*" into indexed_bam_file_paired
+
+    script:
+  """
+  samtools index ${bam}
+  """
+  }
+}
+
+
+if ( params.fastq_single != "" ) {
+  Channel
+    .fromPath( params.fastq_single )
+    .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq_single}" }
+    .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
+    .set { fastq_files_single }
+
+  process mapping_fastq_single {
+    tag "$file_id"
+    cpus 4
+
+    input:
+    set file_id, file(reads) from fastq_files_single
+    file index from index_files.collect()
+
+    output:
+    set file_id, "*.bam" into bam_files_single
+    file "*_report.txt" into mapping_report
+
+    script:
+    index_id = index[0]
+    for (index_file in index) {
+      if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) {
+          index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1]
+      }
+    }
+  """
+  bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \
+  -U ${reads} 2> \
+  ${file_id}_bowtie2_report.txt | \
+  samtools view -Sb - > ${file_id}.bam
+
+  if grep -q "Error" ${file_id}_bowtie2_report.txt; then
+    exit 1
+  fi
+  """
+  }
+
+  bam_files_single.into{ bam_files_single_fa; bam_files_single_ba}
+
+  process bam_2_fastq_single {
+    tag "$file_id"
+
+    input:
+      set file_id, file(bam) from bam_files_single_fa
+
+    output:
+      set file_id, "*.fastq" into fastq_files_extracted
+    script:
+  """
+  samtools fastq -0 s${file_id}.fastq -F 0x4 ${bam}
+  """
+  }
+
+  process filter_bam_single {
+    tag "$file_id"
+    cpus 4
+
+    input:
+      set file_id, file(bam) from bam_files_single_ba
+      file bed from bed_files
+
+    output:
+      set file_id, "*.bam" into filtered_bam_files_single
+    script:
+  """
+  samtools view -@ ${task.cpus} -hb ${bam} -F 0x4 > f${file_id}.bam
+  """
+  }
+
+  process sort_bam_single {
+    tag "$file_id"
+    publishDir "results/training/bams/", mode: 'copy'
+    cpus 4
+
+    input:
+      set file_id, file(bam) from filtered_bam_files_single
+
+    output:
+      set file_id, "*.bam" into sorted_bam_files_single
+
+    script:
+  """
+  samtools sort -@ ${task.cpus} -O BAM -o s${file_id}.bam ${bam}
+  """
+  }
+
+  process index_bam_single {
+    tag "$file_id"
+    publishDir "results/training/bams/", mode: 'copy'
+
+    input:
+      set file_id, file(bam) from sorted_bam_files_single
+
+    output:
+      set file_id, "*.bam*" into indexed_bam_file_single
+
+    script:
+  """
+  samtools index ${bam}
+  """
+  }
+}
+
No results found