From 6f2eec1ed16f5ef0c42d2ad346551125f4bc0227 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Thu, 23 Aug 2018 15:26:57 +0200 Subject: [PATCH] add pipeline to create training dataset --- src/training_dataset.config | 70 +++++++++++ src/training_dataset.nf | 236 ++++++++++++++++++++++++++++++++++++ 2 files changed, 306 insertions(+) create mode 100644 src/training_dataset.config create mode 100644 src/training_dataset.nf diff --git a/src/training_dataset.config b/src/training_dataset.config new file mode 100644 index 0000000..5dc8127 --- /dev/null +++ b/src/training_dataset.config @@ -0,0 +1,70 @@ +profiles { + docker { + docker.temp = 'auto' + docker.enabled = true + process { + $build_synthetic_bed { + container = "bedtools:2.25.0" + } + $fasta_from_bed { + container = "bedtools:2.25.0" + } + $index_fasta { + container = "bowtie2:2.3.4.1" + } + $mapping_fastq_paired { + container = "bowtie2:2.3.4.1" + } + $bam_2_fastq_paired { + container = "samtools:1.7" + } + $mapping_fastq_single { + container = "bowtie2:2.3.4.1" + } + $bam_2_fastq_single { + container = "samtools:1.7" + } + } + } + sge { + process{ + $build_synthetic_bed { + beforeScript = "module purge; module load BEDtools/2.25.0" + executor = "sge" + cpus = 1 + memory = "5GB" + time = "6h" + queueSize = 1000 + pollInterval = '60sec' + queue = 'h6-E5-2667v4deb128' + penv = 'openmp8' + } + $fasta_from_bed { + beforeScript = "module purge; module load BEDtools/2.25.0" + executor = "sge" + cpus = 1 + memory = "5GB" + time = "6h" + queueSize = 1000 + pollInterval = '60sec' + queue = 'h6-E5-2667v4deb128' + penv = 'openmp8' + } + $index_fasta { + beforeScript = "module purge; module load Bowtie2/2.3.4.1" + } + $mapping_fastq_paired { + beforeScript = "module purge; module load SAMtools/1.7; module load Bowtie2/2.3.4.1" + } + $bam_2_fastq_paired { + beforeScript = "module purge; module load SAMtools/1.7" + } + $mapping_fastq_single { + beforeScript = "module purge; module load SAMtools/1.7; module load Bowtie2/2.3.4.1" + } + $bam_2_fastq_single { + beforeScript = "module purge; module load SAMtools/1.7" + } + } + } +} diff --git a/src/training_dataset.nf b/src/training_dataset.nf new file mode 100644 index 0000000..c8c7e66 --- /dev/null +++ b/src/training_dataset.nf @@ -0,0 +1,236 @@ +/* +small pipeline to build a training dataset from whole genome data + +input: +- fasta +- fastq +- chromosome +- start position +- stop position + +output: +- sort fasta +- sort fastq +*/ + +params.fastq_paired = "" +params.fastq_single = "" + +log.info "fasta files : ${params.fasta}" +log.info "fastq paired files : ${params.fastq_paired}" +log.info "fastq single files : ${params.fastq_single}" +log.info "chromosome : ${params.chromosome}" +log.info "start position : ${params.start}" +log.info "stop position : ${params.stop}" + + +Channel + .fromPath( params.fasta ) + .ifEmpty { error "Cannot find any index files matching: ${params.fasta}" } + .set { fasta_file } + + +process build_synthetic_bed { + tag "${chromosome}:${start}-${stop}" + cpus 4 + + input: + val chromosome from params.chromosome + val start from params.start + val stop from params.stop + + output: + file "*.bed" into bed_files + + script: +""" +echo "${chromosome}\t${start}\t${stop}" > synthetic.bed +""" +} + +process fasta_from_bed { + tag "${fasta.baseName}" + cpus 4 + publishDir "results/training/fasta/", mode: 'copy' + + input: + file fasta from fasta_file + file bed from bed_files + + output: + file "*.fasta" into fasta_files_extracted + + script: +""" +bedtools getfasta -name \ +-fi ${fasta} -bed ${bed} -fo ${fasta.baseName}_S.fasta +""" +} + +process index_fasta { + tag "$fasta.baseName" + cpus 4 + publishDir "results/training/mapping/index/", mode: 'copy' + + input: + file fasta from fasta_files_extracted + + output: + file "*.index*" into index_files + file "*_report.txt" into indexing_report + + script: +""" +bowtie2-build --threads ${task.cpus} ${fasta} ${fasta.baseName}.index &> ${fasta.baseName}_bowtie2_report.txt + +if grep -q "Error" ${fasta.baseName}_bowtie2_report.txt; then + exit 1 +fi +""" +} + +if ( params.fastq_paired != "" ) { + Channel + .fromFilePairs( params.fastq_paired ) + .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq_paired}" } + .set { fastq_files_paired } + + process mapping_fastq_paired { + tag "$pair_id" + cpus 4 + + input: + set pair_id, file(reads) from fastq_files_paired + file index from index_files.collect() + + output: + set pair_id, "*.bam" into bam_files_paired + file "*_report.txt" into mapping_report + + script: + index_id = index[0] + for (index_file in index) { + if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) { + index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1] + } + } + """ + bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \ + -1 ${reads[0]} -2 ${reads[1]} 2> \ + ${pair_id}_bowtie2_report.txt | \ + samtools view -Sb - > ${pair_id}.bam + + if grep -q "Error" ${pair_id}_bowtie2_report.txt; then + exit 1 + fi + """ + } + + bam_files_paired.into{ bam_files_paired_fa; bam_files_paired_ba} + + process bam_2_fastq_paired { + tag "$file_id" + publishDir "results/training/fastq/", mode: 'copy' + + input: + set file_id, file(bam) from bam_files_paired_fa + + output: + set file_id, "*.fastq" into fastq_files_extracted + script: + """ + samtools fastq -1 ${file_id}_SR1.fastq -2 ${file_id}_SR2.fastq -f 0x2 ${bam} + """ + } + + process filter_bam_paired { + tag "$file_id" + publishDir "results/training/bams/", mode: 'copy' + cpus 4 + + input: + set file_id, file(bam) from bam_files_paired_ba + file bed from bed_files + + output: + set file_id, "*.bam" into filtered_bam_files + script: + """ + samtools view -@ ${task.cpus} -hb ${bam} -f 0x2 > ${file_id}_S.bam + """ + } +} + + +if ( params.fastq_single != "" ) { + Channel + .fromPath( params.fastq_single ) + .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq_single}" } + .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} + .set { fastq_files_single } + + process mapping_fastq_single { + tag "$file_id" + cpus 4 + + input: + set file_id, file(reads) from fastq_files_single + file index from index_files.collect() + + output: + set file_id, "*.bam" into bam_files_single + file "*_report.txt" into mapping_report + + script: + index_id = index[0] + for (index_file in index) { + if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) { + index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1] + } + } + """ + bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \ + -U ${reads} 2> \ + ${file_id}_bowtie2_report.txt | \ + samtools view -Sb - > ${file_id}.bam + + if grep -q "Error" ${file_id}_bowtie2_report.txt; then + exit 1 + fi + """ + } + + bam_files_single.into{ bam_files_single_fa; bam_files_single_ba} + + process bam_2_fastq_single { + tag "$file_id" + publishDir "results/training/fastq/", mode: 'copy' + + input: + set file_id, file(bam) from bam_files_single_fa + + output: + set file_id, "*.fastq" into fastq_files_extracted + script: + """ + samtools fastq -s ${file_id}_S.fastq -f 0x2 ${bam} + """ + } + + process filter_bam_single { + tag "$file_id" + publishDir "results/training/bams/", mode: 'copy' + cpus 4 + + input: + set file_id, file(bam) from bam_files_single_ba + file bed from bed_files + + output: + set file_id, "*_S.bam" into filtered_bam_files + script: + """ + samtools view -@ ${task.cpus} -hb ${bam} -f 0x2 > ${file_id}_S.bam + """ + } +} -- GitLab