Skip to content
Snippets Groups Projects
Verified Commit 6f2eec1e authored by Laurent Modolo's avatar Laurent Modolo
Browse files

add pipeline to create training dataset

parent 3e38d39c
No related branches found
No related tags found
No related merge requests found
profiles {
docker {
docker.temp = 'auto'
docker.enabled = true
process {
$build_synthetic_bed {
container = "bedtools:2.25.0"
}
$fasta_from_bed {
container = "bedtools:2.25.0"
}
$index_fasta {
container = "bowtie2:2.3.4.1"
}
$mapping_fastq_paired {
container = "bowtie2:2.3.4.1"
}
$bam_2_fastq_paired {
container = "samtools:1.7"
}
$mapping_fastq_single {
container = "bowtie2:2.3.4.1"
}
$bam_2_fastq_single {
container = "samtools:1.7"
}
}
}
sge {
process{
$build_synthetic_bed {
beforeScript = "module purge; module load BEDtools/2.25.0"
executor = "sge"
cpus = 1
memory = "5GB"
time = "6h"
queueSize = 1000
pollInterval = '60sec'
queue = 'h6-E5-2667v4deb128'
penv = 'openmp8'
}
$fasta_from_bed {
beforeScript = "module purge; module load BEDtools/2.25.0"
executor = "sge"
cpus = 1
memory = "5GB"
time = "6h"
queueSize = 1000
pollInterval = '60sec'
queue = 'h6-E5-2667v4deb128'
penv = 'openmp8'
}
$index_fasta {
beforeScript = "module purge; module load Bowtie2/2.3.4.1"
}
$mapping_fastq_paired {
beforeScript = "module purge; module load SAMtools/1.7; module load Bowtie2/2.3.4.1"
}
$bam_2_fastq_paired {
beforeScript = "module purge; module load SAMtools/1.7"
}
$mapping_fastq_single {
beforeScript = "module purge; module load SAMtools/1.7; module load Bowtie2/2.3.4.1"
}
$bam_2_fastq_single {
beforeScript = "module purge; module load SAMtools/1.7"
}
}
}
}
/*
small pipeline to build a training dataset from whole genome data
input:
- fasta
- fastq
- chromosome
- start position
- stop position
output:
- sort fasta
- sort fastq
*/
params.fastq_paired = ""
params.fastq_single = ""
log.info "fasta files : ${params.fasta}"
log.info "fastq paired files : ${params.fastq_paired}"
log.info "fastq single files : ${params.fastq_single}"
log.info "chromosome : ${params.chromosome}"
log.info "start position : ${params.start}"
log.info "stop position : ${params.stop}"
Channel
.fromPath( params.fasta )
.ifEmpty { error "Cannot find any index files matching: ${params.fasta}" }
.set { fasta_file }
process build_synthetic_bed {
tag "${chromosome}:${start}-${stop}"
cpus 4
input:
val chromosome from params.chromosome
val start from params.start
val stop from params.stop
output:
file "*.bed" into bed_files
script:
"""
echo "${chromosome}\t${start}\t${stop}" > synthetic.bed
"""
}
process fasta_from_bed {
tag "${fasta.baseName}"
cpus 4
publishDir "results/training/fasta/", mode: 'copy'
input:
file fasta from fasta_file
file bed from bed_files
output:
file "*.fasta" into fasta_files_extracted
script:
"""
bedtools getfasta -name \
-fi ${fasta} -bed ${bed} -fo ${fasta.baseName}_S.fasta
"""
}
process index_fasta {
tag "$fasta.baseName"
cpus 4
publishDir "results/training/mapping/index/", mode: 'copy'
input:
file fasta from fasta_files_extracted
output:
file "*.index*" into index_files
file "*_report.txt" into indexing_report
script:
"""
bowtie2-build --threads ${task.cpus} ${fasta} ${fasta.baseName}.index &> ${fasta.baseName}_bowtie2_report.txt
if grep -q "Error" ${fasta.baseName}_bowtie2_report.txt; then
exit 1
fi
"""
}
if ( params.fastq_paired != "" ) {
Channel
.fromFilePairs( params.fastq_paired )
.ifEmpty { error "Cannot find any fastq files matching: ${params.fastq_paired}" }
.set { fastq_files_paired }
process mapping_fastq_paired {
tag "$pair_id"
cpus 4
input:
set pair_id, file(reads) from fastq_files_paired
file index from index_files.collect()
output:
set pair_id, "*.bam" into bam_files_paired
file "*_report.txt" into mapping_report
script:
index_id = index[0]
for (index_file in index) {
if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) {
index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1]
}
}
"""
bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \
-1 ${reads[0]} -2 ${reads[1]} 2> \
${pair_id}_bowtie2_report.txt | \
samtools view -Sb - > ${pair_id}.bam
if grep -q "Error" ${pair_id}_bowtie2_report.txt; then
exit 1
fi
"""
}
bam_files_paired.into{ bam_files_paired_fa; bam_files_paired_ba}
process bam_2_fastq_paired {
tag "$file_id"
publishDir "results/training/fastq/", mode: 'copy'
input:
set file_id, file(bam) from bam_files_paired_fa
output:
set file_id, "*.fastq" into fastq_files_extracted
script:
"""
samtools fastq -1 ${file_id}_SR1.fastq -2 ${file_id}_SR2.fastq -f 0x2 ${bam}
"""
}
process filter_bam_paired {
tag "$file_id"
publishDir "results/training/bams/", mode: 'copy'
cpus 4
input:
set file_id, file(bam) from bam_files_paired_ba
file bed from bed_files
output:
set file_id, "*.bam" into filtered_bam_files
script:
"""
samtools view -@ ${task.cpus} -hb ${bam} -f 0x2 > ${file_id}_S.bam
"""
}
}
if ( params.fastq_single != "" ) {
Channel
.fromPath( params.fastq_single )
.ifEmpty { error "Cannot find any fastq files matching: ${params.fastq_single}" }
.map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
.set { fastq_files_single }
process mapping_fastq_single {
tag "$file_id"
cpus 4
input:
set file_id, file(reads) from fastq_files_single
file index from index_files.collect()
output:
set file_id, "*.bam" into bam_files_single
file "*_report.txt" into mapping_report
script:
index_id = index[0]
for (index_file in index) {
if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) {
index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1]
}
}
"""
bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \
-U ${reads} 2> \
${file_id}_bowtie2_report.txt | \
samtools view -Sb - > ${file_id}.bam
if grep -q "Error" ${file_id}_bowtie2_report.txt; then
exit 1
fi
"""
}
bam_files_single.into{ bam_files_single_fa; bam_files_single_ba}
process bam_2_fastq_single {
tag "$file_id"
publishDir "results/training/fastq/", mode: 'copy'
input:
set file_id, file(bam) from bam_files_single_fa
output:
set file_id, "*.fastq" into fastq_files_extracted
script:
"""
samtools fastq -s ${file_id}_S.fastq -f 0x2 ${bam}
"""
}
process filter_bam_single {
tag "$file_id"
publishDir "results/training/bams/", mode: 'copy'
cpus 4
input:
set file_id, file(bam) from bam_files_single_ba
file bed from bed_files
output:
set file_id, "*_S.bam" into filtered_bam_files
script:
"""
samtools view -@ ${task.cpus} -hb ${bam} -f 0x2 > ${file_id}_S.bam
"""
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment