Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
nextflow
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
nfontrod
nextflow
Commits
6f2eec1e
Verified
Commit
6f2eec1e
authored
6 years ago
by
Laurent Modolo
Browse files
Options
Downloads
Patches
Plain Diff
add pipeline to create training dataset
parent
3e38d39c
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/training_dataset.config
+70
-0
70 additions, 0 deletions
src/training_dataset.config
src/training_dataset.nf
+236
-0
236 additions, 0 deletions
src/training_dataset.nf
with
306 additions
and
0 deletions
src/training_dataset.config
0 → 100644
+
70
−
0
View file @
6f2eec1e
profiles
{
docker
{
docker
.
temp
=
'auto'
docker
.
enabled
=
true
process
{
$
build_synthetic_bed
{
container
=
"bedtools:2.25.0"
}
$
fasta_from_bed
{
container
=
"bedtools:2.25.0"
}
$
index_fasta
{
container
=
"bowtie2:2.3.4.1"
}
$
mapping_fastq_paired
{
container
=
"bowtie2:2.3.4.1"
}
$
bam_2_fastq_paired
{
container
=
"samtools:1.7"
}
$
mapping_fastq_single
{
container
=
"bowtie2:2.3.4.1"
}
$
bam_2_fastq_single
{
container
=
"samtools:1.7"
}
}
}
sge
{
process
{
$
build_synthetic_bed
{
beforeScript
=
"module purge; module load BEDtools/2.25.0"
executor
=
"sge"
cpus
=
1
memory
=
"5GB"
time
=
"6h"
queueSize
=
1000
pollInterval
=
'60sec'
queue
=
'h6-E5-2667v4deb128'
penv
=
'openmp8'
}
$
fasta_from_bed
{
beforeScript
=
"module purge; module load BEDtools/2.25.0"
executor
=
"sge"
cpus
=
1
memory
=
"5GB"
time
=
"6h"
queueSize
=
1000
pollInterval
=
'60sec'
queue
=
'h6-E5-2667v4deb128'
penv
=
'openmp8'
}
$
index_fasta
{
beforeScript
=
"module purge; module load Bowtie2/2.3.4.1"
}
$
mapping_fastq_paired
{
beforeScript
=
"module purge; module load SAMtools/1.7; module load Bowtie2/2.3.4.1"
}
$
bam_2_fastq_paired
{
beforeScript
=
"module purge; module load SAMtools/1.7"
}
$
mapping_fastq_single
{
beforeScript
=
"module purge; module load SAMtools/1.7; module load Bowtie2/2.3.4.1"
}
$
bam_2_fastq_single
{
beforeScript
=
"module purge; module load SAMtools/1.7"
}
}
}
}
This diff is collapsed.
Click to expand it.
src/training_dataset.nf
0 → 100644
+
236
−
0
View file @
6f2eec1e
/*
small pipeline to build a training dataset from whole genome data
input:
- fasta
- fastq
- chromosome
- start position
- stop position
output:
- sort fasta
- sort fastq
*/
params
.
fastq_paired
=
""
params
.
fastq_single
=
""
log
.
info
"fasta files : ${params.fasta}"
log
.
info
"fastq paired files : ${params.fastq_paired}"
log
.
info
"fastq single files : ${params.fastq_single}"
log
.
info
"chromosome : ${params.chromosome}"
log
.
info
"start position : ${params.start}"
log
.
info
"stop position : ${params.stop}"
Channel
.
fromPath
(
params
.
fasta
)
.
ifEmpty
{
error
"Cannot find any index files matching: ${params.fasta}"
}
.
set
{
fasta_file
}
process
build_synthetic_bed
{
tag
"${chromosome}:${start}-${stop}"
cpus
4
input:
val
chromosome
from
params
.
chromosome
val
start
from
params
.
start
val
stop
from
params
.
stop
output:
file
"*.bed"
into
bed_files
script:
"""
echo "${chromosome}\t${start}\t${stop}" > synthetic.bed
"""
}
process
fasta_from_bed
{
tag
"${fasta.baseName}"
cpus
4
publishDir
"results/training/fasta/"
,
mode:
'copy'
input:
file
fasta
from
fasta_file
file
bed
from
bed_files
output:
file
"*.fasta"
into
fasta_files_extracted
script:
"""
bedtools getfasta -name \
-fi ${fasta} -bed ${bed} -fo ${fasta.baseName}_S.fasta
"""
}
process
index_fasta
{
tag
"$fasta.baseName"
cpus
4
publishDir
"results/training/mapping/index/"
,
mode:
'copy'
input:
file
fasta
from
fasta_files_extracted
output:
file
"*.index*"
into
index_files
file
"*_report.txt"
into
indexing_report
script:
"""
bowtie2-build --threads ${task.cpus} ${fasta} ${fasta.baseName}.index &> ${fasta.baseName}_bowtie2_report.txt
if grep -q "Error" ${fasta.baseName}_bowtie2_report.txt; then
exit 1
fi
"""
}
if
(
params
.
fastq_paired
!=
""
)
{
Channel
.
fromFilePairs
(
params
.
fastq_paired
)
.
ifEmpty
{
error
"Cannot find any fastq files matching: ${params.fastq_paired}"
}
.
set
{
fastq_files_paired
}
process
mapping_fastq_paired
{
tag
"$pair_id"
cpus
4
input:
set
pair_id
,
file
(
reads
)
from
fastq_files_paired
file
index
from
index_files
.
collect
()
output:
set
pair_id
,
"*.bam"
into
bam_files_paired
file
"*_report.txt"
into
mapping_report
script:
index_id
=
index
[
0
]
for
(
index_file
in
index
)
{
if
(
index_file
=~
/.*\.1\.bt2/
&&
!(
index_file
=~
/.*\.rev\.1\.bt2/
))
{
index_id
=
(
index_file
=~
/(.*)\.1\.bt2/
)[
0
][
1
]
}
}
"""
bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \
-1 ${reads[0]} -2 ${reads[1]} 2> \
${pair_id}_bowtie2_report.txt | \
samtools view -Sb - > ${pair_id}.bam
if grep -q "Error" ${pair_id}_bowtie2_report.txt; then
exit 1
fi
"""
}
bam_files_paired
.
into
{
bam_files_paired_fa
;
bam_files_paired_ba
}
process
bam_2_fastq_paired
{
tag
"$file_id"
publishDir
"results/training/fastq/"
,
mode:
'copy'
input:
set
file_id
,
file
(
bam
)
from
bam_files_paired_fa
output:
set
file_id
,
"*.fastq"
into
fastq_files_extracted
script:
"""
samtools fastq -1 ${file_id}_SR1.fastq -2 ${file_id}_SR2.fastq -f 0x2 ${bam}
"""
}
process
filter_bam_paired
{
tag
"$file_id"
publishDir
"results/training/bams/"
,
mode:
'copy'
cpus
4
input:
set
file_id
,
file
(
bam
)
from
bam_files_paired_ba
file
bed
from
bed_files
output:
set
file_id
,
"*.bam"
into
filtered_bam_files
script:
"""
samtools view -@ ${task.cpus} -hb ${bam} -f 0x2 > ${file_id}_S.bam
"""
}
}
if
(
params
.
fastq_single
!=
""
)
{
Channel
.
fromPath
(
params
.
fastq_single
)
.
ifEmpty
{
error
"Cannot find any fastq files matching: ${params.fastq_single}"
}
.
map
{
it
->
[(
it
.
baseName
=~
/([^\.]*)/
)[
0
][
1
],
it
]}
.
set
{
fastq_files_single
}
process
mapping_fastq_single
{
tag
"$file_id"
cpus
4
input:
set
file_id
,
file
(
reads
)
from
fastq_files_single
file
index
from
index_files
.
collect
()
output:
set
file_id
,
"*.bam"
into
bam_files_single
file
"*_report.txt"
into
mapping_report
script:
index_id
=
index
[
0
]
for
(
index_file
in
index
)
{
if
(
index_file
=~
/.*\.1\.bt2/
&&
!(
index_file
=~
/.*\.rev\.1\.bt2/
))
{
index_id
=
(
index_file
=~
/(.*)\.1\.bt2/
)[
0
][
1
]
}
}
"""
bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \
-U ${reads} 2> \
${file_id}_bowtie2_report.txt | \
samtools view -Sb - > ${file_id}.bam
if grep -q "Error" ${file_id}_bowtie2_report.txt; then
exit 1
fi
"""
}
bam_files_single
.
into
{
bam_files_single_fa
;
bam_files_single_ba
}
process
bam_2_fastq_single
{
tag
"$file_id"
publishDir
"results/training/fastq/"
,
mode:
'copy'
input:
set
file_id
,
file
(
bam
)
from
bam_files_single_fa
output:
set
file_id
,
"*.fastq"
into
fastq_files_extracted
script:
"""
samtools fastq -s ${file_id}_S.fastq -f 0x2 ${bam}
"""
}
process
filter_bam_single
{
tag
"$file_id"
publishDir
"results/training/bams/"
,
mode:
'copy'
cpus
4
input:
set
file_id
,
file
(
bam
)
from
bam_files_single_ba
file
bed
from
bed_files
output:
set
file_id
,
"*_S.bam"
into
filtered_bam_files
script:
"""
samtools view -@ ${task.cpus} -hb ${bam} -f 0x2 > ${file_id}_S.bam
"""
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment