Skip to content
Snippets Groups Projects
Commit 6a48c301 authored by aliarifki's avatar aliarifki
Browse files

Ajout des barcodes

parent f2dca093
No related branches found
No related tags found
No related merge requests found
......@@ -129,9 +129,9 @@ Channel
.set { input }
Channel
.of( params.adapt )
.ifEmpty { error "No adapter sequence defined." }
.set { adapt }
.of( params.adapt )
.ifEmpty { error "No adapter sequence defined." }
.set { adapt }
Channel
.fromPath( params.genome )
......@@ -143,7 +143,10 @@ Channel
.ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." }
.set { gtf }
// .map( it -> [it.baseName, it])
Channel
.fromPath(params.input+'*/', type: 'dir')
.map(it -> [it.baseName, it])
.set{barcodes}
/*
****************************************************************
......@@ -161,10 +164,8 @@ if(!params.skipBC) {
}
}
// Replace concatenate by seqkit fct to parallelization:
include { barecode } from "./nf_modules/barecode/main.nf"
include { concatenate } from "./nf_modules/seqkit/main.nf"
//include { concatenate } from "./nf_modules/concatenate/main.nf"
include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf"
include { hbv_genome } from "./nf_modules/minimap2/main.nf"
include { seqkit_grep } from "./nf_modules/seqkit/main.nf"
......@@ -178,9 +179,6 @@ include { junctions_nanosplicer } from "./nf_modules/junction_nanosplicer/main.n
include { rna_count } from "./nf_modules/rna_count/main.nf"
// creation des fonctions NanoSplicer:
// include { jwr_check } from "./nf_modules/nanosplicer/main.nf"
/*
****************************************************************
Workflow
......@@ -189,42 +187,41 @@ include { rna_count } from "./nf_modules/rna_count/main.nf"
workflow {
//######################## BASECALLING ########################
if(params.skipBC) {
concatenate(params.input)
// Replace by seqkit scat to parallelization
if(params.skipBC) { // we take fastq files as input and skip basecalling
concatenate(barcodes)
}
else {
//il reste à adapter ça
else { // we take fast5 files as input and proceed to basecalling with guppy
if(params.gpu_mode) {
basecall_fast5_gpu(input)
concatenate(basecall_fast5_gpu.out.pass)
// Replace by seqkit scat to parallelization
}
else {
basecall_fast5_cpu(input)
concatenate(basecall_fast5_cpu.out.pass)
// Replace by seqkit scat to parallelization
}
}
//####################### PREPROCESSING #######################
//Filtration (seqkit_grep looks for the 5'RACE and the gsp patterns in the reads to keep only mature ARNs)
seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp)
//Cut of the 5'RACE sequence
cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt)
//########################## MAPPING ##########################
hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome)
hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome.collect())
sort_index_bam(hbv_genome.out.bam)
// index_bam(sort_bam_genome.out.sorted_bam.collect())
//###################### START POSITIONS #######################
......
......@@ -4,23 +4,23 @@ container_url = "xgrand/cutadapt:${version}"
process cut_5pRACE {
container = "${container_url}"
label "small_mem_mono_cpus"
tag "cutadapt"
tag "${barcode}"
if (params.cutadapt_out != "") {
publishDir "results/${params.cutadapt_out}", mode: 'copy'
}
input:
path(fastq)
tuple val(barcode), path(fastq)
val(adapt)
output:
path("*_cut_*"), emit: fastq_cutadapt
tuple val(barcode), path("${barcode}_merged_porechoped_cut_fastq.fastq"), emit: fastq_cutadapt
"""
cutadapt -e 0.2 -g ${adapt} \
--revcomp \
-o "merged_porechoped_cut_fastq.fastq" \
-o "${barcode}_merged_porechoped_cut_fastq.fastq" \
${fastq}
"""
}
\ No newline at end of file
......@@ -5,23 +5,24 @@ params.nanosplicer_out = ""
process junctions_nanosplicer{
container = "${container_url}"
label "small_mem_mono_cpus"
tag "identification de variants d'épissage"
tag "${barcode}"
if (params.nanosplicer_out != "") {
publishDir "results/${params.nanosplicer_out}", mode: 'copy'
}
input:
path(txt)
path(csv)
tuple val(barcode), path(txt)
tuple val(barcode), path(csv)
output:
path("Rplots.pdf")
path("JWR_check_parsed.csv")
path("*.png")
path("identified_SPvariants.csv"), emit: identified_SPvariants
path("${barcode}/JWR_check_parsed.csv")
tuple val(barcode), path("${barcode}/${barcode}_identified_SPvariants.csv"), emit: identified_SPvariants
script:
"""
Rscript /Junctions_NanoSplicer.R -c ${txt} -j ${csv}
mkdir ${barcode}
cd ${barcode}/
Rscript /Junctions_NanoSplicer.R -c ../${txt} -j ../${csv}
mv identified_SPvariants.csv ${barcode}_identified_SPvariants.csv
"""
}
\ No newline at end of file
......@@ -89,22 +89,25 @@ params.mapping_hbv_genome = "-ax splice --secondary=no -G 1650 -u n --eqx"
process hbv_genome {
container = "${container_url}"
label "big_mem_multi_cpus"
tag "${barcode}"
if (params.minimap2_genome_out != "") {
publishDir "results/${params.minimap2_genome_out}", mode: 'copy'
}
input:
path(fastq)
tuple val(barcode), path(fastq)
path(genome)
output:
path("*"), emit: bam
tuple val(barcode), path("${barcode}/${barcode}_res.bam"), emit: bam
script:
memory = "${task.memory}" - ~/\s*GB/
memory = memory.toInteger() / (task.cpus + 1.0)
"""
minimap2 ${params.mapping_hbv_genome} -t${task.cpus} -K ${memory} ${genome} ${fastq} |
samtools view -Shb - > res.bam
mkdir ${barcode}
cd ${barcode}/
minimap2 ${params.mapping_hbv_genome} -t ${task.cpus} -K ${memory} ../${genome} ../${fastq} |
samtools view -Shb - > ${barcode}_res.bam
"""
}
\ No newline at end of file
version = "1.0"
container_url = "xgrand/nanosplicer:${version}"
params.nanosplicer_out = ""
process jwr_checker {
container = "${container_url}"
label "big_mem_multi_cpus"
tag "${barcode}"
if (params.nanosplicer_out != "") {
publishDir "results/${params.nanosplicer_out}", mode: 'copy'
}
input:
tuple val(barcode), path(bam), path(index)
output:
tuple val(barcode), path("${barcode}/${barcode}_JWR_check.h5.csv"), emit: nanosplicer_jwr
script:
"""
mkdir ${barcode}
cd ${barcode}/
python3 /NanoSplicer/bin/JWR_checker.py --output_csv ../${bam} ${barcode}_JWR_check.h5
"""
}
......@@ -5,22 +5,24 @@ params.rna_count_out = ""
process rna_count{
container = "${container_url}"
label "small_mem_mono_cpus"
tag "RNA quantification"
tag "${barcode}"
if (params.rna_count_out != "") {
publishDir "results/${params.rna_count_out}", mode: 'copy'
}
input:
path(spvariants)
path(classification)
tuple val(barcode), path(spvariants)
tuple val(barcode), path(classification)
output:
path("*.csv")
path("*.pdf")
path("*.png")
path("${barcode}/*.csv")
path("${barcode}/*.pdf")
path("${barcode}/*.png")
script:
"""
Rscript /HBV_RNAs_count.R -s ${spvariants} -c ${classification}
mkdir ${barcode}
cd ${barcode}/
Rscript /HBV_RNAs_count.R -s ../${spvariants} -c ../${classification}
"""
}
......@@ -24,21 +24,23 @@ samtools sort -@ ${task.cpus} ${bam} -O BAM -o ${bam.simpleName}_sorted.bam
params.start_position_counts_out = ""
process start_position_counts {
tag "Start positions count"
tag "${barcode}"
label "big_mem_multi_cpus"
publishDir "results/${params.start_position_counts_out}", mode: 'copy'
input:
tuple path(bam), path(index)
tuple val(barcode), path(bam), path(index)
output:
path "*", emit: count
tuple val(barcode), path("${barcode}/${barcode}_start_positions_counts.txt"), emit: count
script:
"""
samtools view -F 260 ${bam} |
mkdir ${barcode}
cd ${barcode}/
samtools view -F 260 ../${bam} |
cut -f 1,4 |
sort > Start_positions_counts.txt
sort > ${barcode}_start_positions_counts.txt
"""
}
......@@ -67,20 +69,22 @@ params.indexed_bam_out =""
process sort_index_bam {
container = "${container_url}"
label "big_mem_multi_cpus"
tag "sorting"
tag "${barcode}"
if (params.indexed_bam_out != "") {
publishDir "results/${params.indexed_bam_out}", mode: 'copy'
}
input:
path(bam)
tuple val(barcode), path(bam)
output:
tuple path("*sorted.bam"), path("*.bai"), emit: indexed_bam
tuple val(barcode), path("${barcode}/*sorted.bam"), path("${barcode}/*.bai"), emit: indexed_bam
script:
"""
samtools sort -@ ${task.cpus} ${bam} -o ${bam.simpleName}_sorted.bam
samtools index -@ ${task.cpus} ${bam.simpleName}_sorted.bam
mkdir ${barcode}
cd ${barcode}/
samtools sort -@ ${task.cpus} ../${bam} -o ${barcode}_sorted.bam
samtools index -@ ${task.cpus} ${barcode}_sorted.bam
"""
}
\ No newline at end of file
......@@ -29,35 +29,37 @@ params.seqkit_grep_out = ""
process seqkit_grep {
container = "${container_url}"
label "small_mem_multi_cpus"
tag "Filter_reads"
tag "${barcode}"
if (params.seqkit_grep_out != "") {
publishDir "results/${params.seqkit_grep_out}", mode: 'copy'
}
input:
path(fastq)
tuple val(barcode), path(fastq)
val(adapt)
val(gsp)
output:
path("filtered_5RACE_GSP.fastq"), emit: filtered_fastq
path("seq_stats.csv")
path("*.txt")
path("filtered_5RACE.fastq")
tuple val(barcode), path("${barcode}/${barcode}_filtered_5RACE_GSP.fastq"), emit: filtered_fastq
path("${barcode}/*.csv")
path("${barcode}/*.txt")
path("${barcode}/${barcode}_filtered_5RACE.fastq")
script:
lgadapt = Math.round(adapt.size().div(10))
lggsp = Math.round(gsp.size().div(10))
"""
mkdir ${barcode}
cd ${barcode}/
echo "mismatch allowed to 5'RACE adapter: ${lgadapt}" > mismatch.txt
echo "mismatch allowed to Gene Specific primer: ${lggsp}" >> mismatch.txt
echo ${adapt} > adapt.txt
echo ${gsp} > gsp.txt
seqkit grep -i -f adapt.txt -m ${lgadapt} ${fastq} -o filtered_5RACE.fastq -j ${task.cpus}
seqkit grep -i -f gsp.txt -m ${lggsp} filtered_5RACE.fastq -o filtered_5RACE_GSP.fastq -j ${task.cpus}
seqkit stats ${fastq} -T -j ${task.cpus} > seq_stats.csv
seqkit stats filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv
seqkit stats filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv
seqkit grep -i -f adapt.txt -m ${lgadapt} ../${fastq} -o ${barcode}_filtered_5RACE.fastq -j ${task.cpus}
seqkit grep -i -f gsp.txt -m ${lggsp} ${barcode}_filtered_5RACE.fastq -o ${barcode}_filtered_5RACE_GSP.fastq -j ${task.cpus}
seqkit stats ../${fastq} -T -j ${task.cpus} > ${barcode}_seq_stats.csv
seqkit stats ${barcode}_filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv
seqkit stats ${barcode}_filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv
"""
}
......@@ -65,21 +67,24 @@ params.fastq_out = ""
process concatenate {
container = "${container_url}"
label "big_mem_multi_cpus"
tag "Concatenate_reads"
tag "${barcode}"
if (params.fastq_out != "") {
publishDir "results/${params.fastq_out}", mode: 'copy'
}
input:
path fastq
tuple val(barcode), path(fastq)
output:
path "merged.fastq.gz", emit: merged_fastq
tuple val(barcode), path("${barcode}/${barcode}_merged.fastq.gz"), emit: merged_fastq
script:
"""
path=\$(readlink -f ${fastq})
seqkit scat -j ${task.cpus} -f \${path} --gz-only > merged.fastq
gzip merged.fastq
mv ${fastq} path_${fastq}
mkdir ${barcode}
cd ${barcode}/
path=\$(readlink -f ../path_${fastq})
seqkit scat -j ${task.cpus} -f \${path} --gz-only > ${barcode}_merged.fastq
gzip ${barcode}_merged.fastq
"""
}
\ No newline at end of file
......@@ -5,21 +5,26 @@ params.start_position_counts_out =""
process start_position_individuals{
container = "${container_url}"
label "small_mem_mono_cpus"
tag "start positions"
tag "${barcode}"
if (params.start_position_counts_out != "") {
publishDir "results/${params.start_position_counts_out}", mode: 'copy'
}
input:
path(start_position_counts)
tuple val(barcode), path(start_position_counts)
output:
path("Rplots.pdf")
path("*.png")
path("Count_reads_per_promoter.tsv")
path("classification_of_reads_per_RNA.txt"), emit: classification_of_reads
path("${barcode}/*.pdf")
path("${barcode}/*.png")
path("${barcode}/*.tsv")
tuple val(barcode), path("${barcode}/${barcode}_classification_of_reads_per_RNA.txt"), emit: classification_of_reads
script:
"""
Rscript /Start_positions.R -i ${start_position_counts}
mkdir ${barcode}
cd ${barcode}/
Rscript /Start_positions.R -i ../${start_position_counts}
mv classification_of_reads_per_RNA.txt ${barcode}_classification_of_reads_per_RNA.txt
mv Count_reads_per_promoter.tsv ${barcode}_count_reads_per_promoter.tsv
mv Rplots.pdf ${barcode}_Rplots.pdf
"""
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment