From 6a48c301c4e7d7f77f57d2df8962a6afb8965c35 Mon Sep 17 00:00:00 2001 From: aliarifki <aliarifki@outlook.fr> Date: Wed, 14 Jun 2023 10:10:44 +0200 Subject: [PATCH] Ajout des barcodes --- src/bolero.nf | 45 ++++++++++----------- src/nf_modules/cutadapt/main.nf | 8 ++-- src/nf_modules/junction_nanosplicer/main.nf | 17 ++++---- src/nf_modules/minimap2/main.nf | 11 +++-- src/nf_modules/nanosplicer/main.nf | 26 ++++++++++++ src/nf_modules/rna_count/main.nf | 16 ++++---- src/nf_modules/samtools/main.nf | 24 ++++++----- src/nf_modules/seqkit/main.nf | 39 ++++++++++-------- src/nf_modules/start_positions/main.nf | 19 +++++---- 9 files changed, 124 insertions(+), 81 deletions(-) create mode 100644 src/nf_modules/nanosplicer/main.nf diff --git a/src/bolero.nf b/src/bolero.nf index 3ff3c53..c9851d4 100755 --- a/src/bolero.nf +++ b/src/bolero.nf @@ -129,9 +129,9 @@ Channel .set { input } Channel - .of( params.adapt ) - .ifEmpty { error "No adapter sequence defined." } - .set { adapt } + .of( params.adapt ) + .ifEmpty { error "No adapter sequence defined." } + .set { adapt } Channel .fromPath( params.genome ) @@ -143,7 +143,10 @@ Channel .ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." } .set { gtf } -// .map( it -> [it.baseName, it]) +Channel + .fromPath(params.input+'*/', type: 'dir') + .map(it -> [it.baseName, it]) + .set{barcodes} /* **************************************************************** @@ -161,10 +164,8 @@ if(!params.skipBC) { } } -// Replace concatenate by seqkit fct to parallelization: +include { barecode } from "./nf_modules/barecode/main.nf" include { concatenate } from "./nf_modules/seqkit/main.nf" -//include { concatenate } from "./nf_modules/concatenate/main.nf" - include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf" include { hbv_genome } from "./nf_modules/minimap2/main.nf" include { seqkit_grep } from "./nf_modules/seqkit/main.nf" @@ -178,9 +179,6 @@ include { junctions_nanosplicer } from "./nf_modules/junction_nanosplicer/main.n include { rna_count } from "./nf_modules/rna_count/main.nf" -// creation des fonctions NanoSplicer: -// include { jwr_check } from "./nf_modules/nanosplicer/main.nf" - /* **************************************************************** Workflow @@ -189,42 +187,41 @@ include { rna_count } from "./nf_modules/rna_count/main.nf" workflow { + //######################## BASECALLING ######################## - if(params.skipBC) { - concatenate(params.input) - // Replace by seqkit scat to parallelization + if(params.skipBC) { // we take fastq files as input and skip basecalling + concatenate(barcodes) } - else { + + //il reste à adapter ça + else { // we take fast5 files as input and proceed to basecalling with guppy if(params.gpu_mode) { basecall_fast5_gpu(input) concatenate(basecall_fast5_gpu.out.pass) - // Replace by seqkit scat to parallelization } else { basecall_fast5_cpu(input) concatenate(basecall_fast5_cpu.out.pass) - // Replace by seqkit scat to parallelization } } + + + //####################### PREPROCESSING ####################### - - - + + //Filtration (seqkit_grep looks for the 5'RACE and the gsp patterns in the reads to keep only mature ARNs) seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp) //Cut of the 5'RACE sequence cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt) - - //########################## MAPPING ########################## - - hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome) + hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome.collect()) + sort_index_bam(hbv_genome.out.bam) - // index_bam(sort_bam_genome.out.sorted_bam.collect()) //###################### START POSITIONS ####################### diff --git a/src/nf_modules/cutadapt/main.nf b/src/nf_modules/cutadapt/main.nf index b7ffd3b..7c72b44 100755 --- a/src/nf_modules/cutadapt/main.nf +++ b/src/nf_modules/cutadapt/main.nf @@ -4,23 +4,23 @@ container_url = "xgrand/cutadapt:${version}" process cut_5pRACE { container = "${container_url}" label "small_mem_mono_cpus" - tag "cutadapt" + tag "${barcode}" if (params.cutadapt_out != "") { publishDir "results/${params.cutadapt_out}", mode: 'copy' } input: - path(fastq) + tuple val(barcode), path(fastq) val(adapt) output: - path("*_cut_*"), emit: fastq_cutadapt + tuple val(barcode), path("${barcode}_merged_porechoped_cut_fastq.fastq"), emit: fastq_cutadapt """ cutadapt -e 0.2 -g ${adapt} \ --revcomp \ - -o "merged_porechoped_cut_fastq.fastq" \ + -o "${barcode}_merged_porechoped_cut_fastq.fastq" \ ${fastq} """ } \ No newline at end of file diff --git a/src/nf_modules/junction_nanosplicer/main.nf b/src/nf_modules/junction_nanosplicer/main.nf index 9f0a209..337ed6f 100644 --- a/src/nf_modules/junction_nanosplicer/main.nf +++ b/src/nf_modules/junction_nanosplicer/main.nf @@ -5,23 +5,24 @@ params.nanosplicer_out = "" process junctions_nanosplicer{ container = "${container_url}" label "small_mem_mono_cpus" - tag "identification de variants d'épissage" + tag "${barcode}" if (params.nanosplicer_out != "") { publishDir "results/${params.nanosplicer_out}", mode: 'copy' } input: - path(txt) - path(csv) + tuple val(barcode), path(txt) + tuple val(barcode), path(csv) output: - path("Rplots.pdf") - path("JWR_check_parsed.csv") - path("*.png") - path("identified_SPvariants.csv"), emit: identified_SPvariants + path("${barcode}/JWR_check_parsed.csv") + tuple val(barcode), path("${barcode}/${barcode}_identified_SPvariants.csv"), emit: identified_SPvariants script: """ - Rscript /Junctions_NanoSplicer.R -c ${txt} -j ${csv} + mkdir ${barcode} + cd ${barcode}/ + Rscript /Junctions_NanoSplicer.R -c ../${txt} -j ../${csv} + mv identified_SPvariants.csv ${barcode}_identified_SPvariants.csv """ } \ No newline at end of file diff --git a/src/nf_modules/minimap2/main.nf b/src/nf_modules/minimap2/main.nf index 5e101b7..91c9193 100755 --- a/src/nf_modules/minimap2/main.nf +++ b/src/nf_modules/minimap2/main.nf @@ -89,22 +89,25 @@ params.mapping_hbv_genome = "-ax splice --secondary=no -G 1650 -u n --eqx" process hbv_genome { container = "${container_url}" label "big_mem_multi_cpus" + tag "${barcode}" if (params.minimap2_genome_out != "") { publishDir "results/${params.minimap2_genome_out}", mode: 'copy' } input: - path(fastq) + tuple val(barcode), path(fastq) path(genome) output: - path("*"), emit: bam + tuple val(barcode), path("${barcode}/${barcode}_res.bam"), emit: bam script: memory = "${task.memory}" - ~/\s*GB/ memory = memory.toInteger() / (task.cpus + 1.0) """ - minimap2 ${params.mapping_hbv_genome} -t${task.cpus} -K ${memory} ${genome} ${fastq} | - samtools view -Shb - > res.bam + mkdir ${barcode} + cd ${barcode}/ + minimap2 ${params.mapping_hbv_genome} -t ${task.cpus} -K ${memory} ../${genome} ../${fastq} | + samtools view -Shb - > ${barcode}_res.bam """ } \ No newline at end of file diff --git a/src/nf_modules/nanosplicer/main.nf b/src/nf_modules/nanosplicer/main.nf new file mode 100644 index 0000000..71908d7 --- /dev/null +++ b/src/nf_modules/nanosplicer/main.nf @@ -0,0 +1,26 @@ +version = "1.0" +container_url = "xgrand/nanosplicer:${version}" + +params.nanosplicer_out = "" +process jwr_checker { + container = "${container_url}" + label "big_mem_multi_cpus" + tag "${barcode}" + if (params.nanosplicer_out != "") { + publishDir "results/${params.nanosplicer_out}", mode: 'copy' + } + + input: + tuple val(barcode), path(bam), path(index) + + output: + tuple val(barcode), path("${barcode}/${barcode}_JWR_check.h5.csv"), emit: nanosplicer_jwr + + script: + """ + mkdir ${barcode} + cd ${barcode}/ + python3 /NanoSplicer/bin/JWR_checker.py --output_csv ../${bam} ${barcode}_JWR_check.h5 + """ +} + diff --git a/src/nf_modules/rna_count/main.nf b/src/nf_modules/rna_count/main.nf index a2ae2ce..06afba6 100644 --- a/src/nf_modules/rna_count/main.nf +++ b/src/nf_modules/rna_count/main.nf @@ -5,22 +5,24 @@ params.rna_count_out = "" process rna_count{ container = "${container_url}" label "small_mem_mono_cpus" - tag "RNA quantification" + tag "${barcode}" if (params.rna_count_out != "") { publishDir "results/${params.rna_count_out}", mode: 'copy' } input: - path(spvariants) - path(classification) + tuple val(barcode), path(spvariants) + tuple val(barcode), path(classification) output: - path("*.csv") - path("*.pdf") - path("*.png") + path("${barcode}/*.csv") + path("${barcode}/*.pdf") + path("${barcode}/*.png") script: """ - Rscript /HBV_RNAs_count.R -s ${spvariants} -c ${classification} + mkdir ${barcode} + cd ${barcode}/ + Rscript /HBV_RNAs_count.R -s ../${spvariants} -c ../${classification} """ } diff --git a/src/nf_modules/samtools/main.nf b/src/nf_modules/samtools/main.nf index 0b48cd5..d44804b 100755 --- a/src/nf_modules/samtools/main.nf +++ b/src/nf_modules/samtools/main.nf @@ -24,21 +24,23 @@ samtools sort -@ ${task.cpus} ${bam} -O BAM -o ${bam.simpleName}_sorted.bam params.start_position_counts_out = "" process start_position_counts { - tag "Start positions count" + tag "${barcode}" label "big_mem_multi_cpus" publishDir "results/${params.start_position_counts_out}", mode: 'copy' input: - tuple path(bam), path(index) + tuple val(barcode), path(bam), path(index) output: - path "*", emit: count + tuple val(barcode), path("${barcode}/${barcode}_start_positions_counts.txt"), emit: count script: """ -samtools view -F 260 ${bam} | +mkdir ${barcode} +cd ${barcode}/ +samtools view -F 260 ../${bam} | cut -f 1,4 | - sort > Start_positions_counts.txt + sort > ${barcode}_start_positions_counts.txt """ } @@ -67,20 +69,22 @@ params.indexed_bam_out ="" process sort_index_bam { container = "${container_url}" label "big_mem_multi_cpus" - tag "sorting" + tag "${barcode}" if (params.indexed_bam_out != "") { publishDir "results/${params.indexed_bam_out}", mode: 'copy' } input: - path(bam) + tuple val(barcode), path(bam) output: - tuple path("*sorted.bam"), path("*.bai"), emit: indexed_bam + tuple val(barcode), path("${barcode}/*sorted.bam"), path("${barcode}/*.bai"), emit: indexed_bam script: """ -samtools sort -@ ${task.cpus} ${bam} -o ${bam.simpleName}_sorted.bam -samtools index -@ ${task.cpus} ${bam.simpleName}_sorted.bam +mkdir ${barcode} +cd ${barcode}/ +samtools sort -@ ${task.cpus} ../${bam} -o ${barcode}_sorted.bam +samtools index -@ ${task.cpus} ${barcode}_sorted.bam """ } \ No newline at end of file diff --git a/src/nf_modules/seqkit/main.nf b/src/nf_modules/seqkit/main.nf index e6d7e6b..683c924 100755 --- a/src/nf_modules/seqkit/main.nf +++ b/src/nf_modules/seqkit/main.nf @@ -29,35 +29,37 @@ params.seqkit_grep_out = "" process seqkit_grep { container = "${container_url}" label "small_mem_multi_cpus" - tag "Filter_reads" + tag "${barcode}" if (params.seqkit_grep_out != "") { publishDir "results/${params.seqkit_grep_out}", mode: 'copy' } input: - path(fastq) + tuple val(barcode), path(fastq) val(adapt) val(gsp) output: - path("filtered_5RACE_GSP.fastq"), emit: filtered_fastq - path("seq_stats.csv") - path("*.txt") - path("filtered_5RACE.fastq") + tuple val(barcode), path("${barcode}/${barcode}_filtered_5RACE_GSP.fastq"), emit: filtered_fastq + path("${barcode}/*.csv") + path("${barcode}/*.txt") + path("${barcode}/${barcode}_filtered_5RACE.fastq") script: lgadapt = Math.round(adapt.size().div(10)) lggsp = Math.round(gsp.size().div(10)) """ + mkdir ${barcode} + cd ${barcode}/ echo "mismatch allowed to 5'RACE adapter: ${lgadapt}" > mismatch.txt echo "mismatch allowed to Gene Specific primer: ${lggsp}" >> mismatch.txt echo ${adapt} > adapt.txt echo ${gsp} > gsp.txt - seqkit grep -i -f adapt.txt -m ${lgadapt} ${fastq} -o filtered_5RACE.fastq -j ${task.cpus} - seqkit grep -i -f gsp.txt -m ${lggsp} filtered_5RACE.fastq -o filtered_5RACE_GSP.fastq -j ${task.cpus} - seqkit stats ${fastq} -T -j ${task.cpus} > seq_stats.csv - seqkit stats filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv - seqkit stats filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv + seqkit grep -i -f adapt.txt -m ${lgadapt} ../${fastq} -o ${barcode}_filtered_5RACE.fastq -j ${task.cpus} + seqkit grep -i -f gsp.txt -m ${lggsp} ${barcode}_filtered_5RACE.fastq -o ${barcode}_filtered_5RACE_GSP.fastq -j ${task.cpus} + seqkit stats ../${fastq} -T -j ${task.cpus} > ${barcode}_seq_stats.csv + seqkit stats ${barcode}_filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv + seqkit stats ${barcode}_filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv """ } @@ -65,21 +67,24 @@ params.fastq_out = "" process concatenate { container = "${container_url}" label "big_mem_multi_cpus" - tag "Concatenate_reads" + tag "${barcode}" if (params.fastq_out != "") { publishDir "results/${params.fastq_out}", mode: 'copy' } input: - path fastq + tuple val(barcode), path(fastq) output: - path "merged.fastq.gz", emit: merged_fastq + tuple val(barcode), path("${barcode}/${barcode}_merged.fastq.gz"), emit: merged_fastq script: """ - path=\$(readlink -f ${fastq}) - seqkit scat -j ${task.cpus} -f \${path} --gz-only > merged.fastq - gzip merged.fastq + mv ${fastq} path_${fastq} + mkdir ${barcode} + cd ${barcode}/ + path=\$(readlink -f ../path_${fastq}) + seqkit scat -j ${task.cpus} -f \${path} --gz-only > ${barcode}_merged.fastq + gzip ${barcode}_merged.fastq """ } \ No newline at end of file diff --git a/src/nf_modules/start_positions/main.nf b/src/nf_modules/start_positions/main.nf index 27b8ebf..49f14e6 100644 --- a/src/nf_modules/start_positions/main.nf +++ b/src/nf_modules/start_positions/main.nf @@ -5,21 +5,26 @@ params.start_position_counts_out ="" process start_position_individuals{ container = "${container_url}" label "small_mem_mono_cpus" - tag "start positions" + tag "${barcode}" if (params.start_position_counts_out != "") { publishDir "results/${params.start_position_counts_out}", mode: 'copy' } input: - path(start_position_counts) + tuple val(barcode), path(start_position_counts) output: - path("Rplots.pdf") - path("*.png") - path("Count_reads_per_promoter.tsv") - path("classification_of_reads_per_RNA.txt"), emit: classification_of_reads + path("${barcode}/*.pdf") + path("${barcode}/*.png") + path("${barcode}/*.tsv") + tuple val(barcode), path("${barcode}/${barcode}_classification_of_reads_per_RNA.txt"), emit: classification_of_reads script: """ - Rscript /Start_positions.R -i ${start_position_counts} + mkdir ${barcode} + cd ${barcode}/ + Rscript /Start_positions.R -i ../${start_position_counts} + mv classification_of_reads_per_RNA.txt ${barcode}_classification_of_reads_per_RNA.txt + mv Count_reads_per_promoter.tsv ${barcode}_count_reads_per_promoter.tsv + mv Rplots.pdf ${barcode}_Rplots.pdf """ } \ No newline at end of file -- GitLab