diff --git a/src/.docker_modules/r-bolero/1.0/HBV_RNAs_count.R b/src/.docker_modules/r-bolero/1.0/HBV_RNAs_count.R index 16d67f3f8dd8a9161ff7006c410bda1291d45e63..16d8b1d8cbbc7aff4d56551a04e78cb060980aea 100644 --- a/src/.docker_modules/r-bolero/1.0/HBV_RNAs_count.R +++ b/src/.docker_modules/r-bolero/1.0/HBV_RNAs_count.R @@ -13,7 +13,9 @@ option_list = list( make_option(c("-s", "--SPvariants"), type="character", default=NULL, help="input identified SP variants table (.csv)", metavar="character"), make_option(c("-c", "--classification"), type="character", default=NULL, - help="input classification of reads file (.txt)", metavar="character")) + help="input classification of reads file (.txt)", metavar="character"), + make_option(c("-b", "--barcode"), type="character", default=NULL, + help="input barcode", metavar="character")) opt_parser = OptionParser(option_list=option_list) opt = parse_args(opt_parser) @@ -85,7 +87,7 @@ ggplot(countSP, aes(x = "percent", scale_fill_manual(values = countSP$teinte) + labs(fill = "spliced-variants") -ggsave(file = "SP_proportion_camembert.png", +ggsave(file = paste0(opt$barcode, "_SP_proportion_piechart.png"), scale = 2, width = 1920, height = 1080, @@ -100,7 +102,7 @@ ggplot(countSP, aes(x = nom, y = proportion, fill = nom)) + xlab(label = "spliced-variants") + ylab(label = "percent") -ggsave(file = "SP_proportion.png", +ggsave(file = paste0(opt$barcode, "_SP_proportion.png"), scale = 2, width = 1920, height = 1080, @@ -127,7 +129,7 @@ count_species <- df_species %>% count(species) count_species <- dplyr::mutate(count_species, percent = (as.numeric(n)/sum(as.numeric(n))*100)) #print(count_species) -write.table(df_species, file = "All_reads_identified.csv", +write.table(df_species, file = paste0(opt$barcode, "_all_reads_identified.csv"), sep = "\t", quote = FALSE, row.names = FALSE) # Null dataset: @@ -165,7 +167,7 @@ count_species_SPxx <- dplyr::mutate(count_species_SPxx, percent=(as.numeric(n)/sum(as.numeric(n))*100)) #print(count_species_SPxx) # save the tab: -write.csv(count_species_SPxx, file = "Count_canonical_species_SPxx.csv") +write.csv(count_species_SPxx, file = paste0(opt$barcode, "_count_canonical_species_SPxx.csv")) # prepare to plot: count_species_SPxx <- dplyr::inner_join(palette_complete, @@ -176,7 +178,7 @@ count_species_SPxx <- dplyr::inner_join(palette_complete, count_species_SPxx$nom <- factor(count_species_SPxx$nom, levels = all_species_name) # Save: -write.csv(count_species, file = "Count_species.csv") +write.csv(count_species, file = paste0(opt$barcode, "_count_species.csv")) # RNA species composition all species: count_species <- dplyr::inner_join(palette_complete, count_species, @@ -194,7 +196,7 @@ ggplot(count_species, labs(fill = "RNA species & spliced-variants") + xlab(label = "RNA species & spliced-variants") -ggsave(file = "Count_RNAs_species.png", +ggsave(file = paste0(opt$barcode, "_count_RNAs_species.png"), scale = 2, width = 1920, height = 1080, @@ -220,7 +222,7 @@ ggplot(count_species_clear, labs(fill = "RNA species & spliced-variants") + xlab(label = "RNA species & spliced-variants") -ggsave(file = "Count_RNAs_species_clear.png", +ggsave(file = paste0(opt$barcode, "_count_RNAs_species_clear.png"), scale = 2, width = 1920, height = 1080, @@ -246,7 +248,7 @@ ggplot(count_clear, aes(x = "percent", scale_fill_manual(values = count_clear$teinte) + labs(fill = "spliced-variants") -ggsave(file = "SP_clear_proportion_camembert.png", +ggsave(file = paste0(opt$barcode, "_SP_clear_proportion_piechart.png"), scale = 2, width = 1920, height = 1080, @@ -263,7 +265,7 @@ ggplot(count_clear, aes(x = nom, xlab(label = "spliced-variants") + ylab(label = "percent") -ggsave(file = "SP_clear_proportion.png", +ggsave(file = paste0(opt$barcode, "_SP_clear_proportion.png"), scale = 2, width = 1920, height = 1080, @@ -281,7 +283,7 @@ ggplot(count_species_SPxx, aes(x = "species", ylab(label = "TSS usage") + xlab(label = "percent") -ggsave(file = "Count_RNAs_species_camembert.png", +ggsave(file = paste0(opt$barcode, "_count_RNAs_species_piechart.png"), scale = 2, width = 1920, height = 1080, diff --git a/src/.docker_modules/r-bolero/1.0/Junctions_NanoSplicer.R b/src/.docker_modules/r-bolero/1.0/Junctions_NanoSplicer.R index 5567a378b0c61e716092c77a7702f2360b2bf2b2..0685a2199e4d9c4ea60464b46dceabd6cb2d07a6 100644 --- a/src/.docker_modules/r-bolero/1.0/Junctions_NanoSplicer.R +++ b/src/.docker_modules/r-bolero/1.0/Junctions_NanoSplicer.R @@ -15,7 +15,9 @@ option_list = list( make_option(c("-c", "--classification"), type="character", default="./classification.txt", help="input classification or reads file (.txt)", metavar="character"), make_option(c("-j", "--jwr"), type="character", default=NULL, - help="input nanosplicer results table (.csv)", metavar="character")) + help="input nanosplicer results table (.csv)", metavar="character"), + make_option(c("-b", "--barcode"), type="character", default=NULL, + help="input barcode", metavar="character")) opt_parser = OptionParser(option_list=option_list) opt = parse_args(opt_parser) reads_pos <- read.table(opt$classification, @@ -113,7 +115,7 @@ df$acceptor_site <- sapply(df$pg_acceptor, assignation_acceptor) df <- dplyr::mutate(df, junction = paste0(donor_site, acceptor_site)) -write.table(df, file = "JWR_check_parsed.csv", row.names = FALSE, sep = "\t") +write.table(df, file = paste0(opt$barcode, "_JWR_check_parsed.csv"), row.names = FALSE, sep = "\t") duplicated2 <- function(x){ if (sum(dup <- duplicated(x))==0) @@ -293,107 +295,8 @@ SP_variant_unique <- df_SPvariants %>% select(id, SP_name) SP_variant_unique <- SP_variant_unique[!duplicated(SP_variant_unique$id),] # distinct(SP_variant_unique, id) write.table(df_SPvariants, - "identified_SPvariants.csv", + paste0(opt$barcode, "_identified_SPvariants.csv"), row.names = FALSE, sep = "\t", quote = FALSE) -ggplot(df, aes(x=pg_donor)) + - geom_histogram(aes(y=after_stat(density)),color="darkblue", fill="lightblue") + - geom_density(alpha=.2, fill="lightblue") + - geom_vline(aes(xintercept=median(pg_donor)), - color="blue", linetype="dashed", linewidth=1) + - geom_vline(aes(xintercept=quantile(pg_donor, 0.025)), - linetype="dashed", linewidth=0.25) + - geom_vline(aes(xintercept=quantile(pg_donor, 0.975)), - linetype="dashed", linewidth=0.25) + - geom_vline(aes(xintercept=quantile(pg_donor, 0.01)), - color="green", linetype="dashed", linewidth=0.25) + - geom_vline(aes(xintercept=quantile(pg_donor, 0.99)), - color="green", linetype="dashed", linewidth=0.25) + - geom_vline(aes(xintercept=(median(pg_donor)+sd(pg_donor))), - color="red", linewidth=0.5) + - geom_vline(aes(xintercept=(median(pg_donor)-sd(pg_donor))), - color="red", linewidth=0.5) + - scale_x_continuous(breaks=c(min(df$pg_donor), - quantile(df$pg_donor, 0.025), - quantile(df$pg_donor, 0.005), - median(df$pg_donor)-sd(df$pg_donor), - median(df$pg_donor), - median(df$pg_donor)+sd(df$pg_donor), - quantile(df$pg_donor, 0.975), - quantile(df$pg_donor, 0.995), - max(df$pg_donor)), - label = c(min(df$pg_donor), - floor(quantile(df$pg_donor, 0.025)), - floor(quantile(df$pg_donor, 0.005)), - round(median(df$pg_donor)-sd(df$pg_donor)), - median(df$pg_donor), - round(median(df$pg_donor)+sd(df$pg_donor)), - floor(quantile(df$pg_donor, 0.975))+1, - round(quantile(df$pg_donor, 0.995))+1, - max(df$pg_donor))) + - theme(axis.text.x = element_text(angle = 45)) - -ggsave(filename = "Donor_curve.png", - device = "png", - scale = 1, - width = 1920, - height = 1080, - units = "px", - dpi = 320) - -ggplot(df, aes(x=pg_acceptor)) + - geom_histogram(aes(y=after_stat(density)),color="red", fill="darksalmon") + - geom_density(alpha=.2, fill="darksalmon") + - geom_vline(aes(xintercept=median(pg_acceptor)), - color="red", linetype="dashed", linewidth=1) + - geom_vline(aes(xintercept=quantile(pg_acceptor, 0.025)), - linetype="dashed", linewidth=0.25) + - geom_vline(aes(xintercept=quantile(pg_acceptor, 0.975)), - linetype="dashed", linewidth=0.25) + - geom_vline(aes(xintercept=quantile(pg_acceptor, 0.005)), - color="green", linetype="dashed", linewidth=0.25) + - geom_vline(aes(xintercept=quantile(pg_acceptor, 0.995)), - color="green", linetype="dashed", linewidth=0.25) + - geom_vline(aes(xintercept=(median(pg_acceptor)+sd(pg_acceptor))), - color="blue", linewidth=0.5) + - geom_vline(aes(xintercept=(median(pg_acceptor)-sd(pg_acceptor))), - color="blue", linewidth=0.5) + - scale_x_continuous(breaks=c(min(df$pg_acceptor), - quantile(df$pg_acceptor, 0.025), - quantile(df$pg_acceptor, 0.005), - median(df$pg_acceptor)-sd(df$pg_acceptor), - median(df$pg_acceptor), - median(df$pg_acceptor)+sd(df$pg_acceptor), - quantile(df$pg_acceptor, 0.975), - quantile(df$pg_acceptor, 0.995), - max(df$pg_acceptor)), - label = c(min(df$pg_acceptor), - floor(quantile(df$pg_acceptor, 0.025)), - floor(quantile(df$pg_acceptor, 0.005)), - round(median(df$pg_acceptor)-sd(df$pg_acceptor)), - median(df$pg_acceptor), - round(median(df$pg_acceptor)+sd(df$pg_acceptor)), - floor(quantile(df$pg_acceptor, 0.975))+1, - floor(quantile(df$pg_acceptor, 0.995))+1, - max(df$pg_acceptor))) + - theme(axis.text.x = element_text(angle = 45)) - -ggsave(filename = "Acceptor_curve.png", - device = "png", - scale = 1, - width = 1920, - height = 1080, - units = "px", - dpi = 320) - -# Graphs and tests: - -# sink("test_shapiro.txt") -# print("Normality test: Shapiro-Wilk") -# print("Donor site:") -# print(shapiro.test(df$pg_donor)) -# print("Acceptor site:") -# print(shapiro.test(df$pg_acceptor)) -# sink() \ No newline at end of file diff --git a/src/.docker_modules/r-bolero/1.0/Start_positions.R b/src/.docker_modules/r-bolero/1.0/Start_positions.R index 830c52e07598df6019bacf72c72128f99688a3d9..fcffb8ef479643092c27285ef0706dc6fbdcc87c 100644 --- a/src/.docker_modules/r-bolero/1.0/Start_positions.R +++ b/src/.docker_modules/r-bolero/1.0/Start_positions.R @@ -14,7 +14,9 @@ conflict_prefer("lag", "dplyr") # Load Start_positions_count files: option_list = list( make_option(c("-i", "--input"), type="character", default=NULL, - help="input start position file (.txt)", metavar="character") + help="input start position file (.txt)", metavar="character"), + make_option(c("-b", "--barcode"), type="character", default=NULL, + help="input barcode", metavar="character") ) opt_parser = OptionParser(option_list=option_list) @@ -109,7 +111,7 @@ sam_bc01$promoter <- sapply(sam_bc01$start_position, classify_reads) write.table(sam_bc01, - file = "classification_of_reads_per_RNA.txt", + file = paste0(opt$barcode, "_classification_of_reads_per_RNA.txt"), quote = FALSE, sep = "\t", row.names = FALSE) @@ -164,7 +166,7 @@ abs_count_reads <- cbind(c(as.vector(promoters),"total"), abs_count_reads) colnames(abs_count_reads) <- c("promoter", "read_number") write.table(abs_count_reads, - file = "Count_reads_per_promoter.tsv", + file = paste0(opt$barcode, "_count_reads_per_promoter.tsv"), quote = FALSE, sep = "\t", row.names = FALSE) @@ -201,7 +203,7 @@ plot_camembert <- function(barcode, df, tot) { print(camembert) - ggsave(filename = paste0("./Reads_start_promoters_", barcode, "_camembert.png"), + ggsave(filename = paste0("./", opt$barcode, "_reads_start_promoters_piechart.png"), plot = last_plot(), scale = 1, width = 1920, diff --git a/src/bolero.nf b/src/bolero.nf index 3ff3c53e85c17a21141a0f132b670b91291db2d2..0e7e3fa1f410be53b1172ce7118491d028742ee4 100755 --- a/src/bolero.nf +++ b/src/bolero.nf @@ -31,8 +31,8 @@ def helpMessage() { Mandatory arguments: --input [path] Path to the folder containing fast5 files. If skip basecalling option enabled, path to fastq files folder. - --adapt [str] Sequence of 5'RACE adapter. - --gsp [str] Sequence of gene-specific primer used in 5'RACE amplification step. + --adapt [file] Path to the txt/fasta file containing the sequence of 5'RACE adapter. + --gsp [file] Path to the txt/fasta file containing the sequence of gene-specific primer used in 5'RACE amplification step. References: --genome [file] Path to the fasta file containing the genome. @@ -128,10 +128,17 @@ Channel .ifEmpty { error "No fast5/q folder defined." } .set { input } +/* +Channel + .fromPath( params.adapt ) + .ifEmpty { error "No adapter sequence defined." } + .set { adapt } + Channel - .of( params.adapt ) - .ifEmpty { error "No adapter sequence defined." } - .set { adapt } + .fromPath( params.gsp ) + .ifEmpty { error "No adapter sequence defined." } + .set { gsp } +*/ Channel .fromPath( params.genome ) @@ -143,7 +150,10 @@ Channel .ifEmpty { error "No annotation defined, a gtf file describing transcripts and splice variants." } .set { gtf } -// .map( it -> [it.baseName, it]) +Channel + .fromPath(params.input+'*/', type: 'dir') + .map(it -> [it.baseName, it]) + .set{barcodes} /* **************************************************************** @@ -161,10 +171,7 @@ if(!params.skipBC) { } } -// Replace concatenate by seqkit fct to parallelization: include { concatenate } from "./nf_modules/seqkit/main.nf" -//include { concatenate } from "./nf_modules/concatenate/main.nf" - include { cut_5pRACE } from "./nf_modules/cutadapt/main.nf" include { hbv_genome } from "./nf_modules/minimap2/main.nf" include { seqkit_grep } from "./nf_modules/seqkit/main.nf" @@ -178,9 +185,6 @@ include { junctions_nanosplicer } from "./nf_modules/junction_nanosplicer/main.n include { rna_count } from "./nf_modules/rna_count/main.nf" -// creation des fonctions NanoSplicer: -// include { jwr_check } from "./nf_modules/nanosplicer/main.nf" - /* **************************************************************** Workflow @@ -191,40 +195,38 @@ workflow { //######################## BASECALLING ######################## - if(params.skipBC) { - concatenate(params.input) - // Replace by seqkit scat to parallelization + if(params.skipBC) { // we take fastq files as input and skip basecalling + concatenate(barcodes) } - else { + + //il reste à adapter ça + else { // we take fast5 files as input and proceed to basecalling with guppy if(params.gpu_mode) { basecall_fast5_gpu(input) concatenate(basecall_fast5_gpu.out.pass) - // Replace by seqkit scat to parallelization } else { basecall_fast5_cpu(input) concatenate(basecall_fast5_cpu.out.pass) - // Replace by seqkit scat to parallelization } } + + + //####################### PREPROCESSING ####################### - - - + + //Filtration (seqkit_grep looks for the 5'RACE and the gsp patterns in the reads to keep only mature ARNs) seqkit_grep(concatenate.out.merged_fastq, params.adapt, params.gsp) //Cut of the 5'RACE sequence cut_5pRACE(seqkit_grep.out.filtered_fastq, params.adapt) - - //########################## MAPPING ########################## - - hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome) + hbv_genome(cut_5pRACE.out.fastq_cutadapt, genome.collect()) + sort_index_bam(hbv_genome.out.bam) - // index_bam(sort_bam_genome.out.sorted_bam.collect()) //###################### START POSITIONS ####################### diff --git a/src/nextflow.config b/src/nextflow.config index 0ffe3409875cf9b59d9374ab076491e99395affd..698aee8ce7792caf41a9726e7cc8c11f67424dbd 100755 --- a/src/nextflow.config +++ b/src/nextflow.config @@ -18,7 +18,7 @@ profiles { docker.enabled = true process { errorStrategy = 'finish' - memory = '16GB' + memory = '12GB' withLabel: big_mem_mono_cpus { cpus = 1 } @@ -47,7 +47,7 @@ profiles { podman.enabled = true process { errorStrategy = 'finish' - memory = '16GB' + memory = '12GB' withLabel: big_mem_mono_cpus { cpus = 1 } @@ -72,13 +72,45 @@ profiles { } } } + pollux { + singularity.enabled = true + singularity.cacheDir = "./bin/" + singularity.bind = "/home" + process { + errorStrategy = 'finish' + memory = '32GB' + withLabel: big_mem_mono_cpus { + cpus = 1 + } + withLabel: big_mem_multi_cpus { + cpus = 16 + } + withLabel: small_mem_mono_cpus { + cpus = 1 + memory = '2GB' + } + withLabel: small_mem_multi_cpus { + cpus = 8 + memory = '2GB' + } + withLabel: mid_mem_mono_cpus { + cpus = 1 + memory = '8GB' + } + withLabel: mid_mem_multi_cpus { + cpus = 8 + memory = '8GB' + } + } + } + singularity { singularity.enabled = true singularity.cacheDir = "./bin/" singularity.bind = "/home" process { errorStrategy = 'finish' - memory = '16GB' + memory = '12GB' withLabel: big_mem_mono_cpus { cpus = 1 } diff --git a/src/nf_modules/cutadapt/main.nf b/src/nf_modules/cutadapt/main.nf index b7ffd3b37ab7288d518cb350524e07881458a5b0..7c72b448f940835231dab6dc00b05bbfb75cb7bb 100755 --- a/src/nf_modules/cutadapt/main.nf +++ b/src/nf_modules/cutadapt/main.nf @@ -4,23 +4,23 @@ container_url = "xgrand/cutadapt:${version}" process cut_5pRACE { container = "${container_url}" label "small_mem_mono_cpus" - tag "cutadapt" + tag "${barcode}" if (params.cutadapt_out != "") { publishDir "results/${params.cutadapt_out}", mode: 'copy' } input: - path(fastq) + tuple val(barcode), path(fastq) val(adapt) output: - path("*_cut_*"), emit: fastq_cutadapt + tuple val(barcode), path("${barcode}_merged_porechoped_cut_fastq.fastq"), emit: fastq_cutadapt """ cutadapt -e 0.2 -g ${adapt} \ --revcomp \ - -o "merged_porechoped_cut_fastq.fastq" \ + -o "${barcode}_merged_porechoped_cut_fastq.fastq" \ ${fastq} """ } \ No newline at end of file diff --git a/src/nf_modules/junction_nanosplicer/main.nf b/src/nf_modules/junction_nanosplicer/main.nf index 9f0a209ea7f5d158f1d0e277ad30294ead8b4d69..fb391396b8bee9ee39bee4a80818d7db4947e586 100644 --- a/src/nf_modules/junction_nanosplicer/main.nf +++ b/src/nf_modules/junction_nanosplicer/main.nf @@ -5,23 +5,23 @@ params.nanosplicer_out = "" process junctions_nanosplicer{ container = "${container_url}" label "small_mem_mono_cpus" - tag "identification de variants d'épissage" + tag "${barcode}" if (params.nanosplicer_out != "") { publishDir "results/${params.nanosplicer_out}", mode: 'copy' } input: - path(txt) - path(csv) + tuple val(barcode), path(txt) + tuple val(barcode), path(csv) output: - path("Rplots.pdf") - path("JWR_check_parsed.csv") - path("*.png") - path("identified_SPvariants.csv"), emit: identified_SPvariants + path("${barcode}/JWR_check_parsed.csv") + tuple val(barcode), path("${barcode}/${barcode}_identified_SPvariants.csv"), emit: identified_SPvariants script: """ - Rscript /Junctions_NanoSplicer.R -c ${txt} -j ${csv} + mkdir ${barcode} + cd ${barcode}/ + Rscript /Junctions_NanoSplicer.R -c ../${txt} -j ../${csv} """ } \ No newline at end of file diff --git a/src/nf_modules/minimap2/main.nf b/src/nf_modules/minimap2/main.nf index 5e101b78394ec6a73c34017d532df408400274f2..91c91931b131b820a327686623a614cd26b4ab7a 100755 --- a/src/nf_modules/minimap2/main.nf +++ b/src/nf_modules/minimap2/main.nf @@ -89,22 +89,25 @@ params.mapping_hbv_genome = "-ax splice --secondary=no -G 1650 -u n --eqx" process hbv_genome { container = "${container_url}" label "big_mem_multi_cpus" + tag "${barcode}" if (params.minimap2_genome_out != "") { publishDir "results/${params.minimap2_genome_out}", mode: 'copy' } input: - path(fastq) + tuple val(barcode), path(fastq) path(genome) output: - path("*"), emit: bam + tuple val(barcode), path("${barcode}/${barcode}_res.bam"), emit: bam script: memory = "${task.memory}" - ~/\s*GB/ memory = memory.toInteger() / (task.cpus + 1.0) """ - minimap2 ${params.mapping_hbv_genome} -t${task.cpus} -K ${memory} ${genome} ${fastq} | - samtools view -Shb - > res.bam + mkdir ${barcode} + cd ${barcode}/ + minimap2 ${params.mapping_hbv_genome} -t ${task.cpus} -K ${memory} ../${genome} ../${fastq} | + samtools view -Shb - > ${barcode}_res.bam """ } \ No newline at end of file diff --git a/src/nf_modules/nanosplicer/main.nf b/src/nf_modules/nanosplicer/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..71908d7e556563781f6c44d6a8483c51488a3bd6 --- /dev/null +++ b/src/nf_modules/nanosplicer/main.nf @@ -0,0 +1,26 @@ +version = "1.0" +container_url = "xgrand/nanosplicer:${version}" + +params.nanosplicer_out = "" +process jwr_checker { + container = "${container_url}" + label "big_mem_multi_cpus" + tag "${barcode}" + if (params.nanosplicer_out != "") { + publishDir "results/${params.nanosplicer_out}", mode: 'copy' + } + + input: + tuple val(barcode), path(bam), path(index) + + output: + tuple val(barcode), path("${barcode}/${barcode}_JWR_check.h5.csv"), emit: nanosplicer_jwr + + script: + """ + mkdir ${barcode} + cd ${barcode}/ + python3 /NanoSplicer/bin/JWR_checker.py --output_csv ../${bam} ${barcode}_JWR_check.h5 + """ +} + diff --git a/src/nf_modules/ont-guppy/main.nf b/src/nf_modules/ont-guppy/main.nf index b4ea29f9fbdf61b947cbc6f0d8f6ba16f0365184..f6fddc94e1b44bc71d9ef11e3eb76c876b2d9aa5 100644 --- a/src/nf_modules/ont-guppy/main.nf +++ b/src/nf_modules/ont-guppy/main.nf @@ -39,8 +39,10 @@ process basecall_fast5_gpu { """ echo "Start basecalling using GPUs." # guppy_basecaller --print_workflows +find -type f -name "*.fast5" > allfast5files.txt guppy_basecaller --compress_fastq \ -i ${fast5_folder} \ + --input_file_list allfast5files.txt \ -s . \ --flowcell ${params.flowcell} \ --kit ${params.kit} \ @@ -82,8 +84,10 @@ process basecall_fast5_cpu { script: """ echo "Start basecalling using CPUs." +find ${fast5_folder} -type f -name "*.fast5" > allfast5files.txt guppy_basecaller --compress_fastq \ - -i ${fast5_folder} \ + -i / \ + --input_file_list allfast5files.txt \ -s . \ --cpu_threads_per_caller ${params.cpu_threads_per_caller} \ --num_callers ${params.num_callers} \ diff --git a/src/nf_modules/rna_count/main.nf b/src/nf_modules/rna_count/main.nf index a2ae2ce641efe1f9829f2d6712720cb9dea743af..06afba66cb2379922e83b700cbe43fab3b21b9c8 100644 --- a/src/nf_modules/rna_count/main.nf +++ b/src/nf_modules/rna_count/main.nf @@ -5,22 +5,24 @@ params.rna_count_out = "" process rna_count{ container = "${container_url}" label "small_mem_mono_cpus" - tag "RNA quantification" + tag "${barcode}" if (params.rna_count_out != "") { publishDir "results/${params.rna_count_out}", mode: 'copy' } input: - path(spvariants) - path(classification) + tuple val(barcode), path(spvariants) + tuple val(barcode), path(classification) output: - path("*.csv") - path("*.pdf") - path("*.png") + path("${barcode}/*.csv") + path("${barcode}/*.pdf") + path("${barcode}/*.png") script: """ - Rscript /HBV_RNAs_count.R -s ${spvariants} -c ${classification} + mkdir ${barcode} + cd ${barcode}/ + Rscript /HBV_RNAs_count.R -s ../${spvariants} -c ../${classification} """ } diff --git a/src/nf_modules/samtools/main.nf b/src/nf_modules/samtools/main.nf index 0b48cd511948cf0319496c7088dc2473fd7d40b2..d44804b71bb4d0b3953ca1cf078e78882393d943 100755 --- a/src/nf_modules/samtools/main.nf +++ b/src/nf_modules/samtools/main.nf @@ -24,21 +24,23 @@ samtools sort -@ ${task.cpus} ${bam} -O BAM -o ${bam.simpleName}_sorted.bam params.start_position_counts_out = "" process start_position_counts { - tag "Start positions count" + tag "${barcode}" label "big_mem_multi_cpus" publishDir "results/${params.start_position_counts_out}", mode: 'copy' input: - tuple path(bam), path(index) + tuple val(barcode), path(bam), path(index) output: - path "*", emit: count + tuple val(barcode), path("${barcode}/${barcode}_start_positions_counts.txt"), emit: count script: """ -samtools view -F 260 ${bam} | +mkdir ${barcode} +cd ${barcode}/ +samtools view -F 260 ../${bam} | cut -f 1,4 | - sort > Start_positions_counts.txt + sort > ${barcode}_start_positions_counts.txt """ } @@ -67,20 +69,22 @@ params.indexed_bam_out ="" process sort_index_bam { container = "${container_url}" label "big_mem_multi_cpus" - tag "sorting" + tag "${barcode}" if (params.indexed_bam_out != "") { publishDir "results/${params.indexed_bam_out}", mode: 'copy' } input: - path(bam) + tuple val(barcode), path(bam) output: - tuple path("*sorted.bam"), path("*.bai"), emit: indexed_bam + tuple val(barcode), path("${barcode}/*sorted.bam"), path("${barcode}/*.bai"), emit: indexed_bam script: """ -samtools sort -@ ${task.cpus} ${bam} -o ${bam.simpleName}_sorted.bam -samtools index -@ ${task.cpus} ${bam.simpleName}_sorted.bam +mkdir ${barcode} +cd ${barcode}/ +samtools sort -@ ${task.cpus} ../${bam} -o ${barcode}_sorted.bam +samtools index -@ ${task.cpus} ${barcode}_sorted.bam """ } \ No newline at end of file diff --git a/src/nf_modules/seqkit/main.nf b/src/nf_modules/seqkit/main.nf index e6d7e6b2396f486301b08d03328e05b5dd264685..683c924fe37a357baa16860c4520bcb800e16144 100755 --- a/src/nf_modules/seqkit/main.nf +++ b/src/nf_modules/seqkit/main.nf @@ -29,35 +29,37 @@ params.seqkit_grep_out = "" process seqkit_grep { container = "${container_url}" label "small_mem_multi_cpus" - tag "Filter_reads" + tag "${barcode}" if (params.seqkit_grep_out != "") { publishDir "results/${params.seqkit_grep_out}", mode: 'copy' } input: - path(fastq) + tuple val(barcode), path(fastq) val(adapt) val(gsp) output: - path("filtered_5RACE_GSP.fastq"), emit: filtered_fastq - path("seq_stats.csv") - path("*.txt") - path("filtered_5RACE.fastq") + tuple val(barcode), path("${barcode}/${barcode}_filtered_5RACE_GSP.fastq"), emit: filtered_fastq + path("${barcode}/*.csv") + path("${barcode}/*.txt") + path("${barcode}/${barcode}_filtered_5RACE.fastq") script: lgadapt = Math.round(adapt.size().div(10)) lggsp = Math.round(gsp.size().div(10)) """ + mkdir ${barcode} + cd ${barcode}/ echo "mismatch allowed to 5'RACE adapter: ${lgadapt}" > mismatch.txt echo "mismatch allowed to Gene Specific primer: ${lggsp}" >> mismatch.txt echo ${adapt} > adapt.txt echo ${gsp} > gsp.txt - seqkit grep -i -f adapt.txt -m ${lgadapt} ${fastq} -o filtered_5RACE.fastq -j ${task.cpus} - seqkit grep -i -f gsp.txt -m ${lggsp} filtered_5RACE.fastq -o filtered_5RACE_GSP.fastq -j ${task.cpus} - seqkit stats ${fastq} -T -j ${task.cpus} > seq_stats.csv - seqkit stats filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv - seqkit stats filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> seq_stats.csv + seqkit grep -i -f adapt.txt -m ${lgadapt} ../${fastq} -o ${barcode}_filtered_5RACE.fastq -j ${task.cpus} + seqkit grep -i -f gsp.txt -m ${lggsp} ${barcode}_filtered_5RACE.fastq -o ${barcode}_filtered_5RACE_GSP.fastq -j ${task.cpus} + seqkit stats ../${fastq} -T -j ${task.cpus} > ${barcode}_seq_stats.csv + seqkit stats ${barcode}_filtered_5RACE.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv + seqkit stats ${barcode}_filtered_5RACE_GSP.fastq -T -j ${task.cpus} | tail -n1 >> ${barcode}_seq_stats.csv """ } @@ -65,21 +67,24 @@ params.fastq_out = "" process concatenate { container = "${container_url}" label "big_mem_multi_cpus" - tag "Concatenate_reads" + tag "${barcode}" if (params.fastq_out != "") { publishDir "results/${params.fastq_out}", mode: 'copy' } input: - path fastq + tuple val(barcode), path(fastq) output: - path "merged.fastq.gz", emit: merged_fastq + tuple val(barcode), path("${barcode}/${barcode}_merged.fastq.gz"), emit: merged_fastq script: """ - path=\$(readlink -f ${fastq}) - seqkit scat -j ${task.cpus} -f \${path} --gz-only > merged.fastq - gzip merged.fastq + mv ${fastq} path_${fastq} + mkdir ${barcode} + cd ${barcode}/ + path=\$(readlink -f ../path_${fastq}) + seqkit scat -j ${task.cpus} -f \${path} --gz-only > ${barcode}_merged.fastq + gzip ${barcode}_merged.fastq """ } \ No newline at end of file diff --git a/src/nf_modules/start_positions/main.nf b/src/nf_modules/start_positions/main.nf index 27b8ebf35ef66e41db2de2a114008883e26e04d6..668d50a5726464764d78c9d73061b12452631263 100644 --- a/src/nf_modules/start_positions/main.nf +++ b/src/nf_modules/start_positions/main.nf @@ -5,21 +5,24 @@ params.start_position_counts_out ="" process start_position_individuals{ container = "${container_url}" label "small_mem_mono_cpus" - tag "start positions" + tag "${barcode}" if (params.start_position_counts_out != "") { publishDir "results/${params.start_position_counts_out}", mode: 'copy' } input: - path(start_position_counts) + tuple val(barcode), path(start_position_counts) output: - path("Rplots.pdf") - path("*.png") - path("Count_reads_per_promoter.tsv") - path("classification_of_reads_per_RNA.txt"), emit: classification_of_reads + path("${barcode}/*.pdf") + path("${barcode}/*.png") + path("${barcode}/*.tsv") + tuple val(barcode), path("${barcode}/${barcode}_classification_of_reads_per_RNA.txt"), emit: classification_of_reads script: """ - Rscript /Start_positions.R -i ${start_position_counts} + mkdir ${barcode} + cd ${barcode}/ + Rscript /Start_positions.R -i ../${start_position_counts} + mv Rplots.pdf ${barcode}_Rplots.pdf """ } \ No newline at end of file