diff --git a/src/main.nf b/src/main.nf index a8d444b0af842b20a3b02cfdb9ed521127e36e9a..72cba21fde16bbf3a1a625efba3e25891f269d49 100644 --- a/src/main.nf +++ b/src/main.nf @@ -7,6 +7,7 @@ Testing pipeline for marseq scRNASeq analysis params.kmer_size = 12 params.bootstrap = 10 +include { SUBSAMPLE_READ } from "./modules/sample_reads" include { SPLIT } from "./modules/split" include { FASTKMERS } from "./modules/fastkmers" include { MERGEKMER } from "./modules/mergekmer" @@ -27,7 +28,8 @@ Channel.fromPath( file(params.csv) ) Channel.fromPath(params.csv).set{params_csv} workflow { - SPLIT(fastq.r1.mix(fastq.r2)) + SUBSAMPLE_READ(fastq.r1.mix(fastq.r2)) + SPLIT(SUBSAMPLE_READ.out.fastq) FASTKMERS(SPLIT.out.fastq.transpose()) MERGEKMER(FASTKMERS.out.csv.groupTuple()) COLLATEKMER(MERGEKMER.out.csv.map{it -> [it[0].specie, it[1]] }.groupTuple()) diff --git a/src/modules/sample_reads.nf b/src/modules/sample_reads.nf new file mode 100644 index 0000000000000000000000000000000000000000..859e9c295a8e3243e760dc6a28e2109d4d4fa0c2 --- /dev/null +++ b/src/modules/sample_reads.nf @@ -0,0 +1,81 @@ +workflow SUBSAMPLE_READ { + take: + fastq + main: + COUNT_READS(fastq) + COUNT_READS.out.fastq + .groupTuple(by: [2, 3], sort: true) + .view() + SAMPLE_READS(fastq_to_sample) + emit: + fastq: SAMPLE_READS.out.reads + version: SAMPLE_READS.out.version.mix(COUNT_READS.version) +} + +process COUNT_READS { + tag "$meta.id" + label 'small_mem_mono_cpus' + + container "lbmc/alpine:3.17" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), stdout, path("*.fastq.gz"), emit: fastq + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + """ + zcat ${fastq} | wc -l + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + split: v1.35.0 + END_VERSIONS + """ +} + +process SAMPLE_READS { + tag "$meta.id" + label 'small_mem_mono_cpus' + + container "quay.io/biocontainers/seqtk:1.3--h5bf99c6_3" + + input: + tuple val(meta), path(reads), val(sample_size) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (!(args ==~ /.*-s[0-9]+.*/)) { + args += " -s100" + } + if ( !sample_size ) { + error "SEQTK/SAMPLE must have a sample_size value included" + } + """ + printf "%s\\n" $reads | while read f; + do + seqtk \\ + sample \\ + $args \\ + \$f \\ + $sample_size \\ + | gzip --no-name > ${prefix}_\$(basename \$f) + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +}