diff --git a/src/modules/sample_reads.nf b/src/modules/sample_reads.nf index 859e9c295a8e3243e760dc6a28e2109d4d4fa0c2..6bce0b3ba53e13c562448f1ab67493ec114ed9a4 100644 --- a/src/modules/sample_reads.nf +++ b/src/modules/sample_reads.nf @@ -3,13 +3,45 @@ workflow SUBSAMPLE_READ { fastq main: COUNT_READS(fastq) - COUNT_READS.out.fastq - .groupTuple(by: [2, 3], sort: true) - .view() - SAMPLE_READS(fastq_to_sample) + SORT_READS( + COUNT_READS.out.count + .map{ + it -> + [ + it[0].specie, + it[1] + ] + } + .groupTuple() + ) + SAMPLE_READS(SORT_READS.out.count + .cross( + fastq + .map{ + item -> + [ + item[0].specie, + item + ] + } + ) + .map{ + it -> + [ + it[1][1][0], + it[1][1][1], + it[0][1] + ] + } + .join( + COUNT_READS.out.count + ) + ) emit: - fastq: SAMPLE_READS.out.reads - version: SAMPLE_READS.out.version.mix(COUNT_READS.version) + fastq = SAMPLE_READS.out.reads + version = COUNT_READS.out.versions + .mix(SAMPLE_READS.out.versions) + .mix(SORT_READS.out.versions) } process COUNT_READS { @@ -22,13 +54,37 @@ process COUNT_READS { tuple val(meta), path(fastq) output: - tuple val(meta), stdout, path("*.fastq.gz"), emit: fastq + tuple val(meta), path("${meta.id}.csv"), emit: count path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' """ - zcat ${fastq} | wc -l + zcat ${fastq} | wc -l > ${meta.id}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + split: v1.35.0 + END_VERSIONS + """ +} + +process SORT_READS { + tag "$specie" + label 'small_mem_mono_cpus' + container "lbmc/alpine:3.17" + + input: + tuple val(specie), path(count) + + output: + tuple val(specie), path("count.csv"), emit: count + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + """ + cat ${count} | sort -u -k1n | head -n 1 > count.csv cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -40,14 +96,15 @@ process COUNT_READS { process SAMPLE_READS { tag "$meta.id" label 'small_mem_mono_cpus' + debug true - container "quay.io/biocontainers/seqtk:1.3--h5bf99c6_3" + container "lbmc/alpine:3.17" input: - tuple val(meta), path(reads), val(sample_size) + tuple val(meta), path(reads), path(sample_size), path(read_number) output: - tuple val(meta), path("*.fastq.gz"), emit: reads + tuple val(meta), path("sample_${reads}"), emit: reads path "versions.yml" , emit: versions when: @@ -63,15 +120,11 @@ process SAMPLE_READS { error "SEQTK/SAMPLE must have a sample_size value included" } """ - printf "%s\\n" $reads | while read f; - do - seqtk \\ - sample \\ - $args \\ - \$f \\ - $sample_size \\ - | gzip --no-name > ${prefix}_\$(basename \$f) - done + if [\$(cat $sample_size | tr -d '\\n') -eq \$(cat $read_number | tr -d '\\n')]; then + ln -s ${reads} sample_${reads} + else + zcat ${reads} | head -n \$(cat $sample_size | tr -d '\\n') | gzip -c > sample_${reads} + fi cat <<-END_VERSIONS > versions.yml "${task.process}":