Skip to content
Snippets Groups Projects
sample_reads.nf 3.21 KiB
Newer Older
workflow SUBSAMPLE_READ {
    take:
      fastq
    main:
      COUNT_READS(fastq)
      SORT_READS(
        COUNT_READS.out.count
            .map{
                it ->
                [
                    it[0].specie,
                    it[1]
                ]
            }
            .groupTuple()
      )
      SAMPLE_READS(SORT_READS.out.count
        .cross(
            fastq
                .map{
                    item ->
                    [
                        item[0].specie,
                        item
                    ]
                }
        )
        .map{
            it -> 
            [
                it[1][1][0],
                it[1][1][1],
                it[0][1]
            ]
        }
        .join(
            COUNT_READS.out.count
        )
      )
      fastq = SAMPLE_READS.out.reads
      version = COUNT_READS.out.versions
        .mix(SAMPLE_READS.out.versions)
        .mix(SORT_READS.out.versions)
}

process COUNT_READS {
    tag "$meta.id"
    label 'small_mem_mono_cpus'

    container "lbmc/alpine:3.17"

    input:
    tuple val(meta), path(fastq)

    output:
    tuple val(meta), path("${meta.id}.csv"), emit: count 
    path "versions.yml"           , emit: versions

    script:
    def args = task.ext.args ?: ''
    """
    zcat ${fastq} | wc -l > ${meta.id}.csv

    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        split: v1.35.0
    END_VERSIONS
    """
}

process SORT_READS {
    tag "$specie"
    label 'small_mem_mono_cpus'
    container "lbmc/alpine:3.17"

    input:
    tuple val(specie), path(count)

    output:
    tuple val(specie), path("count.csv"), emit: count
    path "versions.yml"           , emit: versions

    script:
    def args = task.ext.args ?: ''
    """
    cat ${count} | sort -u -k1n | head -n 1 > count.csv

    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        split: v1.35.0
    END_VERSIONS
    """
}

process SAMPLE_READS {
    tag "$meta.id"
    label 'small_mem_mono_cpus'
    tuple val(meta), path(reads), path(sample_size), path(read_number)
    tuple val(meta), path("sample_${reads}"), emit: reads
    path "versions.yml"                , emit: versions

    when:
    task.ext.when == null || task.ext.when

    script:
    def args   = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    if (!(args ==~ /.*-s[0-9]+.*/)) {
        args += " -s100"
    }
    if ( !sample_size ) {
        error "SEQTK/SAMPLE must have a sample_size value included"
    }
    """
    SAMPLE_SIZE=\$((cat $sample_size | tr -d '\\n'))
    CURRENT_READ_NUMBER=\$((cat $read_number | tr -d '\\n'))

    if [\$SAMPLE_SIZE -eq \$CURRENT_READ_NUMBER]; then
        READ_NUMBER=\$((cat $sample_size | tr -d '\\n'))
        zcat ${reads} | head -n \$((cat $sample_size | tr -d '\\n')) | gzip -c > sample_${reads}
        READ_NUMBER=\$((zcat sample_${reads} | wc -l))
    fi
    if [\$SAMPLE_SIZE -ne \$READ_NUMBER]; then
       exit 1 

    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
    END_VERSIONS
    """
}