From e5216e2fb7d5139a11da997fb53aa70a6173da38 Mon Sep 17 00:00:00 2001 From: nservant <nicolas.servant@curie.fr> Date: Wed, 2 Sep 2020 22:02:01 +0200 Subject: [PATCH] add --fastq_chunks_size parameter --- CHANGELOG.md | 1 + bin/merge_statfiles.py | 2 +- docs/usage.md | 38 +++++++++++++++++++++----------------- main.nf | 32 +++++++++++++++++--------------- nextflow.config | 11 ++++++----- nextflow_schema.json | 42 ++++++++++++++++++++++++------------------ 6 files changed, 70 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b72dc20..719ea08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` * Template update for nf-core/tools v1.10.2 +* Add the `--fastq_chunks_size` to specify the number of reads per chunks if split_fastq is true ### `Fixed` diff --git a/bin/merge_statfiles.py b/bin/merge_statfiles.py index 469cacd..dc11bf7 100755 --- a/bin/merge_statfiles.py +++ b/bin/merge_statfiles.py @@ -48,7 +48,7 @@ if __name__ == "__main__": if not line.startswith("#"): lsp = line.strip().split("\t") data = map(num, lsp[1:len(lsp)]) - template[str(lsp[0])] = data + template[str(lsp[0])] = list(data) if len(template) == 0: print("Cannot find template files !") diff --git a/docs/usage.md b/docs/usage.md index d4af9fd..11b0653 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -517,75 +517,79 @@ normalization. Default: 0.1 ## Inputs/Outputs -### `--splitFastq` +### `--split_fastq` By default, the nf-core Hi-C pipeline expects one read pairs per sample. However, for large Hi-C data processing single fastq files can be very time consuming. -The `--splitFastq` option allows to automatically split input read pairs -into chunks of reads. In this case, all chunks will be processed in parallel +The `--split_fastq` option allows to automatically split input read pairs +into chunks of reads of size `--fastq_chunks_size` (Default : 20000000). In this case, all chunks will be processed in parallel and merged before generating the contact maps, thus leading to a significant increase of processing performance. ```bash ---splitFastq '[Number of reads per chunk]' +--split_fastq --fastq_chunks_size '[numeric]' ``` -### `--saveReference` +### `--save_reference` If specified, annotation files automatically generated from the `--fasta` file are exported in the results folder. Default: false ```bash ---saveReference +--save_reference ``` -### `--saveAlignedIntermediates` +### `--save_aligned_intermediates` If specified, all intermediate mapping files are saved and exported in the results folder. Default: false ```bash ---saveReference +--save_aligned_inermediates ``` -### `--saveInteractionBAM` +### `--save_interaction_bam` If specified, write a BAM file with all classified reads (valid paires, dangling end, self-circle, etc.) and its tags. +```bash +--save_interaction_bam +``` + ## Skip options -### `--skipMaps` +### `--skip_maps` If defined, the workflow stops with the list of valid interactions, and the genome-wide maps are not built. Usefult for capture-C analysis. Default: false ```bash ---skipMaps +--skip_maps ``` -### `--skipIce` +### `--skip_ice` If defined, the ICE normalization is not run on the raw contact maps. Default: false ```bash ---skipIce +--skip_ice ``` -### `--skipCool` +### `--skip_cool` If defined, cooler files are not generated. Default: false ```bash ---skipCool +--skip_cool ``` -### `--skipMultiQC` +### `--skip_multiQC` If defined, the MultiQC report is not generated. Default: false ```bash ---skipMultiQC +--skip_multiQC ``` diff --git a/main.nf b/main.nf index 5763152..495a1fe 100644 --- a/main.nf +++ b/main.nf @@ -34,7 +34,8 @@ def helpMessage() { --save_reference [bool] Save reference genome to output folder. Default: False Alignments - --split_fastq [int] Size of read chuncks to use to speed up the workflow. Default: None + --split_fastq [bool] Split fastq files in reads chunks to speed up computation. Default: None + --fastq_chunks_size [int] Size of read chunks if split_fastq is true. Default: 20000000 --save_aligned_intermediates [bool] Save intermediates alignment files. Default: False --bwt2_opts_end2end [str] Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default. --bwt2_opts_trimmed [str] Options for bowtie2 mapping after ligation site trimming. See hic.config for default. @@ -46,14 +47,14 @@ def helpMessage() { --rm_dup [bool] Remove duplicates. Default: true Contacts calling - --min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: None - --max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: None - --min_insert_size [int] Minimum insert size of mapped reads to consider. Default: None - --max_insert_size [int] Maximum insert size of mapped reads to consider. Default: None + --min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: 0 + --max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: 0 + --min_insert_size [int] Minimum insert size of mapped reads to consider. Default: 0 + --max_insert_size [int] Maximum insert size of mapped reads to consider. Default: 0 --save_interaction_bam [bool] Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False --dnase [bool] Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False - --min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: None + --min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: 0 Contact maps --bin_size [int] Bin size for contact maps (comma separated). Default: '1000000,500000' @@ -163,7 +164,7 @@ if (params.input_paths){ if ( params.split_fastq ){ raw_reads_full = raw_reads.concat( raw_reads_2 ) - raw_reads = raw_reads_full.splitFastq( by: params.split_fastq , file: true) + raw_reads = raw_reads_full.splitFastq( by: params.fastq_chunks_size, file: true) }else{ raw_reads = raw_reads.concat( raw_reads_2 ).dump(tag: "data") } @@ -239,6 +240,8 @@ if(workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = custom_runName ?: workflow.runName summary['Input'] = params.input summary['splitFastq'] = params.split_fastq +if (params.split_fastq) + summary['Read chunks Size'] = params.fastq_chunks_size summary['Fasta Ref'] = params.fasta summary['Restriction Motif']= params.restriction_site summary['Ligation Motif'] = params.ligation_site @@ -647,12 +650,12 @@ if (!params.dnase){ } def opts = "" - if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" - if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}" - if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}" - if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}" - if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}" - if (params.save_interaction_bam) opts="${opts} --sam" + opts += params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : '' + opts += params.min_insert_size > 0 ? " -s ${params.min_insert_size}" : '' + opts += params.max_insert_size > 0 ? " -l ${params.max_insert_size}" : '' + opts += params.min_restriction_fragment_size > 0 ? " -t ${params.min_restriction_fragment_size}" : '' + opts += params.max_restriction_fragment_size > 0 ? " -m ${params.max_restriction_fragment_size}" : '' + opts += params.save_interaction_bam ? " --sam" : '' prefix = pe_bam.toString() - ~/.bam/ """ mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts} @@ -680,8 +683,7 @@ else{ sample = sample.toString() - ~/(\.[0-9]+)$/ } - def opts = "" - if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" + opts = params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : '' prefix = pe_bam.toString() - ~/.bam/ """ mapped_2hic_dnase.py -r ${pe_bam} ${opts} diff --git a/nextflow.config b/nextflow.config index edb8038..c765a4a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,7 @@ params { genome = false input_paths = false split_fastq = false + fastq_chunks_size = 20000000 chromosome_size = false restriction_fragments = false skip_maps = false @@ -34,12 +35,12 @@ params { // Digestion Hi-C restriction_site = 'A^AGCTT' ligation_site = 'AAGCTAGCTT' - min_restriction_fragment_size = false - max_restriction_fragment_size = false - min_insert_size = false - max_insert_size = false + min_restriction_fragment_size = 0 + max_restriction_fragment_size = 0 + min_insert_size = 0 + max_insert_size = 0 dnase = false - min_cis_dist = false + min_cis_dist = 0 rm_dup = true rm_singleton = true rm_multi = true diff --git a/nextflow_schema.json b/nextflow_schema.json index ed2f701..9071bd2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -27,10 +27,16 @@ "default": "undefined" }, "split_fastq": { - "type": "number", - "description": "Split the reads into chunks before running. Specify the number of reads per chuncks as --split_fastq 20000000.", - "fa_icon": "fas fa-dna" + "type": "boolean", + "description": "Split the reads into chunks before running the pipelne", + "fa_icon": "fas fa-dna", + "default": "false" }, + "fastq_chunks_size":{ + "type": "integer", + "description": "Read number per chunks if split_fastq is used", + "default": "20000000" + }, "single_end": { "type": "boolean", "description": "Specifies that the input is single-end reads.", @@ -178,29 +184,29 @@ "fa_icon": "fas fa-signature", "properties": { "min_cis_dist": { - "type": "string", - "default": "undefined", - "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products" + "type": "integer", + "default": "O", + "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered" }, "max_insert_size": { - "type": "string", - "default": "undefined", - "description": "Maximum fragment size to consider" + "type": "integer", + "default": "0", + "description": "Maximum fragment size to consider. Only values > 0 are considered" }, "min_insert_size": { - "type": "string", - "default": "undefined", - "description": "Minimum fragment size to consider" + "type": "integer", + "default": "0", + "description": "Minimum fragment size to consider. Only values > 0 are considered" }, "max_restriction_fragment_size": { - "type": "string", - "default": "undefined", - "description": "Maximum restriction fragment size to consider" + "type": "integer", + "default": "0", + "description": "Maximum restriction fragment size to consider. Only values > 0 are considered" }, "min_restriction_fragment_size": { - "type": "string", - "default": "undefined", - "description": "Minimum restriction fragment size to consider" + "type": "integer", + "default": "0", + "description": "Minimum restriction fragment size to consider. Only values > 0 are considered" } } }, -- GitLab