diff --git a/CHANGELOG.md b/CHANGELOG.md index 66b31d195e8a3c933d47b359fca83daf7cacb42b..5b2e2884cf8c16665dafae05dab0e331c5b436ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## v1.3.0dev +* New `--keep_multi` and `keep_dup` options. Default: false * Template update for nf-core/tools v1.11 * Minor fix to summary log messages in pipeline header +### `Fixed` + +* `min_mapq` is ignored if `--keep_multi` is used + +### Deprecated + +* `--rm_dup` and `rm_multi` are replaced by `--keep_dup` and `--keep_multi` + ## v1.2.2 - 2020-09-02 ### `Added` diff --git a/main.nf b/main.nf index 1e4842027b3fb9c31405afb779a050ff4089e2d0..a6a92f2cb1c8ec157c19f51d8d63d639e3fdb8ff 100644 --- a/main.nf +++ b/main.nf @@ -22,48 +22,49 @@ def helpMessage() { Mandatory arguments: --input [file] Path to input data (must be surrounded with quotes) + --genome [str] Name of iGenomes reference -profile [str] Configuration profile to use. Can use multiple (comma separated) Available: conda, docker, singularity, awsbatch, test and more. References If not specified in the configuration file or you wish to overwrite any of the references. - --genome [str] Name of iGenomes reference --bwt2_index [file] Path to Bowtie2 index --fasta [file] Path to Fasta reference + + Digestion Hi-C If not specified in the configuration file or you wish to set up specific digestion protocol + --ligation_site [str] Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT' + --restriction_site [str] Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT' --chromosome_size [file] Path to chromosome size file --restriction_fragments [file] Path to restriction fragment file (bed) --save_reference [bool] Save reference genome to output folder. Default: False + DNase Hi-C + --dnase [bool] Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False + --min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: None + Alignments - --split_fastq [bool] Split fastq files in reads chunks to speed up computation. Default: false - --fastq_chunks_size [int] Size of read chunks if split_fastq is true. Default: 20000000 - --save_aligned_intermediates [bool] Save intermediates alignment files. Default: False --bwt2_opts_end2end [str] Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default. --bwt2_opts_trimmed [str] Options for bowtie2 mapping after ligation site trimming. See hic.config for default. --min_mapq [int] Minimum mapping quality values to consider. Default: 10 - --restriction_site [str] Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT' - --ligation_site [str] Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT' - --rm_singleton [bool] Remove singleton reads. Default: true - --rm_multi [bool] Remove multi-mapped reads. Default: true - --rm_dup [bool] Remove duplicates. Default: true - - Contacts calling - --min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: 0 - --max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: 0 - --min_insert_size [int] Minimum insert size of mapped reads to consider. Default: 0 - --max_insert_size [int] Maximum insert size of mapped reads to consider. Default: 0 - --save_interaction_bam [bool] Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False + --keep_multi [bool] Keep multi-mapped reads (--min_mapq is ignored). Default: false + --keep_dups [bool] Keep duplicates. Default: false + --save_aligned_intermediates [bool] Save intermediates alignment files. Default: False + --split_fastq [bool] Split fastq files in reads chunks to speed up computation. Default: false + --fastq_chunks_size [int] Size of read chunks if split_fastq is true. Default: 20000000 - --dnase [bool] Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False - --min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: 0 + Valid Pairs Detection + --min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: None + --max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: None + --min_insert_size [int] Minimum insert size of mapped reads to consider. Default: None + --max_insert_size [int] Maximum insert size of mapped reads to consider. Default: None + --save_interaction_bam [bool] Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False Contact maps - --bin_size [int] Bin size for contact maps (comma separated). Default: '1000000,500000' + --bin_size [str] Bin size for contact maps (comma separated). Default: '1000000,500000' --ice_max_iter [int] Maximum number of iteration for ICE normalization. Default: 100 --ice_filter_low_count_perc [float] Percentage of low counts columns/rows to filter before ICE normalization. Default: 0.02 --ice_filter_high_count_perc [float] Percentage of high counts columns/rows to filter before ICE normalization. Default: 0 --ice_eps [float] Convergence criteria for ICE normalization. Default: 0.1 - Workflow --skip_maps [bool] Skip generation of contact maps. Useful for capture-C. Default: False --skip_ice [bool] Skip ICE normalization. Default: False @@ -243,17 +244,20 @@ summary['splitFastq'] = params.split_fastq if (params.split_fastq) summary['Read chunks Size'] = params.fastq_chunks_size summary['Fasta Ref'] = params.fasta -summary['Restriction Motif']= params.restriction_site -summary['Ligation Motif'] = params.ligation_site -summary['DNase Mode'] = params.dnase -summary['Remove Dup'] = params.rm_dup -summary['Remove MultiHits'] = params.rm_multi +if (params.restriction_site){ + summary['Restriction Motif']= params.restriction_site + summary['Ligation Motif'] = params.ligation_site + summary['Min Fragment Size']= ("$params.min_restriction_fragment_size".isInteger() ? params.min_restriction_fragment_size : 'None') + summary['Max Fragment Size']= ("$params.max_restriction_fragment_size".isInteger() ? params.max_restriction_fragment_size : 'None') + summary['Min Insert Size'] = ("$params.min_insert_size".isInteger() ? params.min_insert_size : 'None') + summary['Max Insert Size'] = ("$params.max_insert_size".isInteger() ? params.max_insert_size : 'None') +}else{ + summary['DNase Mode'] = params.dnase + summary['Min CIS dist'] = ("$params.min_cis_dist".isInteger() ? params.min_cis_dist : 'None') +} summary['Min MAPQ'] = params.min_mapq -summary['Min Fragment Size']= params.min_restriction_fragment_size -summary['Max Fragment Size']= params.max_restriction_fragment_size -summary['Min Insert Size'] = params.min_insert_size -summary['Max Insert Size'] = params.max_insert_size -summary['Min CIS dist'] = params.min_cis_dist +summary['Keep Duplicates'] = params.keep_dups +summary['Keep Multihits'] = params.keep_multi summary['Maps resolution'] = params.bin_size summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" @@ -526,7 +530,7 @@ if (!params.dnase){ set val(oname), file("${prefix}.mapstat") into all_mapstat script: - sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2|_1$|_2)/ + sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2|_1|_2)/ tag = prefix.toString() =~/_R1|_val_1|_1/ ? "R1" : "R2" oname = prefix.toString() - ~/(\.[0-9]+)$/ """ @@ -590,7 +594,7 @@ process combine_mapped_files{ saveAs: {filename -> filename.indexOf(".pairstat") > 0 ? "stats/$filename" : "$filename"} input: - set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple() + set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple().dump(tag:'bams') output: set val(sample), file("${sample}_bwt2pairs.bam") into paired_bam @@ -604,9 +608,11 @@ process combine_mapped_files{ oname = sample.toString() - ~/(\.[0-9]+)$/ def opts = "-t" - opts = params.rm_singleton ? "${opts}" : "--single ${opts}" - opts = params.rm_multi ? "${opts}" : "--multi ${opts}" - if ("$params.min_mapq".isInteger()) opts="${opts} -q ${params.min_mapq}" + if (params.keep_multi) { + opts="${opts} --multi" + }else if (params.min_mapq){ + opts="${opts} -q ${params.min_mapq}" + } """ mergeSAM.py -f ${r1_bam} -r ${r2_bam} -o ${sample}_bwt2pairs.bam ${opts} """ @@ -705,7 +711,7 @@ process remove_duplicates { file("stats/") into all_mergestat script: - if ( params.rm_dup ){ + if ( ! params.keep_dups ){ """ mkdir -p stats/${sample} diff --git a/nextflow.config b/nextflow.config index 14a4f91ac1ede1e0d1e63b33f5556b345ed53d86..32fe8f8b44e16135abcd70a20c5a875de9577aae 100644 --- a/nextflow.config +++ b/nextflow.config @@ -41,9 +41,8 @@ params { max_insert_size = 0 dnase = false min_cis_dist = 0 - rm_dup = true - rm_singleton = true - rm_multi = true + keep_dups = false + keep_multi = false bin_size = '1000000,500000' ice_max_iter = 100 ice_filer_low_count_perc = 0.02 diff --git a/nextflow_schema.json b/nextflow_schema.json index 84bb558fd02002db37df4755daa2dff36d09d89f..4f0113f0b9fdeaf76cb841a78704b6cd37ce32a0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,11 +5,11 @@ "description": "Analysis of Chromosome Conformation Capture data (Hi-C)", "type": "object", "definitions": { - "input_output_options": { - "title": "Input/output options", + "mandatory_arguments": { + "title": "Mandatory arguments", "type": "object", "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", + "description": "Mandatory arguments to run the pipeline", "required": [ "input" ], @@ -26,22 +26,11 @@ "description": "Input FastQ files for test only", "default": "undefined" }, - "split_fastq": { - "type": "boolean", - "description": "Split the reads into chunks before running the pipelne", - "fa_icon": "fas fa-dna", - "default": "false" - }, - "fastq_chunks_size":{ - "type": "integer", - "description": "Read number per chunks if split_fastq is used", - "default": "20000000" - }, - "single_end": { - "type": "boolean", - "description": "Specifies that the input is single-end reads.", - "fa_icon": "fas fa-align-center", - "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--input`. For example:\n\n```bash\n--single_end --input '*.fastq'\n```\n\nIt is not possible to run a mixture of single-end and paired-end files in one run." + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, "outdir": { "type": "string", @@ -64,12 +53,6 @@ "fa_icon": "fas fa-dna", "description": "Options for the reference genome indices used to align reads.", "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." - }, "fasta": { "type": "string", "fa_icon": "fas fa-font", @@ -94,6 +77,24 @@ "type": "string", "description": "Full path to directory containing Bowtie index including base name. i.e. `/path/to/index/base`.", "fa_icon": "far fa-file-alt" + } + } + }, + "digestion_hi_c": { + "title": "Digestion Hi-C", + "type": "object", + "description": "Parameters for protocols based on restriction enzyme", + "default": "", + "properties": { + "restriction_site": { + "type": "string", + "default": "'A^AGCTT'", + "description": "Restriction motifs used during digestion. Several motifs (comma separated) can be provided." + }, + "ligation_site": { + "type": "string", + "default": "'AAGCTAGCTT", + "description": "Expected motif after DNA ligation. Several motifs (comma separated) can be provided." }, "chromosome_size": { "type": "string", @@ -115,41 +116,40 @@ } } }, - "data_processing_options": { - "title": "Data processing", + "dnase_hi_c": { + "title": "DNAse Hi-C", "type": "object", - "description": "Parameters for Hi-C data processing", + "description": "Parameters for protocols based on DNAse digestion", "default": "", - "fa_icon": "fas fa-bahai", "properties": { "dnase": { "type": "boolean", "description": "For Hi-C protocols which are not based on enzyme digestion such as DNase Hi-C" }, - "restriction_site": { - "type": "string", - "default": "'A^AGCTT'", - "description": "Restriction motifs used during digestion. Several motifs (comma separated) can be provided." - }, - "ligation_site": { - "type": "string", - "default": "'AAGCTAGCTT", - "description": "Expected motif after DNA ligation. Several motifs (comma separated) can be provided." - }, - "rm_dup": { - "type": "boolean", - "description": "Remove duplicates", - "default": true - }, - "rm_multi": { + "min_cis_dist": { + "type": "integer", + "default": "O", + "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered" + } + } + }, + "alignments": { + "title": "Alignments", + "type": "object", + "description": "Parameters for reads aligments", + "default": "", + "fa_icon": "fas fa-bahai", + "properties": { + "split_fastq": { "type": "boolean", - "description": "Remove multi-mapped reads", - "default": true + "description": "Split the reads into chunks before running the pipelne", + "fa_icon": "fas fa-dna", + "default": "false" }, - "rm_singleton": { - "type": "boolean", - "description": "Remove singleton", - "default": true + "fastq_chunks_size": { + "type": "integer", + "description": "Read number per chunks if split_fastq is used", + "default": "20000000" }, "min_mapq": { "type": "integer", @@ -166,27 +166,28 @@ "default": "'--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder'", "description": "Option for trimmed reads mapping" }, - "save_interaction_bam": { - "type": "boolean", - "description": "Save a BAM file where all reads are flagged by their interaction classes" - }, "save_aligned_intermediates": { "type": "boolean", "description": "Save all BAM files during two-steps mapping" } } }, - "contacts_calling_options": { - "title": "Contacts calling", + "valid_pairs_detection": { + "title": "Valid Pairs Detection", "type": "object", "description": "Options to call significant interactions", "default": "", "fa_icon": "fas fa-signature", "properties": { - "min_cis_dist": { - "type": "integer", - "default": "O", - "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered" + "keep_dups": { + "type": "string", + "description": "Keep duplicated reads", + "default": "False" + }, + "keep_multi": { + "type": "string", + "description": "Keep multi-aligned reads", + "default": "False" }, "max_insert_size": { "type": "integer", @@ -207,6 +208,10 @@ "type": "integer", "default": "0", "description": "Minimum restriction fragment size to consider. Only values > 0 are considered" + }, + "save_interaction_bam": { + "type": "boolean", + "description": "Save a BAM file where all reads are flagged by their interaction classes" } } }, @@ -435,16 +440,22 @@ }, "allOf": [ { - "$ref": "#/definitions/input_output_options" + "$ref": "#/definitions/mandatory_arguments" }, { "$ref": "#/definitions/reference_genome_options" }, { - "$ref": "#/definitions/data_processing_options" + "$ref": "#/definitions/digestion_hi_c" + }, + { + "$ref": "#/definitions/dnase_hi_c" + }, + { + "$ref": "#/definitions/alignments" }, { - "$ref": "#/definitions/contacts_calling_options" + "$ref": "#/definitions/valid_pairs_detection" }, { "$ref": "#/definitions/contact_maps_options" @@ -462,4 +473,4 @@ "$ref": "#/definitions/institutional_config_options" } ] -} +} \ No newline at end of file