From c9223c8d1f7b0600c01957855cb63dd538182242 Mon Sep 17 00:00:00 2001 From: nservant <nicolas.servant@curie.fr> Date: Sat, 28 Nov 2020 14:04:49 +0100 Subject: [PATCH] [MODIF] new digestion parameter --- CHANGELOG.md | 1 + conf/test.config | 10 ++---- docs/usage.md | 86 +++++++++++++++++++++++++++----------------- main.nf | 14 ++++++-- nextflow.config | 60 +++++++++++++++++++++---------- nextflow_schema.json | 5 +++ 6 files changed, 113 insertions(+), 63 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19d8c62..3063cbd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## v1.3.0dev +* New `--digestion` parameter to automatically set the restriction_site and ligation_site motifs * New `--keep_multi` and `keep_dup` options. Default: false * Template update for nf-core/tools v1.11 * Minor fix to summary log messages in pipeline header diff --git a/conf/test.config b/conf/test.config index 2ab8e57..5b3ac0e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,14 +24,8 @@ config_profile_name = 'Hi-C test data from Schalbetter et al. (2017)' // Annotations fasta = 'https://github.com/nf-core/test-datasets/raw/hic/reference/W303_SGD_2015_JRIU00000000.fsa' - restriction_site = 'A^AGCTT' - ligation_site = 'AAGCTAGCTT' - - min_mapq = 2 - rm_dup = true - rm_singleton = true - rm_multi = true - + digestion = 'hindiii' + min_mapq = 20 min_restriction_fragment_size = 100 max_restriction_fragment_size = 100000 min_insert_size = 100 diff --git a/docs/usage.md b/docs/usage.md index 31eabe9..cbaab42 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -188,6 +188,31 @@ We recommend adding the following line to your environment to limit this NXF_OPTS='-Xms1g -Xmx4g' ``` +## Use case + +### Hi-C digestion protocol + +Here is an command line example for standard DpnII digestion protocols. +Alignment will be performed on the `mm10` genome with default paramters. +Multi-hits will not be considered and duplicates will be removed. +Note that by default, no filters are applied on DNA and restriction fragment sizes. + + +```bash +nextflow run main.nf --input './*_R{1,2}.fastq.gz' --genome 'mm10' --digestion 'dnpii' +``` + +### DNase Hi-C protocol + +Here is an command line example for DNase protocol. +Alignment will be performed on the `mm10` genome with default paramters. +Multi-hits will not be considered and duplicates will be removed. +Contacts involving fragments separated by less than 1000bp will be discarded. + +``` +nextflow run main.nf --input './*_R{1,2}.fastq.gz' --genome 'mm10' --dnase --min_cis 1000 +``` + ## Inputs ### `--input` @@ -207,16 +232,7 @@ notation to specify read pairs. If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` -By default, the pipeline expects paired-end data. If you have single-end data, -you need to specify `--single_end` on the command line when you launch the pipeline. -A normal glob pattern, enclosed in quotation marks, can then be used for `--input`. -For example: - -```bash ---single_end --input '*.fastq' -``` - -It is not possible to run a mixture of single-end and paired-end files in one run. +Note that the Hi-C data analysis requires paired-end data. ## Reference genomes @@ -302,7 +318,7 @@ Note that the `--restriction_site` parameter is mandatory to create this file. ## Hi-C specific options -The following options are defined in the `hicpro.config` file, and can be +The following options are defined in the `nextflow.config` file, and can be updated either using a custom configuration file (see `-c` option) or using command line parameter. @@ -345,17 +361,28 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10 ### Digestion Hi-C +#### `--digestion` + +This parameter allows to automatically set the `--restriction_site` and +`--ligation_site` parameter according to the restriction enzyme you used. +Available keywords are 'hindiii', 'dpnii', 'mboi', 'arima'. + +```bash +--digestion 'hindiii' +``` + #### `--restriction_site` -Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s) -is(are) used to generate the list of restriction fragments. +If the restriction enzyme is not available through the `--digestion` +parameter, you can also defined manually the restriction motif(s) for +Hi-C digestion protocol. +The restriction motif(s) is(are) used to generate the list of restriction fragments. The precise cutting site of the restriction enzyme has to be specified using the '^' character. Default: 'A^AGCTT' Here are a few examples: * MboI: ^GATC * DpnII: ^GATC -* BglII: A^GATCT * HindIII: A^AGCTT * ARIMA kit: ^GATC,G^ANTC @@ -383,7 +410,7 @@ Exemple of the ARIMA kit: GATCGATC,GANTGATC,GANTANTC,GATCANTC #### `--min_restriction_fragment_size` Minimum size of restriction fragments to consider for the Hi-C processing. -Default: '' +Default: '0' - no filter ```bash --min_restriction_fragment_size '[numeric]' @@ -392,7 +419,7 @@ Default: '' #### `--max_restriction_fragment_size` Maximum size of restriction fragments to consider for the Hi-C processing. -Default: '' +Default: '0' - no filter ```bash --max_restriction_fragment_size '[numeric]' @@ -401,7 +428,7 @@ Default: '' #### `--min_insert_size` Minimum reads insert size. Shorter 3C products are discarded. -Default: '' +Default: '0' - no filter ```bash --min_insert_size '[numeric]' @@ -410,7 +437,7 @@ Default: '' #### `--max_insert_size` Maximum reads insert size. Longer 3C products are discarded. -Default: '' +Default: '0' - no filter ```bash --max_insert_size '[numeric]' @@ -434,36 +461,29 @@ to remove spurious ligation products. #### `--min_cis_dist` Filter short range contact below the specified distance. -Mainly useful for DNase Hi-C. Default: '' +Mainly useful for DNase Hi-C. Default: '0' ```bash --min_cis_dist '[numeric]' ``` -#### `--rm_singleton` - -If specified, singleton reads are discarded at the mapping step. - -```bash ---rm_singleton -``` - -#### `--rm_dup` +#### `--keep_dups` -If specified, duplicates reads are discarded before building contact maps. +If specified, duplicates reads are not discarded before building contact maps. ```bash ---rm_dup +--keep_dups ``` -#### `--rm_multi` +#### `--keep_multi` -If specified, reads that aligned multiple times on the genome are discarded. +If specified, reads that aligned multiple times on the genome are not discarded. Note the default mapping options are based on random hit assignment, meaning that only one position is kept per read. +Note that in this case the `--min_mapq` parameter is ignored. ```bash ---rm_multi +--keep_multi ``` ## Genome-wide contact maps diff --git a/main.nf b/main.nf index fd773e9..fe719af 100644 --- a/main.nf +++ b/main.nf @@ -31,8 +31,9 @@ def helpMessage() { --fasta [file] Path to Fasta reference Digestion Hi-C If not specified in the configuration file or you wish to set up specific digestion protocol - --ligation_site [str] Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT' - --restriction_site [str] Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT' + --digestion [str] Digestion Hi-C. Name of restriction enzyme used for digestion pre-configuration. Default: 'hindiii' + --ligation_site [str] Ligation motifs to trim (comma separated) if not available in --digestion. Default: false + --restriction_site [str] Cutting motif(s) of restriction enzyme(s) (comma separated) if not available in --digestion. Default: false --chromosome_size [file] Path to chromosome size file --restriction_fragments [file] Path to restriction fragment file (bed) --save_reference [bool] Save reference genome to output folder. Default: False @@ -101,9 +102,15 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" } +if (params.digest && params.digestion && !params.digest.containsKey(params.digestion)) { + exit 1, "Unknown digestion protocol. Currently, the available digestion options are ${params.digest.keySet().join(", ")}. Please set manually the '--restriction_site' and '--ligation_site' parameters." +} +params.restriction_site = params.digestion ? params.digest[ params.digestion ].restriction_site ?: false : false +params.ligation_site = params.digestion ? params.digest[ params.digestion ].ligation_site ?: false : false + // Check Digestion or DNase Hi-C mode if (!params.dnase && !params.ligation_site) { - exit 1, "Ligation motif not found. For DNase Hi-C, please use '--dnase' option" + exit 1, "Ligation motif not found. Please either use the `--digestion` parameters or specify the `--restriction_site` and `--ligation_site`. For DNase Hi-C, please use '--dnase' option" } // Reference index path configuration @@ -245,6 +252,7 @@ if (params.split_fastq) summary['Read chunks Size'] = params.fastq_chunks_size summary['Fasta Ref'] = params.fasta if (params.restriction_site){ + summary['Digestion'] = params.digestion summary['Restriction Motif']= params.restriction_site summary['Ligation Motif'] = params.ligation_site summary['Min Fragment Size']= ("$params.min_restriction_fragment_size".isInteger() ? params.min_restriction_fragment_size : 'None') diff --git a/nextflow.config b/nextflow.config index b5089d5..3a75a34 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,49 +8,71 @@ // Global default params, used in configs params { - // Workflow flags + // Inputs / outputs genome = false input = "data/*{1,2}.fastq.gz" - outdir = './results' genome = false input_paths = false - split_fastq = false - fastq_chunks_size = 20000000 chromosome_size = false restriction_fragments = false - skip_maps = false - skip_ice = false - skip_cool = false - skip_multiqc = false save_reference = false - save_interaction_bam = false - save_aligned_intermediates = false - - bwt2_opts_end2end = '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder' - bwt2_opts_trimmed = '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder' - min_mapq = 10 - + // Digestion Hi-C - restriction_site = 'A^AGCTT' - ligation_site = 'AAGCTAGCTT' + digestion = false + digest { + 'hindiii'{ + restriction_site='A^AGCTT' + ligation_site='AAGCTAGCTT' + } + 'mboi' { + restriction_site='^GATC' + ligation_site='GATCGATC' + } + 'dpnii' { + restriction_site='^GATC' + ligation_site='GATCGATC' + } + 'arima' { + restriction_site='^GATC,G^ANT' + ligation_site='GATCGATC,GATCGANT,GANTGATC,GANTGANT' + } + } min_restriction_fragment_size = 0 max_restriction_fragment_size = 0 min_insert_size = 0 max_insert_size = 0 + + // Dnase Hi-C dnase = false min_cis_dist = 0 + + // Mapping + split_fastq = false + fastq_chunks_size = 20000000 + save_interaction_bam = false + save_aligned_intermediates = false + bwt2_opts_end2end = '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder' + bwt2_opts_trimmed = '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder' keep_dups = false keep_multi = false + min_mapq = 10 + + // Contact maps bin_size = '1000000,500000' ice_max_iter = 100 ice_filer_low_count_perc = 0.02 ice_filer_high_count_perc = 0 ice_eps = 0.1 - - publish_dir_mode = 'copy' + // Workflow + skip_maps = false + skip_ice = false + skip_cool = false + skip_multiqc = false + // Boilerplate options + publish_dir_mode = 'copy' multiqc_config = false name = false email = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 4f0113f..c0733c3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -86,6 +86,11 @@ "description": "Parameters for protocols based on restriction enzyme", "default": "", "properties": { + "digestion": { + "type": "string", + "default": "hindiii", + "description": "Name of restriction enzyme to automatically set the restriction_site and ligation_site options" + }, "restriction_site": { "type": "string", "default": "'A^AGCTT'", -- GitLab