From c9223c8d1f7b0600c01957855cb63dd538182242 Mon Sep 17 00:00:00 2001
From: nservant <nicolas.servant@curie.fr>
Date: Sat, 28 Nov 2020 14:04:49 +0100
Subject: [PATCH] [MODIF] new digestion parameter

---
 CHANGELOG.md         |  1 +
 conf/test.config     | 10 ++----
 docs/usage.md        | 86 +++++++++++++++++++++++++++-----------------
 main.nf              | 14 ++++++--
 nextflow.config      | 60 +++++++++++++++++++++----------
 nextflow_schema.json |  5 +++
 6 files changed, 113 insertions(+), 63 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19d8c62..3063cbd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## v1.3.0dev
 
+* New `--digestion` parameter to automatically set the restriction_site and ligation_site motifs
 * New `--keep_multi` and `keep_dup` options. Default: false
 * Template update for nf-core/tools v1.11
 * Minor fix to summary log messages in pipeline header
diff --git a/conf/test.config b/conf/test.config
index 2ab8e57..5b3ac0e 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -24,14 +24,8 @@ config_profile_name = 'Hi-C test data from Schalbetter et al. (2017)'
 
   // Annotations
   fasta = 'https://github.com/nf-core/test-datasets/raw/hic/reference/W303_SGD_2015_JRIU00000000.fsa'
-  restriction_site = 'A^AGCTT'
-  ligation_site = 'AAGCTAGCTT'
-  
-  min_mapq = 2
-  rm_dup = true
-  rm_singleton = true
-  rm_multi = true
-
+  digestion = 'hindiii'
+  min_mapq = 20
   min_restriction_fragment_size = 100
   max_restriction_fragment_size = 100000
   min_insert_size = 100
diff --git a/docs/usage.md b/docs/usage.md
index 31eabe9..cbaab42 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -188,6 +188,31 @@ We recommend adding the following line to your environment to limit this
 NXF_OPTS='-Xms1g -Xmx4g'
 ```
 
+## Use case
+
+### Hi-C digestion protocol
+
+Here is an command line example for standard DpnII digestion protocols.
+Alignment will be performed on the `mm10` genome with default paramters.
+Multi-hits will not be considered and duplicates will be removed.
+Note that by default, no filters are applied on DNA and restriction fragment sizes. 
+
+
+```bash
+nextflow run main.nf --input './*_R{1,2}.fastq.gz' --genome 'mm10' --digestion 'dnpii'
+```
+
+### DNase Hi-C protocol
+
+Here is an command line example for DNase protocol.
+Alignment will be performed on the `mm10` genome with default paramters.
+Multi-hits will not be considered and duplicates will be removed.
+Contacts involving fragments separated by less than 1000bp will be discarded.
+
+```
+nextflow run main.nf --input './*_R{1,2}.fastq.gz' --genome 'mm10' --dnase --min_cis 1000
+```
+
 ## Inputs
 
 ### `--input`
@@ -207,16 +232,7 @@ notation to specify read pairs.
 
 If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz`
 
-By default, the pipeline expects paired-end data. If you have single-end data,
-you need to specify `--single_end` on the command line when you launch the pipeline.
-A normal glob pattern, enclosed in quotation marks, can then be used for `--input`.
-For example:
-
-```bash
---single_end --input '*.fastq'
-```
-
-It is not possible to run a mixture of single-end and paired-end files in one run.
+Note that the Hi-C data analysis requires paired-end data.
 
 ## Reference genomes
 
@@ -302,7 +318,7 @@ Note that the `--restriction_site` parameter is mandatory to create this file.
 
 ## Hi-C specific options
 
-The following options are defined in the `hicpro.config` file, and can be
+The following options are defined in the `nextflow.config` file, and can be
 updated either using a custom configuration file (see `-c` option) or using
 command line parameter.
 
@@ -345,17 +361,28 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10
 
 ### Digestion Hi-C
 
+#### `--digestion`
+
+This parameter allows to automatically set the `--restriction_site` and
+`--ligation_site` parameter according to the restriction enzyme you used.
+Available keywords are  'hindiii', 'dpnii', 'mboi', 'arima'.
+
+```bash
+--digestion 'hindiii'
+```
+
 #### `--restriction_site`
 
-Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s)
-is(are) used to generate the list of restriction fragments.
+If the restriction enzyme is not available through the `--digestion`
+parameter, you can also defined manually the restriction motif(s) for
+Hi-C digestion protocol.
+The restriction motif(s) is(are) used to generate the list of restriction fragments.
 The precise cutting site of the restriction enzyme has to be specified using
 the '^' character. Default: 'A^AGCTT'
 Here are a few examples:
 
 * MboI: ^GATC
 * DpnII: ^GATC
-* BglII: A^GATCT
 * HindIII: A^AGCTT
 * ARIMA kit: ^GATC,G^ANTC
 
@@ -383,7 +410,7 @@ Exemple of the ARIMA kit: GATCGATC,GANTGATC,GANTANTC,GATCANTC
 #### `--min_restriction_fragment_size`
 
 Minimum size of restriction fragments to consider for the Hi-C processing.
-Default: ''
+Default: '0' - no filter
 
 ```bash
 --min_restriction_fragment_size '[numeric]'
@@ -392,7 +419,7 @@ Default: ''
 #### `--max_restriction_fragment_size`
 
 Maximum size of restriction fragments to consider for the Hi-C processing.
-Default: ''
+Default: '0' - no filter
 
 ```bash
 --max_restriction_fragment_size '[numeric]'
@@ -401,7 +428,7 @@ Default: ''
 #### `--min_insert_size`
 
 Minimum reads insert size. Shorter 3C products are discarded.
-Default: ''
+Default: '0' - no filter
 
 ```bash
 --min_insert_size '[numeric]'
@@ -410,7 +437,7 @@ Default: ''
 #### `--max_insert_size`
 
 Maximum reads insert size. Longer 3C products are discarded.
-Default: ''
+Default: '0' - no filter
 
 ```bash
 --max_insert_size '[numeric]'
@@ -434,36 +461,29 @@ to remove spurious ligation products.
 #### `--min_cis_dist`
 
 Filter short range contact below the specified distance.
-Mainly useful for DNase Hi-C. Default: ''
+Mainly useful for DNase Hi-C. Default: '0'
 
 ```bash
 --min_cis_dist '[numeric]'
 ```
 
-#### `--rm_singleton`
-
-If specified, singleton reads are discarded at the mapping step.
-
-```bash
---rm_singleton
-```
-
-#### `--rm_dup`
+#### `--keep_dups`
 
-If specified, duplicates reads are discarded before building contact maps.
+If specified, duplicates reads are not discarded before building contact maps.
 
 ```bash
---rm_dup
+--keep_dups
 ```
 
-#### `--rm_multi`
+#### `--keep_multi`
 
-If specified, reads that aligned multiple times on the genome are discarded.
+If specified, reads that aligned multiple times on the genome are not discarded.
 Note the default mapping options are based on random hit assignment, meaning
 that only one position is kept per read.
+Note that in this case the `--min_mapq` parameter is ignored.
 
 ```bash
---rm_multi
+--keep_multi
 ```
 
 ## Genome-wide contact maps
diff --git a/main.nf b/main.nf
index fd773e9..fe719af 100644
--- a/main.nf
+++ b/main.nf
@@ -31,8 +31,9 @@ def helpMessage() {
       --fasta [file]                            Path to Fasta reference
 
     Digestion Hi-C                              If not specified in the configuration file or you wish to set up specific digestion protocol
-      --ligation_site [str]                     Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT'
-      --restriction_site [str]                  Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT'
+      --digestion [str]                         Digestion Hi-C. Name of restriction enzyme used for digestion pre-configuration. Default: 'hindiii'
+      --ligation_site [str]                     Ligation motifs to trim (comma separated) if not available in --digestion. Default: false
+      --restriction_site [str]                  Cutting motif(s) of restriction enzyme(s) (comma separated) if not available in --digestion. Default: false
       --chromosome_size [file]                  Path to chromosome size file
       --restriction_fragments [file]            Path to restriction fragment file (bed)
       --save_reference [bool]                   Save reference genome to output folder. Default: False
@@ -101,9 +102,15 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome
     exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
 }
 
+if (params.digest && params.digestion && !params.digest.containsKey(params.digestion)) {
+   exit 1, "Unknown digestion protocol. Currently, the available digestion options are ${params.digest.keySet().join(", ")}. Please set manually the '--restriction_site' and '--ligation_site' parameters."
+}
+params.restriction_site = params.digestion ? params.digest[ params.digestion ].restriction_site ?: false : false
+params.ligation_site = params.digestion ? params.digest[ params.digestion ].ligation_site ?: false : false
+
 // Check Digestion or DNase Hi-C mode
 if (!params.dnase && !params.ligation_site) {
-   exit 1, "Ligation motif not found. For DNase Hi-C, please use '--dnase' option"
+   exit 1, "Ligation motif not found. Please either use the `--digestion` parameters or specify the `--restriction_site` and `--ligation_site`. For DNase Hi-C, please use '--dnase' option"
 }
 
 // Reference index path configuration
@@ -245,6 +252,7 @@ if (params.split_fastq)
    summary['Read chunks Size'] = params.fastq_chunks_size
 summary['Fasta Ref']        = params.fasta
 if (params.restriction_site){
+   summary['Digestion']        = params.digestion
    summary['Restriction Motif']= params.restriction_site
    summary['Ligation Motif']   = params.ligation_site
    summary['Min Fragment Size']= ("$params.min_restriction_fragment_size".isInteger() ? params.min_restriction_fragment_size : 'None')
diff --git a/nextflow.config b/nextflow.config
index b5089d5..3a75a34 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -8,49 +8,71 @@
 // Global default params, used in configs
 params {
 
-  // Workflow flags
+  // Inputs / outputs
   genome = false
   input = "data/*{1,2}.fastq.gz"
-
   outdir = './results'
   genome = false
   input_paths = false
-  split_fastq = false
-  fastq_chunks_size = 20000000
   chromosome_size = false
   restriction_fragments = false
-  skip_maps = false
-  skip_ice = false
-  skip_cool = false
-  skip_multiqc = false
   save_reference = false
-  save_interaction_bam = false
-  save_aligned_intermediates = false
-
-  bwt2_opts_end2end = '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder'
-  bwt2_opts_trimmed = '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder'
-  min_mapq = 10
-
+ 
   // Digestion Hi-C
-  restriction_site = 'A^AGCTT'
-  ligation_site = 'AAGCTAGCTT'
+  digestion = false
+  digest {
+    'hindiii'{
+       restriction_site='A^AGCTT'
+       ligation_site='AAGCTAGCTT'
+    }
+    'mboi' {
+       restriction_site='^GATC'
+       ligation_site='GATCGATC'
+    }
+    'dpnii' {
+       restriction_site='^GATC'
+       ligation_site='GATCGATC'
+    }
+    'arima' {
+       restriction_site='^GATC,G^ANT'
+       ligation_site='GATCGATC,GATCGANT,GANTGATC,GANTGANT'
+    }
+  }
   min_restriction_fragment_size = 0
   max_restriction_fragment_size = 0
   min_insert_size = 0
   max_insert_size = 0
+
+  // Dnase Hi-C
   dnase = false
   min_cis_dist = 0
+
+  // Mapping
+  split_fastq = false
+  fastq_chunks_size = 20000000
+  save_interaction_bam = false
+  save_aligned_intermediates = false
+  bwt2_opts_end2end = '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder'
+  bwt2_opts_trimmed = '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder'
   keep_dups = false
   keep_multi = false
+  min_mapq = 10
+
+  // Contact maps
   bin_size = '1000000,500000'
   ice_max_iter = 100
   ice_filer_low_count_perc = 0.02
   ice_filer_high_count_perc =  0
   ice_eps = 0.1
-  
-  publish_dir_mode = 'copy'
 
+  // Workflow
+  skip_maps = false
+  skip_ice = false
+  skip_cool = false
+  skip_multiqc = false
+  
   // Boilerplate options
+  publish_dir_mode = 'copy'
   multiqc_config = false
   name = false
   email = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 4f0113f..c0733c3 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -86,6 +86,11 @@
             "description": "Parameters for protocols based on restriction enzyme",
             "default": "",
             "properties": {
+                "digestion": {
+                    "type": "string",
+                    "default": "hindiii",
+                    "description": "Name of restriction enzyme to automatically set the restriction_site and ligation_site options"
+                },
                 "restriction_site": {
                     "type": "string",
                     "default": "'A^AGCTT'",
-- 
GitLab