From ac3967c475ee6719e2728237a8275cd443a6a233 Mon Sep 17 00:00:00 2001
From: nservant <nicolas.servant@curie.fr>
Date: Fri, 27 Nov 2020 17:40:05 +0100
Subject: [PATCH] [MODIF] update input parameters

---
 CHANGELOG.md         |   9 +++
 main.nf              |  78 +++++++++++++-----------
 nextflow.config      |   5 +-
 nextflow_schema.json | 141 +++++++++++++++++++++++--------------------
 4 files changed, 129 insertions(+), 104 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 66b31d1..5b2e288 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,9 +5,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## v1.3.0dev
 
+* New `--keep_multi` and `keep_dup` options. Default: false
 * Template update for nf-core/tools v1.11
 * Minor fix to summary log messages in pipeline header
 
+### `Fixed`
+
+* `min_mapq` is ignored if `--keep_multi` is used
+
+### Deprecated
+
+* `--rm_dup` and `rm_multi` are replaced by `--keep_dup` and `--keep_multi`
+
 ## v1.2.2 - 2020-09-02
 
 ### `Added`
diff --git a/main.nf b/main.nf
index 1e48420..a6a92f2 100644
--- a/main.nf
+++ b/main.nf
@@ -22,48 +22,49 @@ def helpMessage() {
 
     Mandatory arguments:
       --input [file]                            Path to input data (must be surrounded with quotes)
+      --genome [str]                            Name of iGenomes reference
       -profile [str]                            Configuration profile to use. Can use multiple (comma separated)
                                                 Available: conda, docker, singularity, awsbatch, test and more.
 
     References                                  If not specified in the configuration file or you wish to overwrite any of the references.
-      --genome [str]                            Name of iGenomes reference
       --bwt2_index [file]                       Path to Bowtie2 index
       --fasta [file]                            Path to Fasta reference
+
+    Digestion Hi-C                              If not specified in the configuration file or you wish to set up specific digestion protocol
+      --ligation_site [str]                     Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT'
+      --restriction_site [str]                  Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT'
       --chromosome_size [file]                  Path to chromosome size file
       --restriction_fragments [file]            Path to restriction fragment file (bed)
       --save_reference [bool]                   Save reference genome to output folder. Default: False
 
+    DNase Hi-C
+      --dnase [bool]                            Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False
+      --min_cis_dist [int]                      Minimum intra-chromosomal distance to consider. Default: None 
+
     Alignments
-      --split_fastq [bool]                      Split fastq files in reads chunks to speed up computation. Default: false
-      --fastq_chunks_size [int]                 Size of read chunks if split_fastq is true. Default: 20000000
-      --save_aligned_intermediates [bool]       Save intermediates alignment files. Default: False
       --bwt2_opts_end2end [str]                 Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default.
       --bwt2_opts_trimmed [str]                 Options for bowtie2 mapping after ligation site trimming. See hic.config for default.
       --min_mapq [int]                          Minimum mapping quality values to consider. Default: 10
-      --restriction_site [str]                  Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT'
-      --ligation_site [str]                     Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT'
-      --rm_singleton [bool]                     Remove singleton reads. Default: true
-      --rm_multi [bool]                         Remove multi-mapped reads. Default: true
-      --rm_dup [bool]                           Remove duplicates. Default: true
-
-    Contacts calling
-      --min_restriction_fragment_size [int]     Minimum size of restriction fragments to consider. Default: 0
-      --max_restriction_fragment_size [int]     Maximum size of restriction fragments to consider. Default: 0
-      --min_insert_size [int]                   Minimum insert size of mapped reads to consider. Default: 0
-      --max_insert_size [int]                   Maximum insert size of mapped reads to consider. Default: 0
-      --save_interaction_bam [bool]             Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False
+      --keep_multi [bool]                       Keep multi-mapped reads (--min_mapq is ignored). Default: false
+      --keep_dups [bool]                        Keep duplicates. Default: false
+      --save_aligned_intermediates [bool]       Save intermediates alignment files. Default: False
+      --split_fastq [bool]                      Split fastq files in reads chunks to speed up computation. Default: false
+      --fastq_chunks_size [int]                 Size of read chunks if split_fastq is true. Default: 20000000
 
-      --dnase [bool]                            Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False
-      --min_cis_dist [int]                      Minimum intra-chromosomal distance to consider. Default: 0
+    Valid Pairs Detection
+      --min_restriction_fragment_size [int]     Minimum size of restriction fragments to consider. Default: None
+      --max_restriction_fragment_size [int]     Maximum size of restriction fragments to consider. Default: None
+      --min_insert_size [int]                   Minimum insert size of mapped reads to consider. Default: None
+      --max_insert_size [int]                   Maximum insert size of mapped reads to consider. Default: None
+      --save_interaction_bam [bool]             Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False
 
     Contact maps
-      --bin_size [int]                          Bin size for contact maps (comma separated). Default: '1000000,500000'
+      --bin_size [str]                          Bin size for contact maps (comma separated). Default: '1000000,500000'
       --ice_max_iter [int]                      Maximum number of iteration for ICE normalization. Default: 100
       --ice_filter_low_count_perc [float]       Percentage of low counts columns/rows to filter before ICE normalization. Default: 0.02
       --ice_filter_high_count_perc [float]      Percentage of high counts columns/rows to filter before ICE normalization. Default: 0
       --ice_eps [float]                         Convergence criteria for ICE normalization. Default: 0.1
 
-
     Workflow
       --skip_maps [bool]                        Skip generation of contact maps. Useful for capture-C. Default: False
       --skip_ice [bool]                         Skip ICE normalization. Default: False
@@ -243,17 +244,20 @@ summary['splitFastq']       = params.split_fastq
 if (params.split_fastq)
    summary['Read chunks Size'] = params.fastq_chunks_size
 summary['Fasta Ref']        = params.fasta
-summary['Restriction Motif']= params.restriction_site
-summary['Ligation Motif']   = params.ligation_site
-summary['DNase Mode']       = params.dnase
-summary['Remove Dup']       = params.rm_dup
-summary['Remove MultiHits'] = params.rm_multi
+if (params.restriction_site){
+   summary['Restriction Motif']= params.restriction_site
+   summary['Ligation Motif']   = params.ligation_site
+   summary['Min Fragment Size']= ("$params.min_restriction_fragment_size".isInteger() ? params.min_restriction_fragment_size : 'None')
+   summary['Max Fragment Size']= ("$params.max_restriction_fragment_size".isInteger() ? params.max_restriction_fragment_size : 'None')
+   summary['Min Insert Size']  = ("$params.min_insert_size".isInteger() ? params.min_insert_size : 'None')
+   summary['Max Insert Size']  = ("$params.max_insert_size".isInteger() ? params.max_insert_size : 'None')
+}else{
+   summary['DNase Mode']    = params.dnase
+   summary['Min CIS dist']  = ("$params.min_cis_dist".isInteger() ? params.min_cis_dist : 'None')
+}
 summary['Min MAPQ']         = params.min_mapq
-summary['Min Fragment Size']= params.min_restriction_fragment_size
-summary['Max Fragment Size']= params.max_restriction_fragment_size
-summary['Min Insert Size']  = params.min_insert_size
-summary['Max Insert Size']  = params.max_insert_size
-summary['Min CIS dist']     = params.min_cis_dist
+summary['Keep Duplicates']  = params.keep_dups
+summary['Keep Multihits']   = params.keep_multi
 summary['Maps resolution']  = params.bin_size
 summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
 if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
@@ -526,7 +530,7 @@ if (!params.dnase){
       set val(oname), file("${prefix}.mapstat") into all_mapstat
 
       script:
-      sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2|_1$|_2)/
+      sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2|_1|_2)/
       tag = prefix.toString() =~/_R1|_val_1|_1/ ? "R1" : "R2"
       oname = prefix.toString() - ~/(\.[0-9]+)$/
       """
@@ -590,7 +594,7 @@ process combine_mapped_files{
    	      saveAs: {filename -> filename.indexOf(".pairstat") > 0 ? "stats/$filename" : "$filename"}
 
    input:
-   set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple()
+   set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple().dump(tag:'bams')
 
    output:
    set val(sample), file("${sample}_bwt2pairs.bam") into paired_bam
@@ -604,9 +608,11 @@ process combine_mapped_files{
    oname = sample.toString() - ~/(\.[0-9]+)$/
 
    def opts = "-t"
-   opts = params.rm_singleton ? "${opts}" : "--single ${opts}"
-   opts = params.rm_multi ? "${opts}" : "--multi ${opts}"
-   if ("$params.min_mapq".isInteger()) opts="${opts} -q ${params.min_mapq}"
+   if (params.keep_multi) {
+     opts="${opts} --multi"
+   }else if (params.min_mapq){
+     opts="${opts} -q ${params.min_mapq}"
+   }
    """
    mergeSAM.py -f ${r1_bam} -r ${r2_bam} -o ${sample}_bwt2pairs.bam ${opts}
    """
@@ -705,7 +711,7 @@ process remove_duplicates {
    file("stats/") into all_mergestat
 
    script:
-   if ( params.rm_dup ){
+   if ( ! params.keep_dups ){
    """
    mkdir -p stats/${sample}
 
diff --git a/nextflow.config b/nextflow.config
index 14a4f91..32fe8f8 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -41,9 +41,8 @@ params {
   max_insert_size = 0
   dnase = false
   min_cis_dist = 0
-  rm_dup = true
-  rm_singleton = true
-  rm_multi = true
+  keep_dups = false
+  keep_multi = false
   bin_size = '1000000,500000'
   ice_max_iter = 100
   ice_filer_low_count_perc = 0.02
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 84bb558..4f0113f 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -5,11 +5,11 @@
     "description": "Analysis of Chromosome Conformation Capture data (Hi-C)",
     "type": "object",
     "definitions": {
-        "input_output_options": {
-            "title": "Input/output options",
+        "mandatory_arguments": {
+            "title": "Mandatory arguments",
             "type": "object",
             "fa_icon": "fas fa-terminal",
-            "description": "Define where the pipeline should find input data and save output data.",
+            "description": "Mandatory arguments to run the pipeline",
             "required": [
                 "input"
             ],
@@ -26,22 +26,11 @@
                     "description": "Input FastQ files for test only",
                     "default": "undefined"
                 },
-                "split_fastq": {
-                    "type": "boolean",
-                    "description": "Split the reads into chunks before running the pipelne",
-                    "fa_icon": "fas fa-dna",
-		    "default": "false"
-                },
-		"fastq_chunks_size":{
-		    "type": "integer",
-		    "description": "Read number per chunks if split_fastq is used",
-		    "default": "20000000"
-		},
-                "single_end": {
-                    "type": "boolean",
-                    "description": "Specifies that the input is single-end reads.",
-                    "fa_icon": "fas fa-align-center",
-                    "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--input`. For example:\n\n```bash\n--single_end --input '*.fastq'\n```\n\nIt is not possible to run a mixture of single-end and paired-end files in one run."
+                "genome": {
+                    "type": "string",
+                    "description": "Name of iGenomes reference.",
+                    "fa_icon": "fas fa-book",
+                    "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details."
                 },
                 "outdir": {
                     "type": "string",
@@ -64,12 +53,6 @@
             "fa_icon": "fas fa-dna",
             "description": "Options for the reference genome indices used to align reads.",
             "properties": {
-                "genome": {
-                    "type": "string",
-                    "description": "Name of iGenomes reference.",
-                    "fa_icon": "fas fa-book",
-                    "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details."
-                },
                 "fasta": {
                     "type": "string",
                     "fa_icon": "fas fa-font",
@@ -94,6 +77,24 @@
                     "type": "string",
                     "description": "Full path to directory containing Bowtie index including base name. i.e. `/path/to/index/base`.",
                     "fa_icon": "far fa-file-alt"
+                }
+            }
+        },
+        "digestion_hi_c": {
+            "title": "Digestion Hi-C",
+            "type": "object",
+            "description": "Parameters for protocols based on restriction enzyme",
+            "default": "",
+            "properties": {
+                "restriction_site": {
+                    "type": "string",
+                    "default": "'A^AGCTT'",
+                    "description": "Restriction motifs used during digestion. Several motifs (comma separated) can be provided."
+                },
+                "ligation_site": {
+                    "type": "string",
+                    "default": "'AAGCTAGCTT",
+                    "description": "Expected motif after DNA ligation.  Several motifs (comma separated) can be provided."
                 },
                 "chromosome_size": {
                     "type": "string",
@@ -115,41 +116,40 @@
                 }
             }
         },
-        "data_processing_options": {
-            "title": "Data processing",
+        "dnase_hi_c": {
+            "title": "DNAse Hi-C",
             "type": "object",
-            "description": "Parameters for Hi-C data processing",
+            "description": "Parameters for protocols based on DNAse digestion",
             "default": "",
-            "fa_icon": "fas fa-bahai",
             "properties": {
                 "dnase": {
                     "type": "boolean",
                     "description": "For Hi-C protocols which are not based on enzyme digestion such as DNase Hi-C"
                 },
-                "restriction_site": {
-                    "type": "string",
-                    "default": "'A^AGCTT'",
-                    "description": "Restriction motifs used during digestion. Several motifs (comma separated) can be provided."
-                },
-                "ligation_site": {
-                    "type": "string",
-                    "default": "'AAGCTAGCTT",
-                    "description": "Expected motif after DNA ligation.  Several motifs (comma separated) can be provided."
-                },
-                "rm_dup": {
-                    "type": "boolean",
-                    "description": "Remove duplicates",
-                    "default": true
-                },
-                "rm_multi": {
+                "min_cis_dist": {
+                    "type": "integer",
+                    "default": "O",
+                    "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered"
+                }
+            }
+        },
+        "alignments": {
+            "title": "Alignments",
+            "type": "object",
+            "description": "Parameters for reads aligments",
+            "default": "",
+            "fa_icon": "fas fa-bahai",
+            "properties": {
+                "split_fastq": {
                     "type": "boolean",
-                    "description": "Remove multi-mapped reads",
-                    "default": true
+                    "description": "Split the reads into chunks before running the pipelne",
+                    "fa_icon": "fas fa-dna",
+                    "default": "false"
                 },
-                "rm_singleton": {
-                    "type": "boolean",
-                    "description": "Remove singleton",
-                    "default": true
+                "fastq_chunks_size": {
+                    "type": "integer",
+                    "description": "Read number per chunks if split_fastq is used",
+                    "default": "20000000"
                 },
                 "min_mapq": {
                     "type": "integer",
@@ -166,27 +166,28 @@
                     "default": "'--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder'",
                     "description": "Option for trimmed reads mapping"
                 },
-                "save_interaction_bam": {
-                    "type": "boolean",
-                    "description": "Save a BAM file where all reads are flagged by their interaction classes"
-                },
                 "save_aligned_intermediates": {
                     "type": "boolean",
                     "description": "Save all BAM files during two-steps mapping"
                 }
             }
         },
-        "contacts_calling_options": {
-            "title": "Contacts calling",
+        "valid_pairs_detection": {
+            "title": "Valid Pairs Detection",
             "type": "object",
             "description": "Options to call significant interactions",
             "default": "",
             "fa_icon": "fas fa-signature",
             "properties": {
-                "min_cis_dist": {
-                    "type": "integer",
-                    "default": "O",
-                    "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered"
+                "keep_dups": {
+                    "type": "string",
+                    "description": "Keep duplicated reads",
+                    "default": "False"
+                },
+                "keep_multi": {
+                    "type": "string",
+                    "description": "Keep multi-aligned reads",
+                    "default": "False"
                 },
                 "max_insert_size": {
                     "type": "integer",
@@ -207,6 +208,10 @@
                     "type": "integer",
                     "default": "0",
                     "description": "Minimum restriction fragment size to consider. Only values > 0 are considered"
+                },
+                "save_interaction_bam": {
+                    "type": "boolean",
+                    "description": "Save a BAM file where all reads are flagged by their interaction classes"
                 }
             }
         },
@@ -435,16 +440,22 @@
     },
     "allOf": [
         {
-            "$ref": "#/definitions/input_output_options"
+            "$ref": "#/definitions/mandatory_arguments"
         },
         {
             "$ref": "#/definitions/reference_genome_options"
         },
         {
-            "$ref": "#/definitions/data_processing_options"
+            "$ref": "#/definitions/digestion_hi_c"
+        },
+        {
+            "$ref": "#/definitions/dnase_hi_c"
+        },
+        {
+            "$ref": "#/definitions/alignments"
         },
         {
-            "$ref": "#/definitions/contacts_calling_options"
+            "$ref": "#/definitions/valid_pairs_detection"
         },
         {
             "$ref": "#/definitions/contact_maps_options"
@@ -462,4 +473,4 @@
             "$ref": "#/definitions/institutional_config_options"
         }
     ]
-}
+}
\ No newline at end of file
-- 
GitLab