From e5216e2fb7d5139a11da997fb53aa70a6173da38 Mon Sep 17 00:00:00 2001
From: nservant <nicolas.servant@curie.fr>
Date: Wed, 2 Sep 2020 22:02:01 +0200
Subject: [PATCH] add --fastq_chunks_size parameter

---
 CHANGELOG.md           |  1 +
 bin/merge_statfiles.py |  2 +-
 docs/usage.md          | 38 +++++++++++++++++++++-----------------
 main.nf                | 32 +++++++++++++++++---------------
 nextflow.config        | 11 ++++++-----
 nextflow_schema.json   | 42 ++++++++++++++++++++++++------------------
 6 files changed, 70 insertions(+), 56 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b72dc20..719ea08 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 * Template update for nf-core/tools v1.10.2
+* Add the `--fastq_chunks_size` to specify the number of reads per chunks if split_fastq is true
 
 ### `Fixed`
 
diff --git a/bin/merge_statfiles.py b/bin/merge_statfiles.py
index 469cacd..dc11bf7 100755
--- a/bin/merge_statfiles.py
+++ b/bin/merge_statfiles.py
@@ -48,7 +48,7 @@ if __name__ == "__main__":
                 if not line.startswith("#"):
                     lsp = line.strip().split("\t")
                     data = map(num, lsp[1:len(lsp)])
-                    template[str(lsp[0])] = data
+                    template[str(lsp[0])] = list(data)
                 
         if len(template) == 0:
             print("Cannot find template files !")
diff --git a/docs/usage.md b/docs/usage.md
index d4af9fd..11b0653 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -517,75 +517,79 @@ normalization. Default: 0.1
 
 ## Inputs/Outputs
 
-### `--splitFastq`
+### `--split_fastq`
 
 By default, the nf-core Hi-C pipeline expects one read pairs per sample.
 However, for large Hi-C data processing single fastq files can be very
 time consuming.
-The `--splitFastq` option allows to automatically split input read pairs
-into chunks of reads. In this case, all chunks will be processed in parallel
+The `--split_fastq` option allows to automatically split input read pairs
+into chunks of reads of size `--fastq_chunks_size` (Default : 20000000). In this case, all chunks will be processed in parallel
 and merged before generating the contact maps, thus leading to a significant
 increase of processing performance.
 
 ```bash
---splitFastq '[Number of reads per chunk]'
+--split_fastq --fastq_chunks_size '[numeric]'
 ```
 
-### `--saveReference`
+### `--save_reference`
 
 If specified, annotation files automatically generated from the `--fasta` file
 are exported in the results folder. Default: false
 
 ```bash
---saveReference
+--save_reference
 ```
 
-### `--saveAlignedIntermediates`
+### `--save_aligned_intermediates`
 
 If specified, all intermediate mapping files are saved and exported in the
 results folder. Default: false
 
 ```bash
---saveReference
+--save_aligned_inermediates
 ```
 
-### `--saveInteractionBAM`
+### `--save_interaction_bam`
 
 If specified, write a BAM file with all classified reads (valid paires,
 dangling end, self-circle, etc.) and its tags.
 
+```bash
+--save_interaction_bam
+```
+
 ## Skip options
 
-### `--skipMaps`
+### `--skip_maps`
 
 If defined, the workflow stops with the list of valid interactions, and the
 genome-wide maps are not built. Usefult for capture-C analysis. Default: false
 
 ```bash
---skipMaps
+--skip_maps
 ```
 
-### `--skipIce`
+### `--skip_ice`
 
 If defined, the ICE normalization is not run on the raw contact maps.
 Default: false
 
 ```bash
---skipIce
+--skip_ice
 ```
 
-### `--skipCool`
+### `--skip_cool`
 
 If defined, cooler files are not generated. Default: false
 
 ```bash
---skipCool
+--skip_cool
 ```
 
-### `--skipMultiQC`
+### `--skip_multiQC`
 
 If defined, the MultiQC report is not generated. Default: false
 
 ```bash
---skipMultiQC
+--skip_multiQC
 ```
diff --git a/main.nf b/main.nf
index 5763152..495a1fe 100644
--- a/main.nf
+++ b/main.nf
@@ -34,7 +34,8 @@ def helpMessage() {
       --save_reference [bool]                   Save reference genome to output folder. Default: False
 
     Alignments
-      --split_fastq [int]                       Size of read chuncks to use to speed up the workflow. Default: None
+      --split_fastq [bool]                      Split fastq files in reads chunks to speed up computation. Default: None
+      --fastq_chunks_size [int]                 Size of read chunks if split_fastq is true. Default: 20000000
       --save_aligned_intermediates [bool]       Save intermediates alignment files. Default: False 
       --bwt2_opts_end2end [str]                 Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default.
       --bwt2_opts_trimmed [str]                 Options for bowtie2 mapping after ligation site trimming. See hic.config for default.
@@ -46,14 +47,14 @@ def helpMessage() {
       --rm_dup [bool]                           Remove duplicates. Default: true
  
     Contacts calling
-      --min_restriction_fragment_size [int]     Minimum size of restriction fragments to consider. Default: None
-      --max_restriction_fragment_size [int]     Maximum size of restriction fragments to consider. Default: None
-      --min_insert_size [int]                   Minimum insert size of mapped reads to consider. Default: None
-      --max_insert_size [int]                   Maximum insert size of mapped reads to consider. Default: None
+      --min_restriction_fragment_size [int]     Minimum size of restriction fragments to consider. Default: 0
+      --max_restriction_fragment_size [int]     Maximum size of restriction fragments to consider. Default: 0
+      --min_insert_size [int]                   Minimum insert size of mapped reads to consider. Default: 0
+      --max_insert_size [int]                   Maximum insert size of mapped reads to consider. Default: 0
       --save_interaction_bam [bool]             Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False
 
       --dnase [bool]                            Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False
-      --min_cis_dist [int]                      Minimum intra-chromosomal distance to consider. Default: None
+      --min_cis_dist [int]                      Minimum intra-chromosomal distance to consider. Default: 0
 
     Contact maps
       --bin_size [int]                          Bin size for contact maps (comma separated). Default: '1000000,500000'
@@ -163,7 +164,7 @@ if (params.input_paths){
 
 if ( params.split_fastq ){
    raw_reads_full = raw_reads.concat( raw_reads_2 )
-   raw_reads = raw_reads_full.splitFastq( by: params.split_fastq , file: true)
+   raw_reads = raw_reads_full.splitFastq( by: params.fastq_chunks_size, file: true)
  }else{
    raw_reads = raw_reads.concat( raw_reads_2 ).dump(tag: "data")
 }
@@ -239,6 +240,8 @@ if(workflow.revision) summary['Pipeline Release'] = workflow.revision
 summary['Run Name']         = custom_runName ?: workflow.runName
 summary['Input']            = params.input
 summary['splitFastq']       = params.split_fastq
+if (params.split_fastq)
+   summary['Read chunks Size'] = params.fastq_chunks_size
 summary['Fasta Ref']        = params.fasta
 summary['Restriction Motif']= params.restriction_site
 summary['Ligation Motif']   = params.ligation_site
@@ -647,12 +650,12 @@ if (!params.dnase){
       }
 
       def opts = ""
-      if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}"
-      if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}"
-      if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}"
-      if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}"
-      if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}"
-      if (params.save_interaction_bam) opts="${opts} --sam"
+      opts += params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
+      opts += params.min_insert_size > 0 ?  " -s ${params.min_insert_size}" : ''
+      opts += params.max_insert_size > 0 ? " -l ${params.max_insert_size}" : ''
+      opts += params.min_restriction_fragment_size > 0 ? " -t ${params.min_restriction_fragment_size}" : ''
+      opts += params.max_restriction_fragment_size > 0 ? " -m ${params.max_restriction_fragment_size}" : ''
+      opts += params.save_interaction_bam ? " --sam" : ''
       prefix = pe_bam.toString() - ~/.bam/
       """
       mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts}
@@ -680,8 +683,7 @@ else{
          sample = sample.toString() - ~/(\.[0-9]+)$/
       }
 
-      def opts = ""
-      if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}"
+      opts = params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
       prefix = pe_bam.toString() - ~/.bam/
       """
       mapped_2hic_dnase.py -r ${pe_bam} ${opts}
diff --git a/nextflow.config b/nextflow.config
index edb8038..c765a4a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -17,6 +17,7 @@ params {
   genome = false
   input_paths = false
   split_fastq = false
+  fastq_chunks_size = 20000000
   chromosome_size = false
   restriction_fragments = false
   skip_maps = false
@@ -34,12 +35,12 @@ params {
   // Digestion Hi-C
   restriction_site = 'A^AGCTT'
   ligation_site = 'AAGCTAGCTT'
-  min_restriction_fragment_size = false
-  max_restriction_fragment_size = false
-  min_insert_size = false
-  max_insert_size = false
+  min_restriction_fragment_size = 0
+  max_restriction_fragment_size = 0
+  min_insert_size = 0
+  max_insert_size = 0
   dnase = false
-  min_cis_dist = false
+  min_cis_dist = 0
   rm_dup = true
   rm_singleton = true
   rm_multi = true
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ed2f701..9071bd2 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -27,10 +27,16 @@
                     "default": "undefined"
                 },
                 "split_fastq": {
-                    "type": "number",
-                    "description": "Split the reads into chunks before running. Specify the number of reads per chuncks as --split_fastq 20000000.",
-                    "fa_icon": "fas fa-dna"
+                    "type": "boolean",
+                    "description": "Split the reads into chunks before running the pipelne",
+                    "fa_icon": "fas fa-dna",
+		    "default": "false"
                 },
+		"fastq_chunks_size":{
+		    "type": "integer",
+		    "description": "Read number per chunks if split_fastq is used",
+		    "default": "20000000"
+		},
                 "single_end": {
                     "type": "boolean",
                     "description": "Specifies that the input is single-end reads.",
@@ -178,29 +184,29 @@
             "fa_icon": "fas fa-signature",
             "properties": {
                 "min_cis_dist": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products"
+                    "type": "integer",
+                    "default": "O",
+                    "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered"
                 },
                 "max_insert_size": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Maximum fragment size to consider"
+                    "type": "integer",
+                    "default": "0",
+                    "description": "Maximum fragment size to consider. Only values > 0 are considered"
                 },
                 "min_insert_size": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Minimum fragment size to consider"
+                    "type": "integer",
+                    "default": "0",
+                    "description": "Minimum fragment size to consider. Only values > 0 are considered"
                 },
                 "max_restriction_fragment_size": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Maximum restriction fragment size to consider"
+                    "type": "integer",
+                    "default": "0",
+                    "description": "Maximum restriction fragment size to consider. Only values > 0 are considered"
                 },
                 "min_restriction_fragment_size": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Minimum restriction fragment size to consider"
+                    "type": "integer",
+                    "default": "0",
+                    "description": "Minimum restriction fragment size to consider. Only values > 0 are considered"
                 }
             }
         },
-- 
GitLab