add --fastq_chunks_size parameter

e5216e2f · nservant · 0af0b36b · e5216e2f · e5216e2f · e5216e2f
Commit e5216e2f authored 4 years ago by nservant
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`

 * Template update for nf-core/tools v1.10.2
+* Add the `--fastq_chunks_size` to specify the number of reads per chunks if split_fastq is true

 ### `Fixed`


--- a/bin/merge_statfiles.py
+++ b/bin/merge_statfiles.py
@@ -48,7 +48,7 @@ if __name__ == "__main__":
                if not line.startswith("#"):
                    lsp = line.strip().split("\t")
                    data = map(num, lsp[1:len(lsp)])
-                    template[str(lsp[0])] = data
+                    template[str(lsp[0])] = list(data)
                
        if len(template) == 0:
            print("Cannot find template files !")

--- a/docs/usage.md
+++ b/docs/usage.md
@@ -517,75 +517,79 @@ normalization. Default: 0.1

 ## Inputs/Outputs

-### `--splitFastq`
+### `--split_fastq`

 By default, the nf-core Hi-C pipeline expects one read pairs per sample.
 However, for large Hi-C data processing single fastq files can be very
 time consuming.
-The `--splitFastq` option allows to automatically split input read pairs
-into chunks of reads. In this case, all chunks will be processed in parallel
+The `--split_fastq` option allows to automatically split input read pairs
+into chunks of reads of size `--fastq_chunks_size` (Default : 20000000). In this case, all chunks will be processed in parallel
 and merged before generating the contact maps, thus leading to a significant
 increase of processing performance.

 ```bash
--splitFastq '[Number of reads per chunk]'
+--split_fastq --fastq_chunks_size '[numeric]'
 ```

-### `--saveReference`
+### `--save_reference`

 If specified, annotation files automatically generated from the `--fasta` file
 are exported in the results folder. Default: false

 ```bash
--saveReference
+--save_reference
 ```

-### `--saveAlignedIntermediates`
+### `--save_aligned_intermediates`

 If specified, all intermediate mapping files are saved and exported in the
 results folder. Default: false

 ```bash
--saveReference
+--save_aligned_inermediates
 ```

-### `--saveInteractionBAM`
+### `--save_interaction_bam`

 If specified, write a BAM file with all classified reads (valid paires,
 dangling end, self-circle, etc.) and its tags.

+```bash
+--save_interaction_bam
+```
+
 ## Skip options

-### `--skipMaps`
+### `--skip_maps`

 If defined, the workflow stops with the list of valid interactions, and the
 genome-wide maps are not built. Usefult for capture-C analysis. Default: false

 ```bash
--skipMaps
+--skip_maps
 ```

-### `--skipIce`
+### `--skip_ice`

 If defined, the ICE normalization is not run on the raw contact maps.
 Default: false

 ```bash
--skipIce
+--skip_ice
 ```

-### `--skipCool`
+### `--skip_cool`

 If defined, cooler files are not generated. Default: false

 ```bash
--skipCool
+--skip_cool
 ```

-### `--skipMultiQC`
+### `--skip_multiQC`

 If defined, the MultiQC report is not generated. Default: false

 ```bash
--skipMultiQC
+--skip_multiQC
 ```
--- a/main.nf
+++ b/main.nf
@@ -34,7 +34,8 @@ def helpMessage() {
      --save_reference [bool]                   Save reference genome to output folder. Default: False

    Alignments
-      --split_fastq [int]                       Size of read chuncks to use to speed up the workflow. Default: None
+      --split_fastq [bool]                      Split fastq files in reads chunks to speed up computation. Default: None
+      --fastq_chunks_size [int]                 Size of read chunks if split_fastq is true. Default: 20000000
      --save_aligned_intermediates [bool]       Save intermediates alignment files. Default: False 
      --bwt2_opts_end2end [str]                 Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default.
      --bwt2_opts_trimmed [str]                 Options for bowtie2 mapping after ligation site trimming. See hic.config for default.
@@ -46,14 +47,14 @@ def helpMessage() {
      --rm_dup [bool]                           Remove duplicates. Default: true
 
    Contacts calling
-      --min_restriction_fragment_size [int]     Minimum size of restriction fragments to consider. Default: None
-      --max_restriction_fragment_size [int]     Maximum size of restriction fragments to consider. Default: None
-      --min_insert_size [int]                   Minimum insert size of mapped reads to consider. Default: None
-      --max_insert_size [int]                   Maximum insert size of mapped reads to consider. Default: None
+      --min_restriction_fragment_size [int]     Minimum size of restriction fragments to consider. Default: 0
+      --max_restriction_fragment_size [int]     Maximum size of restriction fragments to consider. Default: 0
+      --min_insert_size [int]                   Minimum insert size of mapped reads to consider. Default: 0
+      --max_insert_size [int]                   Maximum insert size of mapped reads to consider. Default: 0
      --save_interaction_bam [bool]             Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False

      --dnase [bool]                            Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False
-      --min_cis_dist [int]                      Minimum intra-chromosomal distance to consider. Default: None
+      --min_cis_dist [int]                      Minimum intra-chromosomal distance to consider. Default: 0

    Contact maps
      --bin_size [int]                          Bin size for contact maps (comma separated). Default: '1000000,500000'
@@ -163,7 +164,7 @@ if (params.input_paths){

 if ( params.split_fastq ){
   raw_reads_full = raw_reads.concat( raw_reads_2 )
-   raw_reads = raw_reads_full.splitFastq( by: params.split_fastq , file: true)
+   raw_reads = raw_reads_full.splitFastq( by: params.fastq_chunks_size, file: true)
 }else{
   raw_reads = raw_reads.concat( raw_reads_2 ).dump(tag: "data")
 }
@@ -239,6 +240,8 @@ if(workflow.revision) summary['Pipeline Release'] = workflow.revision
 summary['Run Name']         = custom_runName ?: workflow.runName
 summary['Input']            = params.input
 summary['splitFastq']       = params.split_fastq
+if (params.split_fastq)
+   summary['Read chunks Size'] = params.fastq_chunks_size
 summary['Fasta Ref']        = params.fasta
 summary['Restriction Motif']= params.restriction_site
 summary['Ligation Motif']   = params.ligation_site
@@ -647,12 +650,12 @@ if (!params.dnase){
      }

      def opts = ""
-      if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}"
-      if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}"
-      if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}"
-      if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}"
-      if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}"
-      if (params.save_interaction_bam) opts="${opts} --sam"
+      opts += params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
+      opts += params.min_insert_size > 0 ?  " -s ${params.min_insert_size}" : ''
+      opts += params.max_insert_size > 0 ? " -l ${params.max_insert_size}" : ''
+      opts += params.min_restriction_fragment_size > 0 ? " -t ${params.min_restriction_fragment_size}" : ''
+      opts += params.max_restriction_fragment_size > 0 ? " -m ${params.max_restriction_fragment_size}" : ''
+      opts += params.save_interaction_bam ? " --sam" : ''
      prefix = pe_bam.toString() - ~/.bam/
      """
      mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts}
@@ -680,8 +683,7 @@ else{
         sample = sample.toString() - ~/(\.[0-9]+)$/
      }

-      def opts = ""
-      if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}"
+      opts = params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
      prefix = pe_bam.toString() - ~/.bam/
      """
      mapped_2hic_dnase.py -r ${pe_bam} ${opts}

--- a/nextflow.config
+++ b/nextflow.config
@@ -17,6 +17,7 @@ params {
  genome = false
  input_paths = false
  split_fastq = false
+  fastq_chunks_size = 20000000
  chromosome_size = false
  restriction_fragments = false
  skip_maps = false
@@ -34,12 +35,12 @@ params {
  // Digestion Hi-C
  restriction_site = 'A^AGCTT'
  ligation_site = 'AAGCTAGCTT'
-  min_restriction_fragment_size = false
-  max_restriction_fragment_size = false
-  min_insert_size = false
-  max_insert_size = false
+  min_restriction_fragment_size = 0
+  max_restriction_fragment_size = 0
+  min_insert_size = 0
+  max_insert_size = 0
  dnase = false
-  min_cis_dist = false
+  min_cis_dist = 0
  rm_dup = true
  rm_singleton = true
  rm_multi = true

--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -27,10 +27,16 @@
                    "default": "undefined"
                },
                "split_fastq": {
-                    "type": "number",
-                    "description": "Split the reads into chunks before running. Specify the number of reads per chuncks as --split_fastq 20000000.",
-                    "fa_icon": "fas fa-dna"
+                    "type": "boolean",
+                    "description": "Split the reads into chunks before running the pipelne",
+                    "fa_icon": "fas fa-dna",
+		    "default": "false"
                },
+		"fastq_chunks_size":{
+		    "type": "integer",
+		    "description": "Read number per chunks if split_fastq is used",
+		    "default": "20000000"
+		},
                "single_end": {
                    "type": "boolean",
                    "description": "Specifies that the input is single-end reads.",
@@ -178,29 +184,29 @@
            "fa_icon": "fas fa-signature",
            "properties": {
                "min_cis_dist": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products"
+                    "type": "integer",
+                    "default": "O",
+                    "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered"
                },
                "max_insert_size": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Maximum fragment size to consider"
+                    "type": "integer",
+                    "default": "0",
+                    "description": "Maximum fragment size to consider. Only values > 0 are considered"
                },
                "min_insert_size": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Minimum fragment size to consider"
+                    "type": "integer",
+                    "default": "0",
+                    "description": "Minimum fragment size to consider. Only values > 0 are considered"
                },
                "max_restriction_fragment_size": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Maximum restriction fragment size to consider"
+                    "type": "integer",
+                    "default": "0",
+                    "description": "Maximum restriction fragment size to consider. Only values > 0 are considered"
                },
                "min_restriction_fragment_size": {
-                    "type": "string",
-                    "default": "undefined",
-                    "description": "Minimum restriction fragment size to consider"
+                    "type": "integer",
+                    "default": "0",
+                    "description": "Minimum restriction fragment size to consider. Only values > 0 are considered"
                }
            }
        },