Merge pull request #76 from nservant/dev

add --fastq_chunks_size parameter

Merge pull request #76 from nservant/dev
add --fastq_chunks_size parameter
fcabe2f2 · Nicolas Servant · GitHub · 49e88497 · e5216e2f · fcabe2f2
Unverified Commit fcabe2f2 authored 4 years ago by Nicolas Servant Committed by GitHub 4 years ago
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 * Template update for nf-core/tools v1.10.2
+* Add the `--fastq_chunks_size` to specify the number of reads per chunks if split_fastq is true
 ### `Fixed`

--- a/bin/merge_statfiles.py
+++ b/bin/merge_statfiles.py
@@ -48,7 +48,7 @@ if __name__ == "__main__":
                if not line.startswith("#"):
                    lsp = line.strip().split("\t")
                    data = map(num, lsp[1:len(lsp)])
-                    template[str(lsp[0])] = data
+                    template[str(lsp[0])] = list(data)
        if len(template) == 0:
            print("Cannot find template files !")

--- a/docs/usage.md
+++ b/docs/usage.md
@@ -517,75 +517,79 @@ normalization. Default: 0.1
 ## Inputs/Outputs
-### `--splitFastq`
+### `--split_fastq`
 By default, the nf-core Hi-C pipeline expects one read pairs per sample.
 However, for large Hi-C data processing single fastq files can be very
 time consuming.
-The `--splitFastq` option allows to automatically split input read pairs
+The `--split_fastq` option allows to automatically split input read pairs
-into chunks of reads. In this case, all chunks will be processed in parallel
+into chunks of reads of size `--fastq_chunks_size` (Default : 20000000). In this case, all chunks will be processed in parallel
 and merged before generating the contact maps, thus leading to a significant
 increase of processing performance.
 ```bash
--splitFastq '[Number of reads per chunk]'
+--split_fastq --fastq_chunks_size '[numeric]'
 ```
-### `--saveReference`
+### `--save_reference`
 If specified, annotation files automatically generated from the `--fasta` file
 are exported in the results folder. Default: false
 ```bash
--saveReference
+--save_reference
 ```
-### `--saveAlignedIntermediates`
+### `--save_aligned_intermediates`
 If specified, all intermediate mapping files are saved and exported in the
 results folder. Default: false
 ```bash
--saveReference
+--save_aligned_inermediates
 ```
-### `--saveInteractionBAM`
+### `--save_interaction_bam`
 If specified, write a BAM file with all classified reads (valid paires,
 dangling end, self-circle, etc.) and its tags.
+```bash
+--save_interaction_bam
+```
 ## Skip options
-### `--skipMaps`
+### `--skip_maps`
 If defined, the workflow stops with the list of valid interactions, and the
 genome-wide maps are not built. Usefult for capture-C analysis. Default: false
 ```bash
--skipMaps
+--skip_maps
 ```
-### `--skipIce`
+### `--skip_ice`
 If defined, the ICE normalization is not run on the raw contact maps.
 Default: false
 ```bash
--skipIce
+--skip_ice
 ```
-### `--skipCool`
+### `--skip_cool`
 If defined, cooler files are not generated. Default: false
 ```bash
--skipCool
+--skip_cool
 ```
-### `--skipMultiQC`
+### `--skip_multiQC`
 If defined, the MultiQC report is not generated. Default: false
 ```bash
--skipMultiQC
+--skip_multiQC
 ```
--- a/main.nf
+++ b/main.nf
@@ -34,7 +34,8 @@ def helpMessage() {
      --save_reference [bool]                   Save reference genome to output folder. Default: False
    Alignments
-      --split_fastq [int]                       Size of read chuncks to use to speed up the workflow. Default: None
+      --split_fastq [bool]                      Split fastq files in reads chunks to speed up computation. Default: None
+      --fastq_chunks_size [int]                 Size of read chunks if split_fastq is true. Default: 20000000
      --save_aligned_intermediates [bool]       Save intermediates alignment files. Default: False 
      --bwt2_opts_end2end [str]                 Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default.
      --bwt2_opts_trimmed [str]                 Options for bowtie2 mapping after ligation site trimming. See hic.config for default.
@@ -46,14 +47,14 @@ def helpMessage() {
      --rm_dup [bool]                           Remove duplicates. Default: true
    Contacts calling
-      --min_restriction_fragment_size [int]     Minimum size of restriction fragments to consider. Default: None
+      --min_restriction_fragment_size [int]     Minimum size of restriction fragments to consider. Default: 0
-      --max_restriction_fragment_size [int]     Maximum size of restriction fragments to consider. Default: None
+      --max_restriction_fragment_size [int]     Maximum size of restriction fragments to consider. Default: 0
-      --min_insert_size [int]                   Minimum insert size of mapped reads to consider. Default: None
+      --min_insert_size [int]                   Minimum insert size of mapped reads to consider. Default: 0
-      --max_insert_size [int]                   Maximum insert size of mapped reads to consider. Default: None
+      --max_insert_size [int]                   Maximum insert size of mapped reads to consider. Default: 0
      --save_interaction_bam [bool]             Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False
      --dnase [bool]                            Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False
-      --min_cis_dist [int]                      Minimum intra-chromosomal distance to consider. Default: None
+      --min_cis_dist [int]                      Minimum intra-chromosomal distance to consider. Default: 0
    Contact maps
      --bin_size [int]                          Bin size for contact maps (comma separated). Default: '1000000,500000'
@@ -163,7 +164,7 @@ if (params.input_paths){
 if ( params.split_fastq ){
   raw_reads_full = raw_reads.concat( raw_reads_2 )
-   raw_reads = raw_reads_full.splitFastq( by: params.split_fastq , file: true)
+   raw_reads = raw_reads_full.splitFastq( by: params.fastq_chunks_size, file: true)
 }else{
   raw_reads = raw_reads.concat( raw_reads_2 ).dump(tag: "data")
 }
@@ -239,6 +240,8 @@ if(workflow.revision) summary['Pipeline Release'] = workflow.revision
 summary['Run Name']         = custom_runName ?: workflow.runName
 summary['Input']            = params.input
 summary['splitFastq']       = params.split_fastq
+if (params.split_fastq)
+   summary['Read chunks Size'] = params.fastq_chunks_size
 summary['Fasta Ref']        = params.fasta
 summary['Restriction Motif']= params.restriction_site
 summary['Ligation Motif']   = params.ligation_site
@@ -647,12 +650,12 @@ if (!params.dnase){
      }
      def opts = ""
-      if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}"
+      opts += params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
-      if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}"
+      opts += params.min_insert_size > 0 ?  " -s ${params.min_insert_size}" : ''
-      if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}"
+      opts += params.max_insert_size > 0 ? " -l ${params.max_insert_size}" : ''
-      if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}"
+      opts += params.min_restriction_fragment_size > 0 ? " -t ${params.min_restriction_fragment_size}" : ''
-      if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}"
+      opts += params.max_restriction_fragment_size > 0 ? " -m ${params.max_restriction_fragment_size}" : ''
-      if (params.save_interaction_bam) opts="${opts} --sam"
+      opts += params.save_interaction_bam ? " --sam" : ''
      prefix = pe_bam.toString() - ~/.bam/
      """
      mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts}
@@ -680,8 +683,7 @@ else{
         sample = sample.toString() - ~/(\.[0-9]+)$/
      }
-      def opts = ""
+      opts = params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
-      if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}"
      prefix = pe_bam.toString() - ~/.bam/
      """
      mapped_2hic_dnase.py -r ${pe_bam} ${opts}

--- a/nextflow.config
+++ b/nextflow.config
@@ -17,6 +17,7 @@ params {
  genome = false
  input_paths = false
  split_fastq = false
+  fastq_chunks_size = 20000000
  chromosome_size = false
  restriction_fragments = false
  skip_maps = false
@@ -34,12 +35,12 @@ params {
  // Digestion Hi-C
  restriction_site = 'A^AGCTT'
  ligation_site = 'AAGCTAGCTT'
-  min_restriction_fragment_size = false
+  min_restriction_fragment_size = 0
-  max_restriction_fragment_size = false
+  max_restriction_fragment_size = 0
-  min_insert_size = false
+  min_insert_size = 0
-  max_insert_size = false
+  max_insert_size = 0
  dnase = false
-  min_cis_dist = false
+  min_cis_dist = 0
  rm_dup = true
  rm_singleton = true
  rm_multi = true

--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -27,10 +27,16 @@
                    "default": "undefined"
                },
                "split_fastq": {
-                    "type": "number",
+                    "type": "boolean",
-                    "description": "Split the reads into chunks before running. Specify the number of reads per chuncks as --split_fastq 20000000.",
+                    "description": "Split the reads into chunks before running the pipelne",
-                    "fa_icon": "fas fa-dna"
+                    "fa_icon": "fas fa-dna",
+		    "default": "false"
                },
+		"fastq_chunks_size":{
+		    "type": "integer",
+		    "description": "Read number per chunks if split_fastq is used",
+		    "default": "20000000"
+		},
                "single_end": {
                    "type": "boolean",
                    "description": "Specifies that the input is single-end reads.",
@@ -178,29 +184,29 @@
            "fa_icon": "fas fa-signature",
            "properties": {
                "min_cis_dist": {
-                    "type": "string",
+                    "type": "integer",
-                    "default": "undefined",
+                    "default": "O",
-                    "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products"
+                    "description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered"
                },
                "max_insert_size": {
-                    "type": "string",
+                    "type": "integer",
-                    "default": "undefined",
+                    "default": "0",
-                    "description": "Maximum fragment size to consider"
+                    "description": "Maximum fragment size to consider. Only values > 0 are considered"
                },
                "min_insert_size": {
-                    "type": "string",
+                    "type": "integer",
-                    "default": "undefined",
+                    "default": "0",
-                    "description": "Minimum fragment size to consider"
+                    "description": "Minimum fragment size to consider. Only values > 0 are considered"
                },
                "max_restriction_fragment_size": {
-                    "type": "string",
+                    "type": "integer",
-                    "default": "undefined",
+                    "default": "0",
-                    "description": "Maximum restriction fragment size to consider"
+                    "description": "Maximum restriction fragment size to consider. Only values > 0 are considered"
                },
                "min_restriction_fragment_size": {
-                    "type": "string",
+                    "type": "integer",
-                    "default": "undefined",
+                    "default": "0",
-                    "description": "Minimum restriction fragment size to consider"
+                    "description": "Minimum restriction fragment size to consider. Only values > 0 are considered"
                }
            }
        },