Skip to content
Snippets Groups Projects
Commit e5216e2f authored by nservant's avatar nservant
Browse files

add --fastq_chunks_size parameter

parent 0af0b36b
Branches
Tags
No related merge requests found
......@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`
* Template update for nf-core/tools v1.10.2
* Add the `--fastq_chunks_size` to specify the number of reads per chunks if split_fastq is true
### `Fixed`
......
......@@ -48,7 +48,7 @@ if __name__ == "__main__":
if not line.startswith("#"):
lsp = line.strip().split("\t")
data = map(num, lsp[1:len(lsp)])
template[str(lsp[0])] = data
template[str(lsp[0])] = list(data)
if len(template) == 0:
print("Cannot find template files !")
......
......@@ -517,75 +517,79 @@ normalization. Default: 0.1
## Inputs/Outputs
### `--splitFastq`
### `--split_fastq`
By default, the nf-core Hi-C pipeline expects one read pairs per sample.
However, for large Hi-C data processing single fastq files can be very
time consuming.
The `--splitFastq` option allows to automatically split input read pairs
into chunks of reads. In this case, all chunks will be processed in parallel
The `--split_fastq` option allows to automatically split input read pairs
into chunks of reads of size `--fastq_chunks_size` (Default : 20000000). In this case, all chunks will be processed in parallel
and merged before generating the contact maps, thus leading to a significant
increase of processing performance.
```bash
--splitFastq '[Number of reads per chunk]'
--split_fastq --fastq_chunks_size '[numeric]'
```
### `--saveReference`
### `--save_reference`
If specified, annotation files automatically generated from the `--fasta` file
are exported in the results folder. Default: false
```bash
--saveReference
--save_reference
```
### `--saveAlignedIntermediates`
### `--save_aligned_intermediates`
If specified, all intermediate mapping files are saved and exported in the
results folder. Default: false
```bash
--saveReference
--save_aligned_inermediates
```
### `--saveInteractionBAM`
### `--save_interaction_bam`
If specified, write a BAM file with all classified reads (valid paires,
dangling end, self-circle, etc.) and its tags.
```bash
--save_interaction_bam
```
## Skip options
### `--skipMaps`
### `--skip_maps`
If defined, the workflow stops with the list of valid interactions, and the
genome-wide maps are not built. Usefult for capture-C analysis. Default: false
```bash
--skipMaps
--skip_maps
```
### `--skipIce`
### `--skip_ice`
If defined, the ICE normalization is not run on the raw contact maps.
Default: false
```bash
--skipIce
--skip_ice
```
### `--skipCool`
### `--skip_cool`
If defined, cooler files are not generated. Default: false
```bash
--skipCool
--skip_cool
```
### `--skipMultiQC`
### `--skip_multiQC`
If defined, the MultiQC report is not generated. Default: false
```bash
--skipMultiQC
--skip_multiQC
```
......@@ -34,7 +34,8 @@ def helpMessage() {
--save_reference [bool] Save reference genome to output folder. Default: False
Alignments
--split_fastq [int] Size of read chuncks to use to speed up the workflow. Default: None
--split_fastq [bool] Split fastq files in reads chunks to speed up computation. Default: None
--fastq_chunks_size [int] Size of read chunks if split_fastq is true. Default: 20000000
--save_aligned_intermediates [bool] Save intermediates alignment files. Default: False
--bwt2_opts_end2end [str] Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default.
--bwt2_opts_trimmed [str] Options for bowtie2 mapping after ligation site trimming. See hic.config for default.
......@@ -46,14 +47,14 @@ def helpMessage() {
--rm_dup [bool] Remove duplicates. Default: true
Contacts calling
--min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: None
--max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: None
--min_insert_size [int] Minimum insert size of mapped reads to consider. Default: None
--max_insert_size [int] Maximum insert size of mapped reads to consider. Default: None
--min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: 0
--max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: 0
--min_insert_size [int] Minimum insert size of mapped reads to consider. Default: 0
--max_insert_size [int] Maximum insert size of mapped reads to consider. Default: 0
--save_interaction_bam [bool] Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False
--dnase [bool] Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False
--min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: None
--min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: 0
Contact maps
--bin_size [int] Bin size for contact maps (comma separated). Default: '1000000,500000'
......@@ -163,7 +164,7 @@ if (params.input_paths){
if ( params.split_fastq ){
raw_reads_full = raw_reads.concat( raw_reads_2 )
raw_reads = raw_reads_full.splitFastq( by: params.split_fastq , file: true)
raw_reads = raw_reads_full.splitFastq( by: params.fastq_chunks_size, file: true)
}else{
raw_reads = raw_reads.concat( raw_reads_2 ).dump(tag: "data")
}
......@@ -239,6 +240,8 @@ if(workflow.revision) summary['Pipeline Release'] = workflow.revision
summary['Run Name'] = custom_runName ?: workflow.runName
summary['Input'] = params.input
summary['splitFastq'] = params.split_fastq
if (params.split_fastq)
summary['Read chunks Size'] = params.fastq_chunks_size
summary['Fasta Ref'] = params.fasta
summary['Restriction Motif']= params.restriction_site
summary['Ligation Motif'] = params.ligation_site
......@@ -647,12 +650,12 @@ if (!params.dnase){
}
def opts = ""
if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}"
if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}"
if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}"
if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}"
if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}"
if (params.save_interaction_bam) opts="${opts} --sam"
opts += params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
opts += params.min_insert_size > 0 ? " -s ${params.min_insert_size}" : ''
opts += params.max_insert_size > 0 ? " -l ${params.max_insert_size}" : ''
opts += params.min_restriction_fragment_size > 0 ? " -t ${params.min_restriction_fragment_size}" : ''
opts += params.max_restriction_fragment_size > 0 ? " -m ${params.max_restriction_fragment_size}" : ''
opts += params.save_interaction_bam ? " --sam" : ''
prefix = pe_bam.toString() - ~/.bam/
"""
mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts}
......@@ -680,8 +683,7 @@ else{
sample = sample.toString() - ~/(\.[0-9]+)$/
}
def opts = ""
if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}"
opts = params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
prefix = pe_bam.toString() - ~/.bam/
"""
mapped_2hic_dnase.py -r ${pe_bam} ${opts}
......
......@@ -17,6 +17,7 @@ params {
genome = false
input_paths = false
split_fastq = false
fastq_chunks_size = 20000000
chromosome_size = false
restriction_fragments = false
skip_maps = false
......@@ -34,12 +35,12 @@ params {
// Digestion Hi-C
restriction_site = 'A^AGCTT'
ligation_site = 'AAGCTAGCTT'
min_restriction_fragment_size = false
max_restriction_fragment_size = false
min_insert_size = false
max_insert_size = false
min_restriction_fragment_size = 0
max_restriction_fragment_size = 0
min_insert_size = 0
max_insert_size = 0
dnase = false
min_cis_dist = false
min_cis_dist = 0
rm_dup = true
rm_singleton = true
rm_multi = true
......
......@@ -27,10 +27,16 @@
"default": "undefined"
},
"split_fastq": {
"type": "number",
"description": "Split the reads into chunks before running. Specify the number of reads per chuncks as --split_fastq 20000000.",
"fa_icon": "fas fa-dna"
"type": "boolean",
"description": "Split the reads into chunks before running the pipelne",
"fa_icon": "fas fa-dna",
"default": "false"
},
"fastq_chunks_size":{
"type": "integer",
"description": "Read number per chunks if split_fastq is used",
"default": "20000000"
},
"single_end": {
"type": "boolean",
"description": "Specifies that the input is single-end reads.",
......@@ -178,29 +184,29 @@
"fa_icon": "fas fa-signature",
"properties": {
"min_cis_dist": {
"type": "string",
"default": "undefined",
"description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products"
"type": "integer",
"default": "O",
"description": "Minimum distance between loci to consider. Useful for --dnase mode to remove spurious ligation products. Only values > 0 are considered"
},
"max_insert_size": {
"type": "string",
"default": "undefined",
"description": "Maximum fragment size to consider"
"type": "integer",
"default": "0",
"description": "Maximum fragment size to consider. Only values > 0 are considered"
},
"min_insert_size": {
"type": "string",
"default": "undefined",
"description": "Minimum fragment size to consider"
"type": "integer",
"default": "0",
"description": "Minimum fragment size to consider. Only values > 0 are considered"
},
"max_restriction_fragment_size": {
"type": "string",
"default": "undefined",
"description": "Maximum restriction fragment size to consider"
"type": "integer",
"default": "0",
"description": "Maximum restriction fragment size to consider. Only values > 0 are considered"
},
"min_restriction_fragment_size": {
"type": "string",
"default": "undefined",
"description": "Minimum restriction fragment size to consider"
"type": "integer",
"default": "0",
"description": "Minimum restriction fragment size to consider. Only values > 0 are considered"
}
}
},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment