diff --git a/.nf-core-lint.yml b/.nf-core-lint.yml deleted file mode 100644 index a24fddf163b17b87225a0073de318213abb59c16..0000000000000000000000000000000000000000 --- a/.nf-core-lint.yml +++ /dev/null @@ -1,3 +0,0 @@ -files_unchanged: - - .github/ISSUE_TEMPLATE/bug_report.md - - .github/PULL_REQUEST_TEMPLATE.md diff --git a/.nf-core.yml b/.nf-core.yml index a4978789d27e9e96e75d1f534409692d9aef16e3..3805dc81c144cd8f7bf7e49106934a313cb7667a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,3 +1 @@ repository_type: pipeline -repository_type: pipeline -repository_type: pipeline diff --git a/CITATIONS.md b/CITATIONS.md index da253d248988442de777df56482ff1333cd7dce5..b5ba945a42bf069488594e576beba9fa95929c2f 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,5 +1,10 @@ # nf-core/hic: Citations +## [HiC-Pro](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0831-x) + +> Servant N, Varoquaux N, Lajoie BR, Viara E, Chen C, Vert JP, Dekker J, Heard E, Barillot E. Genome Biology 2015, 16:259 doi: [10.1186/s13059-015-0831-x](https://dx.doi.org/10.1186/s13059-015-0831-x) + + ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) > Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. diff --git a/README.md b/README.md index 3dd2b4b3fe48f429a3918f596631749fec36dd95..984759cb1339a8cef364f457a3e6c71652ed1674 100644 --- a/README.md +++ b/README.md @@ -16,20 +16,30 @@ ## Introduction -<!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does --> **nf-core/hic** is a bioinformatics best-practice analysis pipeline for Analysis of Chromosome Conformation Capture data (Hi-C). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! -<!-- TODO nf-core: Add full-sized test dataset and amend the paragraph below if applicable --> On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/hic/results). ## Pipeline summary -<!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline --> +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +2. Hi-C data processing + 1. with [`HiC-Pro`](https://github.com/nservant/HiC-Pro) + 1. Mapping using a two steps strategy to rescue reads spanning the ligation + sites ([`bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) + 2. Detection of valid interaction products + 3. Duplicates removal + 4. Generate raw and normalized contact maps ([`iced`](https://github.com/hiclib/iced)) +3. Create genome-wide contact maps at various resolutions ([`cooler`](https://github.com/open2c/cooler)) +4. Contact maps normalization using balancing algorithm ([`cooler`](https://github.com/open2c/cooler)) +5. Export to various contact maps formats ([`HiC-Pro`](https://github.com/nservant/HiC-Pro), [`cooler`](https://github.com/open2c/cooler)) +6. Quality controls ([`HiC-Pro`](https://github.com/nservant/HiC-Pro), [`HiCExplorer`](https://github.com/deeptools/HiCExplorer)) +7. Compartments calling ([`cooltools`](https://cooltools.readthedocs.io/en/latest/)) +8. TADs calling ([`HiCExplorer`](https://github.com/deeptools/HiCExplorer), [`cooltools`](https://cooltools.readthedocs.io/en/latest/)) +9. Quality control report ([`MultiQC`](https://multiqc.info/)) -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Quick Start @@ -49,10 +59,8 @@ On release, automated continuous integration tests run the pipeline on a full-si 4. Start running your own analysis! - <!-- TODO nf-core: Update the example "typical command" below used to run the pipeline --> - ```console - nextflow run nf-core/hic -profile <docker/singularity/podman/shifter/charliecloud/conda/institute> --input samplesheet.csv --genome GRCh37 + nextflow run nf-core/hic -profile <docker/singularity/podman/shifter/charliecloud/conda/institute> --input samplesheet.csv --genome GRCh37 --digestion 'dpnii' ``` ## Documentation @@ -63,10 +71,6 @@ The nf-core/hic pipeline comes with documentation about the pipeline [usage](htt nf-core/hic was originally written by Nicolas Servant. -We thank the following people for their extensive assistance in the development of this pipeline: - -<!-- TODO nf-core: If applicable, make list of people who have also contributed --> - ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). @@ -75,10 +79,9 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations -<!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi and badge at the top of this file. --> -<!-- If you use nf-core/hic for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) --> -<!-- TODO nf-core: Add bibliography of tools and data used in your pipeline --> +If you use nf-core/hic for your analysis, please cite it using the following doi: doi: [10.5281/zenodo.2669513](https://doi.org/10.5281/zenodo.2669513) + An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yml similarity index 100% rename from assets/multiqc_config.yaml rename to assets/multiqc_config.yml diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 9f6aaa35eeecaa04a4f33b9b8bdf0aa303936362..73d28f10252c72d7a6ffcec3f4b8742e40330b0e 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -# TODO nf-core: Update the script to check the samplesheet # This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv import os @@ -38,7 +37,6 @@ def print_error(error, context="Line", context_str=""): sys.exit(1) -# TODO nf-core: Update the check_samplesheet function def check_samplesheet(file_in, file_out): """ This function checks that the samplesheet follows the following structure: @@ -57,7 +55,6 @@ def check_samplesheet(file_in, file_out): ## Check header MIN_COLS = 2 - # TODO nf-core: Update the column names for the input samplesheet HEADER = ["sample", "fastq_1", "fastq_2"] header = [x.strip('"') for x in fin.readline().strip().split(",")] if header[: len(HEADER)] != HEADER: diff --git a/conf/base.config b/conf/base.config index 2e7db0c1db87efe04da28a0914de2e9130a1c112..7d29ffef7f7f03cc82e52b19ce8abe9fb3935330 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,10 +10,9 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + time = { check_max( 12.h * task.attempt, 'time' ) } errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } maxRetries = 1 @@ -24,21 +23,20 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } withLabel:process_medium { cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } time = { check_max( 8.h * task.attempt, 'time' ) } } withLabel:process_high { cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } + memory = { check_max( 64.GB * task.attempt, 'memory' ) } time = { check_max( 16.h * task.attempt, 'time' ) } } withLabel:process_long { diff --git a/conf/modules.config b/conf/modules.config index 763ba3f5eb8dfeb64543febbf4c679d5ea7d4fa7..a934ac12d19f93e090e323e3d5bff585b878d1db 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -99,7 +99,7 @@ process { withName: 'GET_VALID_INTERACTION' { publishDir = [ path: { "${params.outdir}/hicpro/valid_pairs" }, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename}, mode: 'copy', enabled: params.save_pairs_intermediates ] diff --git a/docs/output.md b/docs/output.md index e2d35a1b9c9b5bd3b6c2f7948a3cf5bb469e0c62..8b3fd0a40579b5ee19f107acdf6f531a8d98702f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -3,66 +3,275 @@ ## Introduction This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. - The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -<!-- TODO nf-core: Write this documentation describing your workflow's output --> - ## Pipeline overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +The pipeline is built using [Nextflow](https://www.nextflow.io/) +and processes data using the following steps: -* [FastQC](#fastqc) - Raw read QC -* [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -* [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +* [HiC-Pro](#hicpro) + * [Reads alignment](#reads-alignment) + * [Valid pairs detection](#valid-pairs-detection) + * [Duplicates removal](#duplicates-removal) + * [Contact maps](#hicpro-contact-maps) +* [Hi-C contact maps](#hic-contact-maps) +* [Downstream analysis](#downstream-analysis) + * [Distance decay](#distance-decay) + * [Compartments calling](#compartments-calling) + * [TADs calling](#tads-calling) +* [MultiQC](#multiqc) - aggregate report and quality controls, describing +results of the whole pipeline +* [Export](#exprot) - additionnal export for compatibility with downstream +analysis tool and visualization -### FastQC +## HiC-Pro -<details markdown="1"> -<summary>Output files</summary> +The current version is mainly based on the +[HiC-Pro](https://github.com/nservant/HiC-Pro) pipeline. +For details about the workflow, see +[Servant et al. 2015](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0831-x) -* `fastqc/` - * `*_fastqc.html`: FastQC report containing quality metrics. - * `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +### Reads alignment -</details> +Using Hi-C data, each reads mate has to be independantly aligned on the +reference genome. +The current workflow implements a two steps mapping strategy. First, the reads +are aligned using an end-to-end aligner. +Second, reads spanning the ligation junction are trimmmed from their 3' end, +and aligned back on the genome. +Aligned reads for both fragment mates are then paired in a single paired-end +BAM file. +Singletons are discarded, and multi-hits are filtered according to the +configuration parameters (`--rm-multi`). +Note that if the `--dnase` mode is activated, HiC-Pro will skip the second +mapping step. -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +**Output directory: `results/hicpro/mapping`** - +* `*bwt2pairs.bam` - final BAM file with aligned paired data +* `*.pairstat` - mapping statistics - +if `--saveAlignedIntermediates` is specified, additional mapping file results +are available ; - +* `*.bam` - Aligned reads (R1 and R2) from end-to-end alignment +* `*_unmap.fastq` - Unmapped reads after end-to-end alignment +* `*_trimmed.fastq` - Trimmed reads after end-to-end alignment +* `*_trimmed.bam` - Alignment of trimmed reads +* `*bwt2merged.bam` - merged BAM file after the two-steps alignment +* `*.mapstat` - mapping statistics per read mate -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +Usually, a high fraction of reads is expected to be aligned on the genome +(80-90%). Among them, we usually observed a few percent (around 10%) of step 2 +aligned reads. Those reads are chimeric fragments for which we detect a +ligation junction. An abnormal level of chimeric reads can reflect a ligation +issue during the library preparation. +The fraction of singleton or multi-hits depends on the genome complexity and +the fraction of unmapped reads. The fraction of singleton is usually close to +the sum of unmapped R1 and R2 reads, as it is unlikely that both mates from the +same pair were unmapped. -### MultiQC +### Valid pairs detection with HiC-Pro -<details markdown="1"> -<summary>Output files</summary> +Each aligned reads can be assigned to one restriction fragment according to the +reference genome and the digestion protocol. -* `multiqc/` - * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - * `multiqc_plots/`: directory containing static images from the report in various formats. +Invalid pairs are classified as follow: -</details> +* Dangling end, i.e. unligated fragments (both reads mapped on the same +restriction fragment) +* Self circles, i.e. fragments ligated on themselves (both reads mapped on the +same restriction fragment in inverted orientation) +* Religation, i.e. ligation of juxtaposed fragments +* Filtered pairs, i.e. any pairs that do not match the filtering criteria on +inserts size, restriction fragments size +* Dumped pairs, i.e. any pairs for which we were not able to reconstruct the +ligation product. -[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +Only valid pairs involving two different restriction fragments are used to +build the contact maps. +Duplicated valid pairs associated to PCR artefacts are discarded +(see `--rm_dup`). -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see <http://multiqc.info>. +In case of Hi-C protocols that do not require a restriction enzyme such as +DNase Hi-C or micro Hi-C, the assignment to a restriction is not possible +(see `--dnase`). +Short range interactions that are likely to be spurious ligation products +can thus be discarded using the `--min_cis_dist` parameter. -### Pipeline information +**Output directory: `results/hicpro/valid_pairs`** -<details markdown="1"> -<summary>Output files</summary> +* `*.validPairs` - List of valid ligation products +* `*.DEpairs` - List of dangling-end products +* `*.SCPairs` - List of self-circle products +* `*.REPairs` - List of religation products +* `*.FiltPairs` - List of filtered pairs +* `*RSstat` - Statitics of number of read pairs falling in each category -* `pipeline_info/` - * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.tsv`. - * Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. +The validPairs are stored using a simple tab-delimited text format ; + +```bash +read name / chr_reads1 / pos_reads1 / strand_reads1 / chr_reads2 / pos_reads2 / +strand_reads2 / fragment_size / res frag name R1 / res frag R2 / mapping qual R1 +/ mapping qual R2 [/ allele_specific_tag] +``` + +The ligation efficiency can be assessed using the filtering of valid and +invalid pairs. As the ligation is a random process, 25% of each valid ligation +class is expected. In the same way, a high level of dangling-end or self-circle +read pairs is associated with a low quality experiment, and reveals a problem +during the digestion, fill-in or ligation steps. + +In the context of Hi-C protocol without restriction enzyme, this analysis step +is skipped. The aligned pairs are therefore directly used to generate the +contact maps. A filter of the short range contact (typically <1kb) is +recommanded as this pairs are likely to be self ligation products. + +### Duplicates removal + +Note that validPairs file are generated per reads chunck. +These files are then merged in the allValidPairs file, and duplicates are +removed if the `--rm_dup` parameter is used. + +**Output directory: `results/hicpro/valid_pairs`** + +* `*allValidPairs` - combined valid pairs from all read chunks +* `*mergestat` - statistics about duplicates removal and valid pairs information + +Additional quality controls such as fragment size distribution can be extracted +from the list of valid interaction products. +We usually expect to see a distribution centered around 300 pb which correspond +to the paired-end insert size commonly used. +The fraction of dplicates is also presented. A high level of duplication +indicates a poor molecular complexity and a potential PCR bias. +Finaly, an important metric is to look at the fraction of intra and +inter-chromosomal interactions, as well as long range (>20kb) versus short +range (<20kb) intra-chromosomal interactions. + +### Contact maps + +Intra et inter-chromosomal contact maps are build for all specified resolutions. +The genome is splitted into bins of equal size. Each valid interaction is +associated with the genomic bins to generate the raw maps. +In addition, Hi-C data can contain several sources of biases which has to be +corrected. +The HiC-Pro workflow uses the [ìced](https://github.com/hiclib/iced) and +[Varoquaux and Servant, 2018](http://joss.theoj.org/papers/10.21105/joss.01286) +python package which proposes a fast implementation of the original ICE +normalization algorithm (Imakaev et al. 2012), making the assumption of equal +visibility of each fragment. + +Importantly, the HiC-Pro maps are generated only if the `--hicpro_maps` option +is specified on the command line. + +**Output directory: `results/hicpro/matrix`** + +* `*.matrix` - genome-wide contact maps +* `*_iced.matrix` - genome-wide iced contact maps + +The contact maps are generated for all specified resolutions +(see `--bin_size` argument). +A contact map is defined by : + +* A list of genomic intervals related to the specified resolution (BED format). +* A matrix, stored as standard triplet sparse format (i.e. list format). + +Based on the observation that a contact map is symmetric and usually sparse, +only non-zero values are stored for half of the matrix. The user can specified +if the 'upper', 'lower' or 'complete' matrix has to be stored. The 'asis' +option allows to store the contacts as they are observed from the valid pairs +files. + +```bash + A B 10 + A C 23 + B C 24 + (...) +``` + +This format is memory efficient, and is compatible with several software for +downstream analysis. + +## Hi-C contact maps + +Contact maps are usually stored as simple txt (`HiC-Pro`), .hic (`Juicer/Juicebox`) and .(m)cool (`cooler/Higlass`) formats. +Note that .cool and .hic format are compressed and usually much more efficient that the txt format. +In the current workflow, we propose to use the `cooler` format as a standard to build the raw and normalized maps +after valid pairs detection as it is used by several downstream analysis and visualization tools. + +Raw contact maps are therefore in **`results/contact_maps/raw`** which contains the different maps in `txt` and `cool` formats, at various resolutions. +Normalized contact maps are stored in **`results/contact_maps/norm`** which contains the different maps in `txt`, `cool`, and `mcool` format. + +Note that `txt` contact maps generated with `cooler` are identical to those generated by `HiC-Pro`. +However, differences can be observed on the normalized contact maps as the balancing algorithm is not the same. -</details> +## Downstream analysis + +Downstream analysis are performed from `cool` files at specified resolution. + +### Distance decay + +The distance decay plot shows the relationship between contact frequencies and genomic distance. It gives a good indication of the compaction of the genome. +According to the organism, the slope of the curve should fit the expectation of polymer physics models. + +The results generated with the `HiCExplorer hicPlotDistVsCounts` tool (plot and table) are available in the **`results/dist_decay/`** folder. + +### Compartments calling + +Compartments calling is one of the most common analysis which aims at detecting A (open, active) / B (close, inactive) compartments. +In the first studies on the subject, the compartments were called at high/medium resolution (1000000 to 250000) which is enough to call A/B comparments. +Analysis at higher resolution has shown that these two main types of compartments can be further divided into compartments subtypes. + +Although different methods have been proposed for compartment calling, the standard remains the eigen vector decomposition from the normalized correlation maps. +Here, we use the implementation available in the [`cooltools`](https://cooltools.readthedocs.io/en/lates) package. + +Results are available in **`results/compartments/`** folder and includes : + +* `*cis.vecs.tsv`: eigenvectors decomposition along the genome +* `*cis.lam.txt`: eigenvalues associated with the eigenvectors + +### TADs calling + +TADs has been described as functional units of the genome. +While contacts between genes and regulatority elements can occur within a single TADs, contacts between TADs are much less frequent, mainly due to the presence of insulation protein (such as CTCF) at their boundaries. Looking at Hi-C maps, TADs look like triangles around the diagonal. According to the contact map resolutions, TADs appear as hierarchical structures with a median size around 1Mb (in mammals), as well as smaller structures usually called sub-TADs of smaller size. + +TADs calling remains a challenging task, and even if many methods have been proposed in the last decade, little overlap have been found between their results. + +Currently, the pipeline proposes two approaches : + +* Insulation score using the [`cooltools`](https://cooltools.readthedocs.io/en/latest/cli.html#cooltools-diamond-insulation) package. Results are availabe in **`results/tads/insulation`**. +* [`HiCExplorer TADs calling`](https://hicexplorer.readthedocs.io/en/latest/content/tools/hicFindTADs.html). Results are available at **`results/tads/hicexplorer`**. + +Usually, TADs results are presented as simple BED files, or bigWig files, with the position of boundaries along the genome. + +## MultiQC + +[MultiQC](http://multiqc.info) is a visualisation tool that generates a single +HTML report summarising all samples in your project. Most of the pipeline QC +results are visualised in the report and further statistics are available in +within the report data directory. + +The pipeline has special steps which allow the software versions used to be +reported in the MultiQC output for future traceability. + +**Output files:** + +* `multiqc/` + * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + * `multiqc_plots/`: directory containing static images from the report in various formats. + +## Pipeline information [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +**Output files:** + +* `pipeline_info/` + * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, + `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + * Reports generated by the pipeline: `pipeline_report.html`, + `pipeline_report.txt` and `software_versions.csv`. + * Documentation for interpretation of results in HTML format: + `results_description.html`. diff --git a/docs/usage.md b/docs/usage.md index 79b33ec2f74790ac27c2c883fe932dfa222f63e6..800d44713563554482d79b8e165d06514ad921e3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,65 +6,20 @@ ## Introduction -<!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. --> - -## Samplesheet input - -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. - -```console ---input '[path to samplesheet file]' -``` - -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` - -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` - -| Column | Description | -|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | - -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. - ## Running the pipeline The typical command for running the pipeline is as follows: -```console -nextflow run nf-core/hic --input samplesheet.csv --genome GRCh37 -profile docker +```bash +nextflow run nf-core/hic --input '*_R{1,2}.fastq.gz' -profile docker ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `docker` configuration profile. +See below for more information about profiles. Note that the pipeline will create the following files in your working directory: -```console +```bash work # Directory containing the nextflow working files results # Finished results (configurable, see below) .nextflow_log # Log file from Nextflow @@ -75,210 +30,673 @@ results # Finished results (configurable, see below) When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: -```console +```bash nextflow pull nf-core/hic ``` ### Reproducibility -It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +It's a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/hic releases page](https://github.com/nf-core/hic/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. +First, go to the +[nf-core/hic releases page](https://github.com/nf-core/hic/releases) and find +the latest version number - numeric only (eg. `1.3.1`). +Then specify this when running the pipeline with `-r` (one hyphen) +eg. `-r 1.3.1`. -This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. +This version number will be logged in reports when you run the pipeline, so +that you'll know what you used when you look back in the future. + +### Automatic resubmission + +Each step in the pipeline has a default set of requirements for number of CPUs, +memory and time. For most of the steps in the pipeline, if the job exits with +an error code of `143` (exceeded requested resources) it will automatically +resubmit with higher requests (2 x original, then 3 x original). If it still +fails after three times then the pipeline is stopped. ## Core Nextflow arguments -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +> **NB:** These options are part of Nextflow and use a _single_ hyphen +(pipeline parameters use a double-hyphen). ### `-profile` -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. +Use this parameter to choose a configuration profile. Profiles can give +configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. When using Biocontainers, most of these software packaging methods pull Docker containers from quay.io e.g [FastQC](https://quay.io/repository/biocontainers/fastqc) except for Singularity which directly downloads Singularity images via https hosted by the [Galaxy project](https://depot.galaxyproject.org/singularity/) and Conda which downloads and installs software locally from [Bioconda](https://bioconda.github.io/). +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +> We highly recommend the use of Docker or Singularity containers for full +pipeline reproducibility, however when this is not possible, Conda is also supported. -The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). +The pipeline also dynamically loads configurations from +[https://github.com/nf-core/configs](https://github.com/nf-core/configs) +when it runs, making multiple config profiles for various institutional +clusters available at run time. +For more information and to see if your system is available in these +configs please see +the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). -Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! -They are loaded in sequence, so later profiles can overwrite earlier profiles. +Note that multiple profiles can be loaded, for example: `-profile test,docker` - +the order of arguments is important! +They are loaded in sequence, so later profiles can overwrite +earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. +If `-profile` is not specified, the pipeline will run locally and +expect all software to be +installed and available on the `PATH`. This is _not_ recommended. * `docker` - * A generic configuration profile to be used with [Docker](https://docker.com/) + * A generic configuration profile to be used with [Docker](https://docker.com/) + * Pulls software from Docker Hub: [`nfcore/hic`](https://hub.docker.com/r/nfcore/hic/) * `singularity` - * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) + * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) + * Pulls software from Docker Hub: [`nfcore/hic`](https://hub.docker.com/r/nfcore/hic/) * `podman` - * A generic configuration profile to be used with [Podman](https://podman.io/) + * A generic configuration profile to be used with [Podman](https://podman.io/) + * Pulls software from Docker Hub: [`nfcore/hic`](https://hub.docker.com/r/nfcore/hic/) * `shifter` - * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) + * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) + * Pulls software from Docker Hub: [`nfcore/hic`](https://hub.docker.com/r/nfcore/hic/) * `charliecloud` - * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + * Pulls software from Docker Hub: [`nfcore/hic`](https://hub.docker.com/r/nfcore/hic/) * `conda` - * A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + * A generic configuration profile to be used with [Conda](https://conda.io/docs/) + * Pulls most software from [Bioconda](https://bioconda.github.io/) * `test` - * A profile with a complete configuration for automated testing - * Includes links to test data so needs no other parameters + * A profile with a complete configuration for automated testing + * Includes links to test data so needs no other parameters ### `-resume` -Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. - -You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. +Specify this when restarting a pipeline. Nextflow will used cached results from +any pipeline steps where the inputs are the same, continuing from where it got +to previously. +You can also supply a run name to resume a specific run: `-resume [run-name]`. +Use the `nextflow log` command to show previous run names. ### `-c` -Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. +Specify the path to a specific config file (this is a core Nextflow command). +See the [nf-core website documentation](https://nf-co.re/usage/configuration) +for more information. + +#### Custom resource requests + +Each step in the pipeline has a default set of requirements for number of CPUs, +memory and time. For most of the steps in the pipeline, if the job exits with +an error code of `143` (exceeded requested resources) it will automatically resubmit +with higher requests (2 x original, then 3 x original). If it still fails after three +times then the pipeline is stopped. + +Whilst these default requirements will hopefully work for most people with most data, +you may find that you want to customise the compute resources that the pipeline requests. +You can do this by creating a custom config file. For example, to give the workflow +process `star` 32GB of memory, you could use the following config: + +```nextflow +process { + withName: star { + memory = 32.GB + } +} +``` + +To find the exact name of a process you wish to modify the compute resources, check the live-status of a nextflow run displayed on your terminal or check the nextflow error for a line like so: `Error executing process > 'bowtie2_end_to_end'`. In this case the name to specify in the custom config file is `bowtie2_end_to_end`. + +See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information. -## Custom configuration +If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition above). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. -### Resource requests +If you have any questions or issues please send us a message on +[Slack](https://nf-co.re/join/slack) on the +[`#configs` channel](https://nfcore.slack.com/channels/configs). -Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. +### Running in the background -For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: +Nextflow handles job submissions and supervises the running jobs. +The Nextflow process must run until the pipeline is finished. -```console -[62/149eb0] NOTE: Process `RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) -Error executing process > 'RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' +The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal +so that the workflow does not stop if you log out of your session. The logs are +saved to a file. -Caused by: - Process `RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) +Alternatively, you can use `screen` / `tmux` or similar tool to create a detached +session which you can log back into at a later time. +Some HPC setups also allow you to run nextflow within a cluster job submitted +your job scheduler (from where it submits more jobs). -Command executed: - STAR \ - --genomeDir star \ - --readFilesIn WT_REP1_trimmed.fq.gz \ - --runThreadN 2 \ - --outFileNamePrefix WT_REP1. \ - <TRUNCATED> +#### Nextflow memory requirements -Command exit status: - 137 +In some cases, the Nextflow Java virtual machines can start to request a +large amount of memory. +We recommend adding the following line to your environment to limit this +(typically in `~/.bashrc` or `~./bash_profile`): + +```bash +NXF_OPTS='-Xms1g -Xmx4g' +``` -Command output: - (empty) +## Use case -Command error: - .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. <TRUNCATED> -Work dir: - /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb +### Hi-C digestion protocol -Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` +Here is an command line example for standard DpnII digestion protocols. +Alignment will be performed on the `mm10` genome with default parameters. +Multi-hits will not be considered and duplicates will be removed. +Note that by default, no filters are applied on DNA and restriction fragment sizes. + +```bash +nextflow run main.nf --input './*_R{1,2}.fastq.gz' --genome 'mm10' --digestion 'dnpii' ``` -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so based on the search results the file we want is `modules/nf-core/software/star/align/main.nf`. If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. Providing you haven't set any other standard nf-core parameters to __cap__ the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. +### DNase Hi-C protocol -```nextflow -process { - withName: STAR_ALIGN { - memory = 100.GB - } -} +Here is an command line example for DNase protocol. +Alignment will be performed on the `mm10` genome with default paramters. +Multi-hits will not be considered and duplicates will be removed. +Contacts involving fragments separated by less than 1000bp will be discarded. + +```bash +nextflow run main.nf --input './*_R{1,2}.fastq.gz' --genome 'mm10' --dnase --min_cis 1000 ``` -> **NB:** We specify just the process name i.e. `STAR_ALIGN` in the config file and not the full task name string that is printed to screen in the error message or on the terminal whilst the pipeline is running i.e. `RNASEQ:ALIGN_STAR:STAR_ALIGN`. You may get a warning suggesting that the process selector isn't recognised but you can ignore that if the process name has been specified correctly. This is something that needs to be fixed upstream in core Nextflow. +## Inputs -### Tool-specific options +### `--input` -For the ultimate flexibility, we have implemented and are using Nextflow DSL2 modules in a way where it is possible for both developers and users to change tool-specific command-line arguments (e.g. providing an additional command-line argument to the `STAR_ALIGN` process) as well as publishing options (e.g. saving files produced by the `STAR_ALIGN` process that aren't saved by default by the pipeline). In the majority of instances, as a user you won't have to change the default options set by the pipeline developer(s), however, there may be edge cases where creating a simple custom config file can improve the behaviour of the pipeline if for example it is failing due to a weird error that requires setting a tool-specific parameter to deal with smaller / larger genomes. +Use this to specify the location of your input FastQ files. For example: -The command-line arguments passed to STAR in the `STAR_ALIGN` module are a combination of: +```bash +--input 'path/to/data/sample_*_{1,2}.fastq' +``` -* Mandatory arguments or those that need to be evaluated within the scope of the module, as supplied in the [`script`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L49-L55) section of the module file. +Please note the following requirements: -* An [`options.args`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L56) string of non-mandatory parameters that is set to be empty by default in the module but can be overwritten when including the module in the sub-workflow / workflow context via the `addParams` Nextflow option. +1. The path must be enclosed in quotes +2. The path must have at least one `*` wildcard character +3. When using the pipeline with paired end data, the path must use `{1,2}` +notation to specify read pairs. -The nf-core/rnaseq pipeline has a sub-workflow (see [terminology](https://github.com/nf-core/modules#terminology)) specifically to align reads with STAR and to sort, index and generate some basic stats on the resulting BAM files using SAMtools. At the top of this file we import the `STAR_ALIGN` module via the Nextflow [`include`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/subworkflows/nf-core/align_star.nf#L10) keyword and by default the options passed to the module via the `addParams` option are set as an empty Groovy map [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/subworkflows/nf-core/align_star.nf#L5); this in turn means `options.args` will be set to empty by default in the module file too. This is an intentional design choice and allows us to implement well-written sub-workflows composed of a chain of tools that by default run with the bare minimum parameter set for any given tool in order to make it much easier to share across pipelines and to provide the flexibility for users and developers to customise any non-mandatory arguments. +If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` -When including the sub-workflow above in the main pipeline workflow we use the same `include` statement, however, we now have the ability to overwrite options for each of the tools in the sub-workflow including the [`align_options`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/workflows/rnaseq.nf#L225) variable that will be used specifically to overwrite the optional arguments passed to the `STAR_ALIGN` module. In this case, the options to be provided to `STAR_ALIGN` have been assigned sensible defaults by the developer(s) in the pipeline's [`modules.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L70-L74) and can be accessed and customised in the [workflow context](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/workflows/rnaseq.nf#L201-L204) too before eventually passing them to the sub-workflow as a Groovy map called `star_align_options`. These options will then be propagated from `workflow -> sub-workflow -> module`. +Note that the Hi-C data analysis requires paired-end data. -As mentioned at the beginning of this section it may also be necessary for users to overwrite the options passed to modules to be able to customise specific aspects of the way in which a particular tool is executed by the pipeline. Given that all of the default module options are stored in the pipeline's `modules.config` as a [`params` variable](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L24-L25) it is also possible to overwrite any of these options via a custom config file. +## Reference genomes -Say for example we want to append an additional, non-mandatory parameter (i.e. `--outFilterMismatchNmax 16`) to the arguments passed to the `STAR_ALIGN` module. Firstly, we need to copy across the default `args` specified in the [`modules.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L71) and create a custom config file that is a composite of the default `args` as well as the additional options you would like to provide. This is very important because Nextflow will overwrite the default value of `args` that you provide via the custom config. +The pipeline config files come bundled with paths to the illumina iGenomes reference +index files. If running with docker or AWS, the configuration is set up to use the +[AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. -As you will see in the example below, we have: +### `--genome` (using iGenomes) -* appended `--outFilterMismatchNmax 16` to the default `args` used by the module. -* changed the default `publish_dir` value to where the files will eventually be published in the main results directory. -* appended `'bam':''` to the default value of `publish_files` so that the BAM files generated by the process will also be saved in the top-level results directory for the module. Note: `'out':'log'` means any file/directory ending in `out` will now be saved in a separate directory called `my_star_directory/log/`. +There are many different species supported in the iGenomes references. To run +the pipeline, you must specify which to use with the `--genome` flag. -```nextflow -params { - modules { - 'star_align' { - args = "--quantMode TranscriptomeSAM --twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat --runRNGseed 0 --outFilterMultimapNmax 20 --alignSJDBoverhangMin 1 --outSAMattributes NH HI AS NM MD --quantTranscriptomeBan Singleend --outFilterMismatchNmax 16" - publish_dir = "my_star_directory" - publish_files = ['out':'log', 'tab':'log', 'bam':''] - } - } -} +You can find the keys to specify the genomes in the +[iGenomes config file](../conf/igenomes.config). + +### `--fasta` + +If you prefer, you can specify the full path to your reference genome when you +run the pipeline: + +```bash +--fasta '[path to Fasta reference]' ``` -### Updating containers +### `--bwt2_index` -The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. +The bowtie2 indexes are required to align the data with the HiC-Pro workflow. If the +`--bwt2_index` is not specified, the pipeline will either use the igenome +bowtie2 indexes (see `--genome` option) or build the indexes on-the-fly +(see `--fasta` option) -1. Check the default version used by the pipeline in the module file for [Pangolin](https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/modules/nf-core/software/pangolin/main.nf#L14-L19) -2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) -3. Create the custom config accordingly: +```bash +--bwt2_index '[path to bowtie2 index]' +``` - * For Docker: +### `--chromosome_size` + +The Hi-C pipeline will also requires a two-columns text file with the +chromosome name and its size (tab separated). +If not specified, this file will be automatically created by the pipeline. +In the latter case, the `--fasta` reference genome has to be specified. + +```bash + chr1 249250621 + chr2 243199373 + chr3 198022430 + chr4 191154276 + chr5 180915260 + chr6 171115067 + chr7 159138663 + chr8 146364022 + chr9 141213431 + chr10 135534747 + (...) +``` - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +```bash +--chromosome_size '[path to chromosome size file]' +``` - * For Singularity: +### `--restriction_fragments` + +Finally, Hi-C experiments based on restriction enzyme digestion requires a BED +file with coordinates of restriction fragments. + +```bash + chr1 0 16007 HIC_chr1_1 0 + + chr1 16007 24571 HIC_chr1_2 0 + + chr1 24571 27981 HIC_chr1_3 0 + + chr1 27981 30429 HIC_chr1_4 0 + + chr1 30429 32153 HIC_chr1_5 0 + + chr1 32153 32774 HIC_chr1_6 0 + + chr1 32774 37752 HIC_chr1_7 0 + + chr1 37752 38369 HIC_chr1_8 0 + + chr1 38369 38791 HIC_chr1_9 0 + + chr1 38791 39255 HIC_chr1_10 0 + + (...) +``` - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +If not specified, this file will be automatically created by the pipline. +In this case, the `--fasta` reference genome will be used. +Note that the `digestion` or `--restriction_site` parameter is mandatory to create this file. - * For Conda: +## Hi-C specific options - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` +The following options are defined in the `nextflow.config` file, and can be +updated either using a custom configuration file (see `-c` option) or using +command line parameter. -> **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. +### HiC-pro mapping -### nf-core/configs +The reads mapping is currently based on the two-steps strategy implemented in +the HiC-pro pipeline. The idea is to first align reads from end-to-end. +Reads that do not aligned are then trimmed at the ligation site, and their 5' +end is re-aligned to the reference genome. +Note that the default option are quite stringent, and can be updated according +to the reads quality or the reference genome. -In most cases, you will only need to create a custom config as a one-off but if you and others within your organisation are likely to be running nf-core pipelines regularly and need to use the same settings regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter. You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. +#### `--bwt2_opts_end2end` -See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information about creating your own configuration files. +Bowtie2 alignment option for end-to-end mapping. +Default: '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end +--reorder' -If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). +```bash +--bwt2_opts_end2end '[Options for bowtie2 step1 mapping on full reads]' +``` -## Running in the background +#### `--bwt2_opts_trimmed` -Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. +Bowtie2 alignment option for trimmed reads mapping (step 2). +Default: '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end +--reorder' -The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. +```bash +--bwt2_opts_trimmed '[Options for bowtie2 step2 mapping on trimmed reads]' +``` -Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. -Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). +#### `--min_mapq` -## Nextflow memory requirements +Minimum mapping quality. Reads with lower quality are discarded. Default: 10 -In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. -We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): +```bash +--min_mapq '[Minimum quality value]' +``` -```console -NXF_OPTS='-Xms1g -Xmx4g' +### Digestion Hi-C + +#### `--digestion` + +This parameter allows to automatically set the `--restriction_site` and +`--ligation_site` parameter according to the restriction enzyme you used. +Available keywords are 'hindiii', 'dpnii', 'mboi', 'arima'. + +```bash +--digestion 'hindiii' +``` + +#### `--restriction_site` + +If the restriction enzyme is not available through the `--digestion` +parameter, you can also defined manually the restriction motif(s) for +Hi-C digestion protocol. +The restriction motif(s) is(are) used to generate the list of restriction fragments. +The precise cutting site of the restriction enzyme has to be specified using +the '^' character. Default: 'A^AGCTT' +Here are a few examples: + +* MboI: ^GATC +* DpnII: ^GATC +* HindIII: A^AGCTT +* ARIMA kit: ^GATC,G^ANTC + +Note that multiples restriction motifs can be provided (comma-separated) and +that 'N' base are supported. + +```bash +--restriction_size '[Cutting motif]' +``` + +#### `--ligation_site` + +Ligation motif after reads ligation. This motif is used for reads trimming and +depends on the fill in strategy. +Note that multiple ligation sites can be specified (comma separated) and that +'N' base is interpreted and replaced by 'A','C','G','T'. +Default: 'AAGCTAGCTT' + +```bash +--ligation_site '[Ligation motif]' +``` + +Exemple of the ARIMA kit: GATCGATC,GANTGATC,GANTANTC,GATCANTC + +### DNAse Hi-C + +#### `--dnase` + +In DNAse Hi-C mode, all options related to digestion Hi-C +(see previous section) are ignored. +In this case, it is highly recommanded to use the `--min_cis_dist` parameter +to remove spurious ligation products. + +```bash +--dnase' +``` + +### HiC-pro processing + +#### `--min_restriction_fragment_size` + +Minimum size of restriction fragments to consider for the Hi-C processing. +Default: '0' - no filter + +```bash +--min_restriction_fragment_size '[numeric]' +``` + +#### `--max_restriction_fragment_size` + +Maximum size of restriction fragments to consider for the Hi-C processing. +Default: '0' - no filter + +```bash +--max_restriction_fragment_size '[numeric]' +``` + +#### `--min_insert_size` + +Minimum reads insert size. Shorter 3C products are discarded. +Default: '0' - no filter + +```bash +--min_insert_size '[numeric]' +``` + +#### `--max_insert_size` + +Maximum reads insert size. Longer 3C products are discarded. +Default: '0' - no filter + +```bash +--max_insert_size '[numeric]' +``` + +#### `--min_cis_dist` + +Filter short range contact below the specified distance. +Mainly useful for DNase Hi-C. Default: '0' + +```bash +--min_cis_dist '[numeric]' +``` + +#### `--keep_dups` + +If specified, duplicates reads are not discarded before building contact maps. + +```bash +--keep_dups +``` + +#### `--keep_multi` + +If specified, reads that aligned multiple times on the genome are not discarded. +Note the default mapping options are based on random hit assignment, meaning +that only one position is kept per read. +Note that in this case the `--min_mapq` parameter is ignored. + +```bash +--keep_multi +``` + +## Genome-wide contact maps + +Once the list of valid pairs is available, the standard is now to move on the `cooler` +framework to build the raw and balanced contact maps in txt and (m)cool formats. + +### `--bin_size` + +Resolution of contact maps to generate (comma separated). +Default:'1000000,500000' + +```bash +--bins_size '[string]' +``` + +### `--res_zoomify` + +Define the maximum resolution to reach when zoomify the cool contact maps. +Default:'5000' + +```bash +--res_zoomify '[string]' +``` + +### HiC-Pro contact maps + +By default, the contact maps are now generated with the `cooler` framework. +However, for backward compatibility, the raw and normalized maps can still be generated +by HiC-pro if the `--hicpro_maps` parameter is set. + +#### `--hicpro_maps` + +If specified, the raw and ICE normalized contact maps will be generated by HiC-Pro. + +```bash +--hicpro_maps +``` + +#### `--ice_max_iter` + +Maximum number of iteration for ICE normalization. +Default: 100 + +```bash +--ice_max_iter '[numeric]' +``` + +#### `--ice_filer_low_count_perc` + +Define which pourcentage of bins with low counts should be force to zero. +Default: 0.02 + +```bash +--ice_filter_low_count_perc '[numeric]' +``` + +#### `--ice_filer_high_count_perc` + +Define which pourcentage of bins with low counts should be discarded before +normalization. Default: 0 + +```bash +--ice_filter_high_count_perc '[numeric]' +``` + +#### `--ice_eps` + +The relative increment in the results before declaring convergence for ICE +normalization. Default: 0.1 + +```bash +--ice_eps '[numeric]' +``` + +## Downstream analysis + +### Additional quality controls + +#### `--res_dist_decay` + +Generates distance vs Hi-C counts plots at a given resolution using `HiCExplorer`. +Several resolution can be specified (comma separeted). Default: '250000' + +```bash +--res_dist_decay '[string]' +``` + +### Compartment calling + +Call open/close compartments for each chromosome, using the `cooltools` command. + +#### `--res_compartments` + +Resolution to call the chromosome compartments (comma separated). +Default: '250000' + +```bash +--res_compartments '[string]' +``` + +### TADs calling + +#### `--tads_caller` + +TADs calling can be performed using different approaches. +Currently available options are `insulation` and `hicexplorer`. +Note that all options can be specified (comma separated). +Default: 'insulation' + +```bash +--tads_caller '[string]' +``` + +#### `--res_tads` + +Resolution to run the TADs calling analysis (comma separated). +Default: '40000,20000' + +```bash +--res_tads '[string]' +``` + +## Inputs/Outputs + +### `--split_fastq` + +By default, the nf-core Hi-C pipeline expects one read pairs per sample. +However, for large Hi-C data processing single fastq files can be very +time consuming. +The `--split_fastq` option allows to automatically split input read pairs +into chunks of reads of size `--fastq_chunks_size` (Default : 20000000). In this case, all chunks will be processed in parallel +and merged before generating the contact maps, thus leading to a significant +increase of processing performance. + +```bash +--split_fastq --fastq_chunks_size '[numeric]' +``` + +### `--save_reference` + +If specified, annotation files automatically generated from the `--fasta` file +are exported in the results folder. Default: false + +```bash +--save_reference +``` + +### `--save_aligned_intermediates` + +If specified, all intermediate mapping files are saved and exported in the +results folder. Default: false + +```bash +--save_aligned_inermediates +``` + +### `--save_interaction_bam` + +If specified, write a BAM file with all classified reads (valid paires, +dangling end, self-circle, etc.) and its tags. + +```bash +--save_interaction_bam +``` + +## Skip options + +### `--skip_maps` + +If defined, the workflow stops with the list of valid interactions, and the +genome-wide maps are not built. Usefult for capture-C analysis. Default: false + +```bash +--skip_maps +``` + +### `--skip_balancing` + +If defined, the contact maps normalization is not run on the raw contact maps. +Default: false + +```bash +--skip_balancing +``` + +### `--skip_cool` + +If defined, cooler files are not generated. Default: false + +```bash +--skip_cool +``` + +### `skip_dist_decay` + +Do not run distance decay plots. Default: false + +```bash +--skip_dist_decay +``` + +### `skip_compartments` + +Do not call compartments. Default: false + +```bash +--skip_compartments +``` + +### `skip_tads` + +Do not call TADs. Default: false + +```bash +--skip_tads +``` + +### `--skip_multiQC` + +If defined, the MultiQC report is not generated. Default: false + +```bash +--skip_multiQC ``` diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 92eb1766e1d7763ccc40bcefc92b56a28544a4b1..d7085e909f4863c6112cbfe13aeed085cdad4d2d 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -9,9 +9,8 @@ class WorkflowMain { // public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - // TODO nf-core: Add Zenodo DOI for pipeline after first release - //"* The pipeline\n" + - //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + + "* The pipeline\n" + + " https://doi.org/10.5281/zenodo.2669513\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + diff --git a/modules.json b/modules.json index ef01adba6142fe35bf09a9bb4450705ff63138af..53a88ac19cc1c065bb8d6d2f1b1b29040d5ddc44 100644 --- a/modules.json +++ b/modules.json @@ -4,7 +4,7 @@ "repos": { "nf-core/modules": { "bowtie2/align": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "git_sha": "61f68913fefc20241ceccb671b104230b2d775d7" }, "bowtie2/build": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" @@ -16,10 +16,10 @@ "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, "custom/getchromsizes": { - "git_sha": "950700bcdc0e9a2b6883d40d2c51c6fc435cd714" + "git_sha": "213403187932dbbdd936a04474cc8cd8abae7a08" }, "fastqc": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "git_sha": "49b18b1639f4f7104187058866a8fab33332bdfe" } } } diff --git a/modules/nf-core/modules/bowtie2/align/main.nf b/modules/nf-core/modules/bowtie2/align/main.nf index 7e8a9659bd965a852223eb160bbe3e696600070b..c233f955d10b8812541b80725939b4d1768e0d32 100644 --- a/modules/nf-core/modules/bowtie2/align/main.nf +++ b/modules/nf-core/modules/bowtie2/align/main.nf @@ -2,10 +2,10 @@ process BOWTIE2_ALIGN { tag "$meta.id" label 'process_high' - conda (params.enable_conda ? 'bioconda::bowtie2=2.4.4 bioconda::samtools=1.14 conda-forge::pigz=2.6' : null) + conda (params.enable_conda ? 'bioconda::bowtie2=2.4.4 bioconda::samtools=1.15.1 conda-forge::pigz=2.6' : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:4d235f41348a00533f18e47c9669f1ecb327f629-0' : - 'quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:4d235f41348a00533f18e47c9669f1ecb327f629-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:1744f68fe955578c63054b55309e05b41c37a80d-0' : + 'quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:1744f68fe955578c63054b55309e05b41c37a80d-0' }" input: tuple val(meta), path(reads) @@ -29,6 +29,8 @@ process BOWTIE2_ALIGN { def unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' """ INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed 's/.rev.1.bt2l//'` + [ -z "\$INDEX" ] && echo "BT2 index files not found" 1>&2 && exit 1 bowtie2 \\ -x \$INDEX \\ -U $reads \\ @@ -49,6 +51,8 @@ process BOWTIE2_ALIGN { def unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' """ INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed 's/.rev.1.bt2l//'` + [ -z "\$INDEX" ] && echo "BT2 index files not found" 1>&2 && exit 1 bowtie2 \\ -x \$INDEX \\ -1 ${reads[0]} \\ diff --git a/modules/nf-core/modules/custom/getchromsizes/main.nf b/modules/nf-core/modules/custom/getchromsizes/main.nf index bbcfa9becd254aa507089a34c93ccbf35f78aaa4..0eabf3a4c3cdd9a862154859e1241990052dc2d4 100644 --- a/modules/nf-core/modules/custom/getchromsizes/main.nf +++ b/modules/nf-core/modules/custom/getchromsizes/main.nf @@ -2,10 +2,10 @@ process CUSTOM_GETCHROMSIZES { tag "$fasta" label 'process_low' - conda (params.enable_conda ? "bioconda::samtools=1.15" : null) + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : - 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" input: path fasta diff --git a/modules/nf-core/modules/fastqc/main.nf b/modules/nf-core/modules/fastqc/main.nf index ed6b8c50b1fb6bfa64e5acb72a536328bd8ca88a..05730368b2d43e0eaac6b13a69f07ed54d1ed2cb 100644 --- a/modules/nf-core/modules/fastqc/main.nf +++ b/modules/nf-core/modules/fastqc/main.nf @@ -44,4 +44,16 @@ process FASTQC { END_VERSIONS """ } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.html + touch ${prefix}.zip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + END_VERSIONS + """ } diff --git a/nextflow.config b/nextflow.config index c6ed56da47d4429d9c1005ee34acc1ce126f557c..2cd1618f2231064f95ff654bdd15ad3e37d22de8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,7 +9,6 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options input = null @@ -233,7 +232,7 @@ manifest { description = 'Analysis of Chromosome Conformation Capture data (Hi-C)' mainScript = 'main.nf' nextflowVersion = '!>=21.04.0' - version = '1.3.0' + version = '1.4.0dev' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index 7fe34b7c68bcc906fbaa437aab40a53ae0d41bfc..f7f61b7caa7f166028b6c8e08b034cefd70819ae 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -20,12 +20,6 @@ "description": "Input FastQ files.", "help_text": "Use this to specify the location of your input FastQ files. For example:\n\n```bash\n--input 'path/to/data/sample_*_{1,2}.fastq'\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The path must have at least one `*` wildcard character\n3. When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs.\n\nIf left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz`" }, - "input_paths": { - "type": "string", - "hidden": true, - "description": "Input FastQ files for test only", - "default": "undefined" - }, "outdir": { "type": "string", "description": "The output directory where the results will be saved.", diff --git a/workflows/hic.nf b/workflows/hic.nf index 4f705df847bf3b6ddbb9a486e562f85858e05e23..0940deaa91a5627ac25d07101dd9287bd23ebd4a 100644 --- a/workflows/hic.nf +++ b/workflows/hic.nf @@ -85,7 +85,7 @@ ch_map_res = ch_map_res.unique() ======================================================================================== */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() /*