diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index b88d0f43654c3a170c0cf930e212fa316dda28e2..929f493c0b09f80151a8816b167840ea8eca9f30 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,47 +1,61 @@ # nf-core/hic: Contributing Guidelines -Hi there! Many thanks for taking an interest in improving nf-core/hic. +Hi there! +Many thanks for taking an interest in improving nf-core/hic. -We try to manage the required tasks for nf-core/hic using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. +We try to manage the required tasks for nf-core/hic using GitHub issues, you probably came to this page when creating one. +Please use the pre-filled template to save time. -However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) - -> If you need help using or modifying nf-core/hic then the best place to ask is on the pipeline channel on [Slack](https://nf-core-invite.herokuapp.com/). +However, don't be put off by this template - other more general issues and suggestions are welcome! +Contributions to the code are even more welcome ;) +> If you need help using or modifying nf-core/hic then the best place to ask is on the nf-core +Slack [#hic](https://nfcore.slack.com/channels/hic) channel ([join our Slack here](https://nf-co.re/join/slack)). ## Contribution workflow -If you'd like to write some code for nf-core/hic, the standard workflow -is as follows: -1. Check that there isn't already an issue about your idea in the - [nf-core/hic issues](https://github.com/nf-core/hic/issues) to avoid - duplicating work. +If you'd like to write some code for nf-core/hic, the standard workflow is as follows: + +1. Check that there isn't already an issue about your idea in the [nf-core/hic issues](https://github.com/nf-core/hic/issues) to avoid duplicating work * If there isn't one already, please create one so that others know you're working on this -2. Fork the [nf-core/hic repository](https://github.com/nf-core/hic) to your GitHub account +2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/hic repository](https://github.com/nf-core/hic) to your GitHub account 3. Make the necessary changes / additions within your forked repository -4. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged. - -If you're not used to this workflow with git, you can start with some [basic docs from GitHub](https://help.github.com/articles/fork-a-repo/) or even their [excellent interactive tutorial](https://try.github.io/). +4. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged +If you're not used to this workflow with git, you can start with some [docs from GitHub](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests) or even their [excellent `git` resources](https://try.github.io/). ## Tests -When you create a pull request with changes, [Travis CI](https://travis-ci.org/) will run automatic tests. + +When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. There are typically two types of tests that run: ### Lint Tests -The nf-core has a [set of guidelines](http://nf-co.re/guidelines) which all pipelines must adhere to. + +`nf-core` has a [set of guidelines](https://nf-co.re/developers/guidelines) which all pipelines must adhere to. To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint <pipeline-directory>` command. If any failures or warnings are encountered, please follow the listed URL for more documentation. ### Pipeline Tests -Each nf-core pipeline should be set up with a minimal set of test-data. -Travis CI then runs the pipeline on this data to ensure that it exists successfully. + +Each `nf-core` pipeline should be set up with a minimal set of test-data. +`GitHub Actions` then runs the pipeline on this data to ensure that it exits successfully. If there are any failures then the automated tests fail. -These tests are run both with the latest available version of Nextflow and also the minimum required version that is stated in the pipeline code. +These tests are run both with the latest available version of `Nextflow` and also the minimum required version that is stated in the pipeline code. + +## Patch + +: warning: Only in the unlikely and regretful event of a release happening with a bug. + +* On your own fork, make a new branch `patch` based on `upstream/master`. +* Fix the bug, and bump version (X.Y.Z+1). +* A PR should be made on `master` from patch to directly this particular bug. ## Getting help -For further information/help, please consult the [nf-core/hic documentation](https://github.com/nf-core/hic#documentation) and don't hesitate to get in touch on the pipeline channel on [Slack](https://nf-core-invite.herokuapp.com/). +For further information/help, please consult the [nf-core/hic documentation](https://nf-co.re/nf-core/hic/docs) and +don't hesitate to get in touch on the nf-core Slack [#hic](https://nfcore.slack.com/channels/hic) channel +([join our Slack here](https://nf-co.re/join/slack)). + diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 8112c95f4b2e8d2c3aeda5b539529608fe9213e8..2b9203377a6365822d0f13f6a59f2496ae717fb1 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,31 +1,42 @@ +# nf-core/hic bug report + Hi there! -Thanks for telling us about a problem with the pipeline. Please delete this text and anything that's not relevant from the template below: +Thanks for telling us about a problem with the pipeline. +Please delete this text and anything that's not relevant from the template below: + +## Describe the bug -#### Describe the bug A clear and concise description of what the bug is. -#### Steps to reproduce +## Steps to reproduce + Steps to reproduce the behaviour: + 1. Command line: `nextflow run ...` 2. See error: _Please provide your error message_ -#### Expected behaviour +## Expected behaviour + A clear and concise description of what you expected to happen. -#### System: - - Hardware: [e.g. HPC, Desktop, Cloud...] - - Executor: [e.g. slurm, local, awsbatch...] - - OS: [e.g. CentOS Linux, macOS, Linux Mint...] - - Version [e.g. 7, 10.13.6, 18.3...] +## System + +- Hardware: <!-- [e.g. HPC, Desktop, Cloud...] --> +- Executor: <!-- [e.g. slurm, local, awsbatch...] --> +- OS: <!-- [e.g. CentOS Linux, macOS, Linux Mint...] --> +- Version <!-- [e.g. 7, 10.13.6, 18.3...] --> + +## Nextflow Installation + +- Version: <!-- [e.g. 19.10.0] --> + +## Container engine -#### Nextflow Installation: - - Version: [e.g. 0.31.0] +- Engine: <!-- [e.g. Conda, Docker or Singularity] --> +- version: <!-- [e.g. 1.0.0] --> +- Image tag: <!-- [e.g. nfcore/hic:1.0.0] --> -#### Container engine: - - Engine: [e.g. Conda, Docker or Singularity] - - version: [e.g. 1.0.0] - - Image tag: [e.g. nfcore/hic:1.0.0] +## Additional context -#### Additional context Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 1f025b779cf127b420c972c1e385e4efcdd56321..57fa7f7f41368f73726974ef548162c957b7fd7d 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,16 +1,24 @@ +# nf-core/hic feature request + Hi there! -Thanks for suggesting a new feature for the pipeline! Please delete this text and anything that's not relevant from the template below: +Thanks for suggesting a new feature for the pipeline! +Please delete this text and anything that's not relevant from the template below: + +## Is your feature request related to a problem? Please describe -#### Is your feature request related to a problem? Please describe. A clear and concise description of what the problem is. + Ex. I'm always frustrated when [...] -#### Describe the solution you'd like +## Describe the solution you'd like + A clear and concise description of what you want to happen. -#### Describe alternatives you've considered +## Describe alternatives you've considered + A clear and concise description of any alternative solutions or features you've considered. -#### Additional context +## Additional context + Add any other context about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 473c41dcc78813e0ed7867d92322be210da06b0c..50d7959aa9d49a9bc51a14c172917c904d2bafb9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,15 +1,19 @@ -Many thanks to contributing to nf-core/hic! +# nf-core/hic pull request -Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested on pull requests (PRs). +Many thanks for contributing to nf-core/hic! + +Please fill in the appropriate checklist below (delete whatever is not relevant). +These are the most common things requested on pull requests (PRs). ## PR checklist - - [ ] This comment contains a description of changes (with reason) - - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If necessary, also make a PR on the [nf-core/hic branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/hic) - - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). - - [ ] Make sure your code lints (`nf-core lint .`). - - [ ] Documentation in `docs` is updated - - [ ] `CHANGELOG.md` is updated - - [ ] `README.md` is updated - -**Learn more about contributing:** https://github.com/nf-core/hic/tree/master/.github/CONTRIBUTING.md + +- [ ] This comment contains a description of changes (with reason) +- [ ] If you've fixed a bug or added code that should be tested, add tests! +- [ ] If necessary, also make a PR on the [nf-core/hic branch on the nf-core/test-datasets repo](https://github.com/nf-core/test-datasets/pull/new/nf-core/hic) +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). +- [ ] Make sure your code lints (`nf-core lint .`). +- [ ] Documentation in `docs` is updated +- [ ] `CHANGELOG.md` is updated +- [ ] `README.md` is updated + +**Learn more about contributing:** [CONTRIBUTING.md](https://github.com/nf-core/hic/tree/master/.github/CONTRIBUTING.md) \ No newline at end of file diff --git a/.github/markdownlint.yml b/.github/markdownlint.yml index e052a635aa7c2787e741207a069d9a400358ca6c..96b12a70398f6870ef306f4d8a5afcebc8f96ba8 100644 --- a/.github/markdownlint.yml +++ b/.github/markdownlint.yml @@ -1,9 +1,5 @@ # Markdownlint configuration file default: true, line-length: false -no-multiple-blanks: 0 -blanks-around-headers: false -blanks-around-lists: false -header-increment: false no-duplicate-header: siblings_only: true diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml new file mode 100644 index 0000000000000000000000000000000000000000..e95804c7cb51f306a7b2bf2028149c64358af705 --- /dev/null +++ b/.github/workflows/branch.yml @@ -0,0 +1,16 @@ +name: nf-core branch protection +# This workflow is triggered on PRs to master branch on the repository +# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` +on: + pull_request: + branches: + - master + +jobs: + test: + runs-on: ubuntu-18.04 + steps: + # PRs are only ok if coming from an nf-core `dev` branch or a fork `patch` branch + - name: Check PRs + run: | + { [[ $(git remote get-url origin) == *nf-core/hic ]] && [[ ${GITHUB_HEAD_REF} = "dev" ]]; } || [[ ${GITHUB_HEAD_REF} == "patch" ]] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..34410af1c7acb71904ab68cd8cd99e2475776381 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: nf-core CI +# This workflow is triggered on pushes and PRs to the repository. +# It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: [push, pull_request] + +jobs: + test: + env: + NXF_VER: ${{ matrix.nxf_ver }} + NXF_ANSI_LOG: false + runs-on: ubuntu-latest + strategy: + matrix: + # Nextflow versions: check pipeline minimum and current latest + nxf_ver: ['19.10.0', ''] + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Pull docker image + run: | + docker pull nfcore/hic:dev + docker tag nfcore/hic:dev nfcore/hic:dev + - name: Run pipeline with test data + run: | + # nf-core: You can customise CI pipeline run tests as required + # (eg. adding multiple test runs with different parameters) + nextflow run ${GITHUB_WORKSPACE} -profile test,docker diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000000000000000000000000000000000000..1e0827a800dcd520582e8f89d2325cbce15a6b12 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,50 @@ +name: nf-core linting +# This workflow is triggered on pushes and PRs to the repository. +# It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines +on: + push: + pull_request: + release: + types: [published] + +jobs: + Markdown: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-node@v1 + with: + node-version: '10' + - name: Install markdownlint + run: npm install -g markdownlint-cli + - name: Run Markdownlint + run: markdownlint ${GITHUB_WORKSPACE} -c ${GITHUB_WORKSPACE}/.github/markdownlint.yml + YAML: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - uses: actions/setup-node@v1 + with: + node-version: '10' + - name: Install yaml-lint + run: npm install -g yaml-lint + - name: Run yaml-lint + run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml") + nf-core: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - uses: actions/setup-python@v1 + with: + python-version: '3.6' + architecture: 'x64' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nf-core + - name: Run nf-core lint + run: nf-core lint ${GITHUB_WORKSPACE} diff --git a/.gitignore b/.gitignore index 5b54e3e6c257de1e963395161372e1a2ca110fe7..6354f3708fa7c35477f398801673e469c12726ea 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,6 @@ work/ data/ results/ .DS_Store -tests/test_data +tests/ +testing/ *.pyc diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 2dd43f74a0f277205eeb13f0e52c970038c2c400..0000000000000000000000000000000000000000 --- a/.travis.yml +++ /dev/null @@ -1,41 +0,0 @@ -sudo: required -language: python -jdk: openjdk8 -services: docker -python: '3.6' -cache: pip -matrix: - fast_finish: true - -before_install: - - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && ([ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ] || [ $TRAVIS_PULL_REQUEST_BRANCH = "patch" ]))' - # Pull the docker image first so the test doesn't wait for this - - docker pull nfcore/hic:dev - # Fake the tag locally so that the pipeline runs properly - # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) - - docker tag nfcore/hic:dev nfcore/hic:dev - -install: - # Install Nextflow - - mkdir /tmp/nextflow && cd /tmp/nextflow - - wget -qO- get.nextflow.io | bash - - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow - # Install nf-core/tools - - pip install --upgrade pip - - pip install nf-core - # Reset - - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests - # Install markdownlint-cli - - sudo apt-get install npm && npm install -g markdownlint-cli - -env: - - NXF_VER='19.04.0' # Specify a minimum NF version that should be tested and work - - NXF_VER='' # Plus: get the latest NF version and check that it works - -script: - # Lint the pipeline code - - nf-core lint ${TRAVIS_BUILD_DIR} - # Lint the documentation - - markdownlint ${TRAVIS_BUILD_DIR} -c ${TRAVIS_BUILD_DIR}/.github/markdownlint.yml - # Run the pipeline with the test profile - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker diff --git a/CHANGELOG.md b/CHANGELOG.md index 2019e42289ac51ac9dbbb3a87c6502a4ee5ed796..6480cc485d2d8788bfd933848d6ec5772f217f18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,20 +1,77 @@ # nf-core/hic: Changelog -## v1.1.1 +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). + +## v1.2.0dev - 2020-05-12 + +### `Added` + +* Bump v1.2.0dev +* Merge template nf-core 1.9 +* Move some options to camel_case +* Update python scripts for python3 +* Update conda environment file + * python base `2.7.15` > `3.7.6` + * pip `19.1` > `20.0.1` + * scipy `1.2.1` > `1.4.1` + * numpy `1.16.3` > `1.18.1` + * bx-python `0.8.2` > `0.8.8` + * pysam `0.15.2` > `0.15.4` + * cooler `0.8.5` > `0.8.6` + * multiqc `1.7` > `1.8` + * iced `0.5.1` > `0.5.6` + * *_New_* pymdown-extensions `7.1` + * *_New_* hicexplorer `3.4.3` + * *_New_* bioconductor-hitc `1.32.0` + * *_New_* r-optparse `1.6.6` + * *_New_* ucsc-bedgraphtobigwig `377` + * *_New_* cooltools `0.3.2` + * *_New_* fanc `0.8.30` + * *_Removed_* r-markdown + +### `Fixed` + +* Sort output of `get_valid_interaction` process as the input files of `remove_duplicates` +are expected to be sorted (sort -m) + +### `Deprecated` + +* Command line options converted to `camel_case`: + * `--skipMaps` > `--skip_maps` + * `--skipIce` > `--skip_ice` + * `--skipCool` > `--skip_cool` + * `--skipMultiQC` > `--skip_multiqc` + * `--saveReference` > `--save_reference` + * `--saveAlignedIntermediates` > `--save_aligned_intermediates` + * `--saveInteractionBAM` > `--save_interaction_bam` + +## v1.1.1 - 2020-04-02 + +### `Fixed` * Fix bug in tag. Remove '[' ## v1.1.0 - 2019-10-15 +### `Added` + * Update hicpro2higlass with `-p` parameter * Support 'N' base motif in restriction/ligation sites * Support multiple restriction enzymes/ligattion sites (comma separated) ([#31](https://github.com/nf-core/hic/issues/31)) * Add --saveInteractionBAM option * Add DOI ([#29](https://github.com/nf-core/hic/issues/29)) -* Fix bug for reads extension _1/_2 ([#30](https://github.com/nf-core/hic/issues/30)) * Update manual ([#28](https://github.com/nf-core/hic/issues/28)) -## v1.0 - 2019-05-06 +### `Fixed` + +* Fix bug for reads extension `_1`/`_2` ([#30](https://github.com/nf-core/hic/issues/30)) + +## v1.0 - [2019-05-06] + +Initial release of nf-core/hic, created with the [nf-core](http://nf-co.re/) template. + +### `Added` First version of nf-core Hi-C pipeline which is a Nextflow implementation of the [HiC-Pro pipeline](https://github.com/nservant/HiC-Pro/). diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index a977481246a45a03d3b03a99439e7dc1d4d3b7f1..496ad3b59f0bc2e34e2a69f8d3b4cc760be51616 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -56,7 +56,7 @@ project may be further defined and clarified by project maintainers. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team on -[Slack](https://nf-core-invite.herokuapp.com/). The project team will review +[Slack](https://nf-co.re/join/slack). The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details diff --git a/Dockerfile b/Dockerfile index 4714783d6d4c757834980a200f109612ab56cd48..cbb686fa3407b15196c7673932d1b8e9db6f721a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,14 @@ -FROM nfcore/base:1.7 +FROM nfcore/base:1.9 LABEL authors="Nicolas Servant" \ - description="Docker image containing all requirements for nf-core/hic pipeline" + description="Docker image containing all software requirements for the nf-core/hic pipeline" ## Install gcc for pip iced install RUN apt-get update && apt-get install -y gcc g++ && apt-get clean -y COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/nf-core-hic-1.1.1dev/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-hic-1.2.0dev/bin:$PATH + +# Dump the details of the installed packages to a file for posterity +RUN conda env export --name nf-core-hic-1.2.0dev > nf-core-hic-1.2.0dev.yml + diff --git a/README.md b/README.md index def8c35381f7dc28007dbf53de53bbde7a093b43..be3889dd90f80bc31850ce697468ffb6208be3d3 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,12 @@ **Analysis of Chromosome Conformation Capture data (Hi-C)**. -[](https://travis-ci.com/nf-core/hic) -[](https://www.nextflow.io/) +[](https://github.com/nf-core/hic/actions) +[](https://github.com/nf-core/hic/actions) +[](https://www.nextflow.io/) [](http://bioconda.github.io/) [](https://hub.docker.com/r/nfcore/hic) - [](https://doi.org/10.5281/zenodo.2669513) @@ -43,9 +43,10 @@ sites (bowtie2) i. Install [`nextflow`](https://nf-co.re/usage/installation) -ii. Install one of [`docker`](https://docs.docker.com/engine/installation/), -[`singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or -[`conda`](https://conda.io/miniconda.html) +ii. Install either [`Docker`](https://docs.docker.com/engine/installation/) +or [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) +for full pipeline reproducibility (please only use [`Conda`](https://conda.io/miniconda.html) +as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles)) iii. Download the pipeline and test it on a minimal dataset with a single command @@ -53,6 +54,12 @@ iii. Download the pipeline and test it on a minimal dataset with a single comman nextflow run nf-core/hic -profile test,<docker/singularity/conda/institute> ``` +> Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) +to see if a custom config file to run nf-core pipelines already exists for your Institute. +If so, you can simply use `-profile <institute>` in your command. +This will enable either `docker` or `singularity` and set the appropriate execution +settings for your local compute environment. + iv. Start running your own analysis! ```bash @@ -63,8 +70,8 @@ See [usage docs](docs/usage.md) for all of the available options when running th ## Documentation -The nf-core/hic pipeline comes with documentation about the pipeline, found in -the `docs/` directory: +The nf-core/hic pipeline comes with documentation about the pipeline, +found in the `docs/` directory: 1. [Installation](https://nf-co.re/usage/installation) 2. Pipeline configuration @@ -75,27 +82,34 @@ the `docs/` directory: 4. [Output and how to interpret the results](docs/output.md) 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) -## Contributions and Support - -If you would like to contribute to this pipeline, please see the -[contributing guidelines](.github/CONTRIBUTING.md). - For further information or help, don't hesitate to get in touch on [Slack](https://nfcore.slack.com/channels/hic). You can join with [this invite](https://nf-co.re/join/slack). - ## Credits nf-core/hic was originally written by Nicolas Servant. +## Contributions and Support + +If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). + +For further information or help, don't hesitate to get in touch on +[Slack](https://nfcore.slack.com/channels/hic) (you can join with +[this invite](https://nf-co.re/join/slack)). + ## Citation If you use nf-core/hic for your analysis, please cite it using the following doi: [10.5281/zenodo.2669513](https://doi.org/10.5281/zenodo.2669513) -You can cite the `nf-core` pre-print as follows: -Ewels PA, Peltzer A, Fillinger S, Alneberg JA, Patel H, Wilm A, Garcia MU, Di -Tommaso P, Nahnsen S. **nf-core: Community curated bioinformatics pipelines**. -*bioRxiv*. 2019. p. 610741. -[doi: 10.1101/610741](https://www.biorxiv.org/content/10.1101/610741v1). +You can cite the `nf-core` publication as follows: + +> **The nf-core framework for community-curated bioinformatics pipelines.** +> +> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, +Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. +> +> _Nat Biotechnol._ 2020 Feb 13. +doi:[10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). +> ReadCube: [Full Access Link](https://rdcu.be/b1GjZ) diff --git a/assets/email_template.html b/assets/email_template.html index bf19807e4a21c86f969a7ed5570b479574d9e4ee..177bccd2d802d3a11e2e6ca606e72ac19968126b 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -11,6 +11,8 @@ <body> <div style="font-family: Helvetica, Arial, sans-serif; padding: 30px; max-width: 800px; margin: 0 auto;"> +<img src="cid:nfcorepipelinelogo"> + <h1>nf-core/hic v${version}</h1> <h2>Run Name: $runName</h2> diff --git a/assets/email_template.txt b/assets/email_template.txt index 6c85add607a47589da20df83c6892bcfe5e04f1d..a951c5e7f965fa5829707fc84f4351495995190f 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -1,6 +1,12 @@ -======================================== - nf-core/hic v${version} -======================================== +---------------------------------------------------- + ,--./,-. + ___ __ __ __ ___ /,-._.--~\\ + |\\ | |__ __ / ` / \\ |__) |__ } { + | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, + `._,._,' + nf-core/hic v${version} +---------------------------------------------------- + Run Name: $runName <% if (success){ diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index d425b46caa3f6d032a2e5ed340788e583214d447..41468cab303a5894aa01e0823790b22cb44c95cd 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -3,7 +3,9 @@ report_comment: > analysis pipeline. For information about how to interpret these results, please see the <a href="https://github.com/nf-core/hic" target="_blank">documentation</a>. report_section_order: - nf-core/hic-software-versions: + software_versions: order: -1000 + nf-core-hic-summary: + order: -1001 export_plots: true diff --git a/assets/nf-core-hic_logo.png b/assets/nf-core-hic_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..6b364161664e70224fac3a83fb9f02ed0acbd9f8 Binary files /dev/null and b/assets/nf-core-hic_logo.png differ diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index 2d6712200607cb62f31be950cfe4c54e5ca1838a..9afc48067351f78cb54af40a3d76180866edf729 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -8,6 +8,23 @@ Content-Type: text/html; charset=utf-8 $email_html +--nfcoremimeboundary +Content-Type: image/png;name="nf-core-hic_logo.png" +Content-Transfer-Encoding: base64 +Content-ID: <nfcorepipelinelogo> +Content-Disposition: inline; filename="nf-core-hic_logo.png" + +<% out << new File("$baseDir/assets/nf-core-hic_logo.png"). + bytes. + encodeBase64(). + toString(). + tokenize( '\n' )*. + toList()*. + collate( 76 )*. + collect { it.join() }. + flatten(). + join( '\n' ) %> + <% if (mqcFile){ def mqcFileObj = new File("$mqcFile") diff --git a/bin/__pycache__/scrape_software_versions.cpython-36.pyc b/bin/__pycache__/scrape_software_versions.cpython-36.pyc deleted file mode 100644 index 07062ad2b5afae9e2bc03e055e9776da9b619a1c..0000000000000000000000000000000000000000 Binary files a/bin/__pycache__/scrape_software_versions.cpython-36.pyc and /dev/null differ diff --git a/bin/digest_genome.py b/bin/digest_genome.py index ac6d8da3d9f6faa3e9c0960fbecac52ea30da61a..2c29a49e1cf174f12142f78627fd799b83da2788 100755 --- a/bin/digest_genome.py +++ b/bin/digest_genome.py @@ -26,48 +26,47 @@ RE_cutsite = { def find_re_sites(filename, sequences, offset): - infile = open(filename) - chr_id = None - big_str = "" - indices = [] - all_indices = [] - contig_names = [] - c = 0 - for line in infile: - c += 1 - if line.startswith(">"): - print line.split()[0][1:], "..." - # If this is not the first chromosome, find the indices and append - # them to the list - if chr_id is not None: - for rs in range(len(sequences)): - pattern = "(?=%s)" % sequences[rs].lower() - indices += [m.start() + offset[rs] - for m in re.finditer(pattern, big_str)] - indices.sort() - all_indices.append(indices) - indices = [] - - # This is a new chromosome. Empty the sequence string, and add the - # correct chrom id - big_str = "" - chr_id = line.split()[0][1:] - if chr_id in contig_names: - print "The fasta file contains several instance of", - print chr_id, ". Exit." - sys.exit(-1) - contig_names.append(chr_id) - else: - # As long as we don't change chromosomes, continue reading the - # file, and appending the sequences - big_str += line.lower().strip() - # Add the indices for the last chromosome - for rs in range(len(sequences)): - pattern = "(?=%s)" % sequences[rs].lower() - indices += [m.start() + offset[rs] - for m in re.finditer(pattern, big_str)] - indices.sort() - all_indices.append(indices) + with open(filename, 'r') as infile: + chr_id = None + big_str = "" + indices = [] + all_indices = [] + contig_names = [] + c = 0 + for line in infile: + c += 1 + if line.startswith(">"): + print("{}...".format(line.split()[0][1:])) + # If this is not the first chromosome, find the indices and append + # them to the list + if chr_id is not None: + for rs in range(len(sequences)): + pattern = "(?={})".format(sequences[rs].lower()) + indices += [m.start() + offset[rs]\ + for m in re.finditer(pattern, big_str)] + indices.sort() + all_indices.append(indices) + indices = [] + + # This is a new chromosome. Empty the sequence string, and add the + # correct chrom id + big_str = "" + chr_id = line.split()[0][1:] + if chr_id in contig_names: + print("The fasta file contains several instance of {}. Exit.".format(chr_id)) + sys.exit(-1) + contig_names.append(chr_id) + else: + # As long as we don't change chromosomes, continue reading the + # file, and appending the sequences + big_str += line.lower().strip() + # Add the indices for the last chromosome + for rs in range(len(sequences)): + pattern = "(?={})".format(sequences[rs].lower()) + indices += [m.start() + offset[rs] + for m in re.finditer(pattern, big_str)] + indices.sort() + all_indices.append(indices) return contig_names, all_indices @@ -76,27 +75,27 @@ def find_chromsomose_lengths(reference_filename): chromosome_lengths = [] chromosome_names = [] length = None - infile = open(reference_filename) - for line in infile: - if line.startswith(">"): - chromosome_names.append(line[1:].strip()) - if length is not None: - chromosome_lengths.append(length) - length = 0 - else: - length += len(line.strip()) - chromosome_lengths.append(length) + with open(reference_filename, 'r') as infile: + for line in infile: + if line.startswith(">"): + chromosome_names.append(line[1:].strip()) + if length is not None: + chromosome_lengths.append(length) + length = 0 + else: + length += len(line.strip()) + chromosome_lengths.append(length) return chromosome_names, np.array(chromosome_lengths) def replaceN(cs): npos = int(cs.find('N')) cseql = [] - if npos!= -1: + if npos != -1: for nuc in ["A","C","G","T"]: tmp = cs.replace('N', nuc, 1) tmpl = replaceN(tmp) - if type(tmpl)==list: + if type(tmpl) == list: cseql = cseql + tmpl else: cseql.append(tmpl) @@ -138,15 +137,15 @@ if __name__ == "__main__": offpos = int(cseq.find('^')) if offpos == -1: - print "Unable to detect offset for", cseq - print "Please, use '^' to specified the cutting position,", - print "i.e A^GATCT for HindIII digestion" + print("Unable to detect offset for {}. Please, use '^' to specify the cutting position,\ + i.e A^GATCT for HindIII digestion.".format(cseq)) sys.exit(-1) for nuc in list(set(cs)): - if nuc != 'A' and nuc != 'C' and nuc != 'G' and nuc != 'T' and nuc != 'N' and nuc != '^': - print "Find unexpected character ['",nuc,"']in restriction motif" - print "Note that multiple motifs should be separated by a space (not a comma !)" + if nuc not in ['A','T','G','C','N','^']: + print("Find unexpected character ['{}']in restriction motif".format(nuc)) + print("Note that multiple motifs should be separated by a space (not a comma !)") + sys.exit(-1) offset.append(offpos) @@ -166,9 +165,9 @@ if __name__ == "__main__": if out is None: out = os.path.splitext(filename)[0] + "_fragments.bed" - print "Analyzing", filename - print "Restriction site(s)", ",".join(sequences) - print "Offset(s)", ','.join(str(x) for x in offset) + print("Analyzing", filename) + print("Restriction site(s)", ",".join(sequences)) + print("Offset(s)", ','.join(str(x) for x in offset)) # Read fasta file and look for rs per chromosome contig_names, all_indices = find_re_sites(filename, sequences, offset=offset) @@ -183,17 +182,14 @@ if __name__ == "__main__": valid_fragments.append(valid_fragments_chr) # Write results - print "Writing to", out, "..." - outfile = open(out, "w") - for chrom_name, indices in zip(contig_names, valid_fragments): - frag_id = 0 - for begin, end in indices: - # allow to remove cases where the enzyme cut at - # the first position of the chromosome - if end > begin: - frag_id += 1 - frag_name = "HIC_%s_%d" % (chrom_name, frag_id) - outfile.write( - "%s\t%d\t%d\t%s\t0\t+\n" % (chrom_name, begin, - end, frag_name)) - outfile.close() + print("Writing to {} ...".format(out)) + with open(out, 'w') as outfile: + for chrom_name, indices in zip(contig_names, valid_fragments): + frag_id = 0 + for begin, end in indices: + # allow to remove cases where the enzyme cut at + # the first position of the chromosome + if end > begin: + frag_id += 1 + frag_name = "HIC_{}_{}".format(str(chrom_name), int(frag_id)) + outfile.write("{}\t{}\t{}\t{}\t0\t+\n".format(str(chrom_name), int(begin), int(end), str(frag_name))) diff --git a/bin/ice b/bin/ice deleted file mode 100755 index 10f5f224a6064961a04ac2c09bc5b29286bf5484..0000000000000000000000000000000000000000 --- a/bin/ice +++ /dev/null @@ -1,124 +0,0 @@ -#! /usr/bin/env python - -import sys -import argparse -import numpy as np -from scipy import sparse - -import iced -from iced.io import loadtxt, savetxt - - -parser = argparse.ArgumentParser("ICE normalization") -parser.add_argument('filename', - metavar='File to load', - type=str, - help='Path to file of contact counts to load') -parser.add_argument("--results_filename", - "-r", - type=str, - default=None, - help="results_filename") -parser.add_argument("--filtering_perc", "-f", - type=float, - default=None, - help="Percentage of reads to filter out") -parser.add_argument("--filter_low_counts_perc", - type=float, - default=0.02, - help="Percentage of reads to filter out") -parser.add_argument("--filter_high_counts_perc", - type=float, - default=0, - help="Percentage of reads to filter out") -parser.add_argument("--remove-all-zeros-loci", default=False, - action="store_true", - help="If provided, all non-interacting loci will be " - "removed prior to the filtering strategy chosen.") -parser.add_argument("--max_iter", "-m", default=100, type=int, - help="Maximum number of iterations") -parser.add_argument("--eps", "-e", default=0.1, type=float, - help="Precision") -parser.add_argument("--dense", "-d", default=False, action="store_true") -parser.add_argument("--output-bias", "-b", default=False, help="Output the bias vector") -parser.add_argument("--verbose", "-v", default=False) - - -args = parser.parse_args() -filename = args.filename - -# Deprecating filtering_perc option -filter_low_counts = None -if "--filtering_perc" in sys.argv: - DeprecationWarning( - "Option '--filtering_perc' is deprecated. Please use " - "'--filter_low_counts_perc' instead.'") - # And print it again because deprecation warnings are not displayed for - # recent versions of python - print "--filtering_perc is deprecated. Please use filter_low_counts_perc" - print "instead. This option will be removed in ice 0.3" - filter_low_counts = args.filtering_perc -if "--filter_low_counts_perc" in sys.argv and "--filtering_perc" in sys.argv: - raise Warning("This two options are incompatible") -if "--filtering_perc" is None and "--filter_low_counts_perc" not in sys.argv: - filter_low_counts_perc = 0.02 -elif args.filter_low_counts_perc is not None: - filter_low_counts_perc = args.filter_low_counts_perc - -if args.verbose: - print("Using iced version %s" % iced.__version__) - print "Loading files..." - -# Loads file as i, j, counts -i, j, data = loadtxt(filename).T - -# Detecting whether the file is 0 or 1 based. -if min(i.min(), j.min()) == 0: - index_base = 0 - N = max(i.max(), j.max()) + 1 - counts = sparse.coo_matrix((data, (i, j)), shape=(N, N), dtype=float) -else: - index_base = 1 - N = max(i.max(), j.max()) - counts = sparse.coo_matrix((data, (i - 1, j - 1)), shape=(N, N), dtype=float) - -if args.dense: - counts = np.array(counts.todense()) -else: - counts = sparse.csr_matrix(counts) - -if args.verbose: - print "Normalizing..." - -if filter_low_counts_perc != 0: - counts = iced.filter.filter_low_counts(counts, - percentage=filter_low_counts_perc, - remove_all_zeros_loci=args.remove_all_zeros_loci, - copy=False, sparsity=False, verbose=args.verbose) -if args.filter_high_counts_perc != 0: - counts = iced.filter.filter_high_counts( - counts, - percentage=args.filter_high_counts_perc, - copy=False) - -counts, bias = iced.normalization.ICE_normalization( - counts, max_iter=args.max_iter, copy=False, - verbose=args.verbose, eps=args.eps, output_bias=True) - -if args.results_filename is None: - results_filename = ".".join( - filename.split(".")[:-1]) + "_normalized." + filename.split(".")[-1] -else: - results_filename = args.results_filename - -counts = sparse.coo_matrix(counts) - -if args.verbose: - print "Writing results..." - -savetxt( - results_filename, counts.col + index_base, counts.row + index_base, counts.data) - - -if args.output_bias: - np.savetxt(results_filename + ".biases", bias) diff --git a/bin/mapped_2hic_dnase.py b/bin/mapped_2hic_dnase.py index 36c5a605d0001de3775bb70e7934d06be7145797..dd023b0023e0c0a7aa4780bcc04289e467ed877b 100755 --- a/bin/mapped_2hic_dnase.py +++ b/bin/mapped_2hic_dnase.py @@ -21,14 +21,14 @@ import pysam def usage(): """Usage function""" - print "Usage : python mapped_2hic_dnase.py" - print "-r/--mappedReadsFile <BAM/SAM file of mapped reads>" - print "[-o/--outputDir] <Output directory. Default is current directory>" - print "[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>" - print "[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>" - print "[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>" - print "[-v/--verbose] <Verbose>" - print "[-h/--help] <Help>" + print("Usage : python mapped_2hic_dnase.py") + print("-r/--mappedReadsFile <BAM/SAM file of mapped reads>") + print("[-o/--outputDir] <Output directory. Default is current directory>") + print("[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>") + print("[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>") + print("[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>") + print("[-v/--verbose] <Verbose>") + print("[-h/--help] <Help>") return @@ -78,11 +78,11 @@ def get_read_pos(read, st="start"): list of aligned reads """ if st == "middle": - pos = read.pos + int(read.alen/2) + pos = read.reference_start + int(read.alen/2) elif st =="start": pos = get_read_start(read) elif st == "left": - pos = read.pos + pos = read.reference_start return pos @@ -92,9 +92,9 @@ def get_read_start(read): Return the 5' end of the read """ if read.is_reverse: - pos = read.pos + read.alen -1 + pos = read.reference_start + read.alen -1 else: - pos = read.pos + pos = read.reference_start return pos @@ -108,20 +108,16 @@ def get_ordered_reads(read1, read2): read1 = [AlignedRead] read2 = [AlignedRead] """ - if read1.tid == read2.tid: + if read1.reference_id == read2.reference_id: if get_read_pos(read1) < get_read_pos(read2): - r1 = read1 - r2 = read2 + r1, r2 = read1, read2 else: - r1 = read2 - r2 = read1 + r1, r2 = read2, read1 else: - if read1.tid < read2.tid: - r1 = read1 - r2 = read2 + if read1.reference_id < read2.reference_id: + r1, r2 = read1, read2 else: - r1 = read2 - r2 = read1 + r1, r2 = read2, read1 return r1, r2 @@ -134,7 +130,7 @@ def isIntraChrom(read1, read2): read2 : [AlignedRead] """ - if read1.tid == read2.tid: + if read1.reference_id == read2.reference_id: return True else: return False @@ -187,7 +183,7 @@ def get_cis_dist(read1, read2): def get_read_tag(read, tag): - for t in read.tags: + for t in read.get_tags(): if t[0] == tag: return t[1] return None @@ -229,11 +225,11 @@ if __name__ == "__main__": # Verbose mode if verbose: - print "## overlapMapped2HiCFragments.py" - print "## mappedReadsFile=", mappedReadsFile - print "## minCisDist=", minDist - print "## allOuput=", allOutput - print "## verbose=", verbose, "\n" + print("## overlapMapped2HiCFragments.py") + print("## mappedReadsFile=", mappedReadsFile) + print("## minCisDist=", minDist) + print("## allOuput=", allOutput) + print("## verbose={}\n".format(verbose)) # Initialize variables reads_counter = 0 @@ -271,7 +267,7 @@ if __name__ == "__main__": # Read the SAM/BAM file if verbose: - print "## Opening SAM/BAM file '", mappedReadsFile, "'..." + print("## Opening SAM/BAM file {} ...".format(mappedReadsFile)) samfile = pysam.Samfile(mappedReadsFile, "rb") # Reads are 0-based too (for both SAM and BAM format) @@ -286,7 +282,7 @@ if __name__ == "__main__": if read.is_read1: r1 = read if not r1.is_unmapped: - r1_chrom = samfile.getrname(r1.tid) + r1_chrom = samfile.get_reference_name(r1.reference_id) else: r1_chrom = None @@ -294,11 +290,11 @@ if __name__ == "__main__": elif read.is_read2: r2 = read if not r2.is_unmapped: - r2_chrom = samfile.getrname(r2.tid) + r2_chrom = samfile.get_reference_name(r2.reference_id) else: r2_chrom = None - if isIntraChrom(r1,r2): + if isIntraChrom(r1, r2): dist = get_cis_dist(r1, r2) else: dist = None @@ -368,8 +364,8 @@ if __name__ == "__main__": ##reorient reads to ease duplicates removal or1, or2 = get_ordered_reads(r1, r2) - or1_chrom = samfile.getrname(or1.tid) - or2_chrom = samfile.getrname(or2.tid) + or1_chrom = samfile.get_reference_name(or1.reference_id) + or2_chrom = samfile.get_reference_name(or2.reference_id) ##reset as tag now that the reads are oriented r1as = get_read_tag(or1, gtag) @@ -378,7 +374,7 @@ if __name__ == "__main__": htag = str(r1as)+"-"+str(r2as) cur_handler.write( - or1.qname + "\t" + + or1.query_name + "\t" + or1_chrom + "\t" + str(get_read_pos(or1)+1) + "\t" + str(get_read_strand(or1)) + "\t" + @@ -394,7 +390,7 @@ if __name__ == "__main__": elif r2.is_unmapped and not r1.is_unmapped: cur_handler.write( - r1.qname + "\t" + + r1.query_name + "\t" + r1_chrom + "\t" + str(get_read_pos(r1)+1) + "\t" + str(get_read_strand(r1)) + "\t" + @@ -408,7 +404,7 @@ if __name__ == "__main__": "*" + "\n") elif r1.is_unmapped and not r2.is_unmapped: cur_handler.write( - r2.qname + "\t" + + r2.query_name + "\t" + "*" + "\t" + "*" + "\t" + "*" + "\t" + @@ -422,7 +418,7 @@ if __name__ == "__main__": str(r2.mapping_quality) + "\n") if (reads_counter % 100000 == 0 and verbose): - print "##", reads_counter + print("##", reads_counter) # Close handler handle_valid.close() @@ -432,33 +428,28 @@ if __name__ == "__main__": handle_filt.close() # Write stats file - handle_stat = open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') - handle_stat.write("## Hi-C processing - no restriction fragments\n") - handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") - handle_stat.write( - "Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") - handle_stat.write( - "Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") - handle_stat.write( - "Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") - handle_stat.write( - "Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") - handle_stat.write("Single-end_pairs\t" + str(single_counter) + "\n") - handle_stat.write("Filtered_pairs\t" + str(filt_counter) + "\n") - handle_stat.write("Dumped_pairs\t" + str(dump_counter) + "\n") + with open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') as handle_stat: + handle_stat.write("## Hi-C processing - no restriction fragments\n") + handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") + handle_stat.write("Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") + handle_stat.write("Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") + handle_stat.write("Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") + handle_stat.write("Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") + handle_stat.write("Single-end_pairs\t" + str(single_counter) + "\n") + handle_stat.write("Filtered_pairs\t" + str(filt_counter) + "\n") + handle_stat.write("Dumped_pairs\t" + str(dump_counter) + "\n") ## Write AS report - if gtag is not None: - handle_stat.write("## ======================================\n") - handle_stat.write("## Allele specific information\n") - handle_stat.write("Valid_pairs_from_ref_genome_(1-1)\t" + str(G1G1_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + str(UG1_ascounter+G1U_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_genome_(2-2)\t" + str(G2G2_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + str(UG2_ascounter+G2U_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter+G2G1_ascounter) + "\n") - handle_stat.write("Valid_pairs_with_both_unassigned_mated_(0-0)\t" + str(UU_ascounter) + "\n") - handle_stat.write("Valid_pairs_with_at_least_one_conflicting_mate_(3-)\t" + str(CF_ascounter) + "\n") - - handle_stat.close() + if gtag is not None: + handle_stat.write("## ======================================\n") + handle_stat.write("## Allele specific information\n") + handle_stat.write("Valid_pairs_from_ref_genome_(1-1)\t" + str(G1G1_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + str(UG1_ascounter+G1U_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_genome_(2-2)\t" + str(G2G2_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + str(UG2_ascounter+G2U_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter+G2G1_ascounter) + "\n") + handle_stat.write("Valid_pairs_with_both_unassigned_mated_(0-0)\t" + str(UU_ascounter) + "\n") + handle_stat.write("Valid_pairs_with_at_least_one_conflicting_mate_(3-)\t" + str(CF_ascounter) + "\n") + diff --git a/bin/mapped_2hic_fragments.py b/bin/mapped_2hic_fragments.py index d4790ee3114a071cff3131543159d9124bbab1c6..e823ee02cce862b704c2b6939d1642db579665be 100755 --- a/bin/mapped_2hic_fragments.py +++ b/bin/mapped_2hic_fragments.py @@ -12,7 +12,6 @@ Script to keep only valid 3C products - DE and SC are removed Output is : readname / """ - import time import getopt import sys @@ -24,20 +23,20 @@ from bx.intervals.intersection import Intersecter, Interval def usage(): """Usage function""" - print "Usage : python mapped_2hic_fragments.py" - print "-f/--fragmentFile <Restriction fragment file GFF3>" - print "-r/--mappedReadsFile <BAM/SAM file of mapped reads>" - print "[-o/--outputDir] <Output directory. Default is current directory>" - print "[-s/--shortestInsertSize] <Shortest insert size of mapped reads to consider>" - print "[-l/--longestInsertSize] <Longest insert size of mapped reads to consider>" - print "[-t/--shortestFragmentLength] <Shortest restriction fragment length to consider>" - print "[-m/--longestFragmentLength] <Longest restriction fragment length to consider>" - print "[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>" - print "[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>" - print "[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>" - print "[-S/--sam] <Output an additional SAM file with flag 'CT' for pairs classification>" - print "[-v/--verbose] <Verbose>" - print "[-h/--help] <Help>" + print("Usage : python mapped_2hic_fragments.py") + print("-f/--fragmentFile <Restriction fragment file GFF3>") + print("-r/--mappedReadsFile <BAM/SAM file of mapped reads>") + print("[-o/--outputDir] <Output directory. Default is current directory>") + print("[-s/--shortestInsertSize] <Shortest insert size of mapped reads to consider>") + print("[-l/--longestInsertSize] <Longest insert size of mapped reads to consider>") + print("[-t/--shortestFragmentLength] <Shortest restriction fragment length to consider>") + print("[-m/--longestFragmentLength] <Longest restriction fragment length to consider>") + print("[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>") + print("[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>") + print("[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>") + print("[-S/--sam] <Output an additional SAM file with flag 'CT' for pairs classification>") + print("[-v/--verbose] <Verbose>") + print("[-h/--help] <Help>") return @@ -67,7 +66,7 @@ def timing(function, *args): """ startTime = time.time() result = function(*args) - print '%s function took %0.3f ms' % (function.func_name, (time.time() - startTime) * 1000) + print('{} function took {:.3f}ms'.format(function.__name__, (time.time() - startTime) * 1000)) return result @@ -96,8 +95,7 @@ def isIntraChrom(read1, read2): """ if read1.tid == read2.tid: return True - else: - return False + return False def get_cis_dist(read1, read2): @@ -114,8 +112,7 @@ def get_cis_dist(read1, read2): if not read1.is_unmapped and not read2.is_unmapped: ## Contact distances can be calculated for intrachromosomal reads only if isIntraChrom(read1, read2): - r1pos = get_read_pos(read1) - r2pos = get_read_pos(read2) + r1pos, r2pos = get_read_pos(read1), get_read_pos(read2) dist = abs(r1pos - r2pos) return dist @@ -138,11 +135,11 @@ def get_read_pos(read, st="start"): """ if st == "middle": - pos = read.pos + int(read.alen/2) + pos = read.reference_start + int(read.alen/2) elif st =="start": pos = get_read_start(read) elif st == "left": - pos = read.pos + pos = read.reference_start return pos @@ -152,9 +149,9 @@ def get_read_start(read): Return the 5' end of the read """ if read.is_reverse: - pos = read.pos + read.alen -1 + pos = read.reference_start + read.alen -1 else: - pos = read.pos + pos = read.reference_start return pos def get_ordered_reads(read1, read2): @@ -178,18 +175,14 @@ def get_ordered_reads(read1, read2): """ if read1.tid == read2.tid: if get_read_pos(read1) < get_read_pos(read2): - r1 = read1 - r2 = read2 + r1, r2 = read1, read2 else: - r1 = read2 - r2 = read1 + r1, r2 = read2, read1 else: if read1.tid < read2.tid: - r1 = read1 - r2 = read2 + r1, r2 = read1, read2 else: - r1 = read2 - r2 = read1 + r1, r2 = read2, read1 return r1, r2 @@ -206,46 +199,44 @@ def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbo """ resFrag = {} if verbose: - print "## Loading Restriction File Intervals '", in_file, "'..." - + print("## Loading Restriction File Intervals {} ...".format(in_file)) bed_handle = open(in_file) nline = 0 nfilt = 0 for line in bed_handle: - nline +=1 - bedtab = line.split("\t") - try: - chromosome, start, end, name = bedtab[:4] - except ValueError: - print "Warning : wrong input format in line", nline,". Not a BED file !?" - continue + nline += 1 + bedtab = line.split("\t") + try: + chromosome, start, end, name = bedtab[:4] + except ValueError: + print("Warning : wrong input format in line {}. Not a BED file ?!".format(nline)) + continue # BED files are zero-based as Intervals objects - start = int(start) # + 1 - end = int(end) - fragl = abs(end - start) - name = name.strip() - - ## Discard fragments outside the size range - filt=False - if minfragsize != None and int(fragl) < int(minfragsize): - nfilt+=1 - filt=True - elif maxfragsize != None and int(fragl) > int(maxfragsize): - nfilt+=1 - filt=True + start = int(start) # + 1 + end = int(end) + fragl = abs(end - start) + name = name.strip() + + ## Discard fragments outside the size range + filt = False + if minfragsize != None and int(fragl) < int(minfragsize): + nfilt += 1 + filt = True + elif maxfragsize != None and int(fragl) > int(maxfragsize): + nfilt += 1 + filt = True - if chromosome in resFrag: - tree = resFrag[chromosome] - tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) - else: - tree = Intersecter() - tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) - resFrag[chromosome] = tree + if chromosome in resFrag: + tree = resFrag[chromosome] + tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) + else: + tree = Intersecter() + tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) + resFrag[chromosome] = tree if nfilt > 0: - print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining." - + print("Warning : {} fragment(s) outside of range and discarded. {} remaining.".format(nfilt, nline - nfilt)) bed_handle.close() return resFrag @@ -260,22 +251,22 @@ def get_overlapping_restriction_fragment(resFrag, chrom, read): read = the read to intersect [AlignedRead] """ - # Get read position (middle or 5' end) + # Get read position (middle or start) pos = get_read_pos(read, st="middle") if chrom in resFrag: # Overlap with the position of the read (zero-based) resfrag = resFrag[chrom].find(pos, pos+1) if len(resfrag) > 1: - print "Warning : ", len(resfrag), " restriction fragments found for ", read.qname, "- skipped" + print("Warning : {} restictions fragments found for {} -skipped".format(len(resfrag), read.query_name)) return None elif len(resfrag) == 0: - print "Warning - no restriction fragments for ", read.qname ," at ", chrom, ":", pos + print("Warning - no restriction fragments for {} at {} : {}".format(read.query_name, chrom, pos)) return None else: return resfrag[0] else: - print "Warning - no restriction fragments for ", read.qname," at ", chrom, ":", pos + print("Warning - no restriction fragments for {} at {} : {}".format(read.qname, chrom, pos)) return None @@ -301,11 +292,11 @@ def is_religation(read1, read2, frag1, frag2): Check the orientation of reads -><- """ - ret=False + ret = False if are_contiguous_fragments(frag1, frag2, read1.tid, read2.tid): #r1, r2 = get_ordered_reads(read1, read2) #if get_read_strand(r1) == "+" and get_read_strand(r2) == "-": - ret=True + ret = True return ret @@ -374,8 +365,8 @@ def get_PE_fragment_size(read1, read2, resFrag1, resFrag2, interactionType): read1 : [AlignedRead] read2 : [AlignedRead] - resfrag1 = restrictin fragment overlapping the R1 read [interval] - resfrag1 = restrictin fragment overlapping the R1 read [interval] + resfrag1 = restriction fragment overlapping the R1 read [interval] + resfrag1 = restriction fragment overlapping the R1 read [interval] interactionType : Type of interaction from get_interaction_type() [str] """ @@ -463,7 +454,7 @@ def get_interaction_type(read1, read1_chrom, resfrag1, read2, def get_read_tag(read, tag): - for t in read.tags: + for t in read.get_tags(): if t[0] == tag: return t[1] return None @@ -520,16 +511,16 @@ if __name__ == "__main__": # Verbose mode if verbose: - print "## overlapMapped2HiCFragments.py" - print "## mappedReadsFile=", mappedReadsFile - print "## fragmentFile=", fragmentFile - print "## minInsertSize=", minInsertSize - print "## maxInsertSize=", maxInsertSize - print "## minFragSize=", minFragSize - print "## maxFragSize=", maxFragSize - print "## allOuput=", allOutput - print "## SAM ouput=", samOut - print "## verbose=", verbose, "\n" + print("## overlapMapped2HiCFragments.py") + print("## mappedReadsFile=", mappedReadsFile) + print("## fragmentFile=", fragmentFile) + print("## minInsertSize=", minInsertSize) + print("## maxInsertSize=", maxInsertSize) + print("## minFragSize=", minFragSize) + print("## maxFragSize=", maxFragSize) + print("## allOuput=", allOutput) + print("## SAM ouput=", samOut) + print("## verbose={}\n".format(verbose)) # Initialize variables reads_counter = 0 @@ -576,7 +567,7 @@ if __name__ == "__main__": # Read the SAM/BAM file if verbose: - print "## Opening SAM/BAM file '", mappedReadsFile, "'..." + print("## Opening SAM/BAM file {} ...".format(mappedReadsFile)) samfile = pysam.Samfile(mappedReadsFile, "rb") if samOut: @@ -585,7 +576,7 @@ if __name__ == "__main__": # Reads are 0-based too (for both SAM and BAM format) # Loop on all reads if verbose: - print "## Classifying Interactions ..." + print("## Classifying Interactions ...") for read in samfile.fetch(until_eof=True): reads_counter += 1 @@ -596,7 +587,7 @@ if __name__ == "__main__": if read.is_read1: r1 = read if not r1.is_unmapped: - r1_chrom = samfile.getrname(r1.tid) + r1_chrom = samfile.get_reference_name(r1.tid) r1_resfrag = get_overlapping_restriction_fragment(resFrag, r1_chrom, r1) else: r1_resfrag = None @@ -606,7 +597,7 @@ if __name__ == "__main__": elif read.is_read2: r2 = read if not r2.is_unmapped: - r2_chrom = samfile.getrname(r2.tid) + r2_chrom = samfile.get_reference_name(r2.tid) r2_resfrag = get_overlapping_restriction_fragment(resFrag, r2_chrom, r2) else: r2_resfrag = None @@ -706,8 +697,8 @@ if __name__ == "__main__": if not r1.is_unmapped and not r2.is_unmapped: ##reorient reads to ease duplicates removal or1, or2 = get_ordered_reads(r1, r2) - or1_chrom = samfile.getrname(or1.tid) - or2_chrom = samfile.getrname(or2.tid) + or1_chrom = samfile.get_reference_name(or1.tid) + or2_chrom = samfile.get_reference_name(or2.tid) ##reset as tag now that the reads are oriented r1as = get_read_tag(or1, gtag) @@ -734,7 +725,7 @@ if __name__ == "__main__": or2_fragname = 'None' cur_handler.write( - or1.qname + "\t" + + or1.query_name + "\t" + or1_chrom + "\t" + str(get_read_pos(or1)+1) + "\t" + str(get_read_strand(or1)) + "\t" + @@ -753,7 +744,7 @@ if __name__ == "__main__": r1_fragname = r1_resfrag.value['name'] cur_handler.write( - r1.qname + "\t" + + r1.query_name + "\t" + r1_chrom + "\t" + str(get_read_pos(r1)+1) + "\t" + str(get_read_strand(r1)) + "\t" + @@ -770,7 +761,7 @@ if __name__ == "__main__": r2_fragname = r2_resfrag.value['name'] cur_handler.write( - r2.qname + "\t" + + r2.query_name + "\t" + "*" + "\t" + "*" + "\t" + "*" + "\t" + @@ -791,7 +782,7 @@ if __name__ == "__main__": handle_sam.write(r2) if (reads_counter % 100000 == 0 and verbose): - print "##", reads_counter + print("##", reads_counter) # Close handler handle_valid.close() @@ -808,14 +799,10 @@ if __name__ == "__main__": handle_stat = open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') handle_stat.write("## Hi-C processing\n") handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") - handle_stat.write( - "Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") - handle_stat.write( - "Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") - handle_stat.write( - "Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") - handle_stat.write( - "Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") + handle_stat.write("Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") + handle_stat.write("Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") + handle_stat.write("Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") + handle_stat.write("Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") handle_stat.write("Dangling_end_pairs\t" + str(de_counter) + "\n") handle_stat.write("Religation_pairs\t" + str(re_counter) + "\n") handle_stat.write("Self_Cycle_pairs\t" + str(sc_counter) + "\n") @@ -839,4 +826,3 @@ if __name__ == "__main__": if samOut: samfile.close() - diff --git a/bin/markdown_to_html.py b/bin/markdown_to_html.py new file mode 100755 index 0000000000000000000000000000000000000000..57cc4263fe4182373949388b5aa88e20d60a3c70 --- /dev/null +++ b/bin/markdown_to_html.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +from __future__ import print_function +import argparse +import markdown +import os +import sys + +def convert_markdown(in_fn): + input_md = open(in_fn, mode="r", encoding="utf-8").read() + html = markdown.markdown( + "[TOC]\n" + input_md, + extensions = [ + 'pymdownx.extra', + 'pymdownx.b64', + 'pymdownx.highlight', + 'pymdownx.emoji', + 'pymdownx.tilde', + 'toc' + ], + extension_configs = { + 'pymdownx.b64': { + 'base_path': os.path.dirname(in_fn) + }, + 'pymdownx.highlight': { + 'noclasses': True + }, + 'toc': { + 'title': 'Table of Contents' + } + } + ) + return html + +def wrap_html(contents): + header = """<!DOCTYPE html><html> + <head> + <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous"> + <style> + body { + font-family: -apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji"; + padding: 3em; + margin-right: 350px; + max-width: 100%; + } + .toc { + position: fixed; + right: 20px; + width: 300px; + padding-top: 20px; + overflow: scroll; + height: calc(100% - 3em - 20px); + } + .toctitle { + font-size: 1.8em; + font-weight: bold; + } + .toc > ul { + padding: 0; + margin: 1rem 0; + list-style-type: none; + } + .toc > ul ul { padding-left: 20px; } + .toc > ul > li > a { display: none; } + img { max-width: 800px; } + pre { + padding: 0.6em 1em; + } + h2 { + + } + </style> + </head> + <body> + <div class="container"> + """ + footer = """ + </div> + </body> + </html> + """ + return header + contents + footer + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument('mdfile', type=argparse.FileType('r'), nargs='?', + help='File to convert. Defaults to stdin.') + parser.add_argument('-o', '--out', type=argparse.FileType('w'), + default=sys.stdout, + help='Output file name. Defaults to stdout.') + return parser.parse_args(args) + +def main(args=None): + args = parse_args(args) + converted_md = convert_markdown(args.mdfile.name) + html = wrap_html(converted_md) + args.out.write(html) + +if __name__ == '__main__': + sys.exit(main()) diff --git a/bin/markdown_to_html.r b/bin/markdown_to_html.r deleted file mode 100755 index abe1335070d84f0d9a17dae7b6d482341f7f59a8..0000000000000000000000000000000000000000 --- a/bin/markdown_to_html.r +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env Rscript - -# Command line argument processing -args = commandArgs(trailingOnly=TRUE) -if (length(args) < 2) { - stop("Usage: markdown_to_html.r <input.md> <output.html>", call.=FALSE) -} -markdown_fn <- args[1] -output_fn <- args[2] - -# Load / install packages -if (!require("markdown")) { - install.packages("markdown", dependencies=TRUE, repos='http://cloud.r-project.org/') - library("markdown") -} - -base_css_fn <- getOption("markdown.HTML.stylesheet") -base_css <- readChar(base_css_fn, file.info(base_css_fn)$size) -custom_css <- paste(base_css, " -body { - padding: 3em; - margin-right: 350px; - max-width: 100%; -} -#toc { - position: fixed; - right: 20px; - width: 300px; - padding-top: 20px; - overflow: scroll; - height: calc(100% - 3em - 20px); -} -#toc_header { - font-size: 1.8em; - font-weight: bold; -} -#toc > ul { - padding-left: 0; - list-style-type: none; -} -#toc > ul ul { padding-left: 20px; } -#toc > ul > li > a { display: none; } -img { max-width: 800px; } -") - -markdownToHTML( - file = markdown_fn, - output = output_fn, - stylesheet = custom_css, - options = c('toc', 'base64_images', 'highlight_code') -) diff --git a/bin/mergeSAM.py b/bin/mergeSAM.py index fdf0c67dfc24f161266c48506bdfda6b3eb7c899..12917b16277a0a768269f611cd13422bccbe98a1 100755 --- a/bin/mergeSAM.py +++ b/bin/mergeSAM.py @@ -19,20 +19,19 @@ import sys import os import re import pysam -from itertools import izip def usage(): """Usage function""" - print "Usage : python mergeSAM.py" - print "-f/--forward <forward read mapped file>" - print "-r/--reverse <reverse read mapped file>" - print "[-o/--output] <Output file. Default is stdin>" - print "[-s/--single] <report singleton>" - print "[-m/--multi] <report multiple hits>" - print "[-q/--qual] <minimum reads mapping quality>" - print "[-t/--stat] <generate a stat file>" - print "[-v/--verbose] <Verbose>" - print "[-h/--help] <Help>" + print("Usage : python mergeSAM.py") + print("-f/--forward <forward read mapped file>") + print("-r/--reverse <reverse read mapped file>") + print("[-o/--output] <Output file. Default is stdin>") + print("[-s/--single] <report singleton>") + print("[-m/--multi] <report multiple hits>") + print("[-q/--qual] <minimum reads mapping quality>") + print("[-t/--stat] <generate a stat file>") + print("[-v/--verbose] <Verbose>") + print("[-h/--help] <Help>") return @@ -53,37 +52,36 @@ def get_args(): def is_unique_bowtie2(read): - ret = False - if not read.is_unmapped and read.has_tag('AS'): - if read.has_tag('XS'): - primary = read.get_tag('AS') - secondary = read.get_tag('XS') - if (primary > secondary): - ret = True - else: - ret = True - - return ret + ret = False + if not read.is_unmapped and read.has_tag('AS'): + if read.has_tag('XS'): + primary = read.get_tag('AS') + secondary = read.get_tag('XS') + if (primary > secondary): + ret = True + else: + ret = True + return ret ## Remove everything after "/" or " " in read's name def get_read_name(read): - name = read.qname + name = read.query_name #return name.split("/",1)[0] return re.split('/| ', name)[0] def sam_flag(read1, read2, hr1, hr2): + + f1 = read1.flag + f2 = read2.flag - f1 = read1.flag - f2 = read2.flag - - if r1.is_unmapped == False: - r1_chrom = hr1.getrname(r1.tid) - else: - r1_chrom="*" - if r2.is_unmapped == False: - r2_chrom = hr2.getrname(r2.tid) - else: - r2_chrom="*" + if r1.is_unmapped == False: + r1_chrom = hr1.get_reference_name(r1.reference_id) + else: + r1_chrom = "*" + if r2.is_unmapped == False: + r2_chrom = hr2.get_reference_name(r2.reference_id) + else: + r2_chrom="*" ##Relevant bitwise flags (flag in an 11-bit binary number) @@ -101,226 +99,221 @@ def sam_flag(read1, read2, hr1, hr2): ##Output example: a paired-end read that aligns to the reverse strand ##and is the first mate in the pair will have flag 83 (= 64 + 16 + 2 + 1) - if f1 & 0x4: - f1 = f1 | 0x8 + if f1 & 0x4: + f1 = f1 | 0x8 - if f2 & 0x4: - f2 = f2 | 0x8 + if f2 & 0x4: + f2 = f2 | 0x8 - if (not (f1 & 0x4) and not (f2 & 0x4)): + if (not (f1 & 0x4) and not (f2 & 0x4)): ##The flag should now indicate this is paired-end data - f1 = f1 | 0x1 - f1 = f1 | 0x2 - f2 = f2 | 0x1 - f2 = f2 | 0x2 + f1 = f1 | 0x1 + f1 = f1 | 0x2 + f2 = f2 | 0x1 + f2 = f2 | 0x2 ##Indicate if the pair is on the reverse strand - if f1 & 0x10: - f2 = f2 | 0x20 + if f1 & 0x10: + f2 = f2 | 0x20 - if f2 & 0x10: - f1 = f1 | 0x20 + if f2 & 0x10: + f1 = f1 | 0x20 ##Is this first or the second pair? - f1 = f1 | 0x40 - f2 = f2 | 0x80 + f1 = f1 | 0x40 + f2 = f2 | 0x80 ##Insert the modified bitwise flags into the reads - read1.flag = f1 - read2.flag = f2 - - ##Determine the RNEXT and PNEXT values (i.e. the positional values of a read's pair) - #RNEXT - if r1_chrom == r2_chrom: - read1.rnext = r1.tid - read2.rnext = r1.tid - else: - read1.rnext = r2.tid - read2.rnext = r1.tid - - #PNEXT - read1.pnext = read2.pos - read2.pnext = read1.pos - - return(read1, read2) + read1.flag = f1 + read2.flag = f2 + + ##Determine the RNEXT and PNEXT values (i.e. the positional values of a read's pair) + #RNEXT + if r1_chrom == r2_chrom: + read1.next_reference_id = r1.reference_id + read2.next_reference_id = r1.reference_id + else: + read1.next_reference_id = r2.reference_id + read2.next_reference_id = r1.reference_id + #PNEXT + read1.next_reference_start = read2.reference_start + read2.next_reference_start = read1.reference_start + + return(read1, read2) if __name__ == "__main__": ## Read command line arguments - opts = get_args() - inputFile = None - outputFile = None - mapq = None - report_single = False - report_multi = False - verbose = False - stat = False - output = "-" - - if len(opts) == 0: - usage() - sys.exit() - - for opt, arg in opts: - if opt in ("-h", "--help"): - usage() - sys.exit() - elif opt in ("-f", "--forward"): - R1file = arg - elif opt in ("-r", "--reverse"): - R2file = arg - elif opt in ("-o", "--output"): - output = arg - elif opt in ("-q", "--qual"): - mapq = arg - elif opt in ("-s", "--single"): - report_single = True - elif opt in ("-m", "--multi"): - report_multi = True - elif opt in ("-t", "--stat"): - stat = True - elif opt in ("-v", "--verbose"): - verbose = True - else: - assert False, "unhandled option" + opts = get_args() + inputFile = None + outputFile = None + mapq = None + report_single = False + report_multi = False + verbose = False + stat = False + output = "-" + + if len(opts) == 0: + usage() + sys.exit() + + for opt, arg in opts: + if opt in ("-h", "--help"): + usage() + sys.exit() + elif opt in ("-f", "--forward"): + R1file = arg + elif opt in ("-r", "--reverse"): + R2file = arg + elif opt in ("-o", "--output"): + output = arg + elif opt in ("-q", "--qual"): + mapq = arg + elif opt in ("-s", "--single"): + report_single = True + elif opt in ("-m", "--multi"): + report_multi = True + elif opt in ("-t", "--stat"): + stat = True + elif opt in ("-v", "--verbose"): + verbose = True + else: + assert False, "unhandled option" ## Verbose mode - if verbose: - print "## mergeBAM.py" - print "## forward=", R1file - print "## reverse=", R2file - print "## output=", output - print "## min mapq=", mapq - print "## report_single=", report_single - print "## report_multi=", report_multi - print "## verbose=", verbose + if verbose: + print("## mergeBAM.py") + print("## forward=", R1file) + print("## reverse=", R2file) + print("## output=", output) + print("## min mapq=", mapq) + print("## report_single=", report_single) + print("## report_multi=", report_multi) + print("## verbose=", verbose) ## Initialize variables - tot_pairs_counter = 0 - multi_pairs_counter = 0 - uniq_pairs_counter = 0 - unmapped_pairs_counter = 0 - lowq_pairs_counter = 0 - multi_singles_counter = 0 - uniq_singles_counter = 0 - lowq_singles_counter = 0 + tot_pairs_counter = 0 + multi_pairs_counter = 0 + uniq_pairs_counter = 0 + unmapped_pairs_counter = 0 + lowq_pairs_counter = 0 + multi_singles_counter = 0 + uniq_singles_counter = 0 + lowq_singles_counter = 0 #local_counter = 0 - paired_reads_counter = 0 - singleton_counter = 0 - reads_counter = 0 - r1 = None - r2 = None + paired_reads_counter = 0 + singleton_counter = 0 + reads_counter = 0 + r1 = None + r2 = None ## Reads are 0-based too (for both SAM and BAM format) ## Loop on all reads - if verbose: - print "## Merging forward and reverse tags ..." - - with pysam.Samfile(R1file, "rb") as hr1, pysam.Samfile(R2file, "rb") as hr2: - if output == "-": - outfile = pysam.AlignmentFile(output, "w", template=hr1) - else: - outfile = pysam.AlignmentFile(output, "wb", template=hr1) - for r1, r2 in izip(hr1.fetch(until_eof=True), hr2.fetch(until_eof=True)): - reads_counter +=1 + if verbose: + print("## Merging forward and reverse tags ...") + with pysam.Samfile(R1file, "rb") as hr1, pysam.Samfile(R2file, "rb") as hr2: + if output == "-": + outfile = pysam.AlignmentFile(output, "w", template=hr1) + else: + outfile = pysam.AlignmentFile(output, "wb", template=hr1) + for r1, r2 in zip(hr1.fetch(until_eof=True), hr2.fetch(until_eof=True)): + reads_counter +=1 #print r1 #print r2 #print hr1.getrname(r1.tid) #print hr2.getrname(r2.tid) - if (reads_counter % 1000000 == 0 and verbose): - print "##", reads_counter + if (reads_counter % 1000000 == 0 and verbose): + print("##", reads_counter) - if get_read_name(r1) == get_read_name(r2): + if get_read_name(r1) == get_read_name(r2): ## both unmapped - if r1.is_unmapped == True and r2.is_unmapped == True: - unmapped_pairs_counter += 1 - continue + if r1.is_unmapped == True and r2.is_unmapped == True: + unmapped_pairs_counter += 1 + continue ## both mapped - elif r1.is_unmapped == False and r2.is_unmapped == False: + elif r1.is_unmapped == False and r2.is_unmapped == False: ## quality - if mapq != None and (r1.mapping_quality < int(mapq) or r2.mapping_quality < int(mapq)): - lowq_pairs_counter += 1 - continue + if mapq != None and (r1.mapping_quality < int(mapq) or r2.mapping_quality < int(mapq)): + lowq_pairs_counter += 1 + continue ## Unique mapping - if is_unique_bowtie2(r1) == True and is_unique_bowtie2(r2) == True: - uniq_pairs_counter += 1 - else: - multi_pairs_counter += 1 - if report_multi == False: - continue + if is_unique_bowtie2(r1) == True and is_unique_bowtie2(r2) == True: + uniq_pairs_counter += 1 + else: + multi_pairs_counter += 1 + if report_multi == False: + continue # one end mapped, other is not - else: - singleton_counter += 1 - if report_single == False: - continue - if r1.is_unmapped == False: ## first end is mapped, second is not + else: + singleton_counter += 1 + if report_single == False: + continue + if r1.is_unmapped == False: ## first end is mapped, second is not ## quality - if mapq != None and (r1.mapping_quality < int(mapq)): - lowq_singles_counter += 1 - continue + if mapq != None and (r1.mapping_quality < int(mapq)): + lowq_singles_counter += 1 + continue ## Unique mapping - if is_unique_bowtie2(r1) == True: - uniq_singles_counter += 1 - else: - multi_singles_counter += 1 - if report_multi == False: - continue - else: ## second end is mapped, first is not + if is_unique_bowtie2(r1) == True: + uniq_singles_counter += 1 + else: + multi_singles_counter += 1 + if report_multi == False: + continue + else: ## second end is mapped, first is not ## quality - if mapq != None and (r2.mapping_quality < int(mapq)): - lowq_singles_counter += 1 - continue + if mapq != None and (r2.mapping_quality < int(mapq)): + lowq_singles_counter += 1 + continue ## Unique mapping - if is_unique_bowtie2(r2) == True: - uniq_singles_counter += 1 - else: - multi_singles_counter += 1 - if report_multi == False: - continue + if is_unique_bowtie2(r2) == True: + uniq_singles_counter += 1 + else: + multi_singles_counter += 1 + if report_multi == False: + continue - tot_pairs_counter += 1 - (r1, r2) = sam_flag(r1,r2, hr1, hr2) + tot_pairs_counter += 1 + (r1, r2) = sam_flag(r1,r2, hr1, hr2) #print hr1.getrname(r1.tid) #print hr2.getrname(r2.tid) #print r1 #print r2 ## Write output - outfile.write(r1) - outfile.write(r2) - - else: - print "Forward and reverse reads not paired. Check that BAM files have the same read names and are sorted." - sys.exit(1) - - if stat: - if output == '-': - statfile = "pairing.stat" - else: - statfile = re.sub('\.bam$', '.pairstat', output) - handle_stat = open(statfile, 'w') - - handle_stat.write("Total_pairs_processed\t" + str(reads_counter) + "\t" + str(round(float(reads_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unmapped_pairs\t" + str(unmapped_pairs_counter) + "\t" + str(round(float(unmapped_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Low_qual_pairs\t" + str(lowq_pairs_counter) + "\t" + str(round(float(lowq_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unique_paired_alignments\t" + str(uniq_pairs_counter) + "\t" + str(round(float(uniq_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Multiple_pairs_alignments\t" + str(multi_pairs_counter) + "\t" + str(round(float(multi_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Pairs_with_singleton\t" + str(singleton_counter) + "\t" + str(round(float(singleton_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Low_qual_singleton\t" + str(lowq_singles_counter) + "\t" + str(round(float(lowq_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unique_singleton_alignments\t" + str(uniq_singles_counter) + "\t" + str(round(float(uniq_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Multiple_singleton_alignments\t" + str(multi_singles_counter) + "\t" + str(round(float(multi_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Reported_pairs\t" + str(tot_pairs_counter) + "\t" + str(round(float(tot_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.close() - - hr1.close() - hr2.close() - outfile.close() + outfile.write(r1) + outfile.write(r2) + + else: + print("Forward and reverse reads not paired. Check that BAM files have the same read names and are sorted.") + sys.exit(1) + + if stat: + if output == '-': + statfile = "pairing.stat" + else: + statfile = re.sub('\.bam$', '.pairstat', output) + with open(statfile, 'w') as handle_stat: + handle_stat.write("Total_pairs_processed\t" + str(reads_counter) + "\t" + str(round(float(reads_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unmapped_pairs\t" + str(unmapped_pairs_counter) + "\t" + str(round(float(unmapped_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Low_qual_pairs\t" + str(lowq_pairs_counter) + "\t" + str(round(float(lowq_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unique_paired_alignments\t" + str(uniq_pairs_counter) + "\t" + str(round(float(uniq_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Multiple_pairs_alignments\t" + str(multi_pairs_counter) + "\t" + str(round(float(multi_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Pairs_with_singleton\t" + str(singleton_counter) + "\t" + str(round(float(singleton_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Low_qual_singleton\t" + str(lowq_singles_counter) + "\t" + str(round(float(lowq_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unique_singleton_alignments\t" + str(uniq_singles_counter) + "\t" + str(round(float(uniq_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Multiple_singleton_alignments\t" + str(multi_singles_counter) + "\t" + str(round(float(multi_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Reported_pairs\t" + str(tot_pairs_counter) + "\t" + str(round(float(tot_pairs_counter)/float(reads_counter)*100,3)) + "\n") + hr1.close() + hr2.close() + outfile.close() diff --git a/bin/merge_statfiles.py b/bin/merge_statfiles.py index ab3d078c657e632471a47b4bc990aa16998cc781..469cacd81b597e296eb3eb5b4acdc500028be927 100755 --- a/bin/merge_statfiles.py +++ b/bin/merge_statfiles.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -## HiC-Pro -## Copyright (c) 2015 Institut Curie -## Author(s): Nicolas Servant, Eric Viara +## nf-core-hic +## Copyright (c) 2020 Institut Curie +## Author(s): Nicolas Servant ## Contact: nicolas.servant@curie.fr ## This software is distributed without any guarantee under the terms of the BSD-3 licence. ## See the LICENCE file for details @@ -36,13 +36,13 @@ if __name__ == "__main__": if li > 0: if args.verbose: - print "## merge_statfiles.py" - print "## Merging "+ str(li)+" files" + print("## merge_statfiles.py") + print("## Merging "+ str(li)+" files") ## Reading first file to get the template template = OrderedDict() if args.verbose: - print "## Use "+infiles[0]+" as template" + print("## Use "+infiles[0]+" as template") with open(infiles[0]) as f: for line in f: if not line.startswith("#"): @@ -51,17 +51,17 @@ if __name__ == "__main__": template[str(lsp[0])] = data if len(template) == 0: - print "Cannot find template files !" + print("Cannot find template files !") sys.exit(1) ## Int are counts / Float are percentage - for fidx in xrange(1, li): + for fidx in list(range(1, li)): with open(infiles[fidx]) as f: for line in f: if not line.startswith("#"): lsp = line.strip().split("\t") if lsp[0] in template: - for i in xrange(1, len(lsp)): + for i in list(range(1, len(lsp))): if isinstance(num(lsp[i]), int): template[lsp[0]][i-1] += num(lsp[i]) else: @@ -77,6 +77,6 @@ if __name__ == "__main__": sys.stdout.write("\n") else: - print "No files to merge - stop" + print("No files to merge - stop") sys.exit(1) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 7a38feec0135d37268ff82fa0c92dab46c84ac6a..d5f4c5c0095a2006ce4e7a876dc1ac849c020c3a 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -22,11 +22,19 @@ results['MultiQC'] = '<span style="color:#999999;\">N/A</span>' # Search each file using its regex for k, v in regexes.items(): - with open(v[0]) as x: - versions = x.read() - match = re.search(v[1], versions) - if match: - results[k] = "v{}".format(match.group(1)) + try: + with open(v[0]) as x: + versions = x.read() + match = re.search(v[1], versions) + if match: + results[k] = "v{}".format(match.group(1)) + except IOError: + results[k] = False + +# Remove software set to false in results +for k in list(results): + if not results[k]: + del(results[k]) # Remove software set to false in results for k in results: diff --git a/conf/awsbatch.config b/conf/awsbatch.config deleted file mode 100644 index 14af5866f5c6c18db7e8d6b93b40da8ea8311721..0000000000000000000000000000000000000000 --- a/conf/awsbatch.config +++ /dev/null @@ -1,18 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for running on AWS batch - * ------------------------------------------------- - * Base config needed for running with -profile awsbatch - */ -params { - config_profile_name = 'AWSBATCH' - config_profile_description = 'AWSBATCH Cloud Profile' - config_profile_contact = 'Alexander Peltzer (@apeltzer)' - config_profile_url = 'https://aws.amazon.com/de/batch/' -} - -aws.region = params.awsregion -process.executor = 'awsbatch' -process.queue = params.awsqueue -executor.awscli = '/home/ec2-user/miniconda/bin/aws' -params.tracedir = './' diff --git a/conf/base.config b/conf/base.config index 142439f8b3dc15acbb94cc84062abed73b1a46cd..157dd9548a110b9f2f710d3072850608fa9c2de5 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,73 +10,37 @@ */ process { - - // Check the defaults for all processes + // nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 8.GB * task.attempt, 'memory' ) } - time = { check_max( 2.h * task.attempt, 'time' ) } + memory = { check_max( 7.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' - // Process-specific resource requirements - withName:makeBowtie2Index { - cpus = { check_max( 1, 'cpus' ) } - memory = { check_max( 10.GB * task.attempt, 'memory' ) } - time = { check_max( 12.h * task.attempt, 'time' ) } - } - withName:bowtie2_end_to_end { - cpus = { check_max( 4, 'cpus' ) } + withLabel:process_low { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 4.GB * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } + time = { check_max( 6.h * task.attempt, 'time' ) } } - withName:bowtie2_on_trimmed_reads { - cpus = { check_max( 4, 'cpus' ) } - memory = { check_max( 4.GB * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } - } - withName:merge_mapping_steps { - cpus = { check_max( 4, 'cpus' ) } + withLabel:process_medium { + cpus = { check_max( 4 * task.attempt, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } - } - withName:trim_reads { - cpus = { check_max (1, 'cpus')} - memory = { check_max( 1.GB * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } } - withName:combine_mapped_files { - cpus = { check_max( 1, 'cpus' ) } - memory = { check_max( 4.GB * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } - } - withName:get_valid_interaction { - cpus = { check_max( 1, 'cpus' ) } - memory = { check_max( 4.GB * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } + withLabel:process_high { + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 64.GB * task.attempt, 'memory' ) } + time = { check_max( 10.h * task.attempt, 'time' ) } } - withName:build_contact_maps { - cpus = { check_max( 1, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } + withLabel:process_long { + time = { check_max( 20.h * task.attempt, 'time' ) } } - withName:run_ice { - cpus = { check_max( 1, 'cpus' ) } - memory = { check_max( 10.GB * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } + withLabel:process_highmem { + memory = { check_max( 12.GB * task.attempt, 'memory' ) } } - withName:generate_cool { - cpus = { check_max( 2, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + withName:get_software_versions { + cache = false } } - -params { - // Defaults only, expecting to be overwritten - max_memory = 8.GB - max_cpus = 4 - max_time = 24.h - igenomes_base = 's3://ngi-igenomes/igenomes/' -} diff --git a/conf/igenomes.config b/conf/igenomes.config index 92ad32389c6646cae0feea95e5e0a3bceeba909e..1ba2588593f4e1940dc0bf3a3380f0114a71684e 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -11,92 +11,152 @@ params { // illumina iGenomes reference file paths genomes { 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" + } + 'GRCh38' { + fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" } 'GRCm38' { - fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" } 'TAIR10' { - fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" } 'EB2' { - fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" } 'UMD3.1' { - fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" } 'WBcel235' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" } 'CanFam3.1' { - fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" } 'GRCz10' { - fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" } 'BDGP6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" } 'EquCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" } 'EB1' { - fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" } 'Galgal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" } 'Gm01' { - fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" } 'Mmul_1' { - fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" } 'IRGSP-1.0' { - fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" } 'CHIMP2.1.4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" } 'Rnor_6.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" } 'R64-1-1' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" } 'EF2' { - fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" } 'Sbi1' { - fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" } 'Sscrofa10.2' { - fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" } 'AGPv3' { - fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/genome" + fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" + } + 'hg38' { + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" + } + 'hg19' { + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" + } + 'mm10' { + fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" + } + 'bosTau8' { + fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" + } + 'ce10' { + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" + } + 'canFam3' { + fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" + } + 'danRer10' { + fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" + } + 'dm6' { + fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" + } + 'equCab2' { + fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" + } + 'galGal4' { + fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" + } + 'panTro4' { + fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" + } + 'rn6' { + fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" + } + 'sacCer3' { + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" + } + 'susScr3' { + fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" + bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" } } } diff --git a/conf/test.config b/conf/test.config index 00c47f85cd86d5d5a05ce3123024a37a9cc9e466..39a2bba88d6da893f0f3ba97f397e77488556873 100644 --- a/conf/test.config +++ b/conf/test.config @@ -4,12 +4,12 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/hic -profile test + * nextflow run nf-core/hic -profile test,<docker/singularity> */ params { - config_profile_name = 'Hi-C test data from Schalbetter et al. (2017)' +config_profile_name = 'Hi-C test data from Schalbetter et al. (2017)' config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on Travis @@ -26,8 +26,17 @@ params { fasta = 'https://github.com/nf-core/test-datasets/raw/hic/reference/W303_SGD_2015_JRIU00000000.fsa' restriction_site = 'A^AGCTT' ligation_site = 'AAGCTAGCTT' - min_mapq = 0 + + min_mapq = 2 + rm_dup = true + rm_singleton = true + rm_multi = true + min_restriction_fragment_size = 100 + max_restriction_fragment_size = 100000 + min_insert_size = 100 + max_insert_size = 600 + // Options - skipCool = true + skip_cool = true } diff --git a/docs/images/nf-core-hic_logo.png b/docs/images/nf-core-hic_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..e5fead372861ff430d7f1428e15dad9b045523e8 Binary files /dev/null and b/docs/images/nf-core-hic_logo.png differ diff --git a/docs/usage.md b/docs/usage.md index f1cd3a56a220a077cdd835c4d1e8e87e13cf726e..66d19457950bbd8a5038bad785862b5f558cb640 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,10 +2,11 @@ ## Table of contents -* [Introduction](#general-nextflow-info) +* [Table of contents](#table-of-contents) +* [Introduction](#introduction) * [Running the pipeline](#running-the-pipeline) -* [Updating the pipeline](#updating-the-pipeline) -* [Reproducibility](#reproducibility) + * [Updating the pipeline](#updating-the-pipeline) + * [Reproducibility](#reproducibility) * [Main arguments](#main-arguments) * [`-profile`](#-profile-single-dash) * [`awsbatch`](#awsbatch) @@ -67,17 +68,21 @@ * [Other command line parameters](#other-command-line-parameters) * [`--outdir`](#--outdir) * [`--email`](#--email) + * [`--email_on_fail`](#--email_on_fail) + * [`--max_multiqc_email_size`](#--max_multiqc_email_size) * [`-name`](#-name-single-dash) * [`-resume`](#-resume-single-dash) * [`-c`](#-c-single-dash) * [`--custom_config_version`](#--custom_config_version) + * [`--custom_config_base`](#--custom_config_base) * [`--max_memory`](#--max_memory) * [`--max_time`](#--max_time) * [`--max_cpus`](#--max_cpus) * [`--plaintext_email`](#--plaintext_email) + * [`--monochrome_logs`](#--monochrome_logs) * [`--multiqc_config`](#--multiqc_config) -## General Nextflow info +## Introduction Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is @@ -134,6 +139,12 @@ software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +It's a good idea to specify a pipeline version when running the pipeline on +your data. This ensures that a specific version of the pipeline code and +software are used when you run your pipeline. If you keep using the same tag, +you'll be running the same version of the pipeline, even if there have been +changes to the code since. + First, go to the [nf-core/hic releases page](https://github.com/nf-core/hic/releases) and find the latest version number - numeric only (eg. `1.3.1`). @@ -148,24 +159,38 @@ that you'll know what you used when you look back in the future. ### `-profile` Use this parameter to choose a configuration profile. Profiles can give -configuration presets for different compute environments. Note that multiple -profiles can be loaded, for example: `-profile docker` - the order of arguments -is important! +configuration presets for different compute environments. -If `-profile` is not specified at all the pipeline will be run locally and -expects all software to be installed and available on the `PATH`. +Several generic profiles are bundled with the pipeline which instruct +the pipeline to use software packaged using different methods +(Docker, Singularity, Conda) - see below. + +> We highly recommend the use of Docker or Singularity containers for full +pipeline reproducibility, however when this is not possible, Conda is also supported. + +The pipeline also dynamically loads configurations from +[https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, +making multiple config profiles for various institutional clusters available at run time. +For more information and to see if your system is available in these configs please see +the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). + +Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order +of arguments is important! +They are loaded in sequence, so later profiles can overwrite earlier profiles. + +If `-profile` is not specified, the pipeline will run locally and expect all software to be +installed and available on the `PATH`. This is _not_ recommended. -* `awsbatch` - * A generic configuration profile to be used with AWS Batch. -* `conda` - * A generic configuration profile to be used with [conda](https://conda.io/docs/) - * Pulls most software from [Bioconda](https://bioconda.github.io/) * `docker` * A generic configuration profile to be used with [Docker](http://docker.com/) * Pulls software from dockerhub: [`nfcore/hic`](http://hub.docker.com/r/nfcore/hic/) * `singularity` * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) * Pulls software from DockerHub: [`nfcore/hic`](http://hub.docker.com/r/nfcore/hic/) +* `conda` + * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker or Singularity. + * A generic configuration profile to be used with [Conda](https://conda.io/docs/) + * Pulls most software from [Bioconda](https://bioconda.github.io/) * `test` * A profile with a complete configuration for automated testing * Includes links to test data so needs no other parameters @@ -187,15 +212,24 @@ notation to specify read pairs. If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` -## Reference genomes and annotation files +### `--single_end` + +By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--reads`. For example: + +```bash +--single_end --reads '*.fastq' +``` + +It is not possible to run a mixture of single-end and paired-end files in one run. + +## Reference genomes -The pipeline config files come bundled with paths to the illumina iGenomes -reference index files. If running with docker or AWS, the configuration is -set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) -resource. +The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. ### `--genome` (using iGenomes) +There are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. + There are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. @@ -607,31 +641,18 @@ fails after three times then the pipeline is stopped. ### Custom resource requests -Wherever process-specific requirements are set in the pipeline, the default -value can be changed by creating a custom config file. -See the files hosted at -[`nf-core/configs`](https://github.com/nf-core/configs/tree/master/conf) -for examples. - -If you are likely to be running `nf-core` pipelines regularly it may be a good -idea to request that your custom config file is uploaded to the -`nf-core/configs` git repository. Before you do this please can you test that -the config file works with your pipeline of choice using the `-c` parameter -(see definition below). You can then create a pull request to the -`nf-core/configs` repository with the addition of your config file, associated -documentation file (see examples in -[`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), -and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) -to include your custom profile. - -If you have any questions or issues please send us a message on -[`Slack`](https://nf-core-invite.herokuapp.com/). +Wherever process-specific requirements are set in the pipeline, the default value +can be changed by creating a custom config file. See the files hosted +at [`nf-core/configs`](https://github.com/nf-core/configs/tree/master/conf) for examples. + +If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack). ## AWS Batch specific parameters -Running the pipeline on AWS Batch requires a couple of specific parameters to -be set according to your AWS Batch configuration. Please use the `-awsbatch` -profile and then specify all of the following parameters. +Running the pipeline on AWS Batch requires a couple of specific parameters to be +set according to your AWS Batch configuration. Please use +[`-profile awsbatch`](https://github.com/nf-core/configs/blob/master/conf/awsbatch.config) +and then specify all of the following parameters. ### `--awsqueue` @@ -639,6 +660,13 @@ The JobQueue that you intend to use on AWS Batch. ### `--awsregion` +The AWS region in which to run your job. Default is set to `eu-west-1` but can be adjusted to your needs. + +### `--awscli` + +The [AWS CLI](https://www.nextflow.io/docs/latest/awscloud.html#aws-cli-installation) +path in your custom AMI. Default: `/home/ec2-user/miniconda/bin/aws`. + The AWS region to run your job in. Default is set to `eu-west-1` but can be adjusted to your needs. @@ -656,11 +684,20 @@ The output directory where the results will be saved. Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file -(`~/.nextflow/config`) then you don't need to speicfy this on the command line -for every run. +(`~/.nextflow/config`) then you don't need to specify this on the command line for every run. + +### `--email_on_fail` + +This works exactly as with `--email`, except emails are only sent if the workflow is not successful. + +### `--max_multiqc_email_size` + +Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB). ### `-name` +Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. @@ -690,15 +727,35 @@ Note - you can use this to override pipeline defaults. ### `--custom_config_version` -Provide git commit id for custom Institutional configs hosted at -`nf-core/configs`. This was implemented for reproducibility purposes. -Default is set to `master`. +Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. +This was implemented for reproducibility purposes. Default: `master`. ```bash ## Download and use config file with following git commid id --custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96 ``` +### `--custom_config_base` + +If you're running offline, nextflow will not be able to fetch the institutional config files +from the internet. If you don't need them, then this is not a problem. If you do need them, +you should download the files from the repo and tell nextflow where to find them with the +`custom_config_base` option. For example: + +```bash +## Download and unzip the config files +cd /path/to/my/configs +wget https://github.com/nf-core/configs/archive/master.zip +unzip master.zip + +## Run the pipeline +cd /path/to/my/data +nextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/ +``` + +> Note that the nf-core/tools helper package has a `download` command to download all required pipeline +> files + singularity containers + institutional configs in one go for you, to make this process easier. + ### `--max_memory` Use to set a top-limit for the default memory requirement for each process. @@ -718,6 +775,10 @@ Should be a string in the format integer-unit. eg. `--max_cpus 1` Set to receive plain-text e-mails instead of HTML formatted. +### `--monochrome_logs` + +Set to disable colourful command line output and live life in monochrome. + ### `--multiqc_config` Specify a path to a custom MultiQC configuration file. diff --git a/environment.yml b/environment.yml index 271f3f5273246d63738d15c6f49cc7c07d1fdb94..f9c9548db518221c85c86f621049a0931fb73e65 100644 --- a/environment.yml +++ b/environment.yml @@ -1,21 +1,29 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-hic-1.1.1dev +name: nf-core-hic-1.2.0dev channels: - conda-forge - bioconda - defaults dependencies: - - python=2.7.15 - - pip=19.1 - - scipy=1.2.1 - - numpy=1.16.3 - - r-markdown=0.9 - - bx-python=0.8.2 - - pysam=0.15.2 - - cooler=0.8.5 - - bowtie2=2.3.5 - - samtools=1.9 - - multiqc=1.7 + - conda-forge::python=3.7.6 + - conda-forge::pip=20.0.1 + - conda-forge::scipy=1.4.1 + - conda-forge::numpy=1.18.1 + - bioconda::iced=0.5.6 + - bioconda::bx-python=0.8.8 + - bioconda::pysam=0.15.4 + - conda-forge::pymdown-extensions=7.1 + - bioconda::cooler=0.8.6 + - bioconda::bowtie2=2.3.5 + - bioconda::samtools=1.9 + - bioconda::multiqc=1.8 + +## Dev tools + - bioconda::hicexplorer=3.4.3 + - bioconda::bioconductor-hitc=1.32.0 + - conda-forge::r-optparse=1.6.6 + - bioconda::ucsc-bedgraphtobigwig=377 - pip: - - iced==0.5.1 + - cooltools==0.3.2 + - fanc==0.8.30 \ No newline at end of file diff --git a/main.nf b/main.nf index d3632984e3085b131d696006d0097368b567bd9a..7eb43154af1206fab562a759ac27a87faacb86ed 100644 --- a/main.nf +++ b/main.nf @@ -9,7 +9,6 @@ ---------------------------------------------------------------------------------------- */ - def helpMessage() { // Add to this help message with new command line parameters log.info nfcoreHeader() @@ -22,62 +21,64 @@ def helpMessage() { nextflow run nf-core/hic --reads '*_R{1,2}.fastq.gz' -profile conda Mandatory arguments: - --reads Path to input data (must be surrounded with quotes) - -profile Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, awsbatch, test and more. - - References If not specified in the configuration file or you wish to overwrite any of the references. - --genome Name of iGenomes reference - --bwt2_index Path to Bowtie2 index - --fasta Path to Fasta reference - --chromosome_size Path to chromosome size file - --restriction_fragments Path to restriction fragment file (bed) - --saveReference Save reference genome to output folder. Default: False - --saveAlignedIntermediates Save intermediates alignment files. Default: False + --reads [file] Path to input data (must be surrounded with quotes) + -profile [str] Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, awsbatch, test and more. + + References If not specified in the configuration file or you wish to overwrite any of the references. + --genome [str] Name of iGenomes reference + --bwt2_index [file] Path to Bowtie2 index + --fasta [file] Path to Fasta reference + --chromosome_size [file] Path to chromosome size file + --restriction_fragments [file] Path to restriction fragment file (bed) + --save_reference [bool] Save reference genome to output folder. Default: False + --save_aligned_intermediates [bool] Save intermediates alignment files. Default: False Alignments - --bwt2_opts_end2end Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default. - --bwt2_opts_trimmed Options for bowtie2 mapping after ligation site trimming. See hic.config for default. - --min_mapq Minimum mapping quality values to consider. Default: 10 - --restriction_site Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT' - --ligation_site Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT' - --rm_singleton Remove singleton reads. Default: true - --rm_multi Remove multi-mapped reads. Default: true - --rm_dup Remove duplicates. Default: true + --bwt2_opts_end2end [str] Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default. + --bwt2_opts_trimmed [str] Options for bowtie2 mapping after ligation site trimming. See hic.config for default. + --min_mapq [int] Minimum mapping quality values to consider. Default: 10 + --restriction_site [str] Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT' + --ligation_site [str] Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT' + --rm_singleton [bool] Remove singleton reads. Default: true + --rm_multi [bool] Remove multi-mapped reads. Default: true + --rm_dup [bool] Remove duplicates. Default: true Contacts calling - --min_restriction_fragment_size Minimum size of restriction fragments to consider. Default: None - --max_restriction_fragment_size Maximum size of restriction fragments to consider. Default: None - --min_insert_size Minimum insert size of mapped reads to consider. Default: None - --max_insert_size Maximum insert size of mapped reads to consider. Default: None - --saveInteractionBAM Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False + --min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: None + --max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: None + --min_insert_size [int] Minimum insert size of mapped reads to consider. Default: None + --max_insert_size [int] Maximum insert size of mapped reads to consider. Default: None + --save_interaction_bam [bool] Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False - --dnase Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False - --min_cis_dist Minimum intra-chromosomal distance to consider. Default: None + --dnase [bool] Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False + --min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: None Contact maps - --bin_size Bin size for contact maps (comma separated). Default: '1000000,500000' - --ice_max_iter Maximum number of iteration for ICE normalization. Default: 100 - --ice_filter_low_count_perc Percentage of low counts columns/rows to filter before ICE normalization. Default: 0.02 - --ice_filter_high_count_perc Percentage of high counts columns/rows to filter before ICE normalization. Default: 0 - --ice_eps Convergence criteria for ICE normalization. Default: 0.1 + --bin_size [int] Bin size for contact maps (comma separated). Default: '1000000,500000' + --ice_max_iter [int] Maximum number of iteration for ICE normalization. Default: 100 + --ice_filter_low_count_perc [float] Percentage of low counts columns/rows to filter before ICE normalization. Default: 0.02 + --ice_filter_high_count_perc [float] Percentage of high counts columns/rows to filter before ICE normalization. Default: 0 + --ice_eps [float] Convergence criteria for ICE normalization. Default: 0.1 Workflow - --skipMaps Skip generation of contact maps. Useful for capture-C. Default: False - --skipIce Skip ICE normalization. Default: False - --skipCool Skip generation of cool files. Default: False - --skipMultiQC Skip MultiQC. Default: False + --skip_maps [bool] Skip generation of contact maps. Useful for capture-C. Default: False + --skip_ice [bool] Skip ICE normalization. Default: False + --skip_cool [bool] Skip generation of cool files. Default: False + --skip_multiqc [bool] Skip MultiQC. Default: False Other - --splitFastq Size of read chuncks to use to speed up the workflow. Default: None - --outdir The output directory where the results will be saved. Default: './results' - --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. Default: None - -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. Default: None + --split_fastq [bool] Size of read chuncks to use to speed up the workflow. Default: None + --outdir [file] The output directory where the results will be saved. Default: './results' + --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. Default: None + --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful + --max_multiqc_email_size [str] Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. Default: None AWSBatch - --awsqueue The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion The AWS Region for your AWS Batch job to run on + --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion [str] The AWS Region for your AWS Batch job to run on """.stripIndent() } @@ -105,28 +106,27 @@ if (!params.dnase && !params.ligation_site) { params.bwt2_index = params.genome ? params.genomes[ params.genome ].bowtie2 ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false - // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name custom_runName = params.name -if( !(workflow.runName ==~ /[a-z]+_[a-z]+/) ){ - custom_runName = workflow.runName +if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { + custom_runName = workflow.runName } - -if( workflow.profile == 'awsbatch') { - // AWSBatch sanity checking - if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - // Check outdir paths to be S3 buckets if running on AWSBatch - // related: https://github.com/nextflow-io/nextflow/issues/813 - if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" - // Prevent trace files to be stored on S3 since S3 does not support rolling files. - if (workflow.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." +if (workflow.profile.contains('awsbatch')) { + // AWSBatch sanity checking + if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." } // Stage config files -ch_multiqc_config = Channel.fromPath(params.multiqc_config) -ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") +ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() +ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) /********************************************************** * SET UP CHANNELS @@ -135,7 +135,6 @@ ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") /* * input read files */ - if (params.readPaths){ raw_reads = Channel.create() @@ -155,7 +154,7 @@ if (params.readPaths){ .separate( raw_reads, raw_reads_2 ) { a -> [tuple(a[0], a[1][0]), tuple(a[0], a[1][1])] } } -if ( params.splitFastq ){ +if ( params.split_fastq ){ raw_reads_full = raw_reads.concat( raw_reads_2 ) raw_reads = raw_reads_full.splitFastq( by: params.splitFastq , file: true) }else{ @@ -194,7 +193,6 @@ else { } // Chromosome size - if ( params.chromosome_size ){ Channel.fromPath( params.chromosome_size , checkIfExists: true) .into {chromosome_size; chromosome_size_cool} @@ -225,10 +223,6 @@ else { // Resolutions for contact maps map_res = Channel.from( params.bin_size.tokenize(',') ) -// Stage config files -ch_multiqc_config = Channel.fromPath(params.multiqc_config) -ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") - /********************************************************** * SET UP LOGS */ @@ -239,7 +233,7 @@ def summary = [:] if(workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = custom_runName ?: workflow.runName summary['Reads'] = params.reads -summary['splitFastq'] = params.splitFastq +summary['splitFastq'] = params.split_fastq summary['Fasta Ref'] = params.fasta summary['Restriction Motif']= params.restriction_site summary['Ligation Motif'] = params.ligation_site @@ -247,12 +241,11 @@ summary['DNase Mode'] = params.dnase summary['Remove Dup'] = params.rm_dup summary['Min MAPQ'] = params.min_mapq summary['Min Fragment Size']= params.min_restriction_fragment_size -summary['Max Fragment Size']= params.max_restriction_framgnet_size +summary['Max Fragment Size']= params.max_restriction_fragment_size summary['Min Insert Size'] = params.min_insert_size summary['Max Insert Size'] = params.max_insert_size summary['Min CIS dist'] = params.min_cis_dist summary['Maps resolution'] = params.bin_size - summary['Max Memory'] = params.max_memory summary['Max CPUs'] = params.max_cpus summary['Max Time'] = params.max_time @@ -286,9 +279,10 @@ log.info "-\033[2m--------------------------------------------------\033[0m-" // Check the hostnames against configured profiles checkHostname() -def create_workflow_summary(summary) { - def yaml_file = workDir.resolve('workflow_summary_mqc.yaml') - yaml_file.text = """ +Channel.from(summary.collect{ [it.key, it.value] }) + .map { k,v -> "<dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" } + .reduce { a, b -> return [a, b].join("\n ") } + .map { x -> """ id: 'nf-core-hic-summary' description: " - this information is collected when the pipeline is started." section_name: 'nf-core/hic Workflow Summary' @@ -296,17 +290,15 @@ def create_workflow_summary(summary) { plot_type: 'html' data: | <dl class=\"dl-horizontal\"> -${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")} + $x </dl> - """.stripIndent() - - return yaml_file -} - + """.stripIndent() } + .set { ch_workflow_summary } /* * Parse software version numbers */ + process get_software_versions { publishDir "${params.outdir}/pipeline_info", mode: 'copy', saveAs: {filename -> @@ -330,6 +322,25 @@ process get_software_versions { """ } +def create_workflow_summary(summary) { + + def yaml_file = workDir.resolve('workflow_summary_mqc.yaml') + yaml_file.text = """ + id: 'nf-core-chipseq-summary' + description: " - this information is collected when the pipeline is started." + section_name: 'nf-core/chipseq Workflow Summary' + section_href: 'https://github.com/nf-core/chipseq' + plot_type: 'html' + data: | + <dl class=\"dl-horizontal\"> +${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")} + </dl> + """.stripIndent() + + return yaml_file +} + + /**************************************************** * PRE-PROCESSING @@ -338,8 +349,9 @@ process get_software_versions { if(!params.bwt2_index && params.fasta){ process makeBowtie2Index { tag "$bwt2_base" - publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { params.saveReference ? it : null }, mode: 'copy' + label 'process_highmem' + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { params.save_reference ? it : null }, mode: 'copy' input: file fasta from fasta_for_index @@ -361,8 +373,9 @@ if(!params.bwt2_index && params.fasta){ if(!params.chromosome_size && params.fasta){ process makeChromSize { tag "$fasta" - publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { params.saveReference ? it : null }, mode: 'copy' + label 'process_low' + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { params.save_reference ? it : null }, mode: 'copy' input: file fasta from fasta_for_chromsize @@ -380,9 +393,10 @@ if(!params.chromosome_size && params.fasta){ if(!params.restriction_fragments && params.fasta && !params.dnase){ process getRestrictionFragments { - tag "$fasta - ${params.restriction_site}" - publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { params.saveReference ? it : null }, mode: 'copy' + tag "$fasta ${params.restriction_site}" + label 'process_low' + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { params.save_reference ? it : null }, mode: 'copy' input: file fasta from fasta_for_resfrag @@ -407,8 +421,9 @@ if(!params.restriction_fragments && params.fasta && !params.dnase){ process bowtie2_end_to_end { tag "$prefix" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_medium' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' input: set val(sample), file(reads) from raw_reads @@ -445,8 +460,9 @@ process bowtie2_end_to_end { process trim_reads { tag "$prefix" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_low' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' when: !params.dnase @@ -467,8 +483,9 @@ process trim_reads { process bowtie2_on_trimmed_reads { tag "$prefix" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_medium' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' when: !params.dnase @@ -494,8 +511,9 @@ process bowtie2_on_trimmed_reads { if (!params.dnase){ process merge_mapping_steps{ tag "$sample = $bam1 + $bam2" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_medium' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' input: set val(prefix), file(bam1), file(bam2) from end_to_end_bam.join( trimmed_bam ) @@ -534,8 +552,9 @@ if (!params.dnase){ }else{ process dnase_mapping_stats{ tag "$sample = $bam1" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_medium' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' input: set val(prefix), file(bam1) from end_to_end_bam @@ -561,10 +580,9 @@ if (!params.dnase){ } } -println(bwt2_merged_bam) - process combine_mapped_files{ tag "$sample = $r1_prefix + $r2_prefix" + label 'process_low' publishDir "${params.outdir}/mapping", mode: 'copy', saveAs: {filename -> filename.indexOf(".pairstat") > 0 ? "stats/$filename" : "$filename"} @@ -599,6 +617,7 @@ process combine_mapped_files{ if (!params.dnase){ process get_valid_interaction{ tag "$sample" + label 'process_low' publishDir "${params.outdir}/hic_results/data", mode: 'copy', saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"} @@ -616,7 +635,7 @@ if (!params.dnase){ set val(sample), file("*RSstat") into all_rsstat script: - if (params.splitFastq){ + if (params.split_fastq){ sample = sample.toString() - ~/(\.[0-9]+)$/ } @@ -626,15 +645,18 @@ if (!params.dnase){ if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}" if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}" if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}" - if (params.saveInteractionBAM) opts="${opts} --sam" + if (params.save_interaction_bam) opts="${opts} --sam" + prefix = pe_bam.toString() - ~/.bam/ """ mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts} + sort -T /tmp/ -k2,2V -k3,3n -k5,5V -k6,6n -o ${prefix}.validPairs ${prefix}.validPairs """ } } else{ process get_valid_interaction_dnase{ tag "$sample" + label 'process_low' publishDir "${params.outdir}/hic_results/data", mode: 'copy', saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"} @@ -647,14 +669,16 @@ else{ set val(sample), file("*RSstat") into all_rsstat script: - if (params.splitFastq){ + if (params.split_fastq){ sample = sample.toString() - ~/(\.[0-9]+)$/ } def opts = "" if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" + prefix = pe_bam.toString() - ~/.bam/ """ mapped_2hic_dnase.py -r ${pe_bam} ${opts} + sort -T /tmp/ -k2,2V -k3,3n -k5,5V -k6,6n -o ${prefix}.validPairs ${prefix}.validPairs """ } } @@ -666,6 +690,7 @@ else{ process remove_duplicates { tag "$sample" + label 'process_highmem' publishDir "${params.outdir}/hic_results/data", mode: 'copy', saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$sample/$filename" : "$filename"} @@ -712,6 +737,7 @@ process remove_duplicates { process merge_sample { tag "$ext" + label 'process_low' publishDir "${params.outdir}/hic_results/stats/${sample}", mode: 'copy' input: @@ -731,13 +757,13 @@ process merge_sample { """ } - process build_contact_maps{ tag "$sample - $mres" + label 'process_highmem' publishDir "${params.outdir}/hic_results/matrix/raw", mode: 'copy' when: - !params.skipMaps + !params.skip_maps input: set val(sample), file(vpairs), val(mres) from all_valid_pairs.combine(map_res) @@ -759,10 +785,11 @@ process build_contact_maps{ process run_ice{ tag "$rmaps" + label 'process_highmem' publishDir "${params.outdir}/hic_results/matrix/iced", mode: 'copy' when: - !params.skipMaps && !params.skipIce + !params.skip_maps && !params.skip_ice input: file(rmaps) from raw_maps @@ -787,10 +814,11 @@ process run_ice{ */ process generate_cool{ tag "$sample" + label 'process_medium' publishDir "${params.outdir}/export/cool", mode: 'copy' when: - !params.skipCool + !params.skip_cool input: set val(sample), file(vpairs) from all_valid_pairs_4cool @@ -810,13 +838,15 @@ process generate_cool{ * STEP 6 - MultiQC */ process multiqc { + label 'process_low' publishDir "${params.outdir}/MultiQC", mode: 'copy' when: - !params.skipMultiQC + !params.skip_multiqc input: file multiqc_config from ch_multiqc_config + file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) file ('input_*/*') from all_mstats.concat(all_mergestat).collect() file ('software_versions/*') from software_versions_yaml file workflow_summary from create_workflow_summary(summary) @@ -833,8 +863,6 @@ process multiqc { """ } - - /* * STEP 7 - Output Description HTML */ @@ -849,12 +877,10 @@ process output_documentation { script: """ - markdown_to_html.r $output_docs results_description.html + markdown_to_html.py $output_docs -o results_description.html """ } - - /* * Completion e-mail notification */ @@ -863,8 +889,8 @@ workflow.onComplete { // Set up the e-mail variables def subject = "[nf-core/hic] Successful: $workflow.runName" - if(!workflow.success){ - subject = "[nf-core/hic] FAILED: $workflow.runName" + if (!workflow.success) { + subject = "[nf-core/hic] FAILED: $workflow.runName" } def email_fields = [:] email_fields['version'] = workflow.manifest.version @@ -882,10 +908,9 @@ workflow.onComplete { email_fields['summary']['Date Completed'] = workflow.complete email_fields['summary']['Pipeline script file path'] = workflow.scriptFile email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId - if(workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository - if(workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if(workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision - if(workflow.container) email_fields['summary']['Docker image'] = workflow.container + if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision email_fields['summary']['Nextflow Version'] = workflow.nextflow.version email_fields['summary']['Nextflow Build'] = workflow.nextflow.build email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp @@ -895,8 +920,8 @@ workflow.onComplete { def mqc_report = null try { if (workflow.success) { - mqc_report = multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList){ + mqc_report = ch_multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList) { log.warn "[nf-core/hic] Found multiple reports from process 'multiqc', will use only one" mqc_report = mqc_report[0] } @@ -905,6 +930,12 @@ workflow.onComplete { log.warn "[nf-core/hic] Could not attach MultiQC report to summary email" } + // Check if we are only sending emails on failure + email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + // Render the TXT template def engine = new groovy.text.GStringTemplateEngine() def tf = new File("$baseDir/assets/email_template.txt") @@ -917,67 +948,67 @@ workflow.onComplete { def email_html = html_template.toString() // Render the sendmail template - def smail_fields = [ email: params.email, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.maxMultiqcEmailFileSize.toBytes() ] + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] def sf = new File("$baseDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() // Send the HTML e-mail - if (params.email) { + if (email_address) { try { - if( params.plaintext_email ){ throw GroovyException('Send plaintext e-mail, not HTML') } - // Try to send HTML e-mail using sendmail - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "[nf-core/hic] Sent summary e-mail to $params.email (sendmail)" + if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "[nf-core/hic] Sent summary e-mail to $email_address (sendmail)" } catch (all) { - // Catch failures and try with plaintext - [ 'mail', '-s', subject, params.email ].execute() << email_txt - log.info "[nf-core/hic] Sent summary e-mail to $params.email (mail)" + // Catch failures and try with plaintext + [ 'mail', '-s', subject, email_address ].execute() << email_txt + log.info "[nf-core/hic] Sent summary e-mail to $email_address (mail)" } } // Write summary e-mail HTML to a file - def output_d = new File( "${params.outdir}/pipeline_info/" ) - if( !output_d.exists() ) { - output_d.mkdirs() + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() } - def output_hf = new File( output_d, "pipeline_report.html" ) + def output_hf = new File(output_d, "pipeline_report.html") output_hf.withWriter { w -> w << email_html } - def output_tf = new File( output_d, "pipeline_report.txt" ) + def output_tf = new File(output_d, "pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } - c_reset = params.monochrome_logs ? '' : "\033[0m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; c_red = params.monochrome_logs ? '' : "\033[0;31m"; + c_reset = params.monochrome_logs ? '' : "\033[0m"; if (workflow.stats.ignoredCount > 0 && workflow.success) { - log.info "${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}" - log.info "${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}" - log.info "${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}" + log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" + log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" + log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" } - if(workflow.success){ - log.info "${c_purple}[nf-core/hic]${c_green} Pipeline completed successfully${c_reset}" + if (workflow.success) { + log.info "-${c_purple}[nf-core/hic]${c_green} Pipeline completed successfully${c_reset}-" } else { checkHostname() - log.info "${c_purple}[nf-core/hic]${c_red} Pipeline completed with errors${c_reset}" + log.info "-${c_purple}[nf-core/hic]${c_red} Pipeline completed with errors${c_reset}-" } } -def nfcoreHeader(){ +def nfcoreHeader() { // Log colors ANSI codes - c_reset = params.monochrome_logs ? '' : "\033[0m"; - c_dim = params.monochrome_logs ? '' : "\033[2m"; c_black = params.monochrome_logs ? '' : "\033[0;30m"; - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; c_blue = params.monochrome_logs ? '' : "\033[0;34m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; + c_dim = params.monochrome_logs ? '' : "\033[2m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_reset = params.monochrome_logs ? '' : "\033[0m"; c_white = params.monochrome_logs ? '' : "\033[0;37m"; + c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; return """ -${c_dim}--------------------------------------------------${c_reset}- ${c_green},--.${c_black}/${c_green},-.${c_reset} @@ -985,21 +1016,21 @@ def nfcoreHeader(){ ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} ${c_green}`._,._,\'${c_reset} - ${c_purple} nf-core/atacseq v${workflow.manifest.version}${c_reset} + ${c_purple} nf-core/hic v${workflow.manifest.version}${c_reset} -${c_dim}--------------------------------------------------${c_reset}- """.stripIndent() } -def checkHostname(){ +def checkHostname() { def c_reset = params.monochrome_logs ? '' : "\033[0m" def c_white = params.monochrome_logs ? '' : "\033[0;37m" def c_red = params.monochrome_logs ? '' : "\033[1;91m" def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" - if(params.hostnames){ + if (params.hostnames) { def hostname = "hostname".execute().text.trim() params.hostnames.each { prof, hnames -> hnames.each { hname -> - if(hostname.contains(hname) && !workflow.profile.contains(prof)){ + if (hostname.contains(hname) && !workflow.profile.contains(prof)) { log.error "====================================================\n" + " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + diff --git a/nextflow.config b/nextflow.config index 5d69802ee9402cae44e25e6880d3353e4d236561..7ad9a223c8d0a40e16fdc7727f78c217fac76364 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,38 +9,57 @@ params { // Workflow flags - // Specify your pipeline's command line flags - reads = "*{1,2}.fastq.gz" + genome = false + reads = "data/*{1,2}.fastq.gz" + single_end = false + outdir = './results' genome = false readPaths = false chromosome_size = false restriction_fragments = false - skipMaps = false - skipIce = false - skipCool = false - skipMultiQC = false + skip_maps = false + skip_ice = false + skip_cool = false + skip_multiqc = false + save_reference = false + save_interaction_bam = false + save_aligned_intermediates = false + dnase = false + rm_dup = false + rm_singleton = false + rm_multi = false + min_restriction_fragment_size = false + max_restriction_fragment_size = false + min_insert_size = false + max_insert_size = false + min_cis_dist = false // Boilerplate options + multiqc_config = false name = false - multiqc_config = "$baseDir/assets/multiqc_config.yaml" email = false - maxMultiqcEmailFileSize = 25.MB + email_on_fail = false + max_multiqc_email_size = 25.MB plaintext_email = false monochrome_logs = false help = false - igenomes_base = "./iGenomes" + igenomes_base = 's3://ngi-igenomes/igenomes/' tracedir = "${params.outdir}/pipeline_info" - awsqueue = false - awsregion = 'eu-west-1' - igenomesIgnore = false + igenomes_ignore = false + custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" hostnames = false config_profile_description = false config_profile_contact = false config_profile_url = false + + // Defaults only, expecting to be overwritten + max_memory = 24.GB + max_cpus = 8 + max_time = 240.h } // Container slug. Stable releases should specify release tag! @@ -62,19 +81,33 @@ includeConfig 'conf/hicpro.config' // Create profiles profiles { - awsbatch { includeConfig 'conf/awsbatch.config' } conda { process.conda = "$baseDir/environment.yml" } debug { process.beforeScript = 'echo $HOSTNAME' } - docker { docker.enabled = true } - singularity { singularity.enabled = true } + docker { + docker.enabled = true + // Avoid this error: + // WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap. + // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351 + // once this is established and works well, nextflow might implement this behavior as new default. + docker.runOptions = '-u \$(id -u):\$(id -g)' + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + } test { includeConfig 'conf/test.config' } } // Load igenomes.config if required -if(!params.igenomesIgnore){ +if (!params.igenomes_ignore) { includeConfig 'conf/igenomes.config' } +// Export this variable to prevent local Python libraries from conflicting with those in the container +env { + PYTHONNOUSERSITE = 1 +} + // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] @@ -101,16 +134,16 @@ manifest { homePage = 'https://github.com/nf-core/hic' description = 'Analysis of Chromosome Conformation Capture data (Hi-C)' mainScript = 'main.nf' - nextflowVersion = '>=19.04.0' - version = '1.1.1dev' + nextflowVersion = '>=19.10.0' + version = '1.2.0dev' } // Function to ensure that resource requirements don't go beyond // a maximum limit def check_max(obj, type) { - if(type == 'memory'){ + if (type == 'memory') { try { - if(obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) return params.max_memory as nextflow.util.MemoryUnit else return obj @@ -118,9 +151,9 @@ def check_max(obj, type) { println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" return obj } - } else if(type == 'time'){ + } else if (type == 'time') { try { - if(obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) return params.max_time as nextflow.util.Duration else return obj @@ -128,7 +161,7 @@ def check_max(obj, type) { println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" return obj } - } else if(type == 'cpus'){ + } else if (type == 'cpus') { try { return Math.min( obj, params.max_cpus as int ) } catch (all) {