diff --git a/results/.gitignore b/results/.gitignore deleted file mode 100644 index 72e8ffc0db8aad71a934dd11e5968bd5109e54b4..0000000000000000000000000000000000000000 --- a/results/.gitignore +++ /dev/null @@ -1 +0,0 @@ -* diff --git a/src/.docker_modules/bedops/2.4.39/Dockerfile b/src/.docker_modules/bedops/2.4.39/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..2586c39b0c8f6fc0be6f145b0e72419c9c62f300 --- /dev/null +++ b/src/.docker_modules/bedops/2.4.39/Dockerfile @@ -0,0 +1,2 @@ +FROM quay.io/biocontainers/bedops:2.4.39--hc9558a2_0 +MAINTAINER Laurent Modolo diff --git a/src/.docker_modules/bedops/2.4.39/docker_init.sh b/src/.docker_modules/bedops/2.4.39/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..a50d06b132b1d69bf8e7d18c55fe05b4a5a15b7b --- /dev/null +++ b/src/.docker_modules/bedops/2.4.39/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/bedops:2.4.39 +docker build src/.docker_modules/bedops/2.4.39 -t 'lbmc/bedops:2.4.39' +docker push lbmc/bedops:2.4.39 diff --git a/src/.docker_modules/crossmap/0.4.1/Dockerfile b/src/.docker_modules/crossmap/0.4.1/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..bc988bb81d9f6fb83699092f8c68c5b476489a94 --- /dev/null +++ b/src/.docker_modules/crossmap/0.4.1/Dockerfile @@ -0,0 +1,2 @@ +FROM quay.io/biocontainers/crossmap:0.4.1--pyh5ca1d4c_0 +MAINTAINER Laurent Modolo diff --git a/src/.docker_modules/crossmap/0.4.1/docker_init.sh b/src/.docker_modules/crossmap/0.4.1/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..8bf250e04c90965f0be29ff62eb3bd54c08e81cc --- /dev/null +++ b/src/.docker_modules/crossmap/0.4.1/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/crossmap:0.4.1 +docker build src/.docker_modules/crossmap/0.4.1/ -t 'lbmc/crossmap:0.4.1' +docker push lbmc/crossmap:0.4.1 diff --git a/src/.docker_modules/g2gtools/0.2.7/Dockerfile b/src/.docker_modules/g2gtools/0.2.7/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6268b9a83d5fbd548e94384a92eeb548d5bc46a2 --- /dev/null +++ b/src/.docker_modules/g2gtools/0.2.7/Dockerfile @@ -0,0 +1,12 @@ +FROM jcrist/alpine-conda:4.6.8 + +RUN /opt/conda/bin/conda config --add channels r \ + && /opt/conda/bin/conda config --add channels bioconda \ + && /opt/conda/bin/conda install --yes \ + -c kbchoi g2gtools pycparser setuptools\ + nomkl \ + && /opt/conda/bin/conda clean -afy \ + && find /opt/conda/ -follow -type f -name '*.a' -delete \ + && find /opt/conda/ -follow -type f -name '*.pyc' -delete \ + && find /opt/conda/ -follow -type f -name '*.js.map' -delete + diff --git a/src/.docker_modules/g2gtools/0.2.7/docker_init.sh b/src/.docker_modules/g2gtools/0.2.7/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..2da2d65ca6ecc154956f98dd2bc980f70311c41c --- /dev/null +++ b/src/.docker_modules/g2gtools/0.2.7/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/g2gtools:0.2.7 +docker build src/.docker_modules/g2gtools/0.2.7 -t 'lbmc/g2gtools:0.2.7' +docker push lbmc/g2gtools:0.2.7 diff --git a/src/.docker_modules/g2gtools/0.2.8/Dockerfile b/src/.docker_modules/g2gtools/0.2.8/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c57b061fd1380ba0c432ba515ce7247044e0e957 --- /dev/null +++ b/src/.docker_modules/g2gtools/0.2.8/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.8-alpine +MAINTAINER Laurent Modolo + +ENV G2GTOOLS_VERSION=0.2.8 + +RUN apk add --update --no-cache bash musl-dev linux-headers g++ cmake make build-base bzip2-dev zlib-dev xz-dev autoconf \ + && wget https://github.com/churchill-lab/g2gtools/archive/v${G2GTOOLS_VERSION}.tar.gz \ + && tar -xvf v${G2GTOOLS_VERSION}.tar.gz \ + && cd g2gtools-${G2GTOOLS_VERSION} \ + && pip install numpy \ + && make install diff --git a/src/.docker_modules/g2gtools/0.2.8/docker_init.sh b/src/.docker_modules/g2gtools/0.2.8/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..99cbd49ff63c77fb957b0193c4596029257b2de7 --- /dev/null +++ b/src/.docker_modules/g2gtools/0.2.8/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/g2gtools:0.2.8 +docker build src/.docker_modules/g2gtools/0.2.8 -t 'lbmc/g2gtools:0.2.8' +docker push lbmc/g2gtools:0.2.8 diff --git a/src/.docker_modules/gffread/0.11.8/Dockerfile b/src/.docker_modules/gffread/0.11.8/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..cbb5f60b09e834d05a50d9b4e2d4975577700673 --- /dev/null +++ b/src/.docker_modules/gffread/0.11.8/Dockerfile @@ -0,0 +1,16 @@ +FROM alpine:3.12 +MAINTAINER Laurent Modolo + +ENV GFFREAD_VERSION=0.11.8 +ENV PACKAGES make \ + g++ \ + bash \ + perl + +RUN apk update && \ + apk add ${PACKAGES} && \ +wget http://ccb.jhu.edu/software/stringtie/dl/gffread-${GFFREAD_VERSION}.tar.gz && \ +tar -xvf gffread-${GFFREAD_VERSION}.tar.gz && \ +cd gffread-${GFFREAD_VERSION}/ && \ +make && \ +cp gffread /usr/bin/ diff --git a/src/.docker_modules/gffread/0.11.8/docker_init.sh b/src/.docker_modules/gffread/0.11.8/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..44c18612cbc9b8d5c093d980848bfc03d1b2f1e6 --- /dev/null +++ b/src/.docker_modules/gffread/0.11.8/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/gffread:0.11.8 +docker build src/.docker_modules/gffread/0.11.8 -t 'lbmc/gffread:0.11.8' +docker push lbmc/gffread:0.11.8 diff --git a/src/.docker_modules/kallistobustools/0.39.3/Dockerfile b/src/.docker_modules/kallistobustools/0.39.3/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c993e745ee294fc3ddb4601f850dd401b1f6b176 --- /dev/null +++ b/src/.docker_modules/kallistobustools/0.39.3/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.8-alpine +MAINTAINER Laurent Modolo + +ENV B_VERSION=0.39.3 +ENV K_VERSION=0.46.1 + +RUN apk add --update --no-cache bash musl-dev linux-headers g++ cmake make build-base hdf5 hdf5-dev zlib-dev autoconf && \ +wget https://github.com/BUStools/bustools/archive/v${B_VERSION}.tar.gz && \ +tar xvf v${B_VERSION}.tar.gz && \ +cd bustools-${B_VERSION} && \ +mkdir build && \ +cd build && \ +cmake .. && \ +sed -i -e 's/"Common\.hpp"/"Common\.hpp"\n#include <cmath>/g' ../src/bustools_whitelist.h && \ +sed -i 's/pow/std::pow/g' ../src/bustools_whitelist.cpp && \ +make && \ +make install && \ +wget https://github.com/pachterlab/kallisto/archive/v${K_VERSION}.tar.gz && \ +tar xvf v${K_VERSION}.tar.gz && \ +cd kallisto-${K_VERSION} && \ +mkdir build && \ +cd build && \ +cmake .. && \ +make && \ +make install && \ +wget https://github.com/BUStools/getting_started/releases/download/getting_started/t2g.py && \ +chmod +x t2g.py && \ +mv t2g.py /usr/local/bin/ && \ +rm -R kallisto* bustools* v${K_VERSION}.tar.gz v${B_VERSION}.tar.gz + +CMD ["sh"] diff --git a/src/.docker_modules/kallistobustools/0.39.3/docker_init.sh b/src/.docker_modules/kallistobustools/0.39.3/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..5cdd8c44773d7eb1f21bf5f500e3556f978ecdf9 --- /dev/null +++ b/src/.docker_modules/kallistobustools/0.39.3/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/kallistobustools:0.39.3 +docker build src/.docker_modules/kallistobustools/0.39.3 -t 'lbmc/kallistobustools:0.39.3' +docker push lbmc/kallistobustools:0.39.3 diff --git a/src/.docker_modules/r/3.5.3/Dockerfile b/src/.docker_modules/r-base/3.5.3/Dockerfile similarity index 100% rename from src/.docker_modules/r/3.5.3/Dockerfile rename to src/.docker_modules/r-base/3.5.3/Dockerfile diff --git a/src/.docker_modules/r-base/3.5.3/docker_init.sh b/src/.docker_modules/r-base/3.5.3/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..f62473c54ae30f85f36293fb74b42fc255062a46 --- /dev/null +++ b/src/.docker_modules/r-base/3.5.3/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/r:3.5.3 +docker build src/.docker_modules/r-base/3.5.3 -t 'lbmc/r:3.5.3' +docker push lbmc/r:3.5.3 diff --git a/src/.docker_modules/r/3.6.2/Dockerfile b/src/.docker_modules/r-base/3.6.2/Dockerfile similarity index 100% rename from src/.docker_modules/r/3.6.2/Dockerfile rename to src/.docker_modules/r-base/3.6.2/Dockerfile diff --git a/src/.docker_modules/r/3.6.2/docker_init.sh b/src/.docker_modules/r-base/3.6.2/docker_init.sh similarity index 50% rename from src/.docker_modules/r/3.6.2/docker_init.sh rename to src/.docker_modules/r-base/3.6.2/docker_init.sh index 78d50296b2f41e54690986e4d7c61a8b7f4c4419..d1e6e8183e95ba787d244ae17f13d0f6e97eefba 100755 --- a/src/.docker_modules/r/3.6.2/docker_init.sh +++ b/src/.docker_modules/r-base/3.6.2/docker_init.sh @@ -1,4 +1,4 @@ #!/bin/sh docker pull lbmc/r-base:3.6.2 -docker build src/.docker_modules/r/3.6.2 -t 'lbmc/r-base:3.6.2' +docker build src/.docker_modules/r-base/3.6.2 -t 'lbmc/r-base:3.6.2' docker push lbmc/r-base:3.6.2 diff --git a/src/.docker_modules/r/4.0.0/Dockerfile b/src/.docker_modules/r-base/4.0.0/Dockerfile similarity index 100% rename from src/.docker_modules/r/4.0.0/Dockerfile rename to src/.docker_modules/r-base/4.0.0/Dockerfile diff --git a/src/.docker_modules/r/4.0.0/docker_init.sh b/src/.docker_modules/r-base/4.0.0/docker_init.sh similarity index 50% rename from src/.docker_modules/r/4.0.0/docker_init.sh rename to src/.docker_modules/r-base/4.0.0/docker_init.sh index 2f7d058612659d3b56e6d4b22e0c94798c01ccac..fe24f44d1733d3a3cce32eb516ddcd7d8ae50930 100755 --- a/src/.docker_modules/r/4.0.0/docker_init.sh +++ b/src/.docker_modules/r-base/4.0.0/docker_init.sh @@ -1,4 +1,4 @@ #!/bin/sh docker pull lbmc/r-base:4.0.0 -docker build src/.docker_modules/r/4.0.0 -t 'lbmc/r-base:4.0.0' +docker build src/.docker_modules/r-base/4.0.0 -t 'lbmc/r-base:4.0.0' docker push lbmc/r-base:4.0.0 diff --git a/src/.docker_modules/r-base/4.0.2/Dockerfile b/src/.docker_modules/r-base/4.0.2/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..f9ad39e2fe3ad59963ccdadb916a1d182ab91452 --- /dev/null +++ b/src/.docker_modules/r-base/4.0.2/Dockerfile @@ -0,0 +1,33 @@ +FROM alpine:3.12.0 +MAINTAINER Lauret Modolo + +ENV R_PKGS R=~4.0.2 \ + R-mathlib=~4.0.2 \ + R-dev=~4.0.2 \ + R-doc=~4.0.2 + +ENV R_DEPS g++ \ + libxml2-dev \ + make \ + cmake \ + linux-headers \ + cairo-dev \ + libxmu-dev \ + pango-dev \ + perl \ + tiff-dev \ + icu-dev \ + libjpeg-turbo \ + pcre-dev \ + readline-dev \ + libexecinfo-dev \ + file \ + ttf-linux-libertine \ + git + +RUN echo "http://ftp.acc.umu.se/mirror/alpinelinux.org/v3.11/main" > /etc/apk/repositories \ +&& echo "http://ftp.acc.umu.se/mirror/alpinelinux.org/v3.11/community" >> /etc/apk/repositories \ +&& sed -i -e 's/v[[:digit:]]\..*\//edge\//g' /etc/apk/repositories \ +&& apk add --update --no-cache ${R_PKGS} ${R_DEPS} + +CMD ["R", "--no-save"] diff --git a/src/.docker_modules/r-base/4.0.2/docker_init.sh b/src/.docker_modules/r-base/4.0.2/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..16f69fd3450819f58f071fb6f4ea06ee15e38e27 --- /dev/null +++ b/src/.docker_modules/r-base/4.0.2/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/r-base:4.0.2 +docker build src/.docker_modules/r/4.0.2 -t 'lbmc/r-base:4.0.2' +docker push lbmc/r-base:4.0.2 diff --git a/src/.docker_modules/r/3.5.3/docker_init.sh b/src/.docker_modules/r/3.5.3/docker_init.sh deleted file mode 100755 index ce78559716b93f92c3c602cf242dd1d42a1e6220..0000000000000000000000000000000000000000 --- a/src/.docker_modules/r/3.5.3/docker_init.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -docker pull lbmc/r:3.5.3 -docker build src/.docker_modules/r/3.5.3 -t 'lbmc/r:3.5.3' -docker push lbmc/r:3.5.3 diff --git a/src/.docker_modules/sabre/039a55e/Dockerfile b/src/.docker_modules/sabre/039a55e/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..f769d43f51f93c949411beea29a540ce2f01a030 --- /dev/null +++ b/src/.docker_modules/sabre/039a55e/Dockerfile @@ -0,0 +1,20 @@ +FROM alpine:3.12.0 +MAINTAINER Lauret Modolo + +ENV SABRE_VERSION=039a55e + +ENV SABRE_DEPS g++ bash make zlib-dev git + +RUN echo "http://ftp.acc.umu.se/mirror/alpinelinux.org/v3.11/main" > /etc/apk/repositories \ +&& echo "http://ftp.acc.umu.se/mirror/alpinelinux.org/v3.11/community" >> /etc/apk/repositories \ +&& sed -i -e 's/v[[:digit:]]\..*\//edge\//g' /etc/apk/repositories \ +&& apk add --update --no-cache ${SABRE_DEPS} \ +&& git clone https://github.com/najoshi/sabre.git \ +&& cd sabre \ +&& git checkout $SABRE_VERSION \ +&& make \ +&& mv sabre /usr/bin \ +&& chmod +x /usr/bin/sabre + + +CMD ["bash"] diff --git a/src/.docker_modules/sabre/039a55e/docker_init.sh b/src/.docker_modules/sabre/039a55e/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..fc0f318f612a582b7a56691d27cd5454b2b3370b --- /dev/null +++ b/src/.docker_modules/sabre/039a55e/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/sabre:039a55e +docker build src/.docker_modules/sabre/039a55e -t 'lbmc/sabre:039a55e' +docker push lbmc/sabre:039a55e diff --git a/src/.docker_modules/ucsc/400/Dockerfile b/src/.docker_modules/ucsc/400/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6409f862f0c1b51d853597cd4404bb55f03137c2 --- /dev/null +++ b/src/.docker_modules/ucsc/400/Dockerfile @@ -0,0 +1,27 @@ +FROM debian:jessie +MAINTAINER Laurent Modolo + +ENV PACKAGES apt-utils \ + curl \ + build-essential \ + libssl-dev \ + libpng-dev \ + uuid-dev \ + libmysqlclient-dev \ + procps \ + rsync + + +RUN apt-get update && \ + apt-get install -y ${PACKAGES} + +ENV UCSC_VERSION=400 + +RUN curl -k -L http://hgdownload.soe.ucsc.edu/admin/exe/userApps.v${UCSC_VERSION}.src.tgz -o userApps.v${UCSC_VERSION}.src.tgz &&\ +tar xvf userApps.v${UCSC_VERSION}.src.tgz &&\ +cd userApps/ && \ +make &&\ +cd .. &&\ +mv userApps/bin/* /usr/bin/ &&\ +rm -R userApps.v${UCSC_VERSION}.src.tgz &&\ +rm -R userApps diff --git a/src/.docker_modules/ucsc/400/docker_init.sh b/src/.docker_modules/ucsc/400/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..83c2161652164d7ccecaf82b4ca25babde445599 --- /dev/null +++ b/src/.docker_modules/ucsc/400/docker_init.sh @@ -0,0 +1,4 @@ +#!/bin/sh +docker pull lbmc/ucsc:400 +docker build src/.docker_modules/ucsc/400/ -t 'lbmc/ucsc:400' +docker push lbmc/ucsc:400 diff --git a/src/RNAseq.config b/src/RNAseq.config index cc96d3ab627b41e6983a96e3a6765d754b9d9ae3..214ff793cdf70f1d8bd5f38691edeb7263245b31 100644 --- a/src/RNAseq.config +++ b/src/RNAseq.config @@ -1,85 +1,170 @@ profiles { - sge { + psmn { + singularity.enabled = true + singularity.cacheDir = "$baseDir/.singularity_psmn/" + singularity.runOptions = "--bind /Xnfs,/scratch" process{ - withName: trimming { - beforeScript = "source $baseDir/.conda_psmn.sh" - conda = "$baseDir/.conda_envs/cutadapt_2.4" + withName: fastqc_raw { + container = "lbmc/fastqc:0.11.5" executor = "sge" clusterOptions = "-cwd -V" - cpus = 1 + cpus = 4 + penv = 'openmp4' memory = "20GB" time = "12h" - queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } + withName: trimming { + container = "lbmc/cutadapt:2.1" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } + withName: fastqc_cut { + container = "lbmc/fastqc:0.11.5" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 4 + penv = 'openmp4' + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' } withName: rRNA_removal { - beforeScript = "source $baseDir/.conda_psmn.sh" - conda = "$baseDir/.conda_envs/bowtie2_2.3.4.1" - executor = "sge" - clusterOptions = "-cwd -V" - queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' - penv = 'openmp8' - cpus = 8 + container = "lbmc/bowtie2:2.3.4.1" + executor = "sge" + clusterOptions = "-cwd -V" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + penv = 'openmp8' + cpus = 8 } - withName: hisat2_human { - beforeScript = "source $baseDir/.conda_psmn.sh" - conda = "$baseDir/.conda_envs/hisat2_2.1.0" - executor = "sge" - clusterOptions = "-cwd -V" - memory = "20GB" - cpus = 16 - penv = 'openmp16' - time = "12h" - queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' - penv = 'openmp16' + withName: fastqc_filter { + container = "lbmc/fastqc:0.11.5" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 4 + penv = 'openmp4' + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } + withName: hisat2_genome { + container = "lbmc/hisat2:2.1.0" + executor = "sge" + clusterOptions = "-cwd -V" + memory = "20GB" + cpus = 16 + penv = 'openmp16' + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + penv = 'openmp16' + } + withName: fastqc_genome { + container = "lbmc/fastqc:0.11.5" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 4 + penv = 'openmp4' + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } + withName: dedup_genome { + container = "lbmc/umi_tools:1.0.0" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } + withName: dedup_postgenome { + container = "lbmc/umi_tools:1.0.0" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' } withName: sort_bam { - beforeScript = "source $baseDir/.conda_psmn.sh" - conda = "$baseDir/.conda_envs/hisat2_2.1.0" - executor = "sge" - clusterOptions = "-cwd -V" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + container = "lbmc/hisat2:2.1.0" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 8 + penv = 'openmp8' + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' } withName: counting { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules; module purge; module load htseq/0.11.2" - executor = "sge" - clusterOptions = "-cwd -V" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + container = "lbmc/htseq:0.11.2" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } + withName: multiqc { + container = "ewels/multiqc:1.9" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' + } + withName: rnaseq_qc { + container = "gcr.io/broad-cga-aarong-gtex/rnaseqc:2.3.6" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "20GB" + time = "12h" + queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' } - withName: coverage{ - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "deeptools/3.0.2" - executor = "sge" - clusterOptions = "-cwd -V" - cpus = 16 - memory = "30GB" - time = "24h" - penv = 'openmp16' - queue = 'CLG6242deb384A,CLG6242deb384C,CLG5218deb192A,CLG5218deb192B,CLG5218deb192C,CLG5218deb192D,SLG5118deb96,SLG6142deb384A,SLG6142deb384B,SLG6142deb384C,SLG6142deb384D' - } } } docker { docker.temp = 'auto' docker.enabled = true process { + withName: fastqc_raw { + container = "lbmc/fastqc:0.11.5" + cpus = 4 + } + withName: fastqc_cut { + container = "lbmc/fastqc:0.11.5" + cpus = 4 + } + withName: fastqc_filter { + container = "lbmc/fastqc:0.11.5" + cpus = 4 + } withName: adaptor_removal { container = "lbmc/cutadapt:2.1" - cpus = 1 + cpus = 4 } withName: rRNA_removal { container = "lbmc/bowtie2:2.3.4.1" - cpus = 4 + cpus = 8 } - withName: hisat2_human { - cpus = 4 + withName: hisat2_genome { + cpus = 8 container = "lbmc/hisat2:2.1.0" } + withName: fastqc_genome { + container = "lbmc/fastqc:0.11.5" + cpus = 4 + } + withName: dedup_genome { + container = "lbmc/umi_tools:1.0.0" + cpus = 1 + } withName: sort_bam { container = "lbmc/samtools:1.7" cpus = 1 @@ -88,6 +173,30 @@ profiles { container = "lbmc/htseq:0.11.2" cpus = 1 } + withName: rnaseq_qc { + container = "gcr.io/broad-cga-aarong-gtex/rnaseqc:latest" + cpus = 1 + } + withName: hisat2_postGenomic { + cpus = 4 + container = "lbmc/hisat2:2.1.0" + } + withName: dedup_postgenome { + container = "lbmc/umi_tools:1.0.0" + cpus = 1 + } + withName: fastqc_postgenome { + container = "lbmc/fastqc:0.11.5" + cpus = 4 + } + withName: multiqc { + container = "lbmc/multiqc:1.7" + cpus = 1 + } + withName: coverage_genome { + container = "lbmc/deeptools:3.0.2" + cpus = 8 + } } } } diff --git a/src/RNAseq.nf b/src/RNAseq.nf index bb673adc9c1551d8154586cd34e0e3594c7d04a7..ebe2ecdf33d82b64275882220671bd471dbc541d 100644 --- a/src/RNAseq.nf +++ b/src/RNAseq.nf @@ -2,68 +2,205 @@ * RNAseq Analysis pipeline */ -params.fastq_raw = "data/demultiplexed/*{_R1,_R2}.fastq.gz" -params.output = "results" -params.script_cov = "src/norm_coverage.sh" +////////////////////////////////////////////////////// +// PARAMETERS // +////////////////////////////////////////////////////// -log.info "script for coverage : ${script_cov}" +/////////////////////// +// PARMS : FILE PATH // +/////////////////////// +params.fastq_raw = "data/fastq/*{_R1,_R2}.fastq.gz" +params.output = "results" +params.filter = "data/filter/human_rRNA_tRNA/*.bt2" +params.index_genome = "data/genome/*.ht2" +params.gtf = "data/annotation/*.gtf" +params.gtf_collapse = "data/annotation/*.gtf" +params.index_postgenome = "data/post_genome/*.ht2" + +///////////////////// +// PARMS : OPTIONS // +///////////////////// + +params.do_fastqc = true +params.do_dedup = true +params.do_postgenome = true + +/////////////////////// +// LIBRARIES OPTIONS // +/////////////////////// + +params.adaptorR1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA" +params.adaptorR2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" +params.strand = "FR" + + +////////////// +// LOG INFO // +////////////// + +log.info "input raw : ${params.fastq_raw}" +log.info "outut directory : ${params.output}" +log.info "filter index files : ${params.filter}" +log.info "genome index : ${params.index_genome}" +log.info "gtf file : ${params.gtf}" +log.info "collapsed gtf file for rnaseqc: ${params.gtf_collapse}" +log.info "post-genome index : ${params.index_postgenome}" +log.info "" +log.info "do fastqc ? : ${params.do_fastqc}" +log.info "do deduplication ? : ${params.do_dedup}" +log.info "do post genome alignement ? : ${params.do_postgenome}" +log.info "" + +////////////// +// CHANNELS // +////////////// Channel .fromFilePairs(params.fastq_raw) .ifEmpty { error "Cannot find any file matching: ${params.fastq_raw}" } - .set {fastq_raw_channel} + .into {INPUT_FASTQC_RAW; + INPUT_CUTADAPT} + +Channel + .fromPath( params.filter ) + .ifEmpty { error "Cannot find any index files matching: ${params.filter}" } + .set { FILTER_INDEX } + +Channel + .fromPath ( params.index_genome ) + .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" } + .set { GENOME_INDEX } + +Channel + .fromPath( params.gtf ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } + .set { GTF_FILE } + +Channel + .fromPath( params.gtf_collapse ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf_collapse}" } + .set { GTF_COLLAPSE } + +Channel + .fromPath ( params.index_postgenome ) + .ifEmpty { error "Cannot find any index files matching: ${params.index_postgenome}" } + .set { POSTGENOME_INDEX } + +////////////////////////////////////////////////////// +// PROCESS // +////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// PRE PROCESS /////////////////////////////// +//////////////////////////////////////////////////////////////////////// + -/* Trimming by quality */ +///////////////////////// +/* Fastqc of raw input */ +///////////////////////// + +process fastqc_raw { + tag "$file_id" + publishDir "${params.output}/00_fastqc/raw/", mode: 'copy' + + input: + set file_id, file(reads) from INPUT_FASTQC_RAW + + output: + file "*_fastqc.{html,zip}" into OUTPUT_FASTQC_RAW + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +/////////////////////// +/* Trimming adaptors */ +/////////////////////// process trimming { tag "$file_id" - cpus 4 publishDir "${params.output}/01_cutadapt/", mode: 'copy' echo true input: - set file_id, file(reads) from fastq_raw_channel + set file_id, file(reads) from INPUT_CUTADAPT output: - set file_id, "*cut_{R1,R2}.fastq.gz" into fastq_files_cut - file "*.txt" into rapport_UrQt + set file_id, "*cut_{R1,R2}.fastq.gz" into CUTADAPT_OUTPUT + file "*first_report.txt" into CUTADAPT_LOG + file "*{second,third}_report.txt" into CUTADAPT_LOG_2 + script: """ - cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \ - -o ${file_id}_cut_R1.fastq.gz -p ${file_id}_tmp_R2.fastq.gz \ - ${reads[0]} ${reads[1]} > ${file_id}_report.txt - - cutadapt -u -14 -o ${file_id}_cut_R2.fastq.gz ${file_id}_tmp_R2.fastq.gz \ - > ${file_id}_cut_report.txt + cutadapt -j ${task.cpus} \ + -a ${params.adaptorR1} \ + -A ${params.adaptorR2} \ + -o ${file_id}_tmp_R1.fastq.gz \ + -p ${file_id}_tmp_R2.fastq.gz \ + --minimum-length 70 \ + ${reads[0]} ${reads[1]} > ${file_id}_first_report.txt + + cutadapt -j ${task.cpus} \ + -a "A{100}" \ + -o ${file_id}_cut_R1.fastq.gz \ + ${file_id}_tmp_R1.fastq.gz \ + > ${file_id}_second_report.txt + + cutadapt -j ${task.cpus} \ + -u -13 \ + -o ${file_id}_cut_R2.fastq.gz \ + ${file_id}_tmp_R2.fastq.gz \ + > ${file_id}_third_report.txt """ } +CUTADAPT_OUTPUT.into{CUTADAPT_OUTPUT_FASTQC; + CUTADAPT_OUTPUT_FILTER} -/* rRNA and tRNA filtering */ +///////////////////////// +/* Fastqc of raw input */ +///////////////////////// + +process fastqc_cut { + tag "$file_id" + publishDir "${params.output}/00_fastqc/cut/", mode: 'copy' -params.indexrRNA = "/Xnfs/lbmcdb/Ricci_team/shared_data/genomes/human_rRNA_tRNA/*.bt2" + input: + set file_id, file(reads) from CUTADAPT_OUTPUT_FASTQC + + output: + file "*.{html,zip}" into OUTPUT_FASTQC_CUT -log.info "index files : ${params.indexrRNA}" + when: + params.do_fastqc -Channel - .fromPath( params.indexrRNA ) - .ifEmpty { error "Cannot find any index files matching: ${params.indexrRNA}" } - .set { rRNA_index_files } + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +///////////////////////////// +/* rRNA and tRNA filtering */ +///////////////////////////// process rRNA_removal { tag "$file_id" - cpus 8 publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy' input: - set file_id, file(reads) from fastq_files_cut - file index from rRNA_index_files.toList() + set file_id, file(reads) from CUTADAPT_OUTPUT_FILTER + file index from FILTER_INDEX.toList() output: - set file_id, "*.fastq.gz" into rRNA_removed_files - file "*.txt" into bowtie_report + set file_id, "*.fastq.gz" into FILTER_FASTQ + set file_id, "*.bam*" into FILTER_BAM + file "*.{txt,stats}" into FILTER_LOG script: index_id = index[0] @@ -74,40 +211,68 @@ process rRNA_removal { } """ bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \ --1 ${reads[0]} -2 ${reads[1]} --un-conc-gz ${file_id}_R%.fastq.gz 2> \ -${file_id}_bowtie2_report.txt > /dev/null - -if grep -q "Error " ${file_id}_bowtie2_report.txt; then +-1 ${reads[0]} -2 ${reads[1]} --no-unal \ +--un-conc-gz ${file_id}_R%.fastq.gz 2> \ +${file_id}_filter.txt | samtools view -bS - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.filter.bam \ + && samtools index ${file_id}.filter.bam \ + && samtools idxstats ${file_id}.filter.bam > \ + ${file_id}.filter.stats + +if grep -q "Error " ${file_id}_filter.txt; then exit 1 fi """ } +FILTER_FASTQ.into{FILTER_FASTQ_FASTQC; + FILTER_FASTQ_HISAT; + FILTER_FASTQ_POSTGENOME + } -/* mapping against human genome with hisat2 */ +///////////////////////////// +/* Fastqc of filtred reads */ +///////////////////////////// -params.index_hg38 = "/media/adminmanu/Stockage/HISAT2_index_hg38_tran/*.ht2" +process fastqc_filter { + tag "$file_id" + publishDir "${params.output}/00_fastqc/filter/", mode: 'copy' -log.info "index : ${params.index_hg38}" + input: + set file_id, file(reads) from FILTER_FASTQ_FASTQC + output: + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_FILTER -Channel - .fromPath ( params.index_hg38 ) - .ifEmpty { error "Cannot find any index files matching: ${params.index_hg38}" } - .set { index_file_hg38 } + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} -process hisat2_human { + +/////////////////////////////////////////////////////////////////// +//////////////////////////// GENOME /////////////////////////////// +/////////////////////////////////////////////////////////////////// + + +/////////////////////////////////////////////// +/* mapping against human genome with hisat2 */ +/////////////////////////////////////////////// + +process hisat2_genome { tag "$file_id" publishDir "${params.output}/03_hisat2/", mode: 'copy' - errorStrategy 'finish' input: - set file_id, file(fastq_filtred) from rRNA_removed_files - file index from index_file_hg38.toList() + set file_id, file(fastq_filtred) from FILTER_FASTQ_HISAT + file index from GENOME_INDEX.toList() output: - file "*.fastq.gz" into reads_non_aligned_hg38 - set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files - file "*.txt" into hisat_report + set file_id, "*_notaligned_R{1,2}.fastq.gz" into HISAT_UNALIGNED + set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED + file "*.txt" into HISAT_LOG script: index_id = index[0] @@ -117,33 +282,98 @@ process hisat2_human { } } """ -hisat2 -x ${index_id} -p ${task.cpus} \ --1 ${fastq_filtred[0]} -2 ${fastq_filtred[1]} \ ---un-conc-gz ${file_id}_notaligned_R%.fastq.gz \ ---rna-strandness 'F' 2> ${file_id}_hisat2_hg38.txt | samtools view -bS -F 4 -o ${file_id}.bam - -if grep -q "ERR" ${file_id}_hisat2_hg38.txt; then +hisat2 -x ${index_id} \ + -p ${task.cpus} \ + -1 ${fastq_filtred[0]} \ + -2 ${fastq_filtred[1]} \ + --un-conc-gz ${file_id}_notaligned_R%.fastq.gz \ + --rna-strandness ${params.strand} \ + --dta \ + --no-softclip\ + --trim3 1\ + --trim5 1\ + 2> ${file_id}_genome.txt \ +| samtools view -bS -F 4 - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ +&& samtools index ${file_id}.bam + +if grep -q "ERR" ${file_id}.txt; then exit 1 fi +""" +} -samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${file_id}.bam -samtools index ${file_id}_sorted.bam +HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC; + HISAT_ALIGNED_DEDUP} -""" +//////////////////////////// +/* Fastqc of genome reads */ +//////////////////////////// + +process fastqc_genome { + tag "$file_id" + publishDir "${params.output}/00_fastqc/genome/", mode: 'copy' + + input: + set file_id, file(reads) from HISAT_ALIGNED_FASTQC + + output: + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_GENOME + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ } -sorted_bam_files.into{sorted_bam_htseq; sorted_bam_coverage} +//////////////////////////// +/* Deduplication of reads */ +//////////////////////////// + +if (params.do_dedup) { + process dedup_genome { + tag "$file_id" + publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy' + + input: + set file_id, file(bam) from HISAT_ALIGNED_DEDUP -/* HTseq */ + output: + set file_id, "*.{bam, bai}" into DEDUP_GENOME + file "*.log" into DEDUP_LOG + + when: + params.do_dedup + + """ + umi_tools dedup -I ${bam[0]} \ + -S ${file_id}.dedup.bam \ + --paired \ + 2> ${file_id}_dedup.log + """ + } +} else { + HISAT_ALIGNED_DEDUP.set{DEDUP_GENOME} +} + +DEDUP_GENOME.into{DEDUP_GENOME_HTSEQ; + DEDUP_GENOME_RNASEQC + } + +/////////// +/* HTseq */ +/////////// process sort_bam { tag "$file_id" input: - set file_id, file(bam) from sorted_bam_htseq + set file_id, file(bam) from DEDUP_GENOME_HTSEQ output: - set file_id, "*_htseq.bam" into sorted_bam_files_2 + set file_id, "*_htseq.bam" into SORTED_NAME_GENOME script: """ @@ -151,24 +381,16 @@ samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]} """ } -params.gtf = "$baseDir/data/annotation/*.gtf" -log.info "gtf files : ${params.gtf}" - -Channel - .fromPath( params.gtf ) - .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } - .set { gtf_file } - process counting { tag "$file_id" publishDir "${params.output}/04_HTseq/", mode: 'copy' input: - set file_id, file(bam) from sorted_bam_files_2 - file gtf from gtf_file.toList() + set file_id, file(bam) from SORTED_NAME_GENOME + file gtf from GTF_FILE.toList() output: - file "*.count" into count_files + file "*.count" into HTSEQ_COUNT script: """ @@ -188,32 +410,167 @@ htseq-count ${bam[0]} ${gtf} \ -t exon \ -i gene_id \ -f bam \ -> ${file_id}_exon.count +> ${file_id}.count """ } -Channel - .fromFilePairs(params.script_cov) - .ifEmpty { error "Cannot find any file matching: ${params.script_cov}" } - .set {script_channel} +/////////////// +/* RNASEQ QC */ +/////////////// + +process rnaseq_qc { + tag "$file_id" + publishDir "${params.output}/06_RNAseqQC/", mode: 'copy' + + input: + set file_id, file(bam) from DEDUP_GENOME_RNASEQC + file (gtf) from GTF_COLLAPSE.collect() + + output: + file "*" into RNASEQC_OUTPUT -process coverage { + script: +""" +rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./ \ +--stranded 'FR' +""" +} + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// POST GENOME /////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +///////////////////////// +/* HISAT2 POST_GENOMIC */ +///////////////////////// + +process hisat2_postGenomic { tag "$file_id" - publishDir "${params.output}/05_coverage/", mode: 'copy' + publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy' + + input: + set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME + file index2 from POSTGENOME_INDEX.collect() + + output: + set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED + file "*_postgenome.txt" into POSTGENOME_LOG + + when: + params.do_postgenome + + script: + index2_id = index2[0] + for (index2_file in index2) { + if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) { + index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1] + } + } +""" +hisat2 -x ${index2_id} \ + -p ${task.cpus} \ + -1 ${fastq_unaligned[0]} \ + -2 ${fastq_unaligned[1]} \ + --rna-strandness ${params.strand} \ + --dta\ + --no-softclip\ + --trim3 1\ + --trim5 1\ + 2> ${file_id}_postgenome.txt \ +| samtools view -bS -F 4 - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ +&& samtools index ${file_id}.bam + +if grep -q "ERR" ${file_id}_postgenome.txt; then + exit 1 +fi +""" +} + +POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC; + POSTGENOME_ALIGNED_DEDUP} + +//////////////////////////// +/* Deduplication of reads */ +//////////////////////////// +if (params.do_dedup) { + process dedup_postgenome { + tag "$file_id" + publishDir "${params.output}/05_post_genome_hisat2/dedup/", mode: 'copy' + + input: + set file_id, file(bam) from POSTGENOME_ALIGNED_DEDUP + + output: + set file_id, "*.{bam, bai}" into DEDUP_POSTGENOME + file "*.log" into DEDUP_POSTGENOME_LOG + + when: + params.do_dedup + + """ + umi_tools dedup -I ${bam[0]} \ + -S ${file_id}.dedup.bam \ + --paired \ + 2> ${file_id}_dedup.log + """ + } +} else { + +} + +process fastqc_postgenome { + tag "$file_id" + publishDir "${params.output}/00_fastqc/postgenome/", mode: 'copy' + + input: + set file_id, file(reads) from POSTGENOME_ALIGNED_FASTQC + + output: + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_POSTGENOME + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// POST PROCESS ////////////////////////////// +//////////////////////////////////////////////////////////////////////// + + +///////////// +/* MultiQC */ +///////////// + + +process multiqc { + tag "multiqc" + publishDir "${params.output}/multiqc", mode: 'copy' input: - set file_id, file(bam) from sorted_bam_coverage - set script from script_channel.collect() + file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([]) + file ('*') from CUTADAPT_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([]) + file ('*') from FILTER_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([]) + file ('*') from HISAT_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([]) + file ('*') from HTSEQ_COUNT.collect().ifEmpty([]) + file ('*') from POSTGENOME_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([]) + file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([]) output: - file "*.bw" into coverage_files + file "multiqc_report.html" into multiqc_report + file "multiqc_data" script: """ -bash ${script} -b ${bam} \ - -o {file_id}.bw \ - --binSize 1 \ - -p ${cpus} 8 +multiqc ./ """ } diff --git a/src/RNAseq_illumina.nf b/src/RNAseq_illumina.nf deleted file mode 100644 index 8c70bd8968aff39c48ecd9c3aa05a1bc481d2694..0000000000000000000000000000000000000000 --- a/src/RNAseq_illumina.nf +++ /dev/null @@ -1,218 +0,0 @@ -/* -* RNAseq Analysis pipeline -*/ - -params.fastq_raw = "data/demultiplexed/*{_R1,_R2}.fastq.gz" -params.output = "results" -params.script_cov = "src/norm_coverage.sh" - -log.info "script for coverage : ${params.script_cov}" - - -Channel - .fromFilePairs(params.fastq_raw) - .ifEmpty { error "Cannot find any file matching: ${params.fastq_raw}" } - .set {fastq_raw_channel} - -/* Trimming by quality */ - -process trimming { - tag "$file_id" - cpus 4 - publishDir "${params.output}/01_cutadapt/", mode: 'copy' - echo true - - input: - set file_id, file(reads) from fastq_raw_channel - - output: - set file_id, "*cut_{R1,R2}.fastq.gz" into fastq_files_cut - file "*.txt" into rapport_UrQt - - script: - """ - cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \ - --minimum-length 50 \ - -o ${file_id}_cut_R1.fastq.gz -p ${file_id}_cut_R2.fastq.gz \ - ${reads[0]} ${reads[1]} > ${file_id}_report.txt - - """ -} - - -/* rRNA and tRNA filtering */ - -params.indexrRNA = "/Xnfs/lbmcdb/Ricci_team/shared_data/genomes/human_rRNA_tRNA/*.bt2" - -log.info "index files : ${params.indexrRNA}" - -Channel - .fromPath( params.indexrRNA ) - .ifEmpty { error "Cannot find any index files matching: ${params.indexrRNA}" } - .set { rRNA_index_files } - -process rRNA_removal { - tag "$file_id" - cpus 8 - publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy' - - input: - set file_id, file(reads) from fastq_files_cut - file index from rRNA_index_files.toList() - - output: - set file_id, "*.fastq.gz" into rRNA_removed_files - file "*.txt" into bowtie_report - - script: - index_id = index[0] - for (index_file in index) { - if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) { - index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1] - } - } -""" -bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \ --1 ${reads[0]} -2 ${reads[1]} --un-conc-gz ${file_id}_R%.fastq.gz 2> \ -${file_id}_bowtie2_report.txt > /dev/null - -if grep -q "Error " ${file_id}_bowtie2_report.txt; then - exit 1 -fi -""" -} - -/* mapping against human genome with hisat2 */ - -params.index_hg38 = "/media/adminmanu/Stockage/HISAT2_index_hg38_tran/*.ht2" - -log.info "index : ${params.index_hg38}" - - -Channel - .fromPath ( params.index_hg38 ) - .ifEmpty { error "Cannot find any index files matching: ${params.index_hg38}" } - .set { index_file_hg38 } - -process hisat2_human { - tag "$file_id" - publishDir "${params.output}/03_hisat2/", mode: 'copy' - errorStrategy 'finish' - - input: - set file_id, file(fastq_filtred) from rRNA_removed_files - file index from index_file_hg38.toList() - - output: - file "*.fastq.gz" into reads_non_aligned_hg38 - set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files - file "*.txt" into hisat_report - - script: - index_id = index[0] - for (index_file in index) { - if (index_file =~ /.*\.1\.ht2/ && !(index_file =~ /.*\.rev\.1\.ht2/)) { - index_id = ( index_file =~ /(.*)\.1\.ht2/)[0][1] - } - } -""" -hisat2 -x ${index_id} -p ${task.cpus} \ --1 ${fastq_filtred[0]} -2 ${fastq_filtred[1]} \ ---un-conc-gz ${file_id}_notaligned_R%.fastq.gz \ ---rna-strandness 'F' 2> ${file_id}_hisat2_hg38.txt | samtools view -bS -F 4 -o ${file_id}.bam - -if grep -q "ERR" ${file_id}_hisat2_hg38.txt; then - exit 1 -fi - -samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${file_id}.bam -samtools index ${file_id}_sorted.bam - -""" -} - -sorted_bam_files.into{sorted_bam_htseq; sorted_bam_coverage} - -/* HTseq */ - -process sort_bam { - tag "$file_id" - - input: - set file_id, file(bam) from sorted_bam_htseq - - output: - set file_id, "*_htseq.bam" into sorted_bam_files_2 - - script: -""" -samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]} -""" -} - -params.gtf = "$baseDir/data/annotation/*.gtf" -log.info "gtf files : ${params.gtf}" - -Channel - .fromPath( params.gtf ) - .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } - .set { gtf_file } - -process counting { - tag "$file_id" - publishDir "${params.output}/04_HTseq/", mode: 'copy' - - input: - set file_id, file(bam) from sorted_bam_files_2 - file gtf from gtf_file.toList() - - output: - file "*.count" into count_files - - script: -""" -htseq-count ${bam[0]} ${gtf} \ - --mode=intersection-nonempty \ - -a 10 \ - -s yes \ - -t CDS \ - -i gene_id \ - -f bam \ -> ${file_id}_CDS.count - -htseq-count ${bam[0]} ${gtf} \ - --mode=intersection-nonempty \ - -a 10 \ - -s yes \ - -t exon \ - -i gene_id \ - -f bam \ -> ${file_id}_exon.count - -""" -} - -Channel - .fromPath(params.script_cov) - .ifEmpty { error "Cannot find any file matching: ${params.script_cov}" } - .set {script_channel} - -process coverage { - tag "$file_id" - publishDir "${params.output}/05_coverage/", mode: 'copy' - - input: - set file_id, file(bam) from sorted_bam_coverage - set script from script_channel.collect() - - output: - file "*.bw" into coverage_files - - script: -""" -bash ${script} -b ${bam[0]} \ - -o ${file_id}.bw \ - --binSize 1 \ - -p ${task.cpus} -""" -} diff --git a/src/RibosomeProfiling.nf b/src/RibosomeProfiling.nf index faeb37f2402504642be76281ab7a57462073c3a0..48ce6e55c45388e684f6a5efb984f197dd5e3600 100644 --- a/src/RibosomeProfiling.nf +++ b/src/RibosomeProfiling.nf @@ -1,58 +1,194 @@ /* -* RibosomeProfiling Analysis pipeline +* RNAseq Analysis pipeline */ -/* Trimming */ +////////////////////////////////////////////////////// +// PARAMETERS // +////////////////////////////////////////////////////// + +/////////////////////// +// PARMS : FILE PATH // +/////////////////////// + +params.fastq_raw = "data/fastq/*.fastq.gz" params.output = "results" -params.fastq_raw = "${params.output}/00_demultiplexing/*.fastq.gz" +params.filter = "data/filter/human_rRNA_tRNA/*.bt2" +params.index_genome = "data/genome/*.ht2" +params.gtf = "data/annotation/*.gtf" +params.gtf_collapse = "data/annotation/*.gtf" +params.index_postgenome = "data/post_genome/*.ht2" + +///////////////////// +// PARMS : OPTIONS // +///////////////////// + +params.do_fastqc = true +params.do_dedup = true +params.do_postgenome = true + +/////////////////////// +// LIBRARIES OPTIONS // +/////////////////////// + +params.adaptorR1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA" +params.strand = "F" + + +////////////// +// LOG INFO // +////////////// + +log.info "input raw : ${params.fastq_raw}" +log.info "outut directory : ${params.output}" +log.info "filter index files : ${params.filter}" +log.info "genome index : ${params.index_genome}" +log.info "gtf file : ${params.gtf}" +log.info "collapsed gtf file for rnaseqc: ${params.gtf_collapse}" +log.info "post-genome index : ${params.index_postgenome}" +log.info "" +log.info "adaptor sequence : ${params.adaptorR1}" +log.info "strand of the library : ${params.strand}" +log.info "" +log.info "do fastqc ? : ${params.do_fastqc}" +log.info "do deduplication ? : ${params.do_dedup}" +log.info "do post genome alignement ? : ${params.do_postgenome}" +log.info "" + +////////////// +// CHANNELS // +////////////// + +Channel + .fromPath(params.fastq_raw) + .ifEmpty { error "Cannot find any file matching: ${params.fastq_raw}" } + .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} + .into {INPUT_FASTQC_RAW; + INPUT_CUTADAPT} + +Channel + .fromPath( params.filter ) + .ifEmpty { error "Cannot find any index files matching: ${params.filter}" } + .set { FILTER_INDEX } + +Channel + .fromPath ( params.index_genome ) + .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" } + .set { GENOME_INDEX } + +Channel + .fromPath( params.gtf ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } + .set { GTF_FILE } + +Channel + .fromPath( params.gtf_collapse ) + .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf_collapse}" } + .set { GTF_COLLAPSE } Channel - .fromPath( params.fastq_raw ) - .ifEmpty { error "Cannot find any files matching: ${params.fastq_raw}" } - .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} - .set { fastq_raw_flow } + .fromPath ( params.index_postgenome ) + .ifEmpty { error "Cannot find any index files matching: ${params.index_postgenome}" } + .set { POSTGENOME_INDEX } + +////////////////////////////////////////////////////// +// PROCESS // +////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// PRE PROCESS /////////////////////////////// +//////////////////////////////////////////////////////////////////////// +///////////////////////// +/* Fastqc of raw input */ +///////////////////////// + +process fastqc_raw { + tag "$file_id" + publishDir "${params.output}/00_fastqc/raw/", mode: 'copy' + + input: + set file_id, file(reads) from INPUT_FASTQC_RAW + + output: + file "*_fastqc.{html,zip}" into OUTPUT_FASTQC_RAW + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +/////////////////////// +/* Trimming adaptors */ +/////////////////////// + process trimming { tag "$file_id" - publishDir "${params.output}/01_trimming/", mode: 'copy' + publishDir "${params.output}/01_cutadapt/", mode: 'copy' + echo true input: - set file_id, file(fastq_raw) from fastq_raw_flow + set file_id, file(reads) from INPUT_CUTADAPT output: - set file_id, "*_cut.fastq.gz" into fastq_trim_filt - file "*.txt" into log_trim + set file_id, "*cut.fastq.gz" into CUTADAPT_OUTPUT + file "*first_report.txt" into CUTADAPT_LOG + script: """ - cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC -m 20\ - -o ${file_id}_cut.fastq.gz \ - ${fastq_raw} > ${file_id}_report.txt + cutadapt -j ${task.cpus} \ + -a ${params.adaptorR1} \ + -o ${file_id}_cut.fastq.gz \ + --minimum-length 15 \ + ${reads[0]} > ${file_id}_first_report.txt """ } -/* rRNA and tRNA filtering */ +CUTADAPT_OUTPUT.into{CUTADAPT_OUTPUT_FASTQC; + CUTADAPT_OUTPUT_FILTER} -params.indexrRNA = "results/human_rRNA_tRNA/*.bt2" -log.info "index files rRNA : ${params.indexrRNA}" +///////////////////////// +/* Fastqc of raw input */ +///////////////////////// -Channel - .fromPath( params.indexrRNA ) - .ifEmpty { error "Cannot find any index files matching: ${params.indexrRNA}" } - .set { rRNA_index_files } +process fastqc_cut { + tag "$file_id" + publishDir "${params.output}/00_fastqc/cut/", mode: 'copy' + + input: + set file_id, file(reads) from CUTADAPT_OUTPUT_FASTQC + + output: + file "*.{html,zip}" into OUTPUT_FASTQC_CUT + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +///////////////////////////// +/* rRNA and tRNA filtering */ +///////////////////////////// process rRNA_removal { tag "$file_id" publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy' input: - set file_id, file(reads) from fastq_trim_filt - file index from rRNA_index_files.toList() + set file_id, file(reads) from CUTADAPT_OUTPUT_FILTER + file index from FILTER_INDEX.toList() output: - set file_id, "*.fastq.gz" into rRNA_removed_reads - file "*.txt" into bowtie_report + set file_id, "*.fastq.gz" into FILTER_FASTQ + set file_id, "*.bam*" into FILTER_BAM + file "*.{txt,stats}" into FILTER_LOG script: index_id = index[0] @@ -62,40 +198,69 @@ process rRNA_removal { } } """ -zcat ${reads} | bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \ --U - --un-gz ${file_id}_mRNA.fastq.gz 2> \ -${file_id}_bowtie2_report.txt > /dev/null - -if grep -q "Error " ${file_id}_bowtie2_report.txt; then +bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \ +-U ${reads[0]} --no-unal \ +--un-gz ${file_id}_filter.fastq.gz 2> \ +${file_id}_filter.txt | samtools view -bS - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.filter.bam \ + && samtools index ${file_id}.filter.bam \ + && samtools idxstats ${file_id}.filter.bam > \ + ${file_id}.filter.stats + +if grep -q "Error " ${file_id}_filter.txt; then exit 1 fi """ } +FILTER_FASTQ.into{FILTER_FASTQ_FASTQC; + FILTER_FASTQ_HISAT; + FILTER_FASTQ_POSTGENOME + } -/* mapping against human genome with hisat2 */ +///////////////////////////// +/* Fastqc of filtred reads */ +///////////////////////////// -params.index_hg38 = "/media/adminmanu/Stockage/HISAT2_index_hg38_tran/*.ht2" +process fastqc_filter { + tag "$file_id" + publishDir "${params.output}/00_fastqc/filter/", mode: 'copy' -log.info "index : ${params.index_hg38}" + input: + set file_id, file(reads) from FILTER_FASTQ_FASTQC + output: + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_FILTER -Channel - .fromPath ( params.index_hg38 ) - .ifEmpty { error "Cannot find any index files matching: ${params.index_hg38}" } - .set { index_file_hg38 } + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + + +/////////////////////////////////////////////////////////////////// +//////////////////////////// GENOME /////////////////////////////// +/////////////////////////////////////////////////////////////////// -process hisat2_human { - tag "$file_id" +/////////////////////////////////////////////// +/* mapping against human genome with hisat2 */ +/////////////////////////////////////////////// + +process hisat2_genome { + tag "$file_id" + publishDir "${params.output}/03_hisat2/", mode: 'copy' input: - set file_id, file(fastq_filtred) from rRNA_removed_reads - file index from index_file_hg38.toList() + set file_id, file(fastq_filtred) from FILTER_FASTQ_HISAT + file index from GENOME_INDEX.toList() output: - set file_id, "*.fastq.gz" into reads_non_aligned_hg38 - set file_id, "*.bam" into reads_aligned_hg38 - file "*.txt" into hisat_report + set file_id, "*_notaligned.fastq.gz" into HISAT_UNALIGNED + set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED + file "*.txt" into HISAT_LOG script: index_id = index[0] @@ -105,79 +270,113 @@ process hisat2_human { } } """ -hisat2 -x ${index_id} -p ${task.cpus} \ --U ${fastq_filtred} --un-gz ${file_id}_notaligned.fastq.gz \ ---end-to-end --rna-strandness 'F' \ -2> ${file_id}_hisat2_hg38.txt | samtools view -bS -F 4 -o ${file_id}.bam - -if grep -q "Error " ${file_id}_hisat2_hg38.txt; then +hisat2 -x ${index_id} \ + -p ${task.cpus} \ + -U ${fastq_filtred[0]} \ + --un-gz ${file_id}_notaligned.fastq.gz \ + --rna-strandness ${params.strand} \ + --dta \ + --no-softclip \ + --trim5 1\ + --trim3 1\ + 2> ${file_id}_genome.txt \ +| samtools view -bS -F 4 - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ +&& samtools index ${file_id}.bam + +if grep -q "ERR" ${file_id}.txt; then exit 1 fi """ } -process save_hisat { +HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC; + HISAT_ALIGNED_DEDUP} + +//////////////////////////// +/* Fastqc of genome reads */ +//////////////////////////// + +process fastqc_genome { tag "$file_id" - publishDir "${params.output}/03_hisat2/", mode: 'copy' + publishDir "${params.output}/00_fastqc/genome/", mode: 'copy' input: - set file_id, file(fastq) from reads_non_aligned_hg38 + set file_id, file(reads) from HISAT_ALIGNED_FASTQC output: - file "*" into saved_hisat + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_GENOME - script: -""" -cat ${fastq} > ${file_id}_nonhuman.fastq.gz -""" + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ } -/* sorting */ +//////////////////////////// +/* Deduplication of reads */ +//////////////////////////// + +if (params.do_dedup) { + process dedup_genome { + tag "$file_id" + publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy' + + input: + set file_id, file(bam) from HISAT_ALIGNED_DEDUP + + output: + set file_id, "*.{bam, bai}" into DEDUP_GENOME + file "*.log" into DEDUP_LOG + + when: + params.do_dedup + + """ + umi_tools dedup -I ${bam[0]} \ + -S ${file_id}.dedup.bam \ + > ${file_id}_dedup.log + """ + } +} else { + HISAT_ALIGNED_DEDUP.set{DEDUP_GENOME} +} + +DEDUP_GENOME.into{DEDUP_GENOME_HTSEQ; + DEDUP_GENOME_RNASEQC + } + +/////////// +/* HTseq */ +/////////// -process index_bam { +process sort_bam { tag "$file_id" - publishDir "${params.output}/03_hisat2/", mode: 'copy' input: - set file_id, file(bam) from reads_aligned_hg38 - file report from hisat_report + set file_id, file(bam) from DEDUP_GENOME_HTSEQ output: - set file_id, "*_sorted.{bam,bam.bai}" into sorted_bam_files - file "*.txt" into report_hisat2 + set file_id, "*_htseq.bam" into SORTED_NAME_GENOME script: """ -samtools sort -@ ${task.cpus} -O BAM -o ${file_id}_sorted.bam ${bam} -samtools index ${file_id}_sorted.bam - -cat ${report} > ${file_id}_hg38_hisat2.txt +samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]} """ } - -/* HTseq */ - -params.gtf = "$baseDir/data/annotation/*.gtf" -log.info "gtf files : ${params.gtf}" - -Channel - .fromPath( params.gtf ) - .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" } - .set { gtf_file } - process counting { tag "$file_id" publishDir "${params.output}/04_HTseq/", mode: 'copy' - errorStrategy 'retry' - maxRetries 2 input: - set file_id, file(bam) from sorted_bam_files - file gtf from gtf_file.toList() + set file_id, file(bam) from SORTED_NAME_GENOME + file gtf from GTF_FILE.toList() output: - file "*.count" into count_files + file "*.count" into HTSEQ_COUNT script: """ @@ -187,7 +386,6 @@ htseq-count ${bam[0]} ${gtf} \ -s yes \ -t CDS \ -i gene_id \ - -r pos \ -f bam \ > ${file_id}_CDS.count @@ -197,9 +395,166 @@ htseq-count ${bam[0]} ${gtf} \ -s yes \ -t exon \ -i gene_id \ - -r pos \ -f bam \ -> ${file_id}_exon.count +> ${file_id}.count + +""" +} + +/////////////// +/* RNASEQ QC */ +/////////////// + +process rnaseq_qc { + tag "$file_id" + publishDir "${params.output}/06_RNAseqQC/", mode: 'copy' + + input: + set file_id, file(bam) from DEDUP_GENOME_RNASEQC + file (gtf) from GTF_COLLAPSE.collect() + + output: + file "*" into RNASEQC_OUTPUT + + script: +""" +rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./ +""" +} + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// POST GENOME /////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +///////////////////////// +/* HISAT2 POST_GENOMIC */ +///////////////////////// + +process hisat2_postGenomic { + tag "$file_id" + publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy' + + input: + set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME + file index2 from POSTGENOME_INDEX.collect() + + output: + set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED + file "*_postgenome.txt" into POSTGENOME_LOG + + when: + params.do_postgenome + + script: + index2_id = index2[0] + for (index2_file in index2) { + if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) { + index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1] + } + } +""" +hisat2 -x ${index2_id} \ + -p ${task.cpus} \ + -U ${fastq_unaligned[0]} \ + --rna-strandness ${params.strand} \ + --dta\ + --no-softclip\ + --trim3 1\ + --trim5 1\ + 2> ${file_id}_postgenome.txt \ +| samtools view -bS -F 4 -F 256 - \ +| samtools sort -@ ${task.cpus} -o ${file_id}.bam \ +&& samtools index ${file_id}.bam + +if grep -q "ERR" ${file_id}_postgenome.txt; then + exit 1 +fi +""" +} + +POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC; + POSTGENOME_ALIGNED_DEDUP} + +//////////////////////////// +/* Deduplication of reads */ +//////////////////////////// + +if (params.do_dedup){ + process dedup_postgenome { + tag "$file_id" + publishDir "${params.output}/05_post_genome_hisat2/dedup/", mode: 'copy' + + input: + set file_id, file(bam) from POSTGENOME_ALIGNED_DEDUP + output: + set file_id, "*.{bam, bai}" into DEDUP_POSTGENOME + file "*.log" into DEDUP_POSTGENOME_LOG + + when: + params.do_dedup + + """ + umi_tools dedup -I ${bam[0]} \ + -S ${file_id}.dedup.bam \ + > ${file_id}_dedup.log + """ + } +} else { + +} + +process fastqc_postgenome { + tag "$file_id" + publishDir "${params.output}/00_fastqc/postgenome/", mode: 'copy' + + input: + set file_id, file(reads) from POSTGENOME_ALIGNED_FASTQC + + output: + set file_id, "*.{html,zip}" into OUTPUT_FASTQC_POSTGENOME + + when: + params.do_fastqc + + """ + fastqc ${file_id}* -t ${task.cpus} + """ +} + +//////////////////////////////////////////////////////////////////////// +//////////////////////////// POST PROCESS ////////////////////////////// +//////////////////////////////////////////////////////////////////////// + + +///////////// +/* MultiQC */ +///////////// + + +process multiqc { + tag "multiqc" + publishDir "${params.output}/multiqc", mode: 'copy' + + input: + file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([]) + file ('*') from CUTADAPT_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([]) + file ('*') from FILTER_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([]) + file ('*') from HISAT_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([]) + file ('*') from HTSEQ_COUNT.collect().ifEmpty([]) + file ('*') from POSTGENOME_LOG.collect().ifEmpty([]) + file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([]) + file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([]) + + output: + file "multiqc_report.html" into multiqc_report + file "multiqc_data" + + script: +""" +multiqc ./ """ } diff --git a/src/collapse_annotation.py b/src/collapse_annotation.py new file mode 100644 index 0000000000000000000000000000000000000000..cba9e5ef29309f8da27a07da8a4fa4eb0897c20d --- /dev/null +++ b/src/collapse_annotation.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +# Author: Francois Aguet +import numpy as np +import pandas as pd +from collections import defaultdict +from bx.intervals.intersection import IntervalTree +import argparse +import os +import gzip + + +class Exon: + def __init__(self, exon_id, number, transcript, start_pos, end_pos): + self.id = exon_id + self.number = int(number) + self.transcript = transcript + self.start_pos = start_pos + self.end_pos = end_pos + +class Transcript: + def __init__(self, transcript_id, transcript_name, transcript_type, gene, start_pos, end_pos): + self.id = transcript_id + self.name = transcript_name + self.type = transcript_type + self.gene = gene + self.start_pos = start_pos + self.end_pos = end_pos + self.exons = [] + +class Gene: + def __init__(self, gene_id, gene_name, gene_type, chrom, strand, start_pos, end_pos): + self.id = gene_id + self.name = gene_name + self.biotype = gene_type + self.chr = chrom + self.strand = strand + self.start_pos = start_pos + self.end_pos = end_pos + self.transcripts = [] + +class Annotation: + def __init__(self, gtfpath): + """Parse GTF and construct gene/transcript/exon hierarchy""" + + if gtfpath.endswith('.gtf.gz'): + opener = gzip.open(gtfpath, 'rt') + else: + opener = open(gtfpath, 'r') + + self.genes = [] + with opener as gtf: + for row in gtf: + row = row.strip().split('\t') + + if row[0][0]=='#': continue # skip header + + chrom = row[0] + annot_type = row[2] + start_pos = int(row[3]) + end_pos = int(row[4]) + strand = row[6] + + attributes = defaultdict() + for a in row[8].replace('"', '').replace('_biotype', '_type').split(';')[:-1]: + kv = a.strip().split(' ') + if kv[0]!='tag': + attributes[kv[0]] = kv[1] + else: + attributes.setdefault('tags', []).append(kv[1]) + + if annot_type=='gene': + assert 'gene_id' in attributes + if 'gene_name' not in attributes: + attributes['gene_name'] = attributes['gene_id'] + gene_id = attributes['gene_id'] + g = Gene(gene_id, attributes['gene_name'], attributes['gene_type'], chrom, strand, start_pos, end_pos) + g.source = row[1] + g.phase = row[7] + g.attributes_string = row[8].replace('_biotype', '_type') + self.genes.append(g) + + elif annot_type=='transcript': + assert 'transcript_id' in attributes + if 'transcript_name' not in attributes: + attributes['transcript_name'] = attributes['transcript_id'] + transcript_id = attributes['transcript_id'] + t = Transcript(attributes.pop('transcript_id'), attributes.pop('transcript_name'), + attributes.pop('transcript_type'), g, start_pos, end_pos) + t.attributes = attributes + g.transcripts.append(t) + + elif annot_type=='exon': + if 'exon_id' in attributes: + e = Exon(attributes['exon_id'], attributes['exon_number'], t, start_pos, end_pos) + else: + e = Exon(str(len(t.exons)+1), len(t.exons)+1, t, start_pos, end_pos) + t.exons.append(e) + + if np.mod(len(self.genes),1000)==0: + print('Parsing GTF: {0:d} genes processed\r'.format(len(self.genes)), end='\r') + print('Parsing GTF: {0:d} genes processed\r'.format(len(self.genes))) + + self.genes = np.array(self.genes) + + +def interval_union(intervals): + """ + Returns the union of all intervals in the input list + intervals: list of tuples or 2-element lists + """ + intervals.sort(key=lambda x: x[0]) + union = [intervals[0]] + for i in intervals[1:]: + if i[0] <= union[-1][1]: # overlap w/ previous + if i[1] > union[-1][1]: # only extend if larger + union[-1][1] = i[1] + else: + union.append(i) + return union + + +def subtract_segment(a, b): + """ + Subtract segment a from segment b, + return 'a' if no overlap + """ + if a[0]>=b[0] and a[0]<=b[1] and a[1]>b[1]: + return (b[1]+1,a[1]) + elif a[0]<b[0] and a[1]>=b[0] and a[1]<=b[1]: + return (a[0], b[0]-1) + elif a[0]<b[0] and a[1]>b[1]: + return [(a[0],b[0]-1), (b[1]+1,a[1])] + elif a[0]>=b[0] and a[1]<=b[1]: + return [] + else: + return a + + +def add_transcript_attributes(attributes_string): + """ + Adds transcript attributes if they were missing + (see https://www.gencodegenes.org/pages/data_format.html) + 'status' fields were dropped in Gencode 26 and later + """ + # GTF specification + if 'gene_status' in attributes_string: + attribute_order = ['gene_id', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', + 'transcript_type', 'transcript_status', 'transcript_name'] + add_list = ['transcript_id', 'transcript_type', 'transcript_status', 'transcript_name'] + else: + attribute_order = ['gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name'] + add_list = ['transcript_id', 'transcript_type', 'transcript_name'] + if 'level' in attributes_string: + attribute_order += ['level'] + + attr = attributes_string.strip(';').split('; ') + req = [] + opt = [] + for k in attr: + if k.split()[0] in attribute_order: + req.append(k) + else: + opt.append(k) + attr_dict = {i.split()[0]:i.split()[1].replace(';','') for i in req} + if 'gene_name' not in attr_dict: + attr_dict['gene_name'] = attr_dict['gene_id'] + if 'transcript_id' not in attr_dict: + attr_dict['transcript_id'] = attr_dict['gene_id'] + for k in add_list: + if k not in attr_dict: + attr_dict[k] = attr_dict[k.replace('transcript', 'gene')] + + return '; '.join([k+' '+attr_dict[k] for k in attribute_order] + opt)+';' + + +def collapse_annotation(annot, transcript_gtf, collapsed_gtf, blacklist=set(), collapse_only=False): + """ + Collapse transcripts into a single gene model; remove overlapping intervals + """ + + exclude = set(['retained_intron', 'readthrough_transcript']) + + # 1) 1st pass: collapse each gene, excluding blacklisted transcript types + merged_coord_dict = {} + for g in annot.genes: + exon_coords = [] + for t in g.transcripts: + if (t.id not in blacklist) and (t.type!='retained_intron') and (('tags' not in t.attributes) or len(set(t.attributes['tags']).intersection(exclude))==0): + for e in t.exons: + exon_coords.append([e.start_pos, e.end_pos]) + if exon_coords: + merged_coord_dict[g.id] = interval_union(exon_coords) + + if not collapse_only: + # 2) build interval tree with merged domains + interval_trees = defaultdict() + for g in annot.genes: + if g.id in merged_coord_dict: + for i in merged_coord_dict[g.id]: + # half-open intervals [a,b) + interval_trees.setdefault(g.chr, IntervalTree()).add(i[0], i[1]+1, [i, g.id]) + + # 3) query intervals of each gene, remove overlaps + new_coord_dict = {} + for g in annot.genes: + if g.id in merged_coord_dict: + new_intervals = [] + for i in merged_coord_dict[g.id]: # loop merged exons + ints = interval_trees[g.chr].find(i[0], i[1]+1) + # remove self + ints = [r[0] for r in ints if r[1]!=g.id] + m = set([tuple(i)]) + for v in ints: + m = [subtract_segment(mx, v) for mx in m] + # flatten + m0 = [] + for k in m: + if isinstance(k, tuple): + m0.append(k) + else: + m0.extend(k) + m = m0 + new_intervals.extend(m) + if new_intervals: + new_coord_dict[g.id] = new_intervals + + # 4) remove genes containing single-base exons only + for g in annot.genes: + if g.id in new_coord_dict: + exon_lengths = np.array([i[1]-i[0]+1 for i in new_coord_dict[g.id]]) + if np.all(exon_lengths==1): + new_coord_dict.pop(g.id) + else: + new_coord_dict = merged_coord_dict + + # 5) write to GTF + if transcript_gtf.endswith('.gtf.gz'): + opener = gzip.open(transcript_gtf, 'rt') + else: + opener = open(transcript_gtf, 'r') + + with open(collapsed_gtf, 'w') as output_gtf, opener as input_gtf: + # copy header + for line in input_gtf: + if line[:2]=='##' or line[:2]=='#!': + output_gtf.write(line) + comment = line[:2] + else: + break + output_gtf.write(comment+'collapsed version generated by GTEx pipeline\n') + for g in annot.genes: + if g.id in new_coord_dict: + start_pos = str(np.min([i[0] for i in new_coord_dict[g.id]])) + end_pos = str(np.max([i[1] for i in new_coord_dict[g.id]])) + if 'transcript_id' in g.attributes_string: + attr = g.attributes_string + else: + attr = add_transcript_attributes(g.attributes_string) + output_gtf.write('\t'.join([g.chr, g.source, 'gene', start_pos, end_pos, '.', g.strand, g.phase, attr])+'\n') + output_gtf.write('\t'.join([g.chr, g.source, 'transcript', start_pos, end_pos, '.', g.strand, g.phase, attr])+'\n') + if g.strand=='-': + new_coord_dict[g.id] = new_coord_dict[g.id][::-1] + for k,i in enumerate(new_coord_dict[g.id], 1): + output_gtf.write('\t'.join([ + g.chr, g.source, 'exon', str(i[0]), str(i[1]), '.', g.strand, g.phase, + attr+' exon_id "'+g.id+'_{0:d}; exon_number {0:d}";'.format(k)])+'\n') + + +if __name__=='__main__': + + parser = argparse.ArgumentParser(description='Collapse isoforms into single transcript per gene and remove overlapping intervals between genes') + parser.add_argument('transcript_gtf', help='Transcript annotation in GTF format') + parser.add_argument('output_gtf', help='Name of the output file') + parser.add_argument('--transcript_blacklist', help='List of transcripts to exclude (e.g., unannotated readthroughs)') + parser.add_argument('--collapse_only', action='store_true', help='') + args = parser.parse_args() + + annotation = Annotation(args.transcript_gtf) + + if args.transcript_blacklist: + blacklist_df = pd.read_csv(args.transcript_blacklist, sep='\t') + blacklist = set(blacklist_df[blacklist_df.columns[0]].values) + else: + blacklist = set() + + print('Collapsing transcripts') + collapse_annotation(annotation, args.transcript_gtf, args.output_gtf, blacklist=blacklist, collapse_only=args.collapse_only) diff --git a/src/nextflow_template.sh b/src/nextflow_template.sh new file mode 100644 index 0000000000000000000000000000000000000000..65397d4cec674635f5844814464d8e1f16210f74 --- /dev/null +++ b/src/nextflow_template.sh @@ -0,0 +1,20 @@ +#! /bin/bash + +set -e + +nextflow src/RNAseq.nf -c src/RNAseq.config \ + -profile psmn\ + -resume\ + --do_fastqc true\ + --do_dedup true\ + --do_postgenome true\ + --adaptorR1 "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"\ + --adaptorR2 "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"\ + --strand "FR"\ + --fastq_raw "data/fastq/*{_R1,_R2}_short.fastq.gz"\ + --output "results"\ + --filter "data/filter/*.bt2"\ + --index_genome "data/genome/*.ht2"\ + --gtf "data/annotation/gencode.v28.annotation_v3.gtf"\ + --gtf_collapse "data/annotation/gencode.v28.annotation_v3_collapse.gtf"\ + --index_postgenome "data/post_genome/*.ht2" diff --git a/src/nf_modules/multiqc/multiqc_paired.config b/src/nf_modules/multiqc/multiqc_paired.config index c8fe4a1d9c3cc85287d80a6e578b780178ac6dd6..2d786124c88e7f59cb39fdb1d9dfabb82fa14ba2 100644 --- a/src/nf_modules/multiqc/multiqc_paired.config +++ b/src/nf_modules/multiqc/multiqc_paired.config @@ -34,14 +34,21 @@ profiles { singularity.runOptions = "--bind /Xnfs,/scratch" process{ withName: fastqc_fastq { + container = "lbmc/fastqc:0.11.5" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "5GB" + time = "6h" + queue = "monointeldeb128" + } + withName: multiqc { container = "lbmc/multiqc:1.7" executor = "sge" clusterOptions = "-cwd -V" cpus = 1 memory = "5GB" time = "6h" - queueSize = 1.70 - pollInterval = "60sec" queue = "monointeldeb128" } } @@ -57,8 +64,7 @@ profiles { stageInMode = "copy" stageOutMode = "rsync" executor = "sge" - clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n\ - " + clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r" cpus = 1 queue = "huge" } @@ -68,8 +74,7 @@ profiles { stageInMode = "copy" stageOutMode = "rsync" executor = "sge" - clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n\ - " + clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r" cpus = 1 queue = "huge" } diff --git a/src/nf_modules/multiqc/multiqc_single.config b/src/nf_modules/multiqc/multiqc_single.config index 641a65e3d359b581b9860b548fe71bfa55a6a0e7..75f2f9aa8579b4fbf9ce7d99dceba0359e467db5 100644 --- a/src/nf_modules/multiqc/multiqc_single.config +++ b/src/nf_modules/multiqc/multiqc_single.config @@ -33,14 +33,21 @@ profiles { singularity.runOptions = "--bind /Xnfs,/scratch" process{ withName: fastqc_fastq { + container = "lbmc/fastqc:0.11.5" + executor = "sge" + clusterOptions = "-cwd -V" + cpus = 1 + memory = "5GB" + time = "6h" + queue = "monointeldeb128" + } + withName: multiqc { container = "lbmc/multiqc:1.7" executor = "sge" clusterOptions = "-cwd -V" cpus = 1 memory = "5GB" time = "6h" - queueSize = 1.70 - pollInterval = "60sec" queue = "monointeldeb128" } } @@ -56,8 +63,7 @@ profiles { stageInMode = "copy" stageOutMode = "rsync" executor = "sge" - clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n\ - " + clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r" cpus = 1 queue = "huge" } @@ -67,8 +73,7 @@ profiles { stageInMode = "copy" stageOutMode = "rsync" executor = "sge" - clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r n\ - " + clusterOptions = "-P P_lbmc -l os=cl7 -l sps=1 -r" cpus = 1 queue = "huge" } diff --git a/src/norm_coverage.sh b/src/norm_coverage.sh index b5485369626b715bfa02ba86e6cb36b1696af1b3..abb3bcf65bf38117778f27c21da7d651ffb11a28 100644 --- a/src/norm_coverage.sh +++ b/src/norm_coverage.sh @@ -41,3 +41,5 @@ factor=$(echo "1000000/($hg38)" | bc -l) echo "hg38 counts : $hg38" echo "scaling factor : $factor" echo "bamCoverage -p ${cpus} --scaleFactor ${factor} --binSize ${binSize} -b ${bam} -o ${output}" + +bamCoverage -p ${cpus} --scaleFactor ${factor} --binSize ${binSize} -b ${bam} -o ${output}