From 9fc49679be65d6da815660d0e56fda36819dde58 Mon Sep 17 00:00:00 2001 From: xgrand <xavier.grand@inserm.fr> Date: Tue, 30 May 2023 16:57:31 +0200 Subject: [PATCH] Add Dockerfile and scripts to fusion_parser module to arriba_fusion.nf --- .../fusion_parser/1.0/Dockerfile | 16 ++ .../fusion_parser/1.0/docker_init.sh | 5 + .../fusion_parser/1.0/fusion_parser.R | 163 ++++++++++++++++++ .../fusion_parser/1.0/install_pkgs.R | 26 +++ src/arriba_fusion.nf | 9 +- src/nf_modules/arriba/main.nf | 2 +- src/nf_modules/fusion_parser/main.nf | 25 +++ 7 files changed, 243 insertions(+), 3 deletions(-) create mode 100644 src/.docker_modules/fusion_parser/1.0/Dockerfile create mode 100755 src/.docker_modules/fusion_parser/1.0/docker_init.sh create mode 100644 src/.docker_modules/fusion_parser/1.0/fusion_parser.R create mode 100644 src/.docker_modules/fusion_parser/1.0/install_pkgs.R create mode 100644 src/nf_modules/fusion_parser/main.nf diff --git a/src/.docker_modules/fusion_parser/1.0/Dockerfile b/src/.docker_modules/fusion_parser/1.0/Dockerfile new file mode 100644 index 0000000..4ef9e45 --- /dev/null +++ b/src/.docker_modules/fusion_parser/1.0/Dockerfile @@ -0,0 +1,16 @@ +FROM rocker/r-base:4.2.3 +LABEL MAINTAINER "Xavier Grand <xavier.grand@ens-lyon.fr>" + +RUN apt update && apt-get install -y apt-transport-https +RUN apt install -y libxml2-dev libfontconfig1-dev libssl-dev \ + libcurl4-openssl-dev libharfbuzz-dev \ + libfribidi-dev libfreetype6-dev libpng-dev \ + libtiff5-dev libjpeg-dev + +## copy Rscript files +COPY ./*.R . + +RUN Rscript install_pkgs.R + +# command to run on container start +CMD [ "bash" ] diff --git a/src/.docker_modules/fusion_parser/1.0/docker_init.sh b/src/.docker_modules/fusion_parser/1.0/docker_init.sh new file mode 100755 index 0000000..0fa64b8 --- /dev/null +++ b/src/.docker_modules/fusion_parser/1.0/docker_init.sh @@ -0,0 +1,5 @@ +#!/bin/sh +# docker pull xgrand/fusion_parser:1.0 +docker build src/.docker_modules/fusion_parser/1.0 -t 'xgrand/fusion_parser:1.0' +docker push xgrand/fusion_parser:1.0 +# docker buildx build --platform linux/amd64,linux/arm64 -t "xgrand/fusion_parser:1.0" --push src/.docker_modules/fusion_parser/1.0 \ No newline at end of file diff --git a/src/.docker_modules/fusion_parser/1.0/fusion_parser.R b/src/.docker_modules/fusion_parser/1.0/fusion_parser.R new file mode 100644 index 0000000..cbc30e6 --- /dev/null +++ b/src/.docker_modules/fusion_parser/1.0/fusion_parser.R @@ -0,0 +1,163 @@ +#!/usr/bin/env Rscript +# list.of.packages <- c("optparse", "dplyr", "tidyr", "stringr", "BiocManager", +# "tidyverse", "tibble", "purrr", "furrr", "future", "vsn", +# "pacman") +# new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] +# +# if (length(new.packages) > 0) { +# install.packages(new.packages, dependencies = TRUE, repos = "https://cran.r-project.org") +# } +# +# list.of.BiocPackages <- c("DESeq2", "BiocParallel") +# new.BiocPackages <- list.of.BiocPackages[!(list.of.BiocPackages %in% installed.packages()[,"Package"])] +# +# if (length(new.BiocPackages) > 0) { +# BiocManager::install(new.BiocPackages, dependencies = TRUE, ask = FALSE) +# } + +pacman::p_load(optparse, tidyverse, stringr, DESeq2, tibble, BiocParallel, + furrr, future, future.batchtools, vsn) + +# Option parser and loading data: +option_list = list( + make_option(c("-f", "--fusion"), type="character", default=NULL, + help="fusions folder path", metavar="character"), + make_option(c("-c", "--count"), type="character", default=NULL, + help="htseq counts folder path.", metavar="character"), + make_option(c("-d", "--design"), type="character", default=NULL, + help="path to design table", metavar="character"), + make_option(c("-t", "--threads"), type="integer", default=4, + help="number of threads", metavar="integer"), + make_option(c("-m", "--memory"), type="integer", default=8, + help="memory in Gbytes", metavar="integer") +) + +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +# Number of threads for DESeq2: +register(MulticoreParam(opt$threads)) + +# Set the number of threads/workers to use +plan(multisession, workers = opt$threads) + +# Set memory limits for parallel workers +options(future.globals.maxSize = ((opt$memory*1000)*1024^2)) + +# read design input file: +design <- read.table(file = opt$design, header = TRUE, row.names = NULL) +list_sample <- as.list(design$sample) + +# Function to parse all file from a list: +parse_fusion <- function(ech) { + # read fusion file: + fusion_file <- paste0(ech, "_fusions.tsv") + df1 <- read.table(file = paste0(opt$fusion, fusion_file)) + colnames(df1) <- c("gene1","gene2","strand1", + "strand2","breakpoint1","breakpoint2","site1", + "site2","type","split_reads1","split_reads2", + "discordant_mates","coverage1","coverage2","confidence", + "reading_frame","tags","retained_protein_domains", + "closest_genomic_breakpoint1", + "closest_genomic_breakpoint2","gene_id1","gene_id2", + "transcript_id1","transcript_id2","direction1", + "direction2","filters","fusion_transcript", + "peptide_sequence","read_identifiers") + # filter fusion file: + df1 <- filter(df1, type == 'deletion/read-through' | + type == "deletion/read-through/5'-5'" | + type == "deletion/read-through/3'-3'") + df1 <- df1[df1$strand1 == df1$strand2,] + df1 <- df1 %>% filter(strand1 %in% c("+/+", "-/-")) + + df1 <- df1 %>% select(gene1, gene2, strand1, split_reads1, split_reads2, + discordant_mates, type, gene_id1, gene_id2) + + htseq_file <- paste0(ech, ".tsv") + htseq_df <- read.table(file = paste0(opt$count, htseq_file), header = FALSE) + colnames(htseq_df) <- c("gene_id", "count") + + df1 <- left_join(df1, htseq_df, by = join_by(gene_id1 == gene_id), + suffix = c(".x", ".y")) + colnames(df1)[10] <- "count1" + df1 <- left_join(df1, htseq_df, by = join_by(gene_id2 == gene_id), + suffix = c(".x", ".y")) + colnames(df1)[11] <- "count2" + colnames(df1)[3] <- "strand" + + df1 <- df1 %>% dplyr::mutate(ID = paste0(gene1, "_", gene2)) + + df2 <- df1 %>% dplyr::group_by(ID) %>% + dplyr::summarise(reads1=sum(split_reads1), + reads2=sum(split_reads2), + discordant=sum(discordant_mates)) %>% + dplyr::rowwise() %>% + dplyr::mutate(reads_total = sum(reads1, + reads2, + discordant) + ) %>% + dplyr::select(ID, reads_total) + + colnames(df2)[2] <- ech + + df1 <- df1 %>% select(all_of(c("ID", "gene_id1", "gene_id2", + "split_reads1", "split_reads2", + "discordant_mates", "type", "count1", "count2", + "strand"))) + + write.table(df1, file = paste0(opt$output, ech, "_parsed_fusion.csv"), + sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) + + df2$ID <- as.factor(df2$ID) + return(df2) +} + +# all_results <- map(list_sample, parse_fusion) +all_results <- furrr::future_map(list_sample, parse_fusion) +gc() +concat_res <- purrr::reduce(all_results, full_join, by = "ID") +concat_res[is.na(concat_res)] <- 0 + +# DESeq2: +## Prepare the dataset: +deseq_df <- as.matrix(concat_res %>% column_to_rownames(var = "ID")) +exp_design <- design %>% column_to_rownames(var = "sample") +exp_design$condition <- as.factor(exp_design$condition) +exp_design$rep <- as.factor((exp_design$rep)) + +dds <- DESeq2::DESeqDataSetFromMatrix(countData=deseq_df, + colData=exp_design, + design=~condition + rep, + tidy = FALSE) + +## Filtering: +keep <- rowSums(counts(dds) > 1) >= 3 +dds <- dds[keep,] + +dds <- DESeq2::DESeq(dds) + +# Plots: +pdf(paste0(opt$output, "rplot.pdf")) +res <- results(dds) +plotMA(res) #, ylim=c(-2,2)) +plotDispEsts(dds) + +# this gives log2(n + 1) +# ntd <- normTransform(dds) +# meanSdPlot(assay(ntd)) +dev.off() + +# save results: +save_deseq <- function(comparaison, dds) { + tmp <- as.data.frame(DESeq2::results(dds, contrast=c("condition",comparaison))) + tmp <- tibble::rownames_to_column(tmp, "ID") + write.table(tmp, + file = paste0(opt$output, comparaison[1], "_", comparaison[2], ".tsv"), + row.names = FALSE, + dec = ",", + sep = "\t", + qmethod = "double" + ) +} + +sapply(list_comparaison, save_deseq, dds) diff --git a/src/.docker_modules/fusion_parser/1.0/install_pkgs.R b/src/.docker_modules/fusion_parser/1.0/install_pkgs.R new file mode 100644 index 0000000..37f282d --- /dev/null +++ b/src/.docker_modules/fusion_parser/1.0/install_pkgs.R @@ -0,0 +1,26 @@ +#!/usr/bin/env Rscript +list.of.packages <- c("optparse", "dplyr", "tidyr", "stringr", "BiocManager", + "tidyverse", "tibble", "purrr", "furrr", "future", "vsn", + "pacman", "gargle", "ids", "systemfonts", "textshaping", + "googledrive", "googlesheets4", "httr", "ragg", "rvest", + "xml2", "textshaping", "ragg") + +new.packages <- list.of.packages[!(list.of.packages %in% +installed.packages()[,"Package"])] + +if (length(new.packages) > 0) { + install.packages(new.packages, + dependencies = TRUE, + repos = "https://cran.r-project.org") +} + +list.of.BiocPackages <- c("DESeq2", "BiocParallel") +new.BiocPackages <- list.of.BiocPackages[!(list.of.BiocPackages %in% + installed.packages()[,"Package"])] + +if (length(new.BiocPackages) > 0) { + BiocManager::install(new.BiocPackages, dependencies = TRUE, ask = FALSE) +} + +pacman::p_load(optparse, tidyverse, stringr, DESeq2, tibble, BiocParallel, + furrr, future, future.batchtools, vsn) \ No newline at end of file diff --git a/src/arriba_fusion.nf b/src/arriba_fusion.nf index 52b2f5c..0d5c7db 100644 --- a/src/arriba_fusion.nf +++ b/src/arriba_fusion.nf @@ -74,6 +74,7 @@ params.bam = "" params.genome = "/home/xavier/Data/Genome/hg19/Homo_sapiens.GRCh37.dna.primary_assembly.fa" params.gtf = "/home/xavier/Data/Genome/hg19/Homo_sapiens.GRCh37.87.gtf" params.index = "/home/xavier/Data/Genome/hg19/STAR" +params.htseq_param = "yes" /* Params out */ params.fastp_out = "02_fastp" @@ -103,6 +104,7 @@ if(params.index) { } else { index_list = "" } +log.info "htseq param: -s ${params.htseq_param}" /* **************************************************************** @@ -157,6 +159,7 @@ include { filter_bam_quality } from './nf_modules/samtools/main.nf' include { index_with_gtf } from './nf_modules/star/main_2.7.8a.nf' include { mapping2fusion } from './nf_modules/star/main_2.7.8a.nf' include { index_bam } from './nf_modules/samtools/main.nf' +include { htseq_count } from './nf_modules/htseq/main.nf' addParams(htseq_out: '09_htseq_count', htseq_param: "${params.htseq_param}" ) include { arriba } from "./nf_modules/arriba/main.nf" include { draw_fusions } from "./nf_modules/arriba/main.nf" @@ -176,13 +179,13 @@ workflow { if(params.fastq != ""){ fastp(fastq_files) - fastqc_raw(fastq_files) + /* fastqc_raw(fastq_files) fastqc_preprocessed(fastp.out.fastq) multiqc(fastqc_raw.out.report) .mix( fastqc_preprocessed.out.report ).collect() - + */ /* **************************************************************** Mapping @@ -205,6 +208,7 @@ workflow { filter_bam_quality(mapping2fusion.out.bam) index_bam(filter_bam_quality.out.bam.collect()) + htseq_count(index_bam.out.bam_idx.collect(), gtf_file.collect()) /* **************************************************************** @@ -217,6 +221,7 @@ workflow { } else { index_bam(bam_files.collect()) + htseq_count(index_bam.out.bam_idx.collect(), gtf_file.collect()) arriba(index_bam.out.bam_idx.collect(), gtf_file.collect(), genome_file.collect()) draw_fusions(arriba.out.fusions, index_bam.out.bam_idx, gtf_file) } diff --git a/src/nf_modules/arriba/main.nf b/src/nf_modules/arriba/main.nf index be1630a..9ef3413 100644 --- a/src/nf_modules/arriba/main.nf +++ b/src/nf_modules/arriba/main.nf @@ -49,7 +49,7 @@ process draw_fusions{ script: """ -draw_fusions.R \ +Rscript /usr/local/bin/arriba_v${version}/draw_fusions.R \ --fusions=${fusions} \ --alignments=${bam} \ --output=${fusion_id}_fusions.pdf \ diff --git a/src/nf_modules/fusion_parser/main.nf b/src/nf_modules/fusion_parser/main.nf new file mode 100644 index 0000000..ceac664 --- /dev/null +++ b/src/nf_modules/fusion_parser/main.nf @@ -0,0 +1,25 @@ +version = "1.0" +container_url = "xgrand/fusion_parser:${version}" + +process parsefusion{ + container = "${container_url}" + label "big_mem_multi_cpus" + tag "${bam_id}" + if (params.fusion_out != "") { + publishDir "results/${params.fusion_out}", mode: 'copy' + } + + input: + path(fusion) + path(count) + path(design) + + output: + tuple val(bam_id), path("*.tsv"), emit: dfg + + script: + memory = "${task.memory}" - ~/\s*GB/ +""" +Rscript fusion_parser.R --fusion ${fusion} --count ${count} --design ${design} --threads ${task.cpus} --memory ${memory} +""" +} \ No newline at end of file -- GitLab