diff --git a/DESCRIPTION b/DESCRIPTION index 05e16f685a195d182ddd1f4ebe8bb6b2fedbe395..6401dbb9f0600f2ab6d2edd27b96fb690f4e8008 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,9 @@ Imports: gplots (>= 3.1.3), pheatmap (>= 1.0.12), stringr (>= 1.4.0), - viridis (>= 0.6.2) + viridis (>= 0.6.2), + tximport (>= 1.22.0), + fs (>= 1.5.2) LazyData: true Encoding: UTF-8 Roxygen: list(markdown = TRUE) diff --git a/R/load_matrix.R b/R/load_matrix.R index bb51c52d9501fea09536e4f42072b1014cdbd335..6dd9fb88523481108d0875aa98671f3a7fdaa78c 100644 --- a/R/load_matrix.R +++ b/R/load_matrix.R @@ -1,3 +1,65 @@ +#' Return a string indicating the file type of count file given in path +#' +#' @param path The path file linking to either htseq count file or kallisto +#' count files +#' +#' @return A string indicating the path +#' @import fs +file_type <- function(path) { + hd5_files <- fs::dir_ls(path = path, glob = "*abundance.h5", recurse = 1) + if (length(hd5_files) > 0) { + return("kallisto") + } else { + return("htseq") + } +} + +#' Build the deseq2 count matrix from design file +#' +#' @param design_table A dataframe with the column sample and +#' condition. Other columns corresponding to cofactors can be present +#' @param path The path were the count file are stored +#' @param my_formula The formula to use in DESeq2 +#' @param tx2gene_path A two column file, the first contains transcript \ +#' id and the other gene id + +#' @return A DESeqDataset object filtered +#' @import DESeq2 +#' @import tximport +load_matrix_kallisto <- function(design_table, path, my_formula, tx2gene_path) { + files <- file.path(path, design_table$prefix, "abundance.h5") + names(files) <- paste0(design_table$sample) + if (!is.null(tx2gene_path) & nchar(tx2gene_path) > 0) { + tx2gene_df <- read.table(tx2gene_path, header = FALSE) + txi <- tximport::tximport(files, + type = "kallisto", txOut = FALSE, + tx2gene = tx2gene_df + ) + } else { + txi <- tximport::tximport(files, type = "kallisto", txOut = TRUE) + } + sample_table <- design_table + rownames(sample_table) <- colnames(txi$counts) + sample_table <- sample_table[ + , + !names(design_table) %in% c("prefix", "sample") + ] + tmp_cn <- colnames(sample_table) + dds_input <- DESeq2::DESeqDataSetFromTximport( + txi, sample_table, + design = as.formula(my_formula) + ) + dds <- DESeq2::DESeq(dds_input) + if (length(tmp_cn) <= 1) { + ac <- "" + } else { + ac <- tmp_cn[2:length(tmp_cn)] + } + res <- list(dds = dds, acol = ac) + return(res) +} + + #' Get the filename of interest from a prefix #' #' @param prefix A file prefix used to get the filename of interest @@ -30,7 +92,7 @@ find_file <- function(prefix, path) { #' @return A DESeqDataset object filtered #' @import DESeq2 #' @import dplyr -load_matrix <- function(design_table, path, my_formula) { +load_matrix_htseq <- function(design_table, path, my_formula) { selected_samples <- design_table$prefix selected_files <- as.vector(sapply(selected_samples, find_file, path = path @@ -68,6 +130,28 @@ load_matrix <- function(design_table, path, my_formula) { return(res) } +#' Build the deseq2 count matrix from design file +#' +#' @param design_table A dataframe with the column sample and +#' condition. Other columns corresponding to cofactors can be present +#' @param path The path were the count file are stored +#' @param my_formula The formula to use in DESeq2 +#' @param tx2gene_path A two column file, the first contains transcript \ +#' id and the other gene id + +#' @return A DESeqDataset object filtered +#' @import DESeq2 +#' @import tximport +load_matrix <- function(design_table, path, my_formula, tx2gene_path) { + res <- file_type(path) + if (res == "htseq") { + return(load_matrix_htseq(design_table, path, my_formula)) + } else { + return(load_matrix_kallisto(design_table, path, my_formula, + tx2gene_path)) + } +} + #' Filter DESeq2 object on expressed gene if needed #' #' @param dds A dds object @@ -115,15 +199,17 @@ filter_dds <- function(dds, filter_list, min_expression, addition_col, design = as.formula(my_formula) ) if (norm_kind != "") { - df <- read.table(norm_kind, h=T, sep="\t") + df <- read.table(norm_kind, h = T, sep = "\t") vec <- df$size_factor names(vec) <- df$sample - if(!all(colnames(cts_2) %in% names(vec))) { + if (!all(colnames(cts_2) %in% names(vec))) { bad <- colnames(cts_2)[!(colnames(cts_2) %in% names(vec))] bad <- paste(bad, collapse = " ") - stop(paste("Some samples (", bad ,") are not defined in", norm, - "but are", - "present in design file !")) + stop(paste( + "Some samples (", bad, ") are not defined in", norm, + "but are", + "present in design file !" + )) } vec <- vec[colnames(cts_2)] sizeFactors(dds_input) <- vec @@ -169,6 +255,8 @@ get_matrices <- function(dds, output_folder, gene_names) { #' condition. Other columns corresponding to cofactors can be present #' @param path The path were the count file are stored #' @param my_formula The formula to use in DESeq2 +#' @param tx2gene_path A two column file, the first contains transcript \ +#' id and the other gene id #' @param filter_list A vector containing the list of genes to keep in the #' analysis. To keep all gene use a NULL object #' @param min_expression The minimum average required expression across @@ -182,10 +270,10 @@ get_matrices <- function(dds, output_folder, gene_names) { #' @return The dds object corresponding to a DESeqDatase object obtained after #' running the DESeq function. #' @export -build_count_matrices <- function(design_table, path, my_formula, filter_list, - min_expression, output_folder, gene_names, - norm) { - res <- load_matrix(design_table, path, my_formula) +build_count_matrices <- function(design_table, path, my_formula, tx2gene_path, + filter_list, min_expression, output_folder, + gene_names, norm) { + res <- load_matrix(design_table, path, my_formula, tx2gene_path) dds <- filter_dds( res$dds, filter_list, min_expression, res$acol, my_formula, norm diff --git a/R/main.R b/R/main.R index bd314adb9e3c88e5e568e6ab4aeef762dff3b728..066f4478432ae11889723653e6a10240b83435a5 100644 --- a/R/main.R +++ b/R/main.R @@ -10,6 +10,8 @@ #' condition. Other columns corresponding to cofactors can be present #' @param path The path were the count file are stored #' @param my_formula The formula to use in DESeq2 (default '~ condition') +#' @param tx2gene_path A two column file, the first contains transcript \ +#' id and the other gene id #' @param filter_list A vector containing the list of genes to keep in the #' analysis. To keep all gene use a NULL object, (default NULL) #' @param min_expression The minimum average required expression across @@ -39,7 +41,8 @@ #' #' @export run_deseq2 <- function(design_table, path, my_contrasts, - my_formula = "~ condition", filter_list = NULL, + my_formula = "~ condition", tx2gene_path = "", + filter_list = NULL, min_expression = 2, output_folder = ".", basemean_threshold = 0, lfc_threshold = 0, gene_names = NULL, norm = "") { @@ -47,7 +50,7 @@ run_deseq2 <- function(design_table, path, my_contrasts, stop(paste0("The output folder ", output_folder, " does not exit")) } dds <- build_count_matrices( - design_table, path, my_formula, filter_list, + design_table, path, my_formula, tx2gene_path, filter_list, min_expression, output_folder, gene_names, norm ) @@ -104,7 +107,8 @@ cli_run_deseq2 <- function() { filter_list <- read.table(cli$filter, h = F)$V1 } tmp <- run_deseq2( - design_table, cli$path, my_contrasts, cli$formula, filter_list, + design_table, cli$path, my_contrasts, cli$formula, cli$tx2gene, + filter_list, cli$gene_expression_threshold, cli$output, cli$basemean_threshold, cli$lfc_threshold, gene_names, cli$norm diff --git a/R/parser.R b/R/parser.R index 8aaac3598c4f030e49c72ff15db89c1cdf23c310..3eb75a38feb5e55ce2489407a51d929027fa7377 100644 --- a/R/parser.R +++ b/R/parser.R @@ -17,7 +17,7 @@ cli_function <- function() { ) p <- add_argument(p, "path", type = "character", - help = paste0("The folder where the tsv file are stored (default .)"), + help = paste0("The folder where the tsv/h5 files are stored (default .)"), default = "." ) p <- add_argument(p, "formula", @@ -60,22 +60,36 @@ cli_function <- function() { log2fc", default = 0 ) p <- add_argument(p, "--gene_names", - short = "-c", type = "character", - help = paste0("Optional: A file containing the correspondance between geneIDs and gene names", - "The first colomn contains geneIDs and the header is 'gene'", - "The second colomn contains corresponding gene names and the header is 'gene_name"), - default = "" + short = "-c", type = "character", + help = paste0( + "Optional: A file containing the correspondance between geneIDs and gene names", + "The first colomn contains geneIDs and the header is 'gene'", + "The second colomn contains corresponding gene names and the header is 'gene_name" + ), + default = "" ) p <- add_argument(p, "--norm", - short = "-n", type = "character", - help = paste0("A two column dataframe containing the", - "columns sample and size_factor. Sample", - " column must correspond to the samples", - " name given in design file. Size_factor", - "correspond to the scaling value to apply", - "to count data in deseq2"), - default = "") - + short = "-n", type = "character", + help = paste0( + "A two column dataframe containing the", + "columns sample and size_factor. Sample", + " column must correspond to the samples", + " name given in design file. Size_factor", + "correspond to the scaling value to apply", + "to count data in deseq2" + ), + default = "" + ) + p <- add_argument(p, "--tx2gene", + short = "-t", type = "integer", + help = paste0( + "A two column dataframe without header. ", + "The first column corresponds to transcript id. ", + "The second column correspond to gene id. This file ", + "is only use when the path points to kalisto count file" + ), + default = "" + ) # Parse de command line arguments diff --git a/Readme.md b/Readme.md index 9929e7aa29d1dfb8236305967bf7514ddd81acd6..a03a3c07a22856b3058efd79bbe4bb6ae69ae132 100644 --- a/Readme.md +++ b/Readme.md @@ -20,6 +20,8 @@ Imports: * pheatmap (>= 1.0.12), * stringr (>= 1.4.0), * viridis (>= 0.6.2) +* tximport (>= 1.22.0), +* fs (>= 1.5.2) The pandoc package must be installed. On ubuntu run: @@ -86,7 +88,7 @@ Wrapper to perform DESeq2 enrichment analysis positional arguments: design A file containing the design of interest - path The folder where the tsv file are + path The folder where the tsv/h5 files are stored (default .) formula The design formula for DESeq2 contrasts The comparisons to perform in the @@ -134,6 +136,12 @@ optional arguments: Size_factorcorrespond to the scaling value to applyto count data in deseq2 [default: ] +-t, --tx2gene A two column dataframe without + header. The first column corresponds + to transcript id. The second column + correspond to gene id. This file is + only use when the path points to + kalisto count file [default: ] ``` The design parameter must point to a file with the following structure: @@ -156,6 +164,29 @@ The columns must be `tab-separated`. Note that you can add as many other columns as you want to include every covariate you might need. +The `path` parameter corresponds to a folder: + +- either containing direct tsv counts file from HTseq counts like this + +``` +path_folder/ +├── sample1.tsv +├── sample2.tsv +└── sample3.tsv +``` + +- or container folder whose name **must match the prefix column** in the design file given in input, for example: + +``` +path_folder/ +├── prefix1/ +| └── abundance.h5 +├── prefix2/ +| └── abundance.h5 +└── condition3.tsv prefix3/ + └── abundance.h5 +``` + The `formula`parameter can be: "~ condition". You can add any covariate as you want is they are present inside the `design` table. For example " ~ condition + cov1 + cov2". diff --git a/docker/Dockerfile b/docker/Dockerfile index c4d753352347e28574d27e6d6961eba21b9924c6..d1970c1e66eed38a9875c08350b9086ae7e4ffa1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,7 +9,9 @@ RUN apt install -y pandoc libfontconfig1-dev libcurl4-openssl-dev \ libpng-dev libtiff5-dev libjpeg-dev procps RUN R -e "install.packages(c('devtools', 'testthat', 'roxygen2', 'BiocManager', 'plotly'))" && \ - R -e "BiocManager::install('DESeq2')" + R -e "BiocManager::install('DESeq2')" && \ + R -e "BiocManager::install('tximport')" && \ + R -e "BiocManager::install('rhdf5')" COPY *.R .