From f27b7bde670f87c71dc11c16d71a73f563b5e5ab Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Wed, 18 Oct 2023 11:28:47 +0200 Subject: [PATCH] add --norm parameter --- R/load_matrix.R | 29 ++++++++++++++++++++--------- R/main.R | 6 +++--- R/parser.R | 10 +++++++--- Readme.md | 21 +++++++++++++++++++++ 4 files changed, 51 insertions(+), 15 deletions(-) diff --git a/R/load_matrix.R b/R/load_matrix.R index adef723..bb51c52 100644 --- a/R/load_matrix.R +++ b/R/load_matrix.R @@ -77,7 +77,8 @@ load_matrix <- function(design_table, path, my_formula) { #' samples to keep the gene #' @param addition_col Additional design columns #' @param my_formula The formula to use in DESeq2 -#' @param norm_kind Add a parameter used to change deseq2 normalisationto cpm +#' @param norm_kind A file containing the size factors to apply to each sample +#' or a "" value #' #' @return A DESeqDataset object filtered #' @import DESeq2 @@ -88,7 +89,7 @@ filter_dds <- function(dds, filter_list, min_expression, addition_col, "([\\+\\*\\:\\-\\^]|%in%)" )[[1]]) if (is.null(filter_list) || filter_list == "") { - if (min_expression == 0 && norm_kind != "CPM") { + if (min_expression == 0 && norm_kind == "") { return(dds) } else { cts <- DESeq2::counts(dds, norm = F) @@ -113,10 +114,19 @@ filter_dds <- function(dds, filter_list, min_expression, addition_col, countData = cts_2, colData = coldata, design = as.formula(my_formula) ) - if (norm_kind == "CPM") { - csum <- colSums(cts_2) - fac <- colSums(cts_2) / 1e6 - sizeFactors(dds_input) <- fac + if (norm_kind != "") { + df <- read.table(norm_kind, h=T, sep="\t") + vec <- df$size_factor + names(vec) <- df$sample + if(!all(colnames(cts_2) %in% names(vec))) { + bad <- colnames(cts_2)[!(colnames(cts_2) %in% names(vec))] + bad <- paste(bad, collapse = " ") + stop(paste("Some samples (", bad ,") are not defined in", norm, + "but are", + "present in design file !")) + } + vec <- vec[colnames(cts_2)] + sizeFactors(dds_input) <- vec } dds <- DESeq2::DESeq(dds_input) return(dds) @@ -166,18 +176,19 @@ get_matrices <- function(dds, output_folder, gene_names) { #' @param output_folder Folder were the matrices will be produced #' @param gene_names A dataframe with the columns gene_id and gene_name, #' (default NULL). -#' @param norm_kind Add a parameter used to change deseq2 normalisationto cpm +#' @param norm A file containing the size factors to apply to each sample +#' or a NULL value #' #' @return The dds object corresponding to a DESeqDatase object obtained after #' running the DESeq function. #' @export build_count_matrices <- function(design_table, path, my_formula, filter_list, min_expression, output_folder, gene_names, - norm_kind) { + norm) { res <- load_matrix(design_table, path, my_formula) dds <- filter_dds( res$dds, filter_list, min_expression, res$acol, - my_formula, norm_kind + my_formula, norm ) get_matrices(dds, output_folder, gene_names) return(dds) diff --git a/R/main.R b/R/main.R index 87a7c53..bd314ad 100644 --- a/R/main.R +++ b/R/main.R @@ -23,7 +23,8 @@ #' (default 0) #' @param gene_names A dataframe with the columns gene_id and gene_name, #' (default NULL). -#' @param norm Add a parameter used to change deseq2 normalisationto cpm +#' @param norm A file containing the size factors to apply to each sample +#' or a "" value #' #' @return A list containing for each comparison given in my_contrasts #' (keys of the list) a sublist containing: @@ -41,7 +42,7 @@ run_deseq2 <- function(design_table, path, my_contrasts, my_formula = "~ condition", filter_list = NULL, min_expression = 2, output_folder = ".", basemean_threshold = 0, lfc_threshold = 0, - gene_names = NULL, norm = "default") { + gene_names = NULL, norm = "") { if (!dir.exists(output_folder)) { stop(paste0("The output folder ", output_folder, " does not exit")) } @@ -102,7 +103,6 @@ cli_run_deseq2 <- function() { } else { filter_list <- read.table(cli$filter, h = F)$V1 } - tmp <- run_deseq2( design_table, cli$path, my_contrasts, cli$formula, filter_list, cli$gene_expression_threshold, cli$output, diff --git a/R/parser.R b/R/parser.R index 09036f6..8aaac35 100644 --- a/R/parser.R +++ b/R/parser.R @@ -68,9 +68,13 @@ cli_function <- function() { ) p <- add_argument(p, "--norm", short = "-n", type = "character", - help = paste0("The kind of norm to use, CPM or default", - "to use the default normalisation"), - default = "default") + help = paste0("A two column dataframe containing the", + "columns sample and size_factor. Sample", + " column must correspond to the samples", + " name given in design file. Size_factor", + "correspond to the scaling value to apply", + "to count data in deseq2"), + default = "") diff --git a/Readme.md b/Readme.md index 92f36c9..d49dbb7 100644 --- a/Readme.md +++ b/Readme.md @@ -127,6 +127,13 @@ optional arguments: second colomn contains corresponding gene names and the header is 'gene_name [default: ] + -n, --norm A two column dataframe containing + thecolumns sample and size_factor. + Sample column must correspond to the + samples name given in design file. + Size_factorcorrespond to the scaling + value to applyto count data in + deseq2 [default: ] ``` The design parameter must point to a file with the following structure: @@ -163,6 +170,20 @@ The gene_names parameter must have the following structure: 1. The `gene`columns must contain the ENSEMBL gene_IDs. 2. The `gene_name` column must contain the name of the gene corresponding to ENSEMBL gene_ID. +Finally the `norm` parameter is an **optional** parameter that can be provided to use another normalization than the DESeq2 method. In most cases you won't need it. This parameter takes a 2 columns file with the following format: + +``` +sample size_factor +276_DMSO_DMSO_0 1 +277_DMSO_DMSO_0 2 +``` + +Note that the header must be provided. + +- The `sample` column must corresponds to samples also defined in the *sample* column of the design file. +- The `size_factor` column correspond to a factor used to normalize sample in DESeq2. They will be used in the sizeFactors function of DESeq2. + + ## With the function run_deseq2 You can directly use the function run_deseq2 with R: -- GitLab