From f27b7bde670f87c71dc11c16d71a73f563b5e5ab Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Wed, 18 Oct 2023 11:28:47 +0200
Subject: [PATCH] add --norm parameter

---
 R/load_matrix.R | 29 ++++++++++++++++++++---------
 R/main.R        |  6 +++---
 R/parser.R      | 10 +++++++---
 Readme.md       | 21 +++++++++++++++++++++
 4 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/R/load_matrix.R b/R/load_matrix.R
index adef723..bb51c52 100644
--- a/R/load_matrix.R
+++ b/R/load_matrix.R
@@ -77,7 +77,8 @@ load_matrix <- function(design_table, path, my_formula) {
 #' samples to keep the gene
 #' @param addition_col Additional design columns
 #' @param my_formula The formula to use in DESeq2
-#' @param norm_kind Add a parameter used to change deseq2 normalisationto cpm
+#' @param norm_kind A file containing the size factors to apply to each sample
+#' or a "" value
 #'
 #' @return A DESeqDataset object filtered
 #' @import DESeq2
@@ -88,7 +89,7 @@ filter_dds <- function(dds, filter_list, min_expression, addition_col,
         "([\\+\\*\\:\\-\\^]|%in%)"
     )[[1]])
     if (is.null(filter_list) || filter_list == "") {
-        if (min_expression == 0 && norm_kind != "CPM") {
+        if (min_expression == 0 && norm_kind == "") {
             return(dds)
         } else {
             cts <- DESeq2::counts(dds, norm = F)
@@ -113,10 +114,19 @@ filter_dds <- function(dds, filter_list, min_expression, addition_col,
         countData = cts_2, colData = coldata,
         design = as.formula(my_formula)
     )
-    if (norm_kind == "CPM") {
-        csum <- colSums(cts_2)
-        fac <- colSums(cts_2) / 1e6
-        sizeFactors(dds_input) <- fac
+    if (norm_kind != "") {
+        df <- read.table(norm_kind, h=T, sep="\t")
+        vec <- df$size_factor
+        names(vec) <- df$sample
+        if(!all(colnames(cts_2) %in% names(vec))) {
+            bad <- colnames(cts_2)[!(colnames(cts_2) %in% names(vec))]
+            bad <- paste(bad, collapse = " ")
+            stop(paste("Some samples (", bad ,") are not defined in", norm,
+            "but are",
+            "present in design file !"))
+        }
+        vec <- vec[colnames(cts_2)]
+        sizeFactors(dds_input) <- vec
     }
     dds <- DESeq2::DESeq(dds_input)
     return(dds)
@@ -166,18 +176,19 @@ get_matrices <- function(dds, output_folder, gene_names) {
 #' @param output_folder Folder were the matrices will be produced
 #' @param gene_names A dataframe with the columns gene_id and gene_name,
 #' (default NULL).
-#' @param norm_kind Add a parameter used to change deseq2 normalisationto cpm
+#' @param norm A file containing the size factors to apply to each sample
+#' or a NULL value
 #'
 #' @return The dds object corresponding to a DESeqDatase object obtained after
 #' running the DESeq function.
 #' @export
 build_count_matrices <- function(design_table, path, my_formula, filter_list,
                                  min_expression, output_folder, gene_names,
-                                 norm_kind) {
+                                 norm) {
     res <- load_matrix(design_table, path, my_formula)
     dds <- filter_dds(
         res$dds, filter_list, min_expression, res$acol,
-        my_formula, norm_kind
+        my_formula, norm
     )
     get_matrices(dds, output_folder, gene_names)
     return(dds)
diff --git a/R/main.R b/R/main.R
index 87a7c53..bd314ad 100644
--- a/R/main.R
+++ b/R/main.R
@@ -23,7 +23,8 @@
 #' (default 0)
 #' @param gene_names A dataframe with the columns gene_id and gene_name,
 #' (default NULL).
-#' @param norm Add a parameter used to change deseq2 normalisationto cpm
+#' @param norm A file containing the size factors to apply to each sample
+#' or a "" value
 #'
 #' @return A list containing for each comparison given in my_contrasts
 #' (keys of the list) a sublist containing:
@@ -41,7 +42,7 @@ run_deseq2 <- function(design_table, path, my_contrasts,
                        my_formula = "~ condition", filter_list = NULL,
                        min_expression = 2, output_folder = ".",
                        basemean_threshold = 0, lfc_threshold = 0,
-                       gene_names = NULL, norm = "default") {
+                       gene_names = NULL, norm = "") {
     if (!dir.exists(output_folder)) {
         stop(paste0("The output folder ", output_folder, " does not exit"))
     }
@@ -102,7 +103,6 @@ cli_run_deseq2 <- function() {
     } else {
         filter_list <- read.table(cli$filter, h = F)$V1
     }
-
     tmp <- run_deseq2(
         design_table, cli$path, my_contrasts, cli$formula, filter_list,
         cli$gene_expression_threshold, cli$output,
diff --git a/R/parser.R b/R/parser.R
index 09036f6..8aaac35 100644
--- a/R/parser.R
+++ b/R/parser.R
@@ -68,9 +68,13 @@ cli_function <- function() {
     )
     p <- add_argument(p, "--norm",
                       short = "-n", type = "character",
-                      help = paste0("The kind of norm to use, CPM or default",
-                                    "to use the default normalisation"),
-                        default = "default")
+                      help = paste0("A two column dataframe containing the",
+                                    "columns sample and size_factor. Sample",
+                                    " column must correspond to the samples",
+                                    " name given in design file. Size_factor",
+                                    "correspond to the scaling value to apply",
+                                    "to count data in deseq2"),
+                      default = "")
 
 
 
diff --git a/Readme.md b/Readme.md
index 92f36c9..d49dbb7 100644
--- a/Readme.md
+++ b/Readme.md
@@ -127,6 +127,13 @@ optional arguments:
                                    second colomn contains corresponding
                                    gene names and the header is
                                    'gene_name [default: ]
+  -n, --norm                       A two column dataframe containing
+                                   thecolumns sample and size_factor.
+                                   Sample column must correspond to the
+                                   samples name given in design file.
+                                   Size_factorcorrespond to the scaling
+                                   value to applyto count data in
+                                   deseq2 [default: ]
 ```
 
 The design parameter must point to a file with the following structure:
@@ -163,6 +170,20 @@ The gene_names parameter must have the following structure:
 1. The `gene`columns must contain the ENSEMBL gene_IDs.
 2. The `gene_name` column must contain the name of the gene corresponding to ENSEMBL gene_ID.
 
+Finally the `norm` parameter is an **optional** parameter that can be provided to use another normalization than the DESeq2 method. In most cases you won't need it. This parameter takes a 2 columns file with the following format:
+
+```
+sample  size_factor
+276_DMSO_DMSO_0 1
+277_DMSO_DMSO_0 2
+```
+
+Note that the header must be provided.
+
+- The `sample` column must corresponds to samples also defined in the *sample* column of the design file.
+- The `size_factor` column correspond to a factor used to normalize sample in DESeq2. They will be used in the sizeFactors function of DESeq2.
+
+
 ## With the function run_deseq2
 
 You can directly use the function run_deseq2 with R:
-- 
GitLab