From f86c33b1405e20e70c52cf6a77e9ffd223724f93 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Thu, 28 Sep 2023 12:31:57 +0200
Subject: [PATCH] Add norm to be able to use CPM normalisation: It's NOT
 advised to do CPM normalisation

---
 R/load_matrix.R | 18 +++++++++++++-----
 R/main.R        |  7 ++++---
 R/parser.R      |  7 +++++++
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/R/load_matrix.R b/R/load_matrix.R
index 9681ee6..adef723 100644
--- a/R/load_matrix.R
+++ b/R/load_matrix.R
@@ -26,7 +26,7 @@ find_file <- function(prefix, path) {
 #' condition. Other columns corresponding to cofactors can be present
 #' @param path The path were the count file are stored
 #' @param my_formula The formula to use in DESeq2
-#'
+
 #' @return A DESeqDataset object filtered
 #' @import DESeq2
 #' @import dplyr
@@ -77,17 +77,18 @@ load_matrix <- function(design_table, path, my_formula) {
 #' samples to keep the gene
 #' @param addition_col Additional design columns
 #' @param my_formula The formula to use in DESeq2
+#' @param norm_kind Add a parameter used to change deseq2 normalisationto cpm
 #'
 #' @return A DESeqDataset object filtered
 #' @import DESeq2
 filter_dds <- function(dds, filter_list, min_expression, addition_col,
-                       my_formula) {
+                       my_formula, norm_kind) {
     columns <- unique(str_split(
         str_replace_all(my_formula, "[~\\s]", ""),
         "([\\+\\*\\:\\-\\^]|%in%)"
     )[[1]])
     if (is.null(filter_list) || filter_list == "") {
-        if (min_expression == 0) {
+        if (min_expression == 0 && norm_kind != "CPM") {
             return(dds)
         } else {
             cts <- DESeq2::counts(dds, norm = F)
@@ -112,6 +113,11 @@ filter_dds <- function(dds, filter_list, min_expression, addition_col,
         countData = cts_2, colData = coldata,
         design = as.formula(my_formula)
     )
+    if (norm_kind == "CPM") {
+        csum <- colSums(cts_2)
+        fac <- colSums(cts_2) / 1e6
+        sizeFactors(dds_input) <- fac
+    }
     dds <- DESeq2::DESeq(dds_input)
     return(dds)
 }
@@ -160,16 +166,18 @@ get_matrices <- function(dds, output_folder, gene_names) {
 #' @param output_folder Folder were the matrices will be produced
 #' @param gene_names A dataframe with the columns gene_id and gene_name,
 #' (default NULL).
+#' @param norm_kind Add a parameter used to change deseq2 normalisationto cpm
 #'
 #' @return The dds object corresponding to a DESeqDatase object obtained after
 #' running the DESeq function.
 #' @export
 build_count_matrices <- function(design_table, path, my_formula, filter_list,
-                                 min_expression, output_folder, gene_names) {
+                                 min_expression, output_folder, gene_names,
+                                 norm_kind) {
     res <- load_matrix(design_table, path, my_formula)
     dds <- filter_dds(
         res$dds, filter_list, min_expression, res$acol,
-        my_formula
+        my_formula, norm_kind
     )
     get_matrices(dds, output_folder, gene_names)
     return(dds)
diff --git a/R/main.R b/R/main.R
index 13752bb..87a7c53 100644
--- a/R/main.R
+++ b/R/main.R
@@ -23,6 +23,7 @@
 #' (default 0)
 #' @param gene_names A dataframe with the columns gene_id and gene_name,
 #' (default NULL).
+#' @param norm Add a parameter used to change deseq2 normalisationto cpm
 #'
 #' @return A list containing for each comparison given in my_contrasts
 #' (keys of the list) a sublist containing:
@@ -40,14 +41,14 @@ run_deseq2 <- function(design_table, path, my_contrasts,
                        my_formula = "~ condition", filter_list = NULL,
                        min_expression = 2, output_folder = ".",
                        basemean_threshold = 0, lfc_threshold = 0,
-                       gene_names = NULL) {
+                       gene_names = NULL, norm = "default") {
     if (!dir.exists(output_folder)) {
         stop(paste0("The output folder ", output_folder, " does not exit"))
     }
     dds <- build_count_matrices(
         design_table, path, my_formula, filter_list,
         min_expression, output_folder,
-        gene_names
+        gene_names, norm
     )
     create_plots(dds, output_folder)
     results <- de_analyzes(
@@ -106,6 +107,6 @@ cli_run_deseq2 <- function() {
         design_table, cli$path, my_contrasts, cli$formula, filter_list,
         cli$gene_expression_threshold, cli$output,
         cli$basemean_threshold, cli$lfc_threshold,
-        gene_names
+        gene_names, cli$norm
     )
 }
diff --git a/R/parser.R b/R/parser.R
index 830f48c..09036f6 100644
--- a/R/parser.R
+++ b/R/parser.R
@@ -66,6 +66,13 @@ cli_function <- function() {
                                     "The second colomn contains corresponding gene names and the header is 'gene_name"),
                       default = ""
     )
+    p <- add_argument(p, "--norm",
+                      short = "-n", type = "character",
+                      help = paste0("The kind of norm to use, CPM or default",
+                                    "to use the default normalisation"),
+                        default = "default")
+
+
 
     # Parse de command line arguments
     argv <- argparser::parse_args(p)
-- 
GitLab