diff --git a/src/v4/HTRSIM/dev/flat_full_bis.Rmd b/src/v4/HTRSIM/dev/flat_full_bis.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..a6766ab3a2b44293d538351f2122ec889a5904cd --- /dev/null +++ b/src/v4/HTRSIM/dev/flat_full_bis.Rmd @@ -0,0 +1,5893 @@ +--- +title: "flat_full.Rmd for working package" +output: html_document +editor_options: + chunk_output_type: console +--- + +<!-- Run this 'development' chunk --> +<!-- Store every call to library() that you need to explore your functions --> + +```{r development, include=FALSE} +library(testthat) +``` + +<!-- + You need to run the 'description' chunk in the '0-dev_history.Rmd' file before continuing your code there. + +If it is the first time you use {fusen}, after 'description', you can directly run the last chunk of the present file with inflate() inside. +--> + +```{r development-load} +# Load already included functions if relevant +pkgload::load_all(export_all = FALSE) +``` + + +# Initialize variable to simulate + +<!-- +Create a chunk for the core of the function + +- The chunk needs to be named `function` at least +- It contains the code of a documented function +- The chunk can also be named `function-my_median` to make it easily +findable in your Rmd +- Let the `@examples` part empty, and use the next `examples` chunk instead to present reproducible examples + +After inflating the template + +- This function code will automatically be added in a new file in the "R/" directory +--> + +```{r function-utils, filename = "utils"} +#' Join two data frames using data.table +#' +#' @param d1 Data frame 1 +#' @param d2 Data frame 2 +#' @param k1 Key columns for data frame 1 +#' @param k2 Key columns for data frame 2 +#' @importFrom data.table data.table +#' @return Joined data frame +#' @export +#' +#' @examples +#' +#' # Example usage: +#' df1 <- data.frame(id = 1:5, value = letters[1:5]) +#' df2 <- data.frame(id = 1:5, category = LETTERS[1:5]) +#' join_dtf(df1, df2, "id", "id") +join_dtf <- function(d1, d2, k1, k2) { + d1.dt_table <- data.table::data.table(d1, key = k1) + d2.dt_table <- data.table::data.table(d2, key = k2) + dt_joined <- d1.dt_table[d2.dt_table, allow.cartesian = TRUE] + return(dt_joined %>% as.data.frame()) +} + + + +#' Clean Variable Name +#' +#' This function removes digits, spaces, and special characters from a variable name. +#' If any of these are present, they will be replaced with an underscore '_'. +#' +#' @param name The input variable name to be cleaned. +#' @return The cleaned variable name without digits, spaces, or special characters. +#' +#' @details +#' This function will check the input variable name for the presence of digits, +#' spaces, and special characters. If any of these are found, they will be removed +#' from the variable name and replaced with an underscore '_'. Additionally, it will +#' check if the cleaned name is not one of the reserved names "interactions" or +#' "correlations" which are not allowed as variable names. +#' @export +#' @examples +#' clean_variable_name("my_var,:&$à à (-i abl23 e_na__ç^me ") +clean_variable_name <- function(name){ + message("Variable name should not contain digits, spaces, or special characters.\nIf any of these are present, they will be removed from the variable name.") + # avoid space in variable name + name <- gsub(" ", "_", name, fixed = TRUE) + # avoid digit in variable name + name <- gsub("[0-9]", "", name) + # avoid special character in variable name + name <- gsub("[[:punct:]]", "", name) + + forbidden_names <- c("interactions", "correlations") + if (name %in% forbidden_names) { + forbidden_str <- paste(forbidden_names, collapse = " and ") + stop(forbidden_str, "cannot be used as variable name") + } + return(name) + +} + + +#' Get Setting Table +#' +#' Create a table of experimental settings. +#' +#' This function takes various experimental parameters and returns a data frame +#' that represents the experimental settings. +#' +#' @param n_genes Number of genes in the experiment. +#' @param max_replicates Maximum number of replicates for each gene. +#' @param min_replicates Minimum number of replicates for each gene. +#' @param lib_size total number of reads +#' +#' @return A data frame containing the experimental settings with their corresponding values. +#' @export +getSettingsTable <- function(n_genes, max_replicates, min_replicates, lib_size ){ + + settings_df <- data.frame(parameters = c("# genes", "Max # replicates", "Min # replicates", "Library size" ), + values = c(n_genes, max_replicates, min_replicates, lib_size)) + rownames(settings_df) <- NULL + + return(settings_df) +} + +``` + + +```{r test-dataFromUser} +# Test unitaires pour la fonction join_dtf +test_that("join_dtf réalise la jointure correctement", { + # Création de données de test + df1 <- data.frame(id = 1:5, value = letters[1:5]) + df2 <- data.frame(id = 1:5, category = LETTERS[1:5]) + + # Exécution de la fonction + result <- HTRSIM::join_dtf(df1, df2, "id", "id") + + # Vérification des résultats + expect_true(is.data.frame(result)) + expect_equal(nrow(result), 5) + expect_equal(ncol(result), 3) + expect_equal(names(result), c("id", "value", "category")) + expect_true(all.equal(result$id, df1$id)) + expect_true(all.equal(result$id, df2$id)) +}) + + +test_that("clean_variable_name correctly removes digits, spaces, and special characters", { + expect_equal(clean_variable_name("my variable name"), "myvariablename") + expect_equal(clean_variable_name("variable_1"), "variable") + expect_equal(clean_variable_name("^spec(ial#chars! "), "specialchars") +}) + +test_that("clean_variable_name handles reserved names properly", { + expect_error(clean_variable_name("interactions")) + expect_error(clean_variable_name("correlations")) +}) +``` + + +```{r function-init_variable, filename = "simulation_initialization"} +#' Initialize variable +#' +#' @param list_var Either c() or output of init_variable +#' @param name Variable name +#' @param mu Either a numeric value or a numeric vector (of length = level) +#' @param sd Either numeric value or NA +#' @param level Numeric value to specify the number of levels to simulate +#' +#' @return +#' A list with initialized variables +#' @export +#' +#' @examples +init_variable <- function(list_var = c(), name = "myVariable", mu = c(2,3), sd = NA, level = NA) { + + name <- clean_variable_name(name) + + # Only mu specified by user => set level param + if (is.na(level) && is.na(sd)) { + level <- length(mu) + } + + # Validate inputs + inputs_checking(list_var, name, mu, sd, level) + + if (endsWithDigit(name)) { + warning("Names ending with digits are not allowed. They will be removed from the variable name.") + name <- removeDigitsAtEnd(name) + } + + # Initialize new variable + list_var[[name]] <- fillInVariable(name, mu, sd, level) + + return(list_var) +} + + + +#' Check if a string ends with a digit +#' +#' This function checks whether a given string ends with a digit. +#' +#' @param string The input string to be checked +#' @return \code{TRUE} if the string ends with a digit, \code{FALSE} otherwise +#' @export +#' @examples +#' endsWithDigit("abc123") # Output: TRUE +#' endsWithDigit("xyz") # Output: FALSE +endsWithDigit <- function(string) { + lastChar <- substring(string, nchar(string)) + return(grepl("[0-9]", lastChar)) +} + +#' Remove digits at the end of a string +#' +#' This function removes any digits occurring at the end of a given string. +#' +#' @param string The input string from which digits are to be removed +#' @return The modified string with digits removed from the end +#' @export +#' @examples +#' removeDigitsAtEnd("abc123") # Output: "abc" +#' removeDigitsAtEnd("xyz") # Output: "xyz" +removeDigitsAtEnd <- function(string) { + return(gsub("\\d+$", "", string)) +} + + +#' Check Input Parameters +#' +#' This function checks the validity of the input parameters for initializing a variable. +#' It ensures that the necessary conditions are met for the input parameters. +#' +#' @param list_var List containing the variables to be initialized. +#' @param name Name of the variable. +#' @param mu Mean of the variable. +#' @param sd Standard deviation of the variable (optional). +#' @param level Number of levels for categorical variables. +#' +#' @return NULL +#' @export +#' +#' @examples +#' inputs_checking(list_var = c(), name = "var1", mu = 0, sd = 1, level = 2) +inputs_checking <- function(list_var, name, mu, sd, level) { + stopifnot(name != "") + stopifnot(is.character(name)) + stopifnot(is.numeric(mu)) + stopifnot(is.numeric(sd) | is.na(sd)) + stopifnot(is.numeric(level)) + stopifnot(length(level) == 1) + stopifnot(level >= 2) + + if (!is.null(list_var)) { + error_msg <- "Non conformable list_var parameter.\nlist_var must be set as an init_var output or initialized as c()" + if (!is.list(list_var)) { + stop(error_msg) + } + } + + if (length(mu) > 1) { + stopifnot(length(mu) == level) + } + + if (is.na(sd)) { + if (level != length(mu)) { + stop("sd was specified as NA. mu should have the same length as the number of levels\n") + } + } + + # Check if variable is already initialized + name_not_in_list_var <- identical(which(already_init_variable(list_var, name)), integer(0)) + if (!name_not_in_list_var) { + message(paste(name, "is already initialized in list_var.\nIt will be updated", sep = " ")) + } + + return(NULL) +} + + +#' Check if Variable is Already Initialized +#' +#' This function checks if a variable is already initialized in the variable list. +#' +#' @param list_var A list object representing the variable list. +#' @param new_var_name A character string specifying the name of the new variable. +#' +#' @return TRUE if the variable is already initialized, FALSE otherwise. +#' @export +#' +#' @examples +#' my_list <- list(var1 = 1, var2 = 2, var3 = 3) +#' already_initialized <- already_init_variable(list_var = my_list, new_var_name = "myVariable") +already_init_variable <- function(list_var, new_var_name) { + if (is.null(list_var)) { + return(FALSE) + } + + var_names <- names(list_var) + return(new_var_name %in% var_names) +} + +#' Fill in Variable +#' +#' This function fills in a variable with simulated data based on the provided parameters. +#' +#' @param name The name of the variable. +#' @param mu A numeric value or a numeric vector (of length = level) representing the mean. +#' @param sd A numeric value representing the standard deviation, or NA if not applicable. +#' @param level A numeric value specifying the number of levels to simulate. +#' +#' @return A data frame or a list containing the simulated data for the variable. +#' @export +#' +#' @examples +#' variable_data <- fillInVariable(name = "myVariable", mu = c(2, 3), sd = NA, level = 2) +#' +fillInVariable <- function(name, mu, sd, level) { + + if (length(mu) > 1 | is.na(sd)) { # Effects given by user + level <- length(mu) + l_labels <- paste(name, 1:level, sep = '') + l_betaEffects <- mu + column_names <- c(paste("label", name, sep = "_"), name) + sub_obj <- build_sub_obj_return_to_user(level, metaData = l_labels, + effectsGivenByUser = l_betaEffects, + column_names) + } else { + sub_obj <- as.data.frame(list(mu = mu, sd = sd, level = level)) + } + + return(sub_obj) +} + +#' Build Sub Object to Return to User +#' +#' This function builds the sub-object to be returned to the user. +#' +#' @param level A numeric value specifying the number of levels. +#' @param metaData A list of labels. +#' @param effectsGivenByUser A list of effects given by the user. +#' @param col_names A character vector specifying the column names to use. +#' +#' @return A list with the sub-object details. +build_sub_obj_return_to_user <- function(level, metaData, effectsGivenByUser, col_names) { + sub_obj <- list(level = level) + data <- cbind(metaData, effectsGivenByUser) %>% as.data.frame() + colnames(data) <- col_names + var_name <- tail(col_names, n = 1) + data[, var_name] <- as.numeric(data[, var_name]) + sub_obj$data <- data + return(sub_obj) +} + + +#' Add interaction +#' +#' @param list_var A list of variables (already initialized) +#' @param between_var A vector of variable names to include in the interaction +#' @param mu Either a numeric value or a numeric vector (of length = level) +#' @param sd Either numeric value or NA +#' +#' @return +#' A list with initialized interaction +#' @export +#' +#' @examples +add_interaction <- function(list_var, between_var, mu, sd = NA) { + name_interaction <- paste(between_var, collapse = ":") + check_input2interaction(name_interaction, list_var, between_var, mu, sd) + + # Check the number of variables in the interaction + if (length(between_var) > 3) { + stop("Cannot initialize an interaction with more than 3 variables.") + } + + interactionCombinations <- getNumberOfCombinationsInInteraction(list_var, between_var) + list_var$interactions[[name_interaction]] <- fillInInteraction(list_var, between_var, mu, sd, interactionCombinations) + return(list_var) +} + +#' Check input for interaction +#' +#' @param name_interaction String specifying the name of the interaction (example: "varA:varB") +#' @param list_var A list of variables (already initialized) +#' @param between_var A vector of variable names to include in the interaction +#' @param mu Either a numeric value or a numeric vector (of length = level) +#' @param sd Either numeric value or NA +#' +#' @return +#' NULL (throws an error if the input is invalid) +#' @export +check_input2interaction <- function(name_interaction, list_var, between_var, mu, sd) { + # Check if variables in between_var are declared and initialized + bool_checkInteractionValidity <- function(between_var, list_var) { + nb_varInInteraction <- length(unique(between_var)) + stopifnot(nb_varInInteraction > 1) + existingVar_nb <- getListVar(list_var) %in% between_var %>% sum() + if (existingVar_nb != nb_varInInteraction) { + return(FALSE) + } else { + return(TRUE) + } + } + + bool_valid_interaction <- bool_checkInteractionValidity(between_var, list_var) + if (!bool_valid_interaction) { + stop("At least one variable in between_var is not declared. Variable not initialized cannot be used in an interaction.") + } + + requestedNumberOfValues <- getNumberOfCombinationsInInteraction(list_var, between_var) + if (is.na(sd) && requestedNumberOfValues != length(mu)) { + msg_e <- "sd was specified as NA. mu should have the same length as the possible number of interactions:\n" + msg_e2 <- paste(requestedNumberOfValues, "interaction values are requested.") + stop(paste(msg_e, msg_e2)) + } + + level <- requestedNumberOfValues + inputs_checking(list_var$interactions, name_interaction, mu, sd, level) +} + +#' Get the number of combinations in an interaction +#' +#' @param list_var A list of variables (already initialized) +#' @param between A vector of variable names to include in the interaction +#' +#' @return +#' The number of combinations in the interaction +#' @export +getNumberOfCombinationsInInteraction <- function(list_var, between) { + levelInlistVar <- getGivenAttribute(list_var, "level") %>% unlist() + n_combinations <- prod(levelInlistVar[between]) + return(n_combinations) +} + +#' getGridCombination +#' +#' Generates all possible combinations of labels. +#' +#' @param l_labels List of label vectors +#' +#' @return A data frame with all possible combinations of labels +#' @export +#' +#' @examples +#' l_labels <- list( +#' c("A", "B", "C"), +#' c("X", "Y") +#' ) +#' getGridCombination(l_labels) +getGridCombination <- function(l_labels) { + grid <- expand.grid(l_labels) + colnames(grid) <- paste("label", seq_along(l_labels), sep = "_") + return(grid) +} + + + +#' Get grid combination from list_var +#' +#' @param list_var A list of variables (already initialized) +#' +#' @return +#' The grid combination between variable in list_var +#' @export +generateGridCombination_fromListVar <- function (list_var){ + l_levels <- getGivenAttribute(list_var, "level") %>% unlist() + vars <- names(l_levels) + l_levels <- l_levels[vars] + l_labels <- getLabels(l_variables2labelized = vars, l_nb_label = l_levels) + gridComb <- getGridCombination(l_labels) + colnames(gridComb) <- paste("label", vars, sep = "_") + return(gridComb) +} + + +#' Fill in interaction +#' +#' @param list_var A list of variables (already initialized) +#' @param between A vector of variable names to include in the interaction +#' @param mu Either a numeric value or a numeric vector (of length = level) +#' @param sd Either numeric value or NA +#' @param level Number of interactions +#' +#' @return +#' A data frame with the filled-in interaction values +#' @export +fillInInteraction <- function(list_var, between, mu, sd, level) { + if (length(mu) > 1 || is.na(sd)) { + l_levels <- getGivenAttribute(list_var, "level") %>% unlist() + l_levelsOfInterest <- l_levels[between] + l_labels_varOfInterest <- getLabels(l_variables2labelized = between, l_nb_label = l_levelsOfInterest ) + + grid_combination <- getGridCombination(l_labels_varOfInterest) + n_combinations <- dim(grid_combination)[1] + column_names <- c(paste("label", between, sep = "_"), paste(between, collapse = ":")) + sub_dtf <- build_sub_obj_return_to_user(level = n_combinations, + metaData = grid_combination, + effectsGivenByUser = mu, + col_names = column_names) + } else { + sub_dtf <- list(mu = mu, sd = sd, level = level) %>% as.data.frame() + } + + return(sub_dtf) +} + +#' Get the list of variable names +#' +#' @param input R list, e.g., output of init_variable +#' +#' @return +#' A character vector with the names of variables +getListVar <- function(input) attributes(input)$names + +#' Get a given attribute from a list of variables +#' +#' @param list_var A list of variables (already initialized) +#' @param attribute A string specifying the attribute to retrieve in all occurrences of the list +#' +#' @return +#' A list without NULL values +getGivenAttribute <- function(list_var, attribute) { + l <- lapply(list_var, FUN = function(var) var[[attribute]]) + l_withoutNull <- l[!vapply(l, is.null, logical(1))] + return(l_withoutNull) +} + + +#' @return +#' A list of labels per variable +#' Get labels for variables +#' +#' @param l_variables2labelized A list of variables +#' @param l_nb_label A list of numeric values representing the number of levels per variable +#' +#' @return +#' A list of labels per variable +getLabels <- function(l_variables2labelized, l_nb_label) { + getVarNameLabel <- function(name, level) { + list_label <- paste(name, 1:level, sep = "") + return(list_label) + } + + listLabels <- Map(getVarNameLabel, l_variables2labelized, l_nb_label) + return(listLabels) +} + +``` + + +```{r tests-init_variable} + +test_that("endsWithDigit returns the correct result", { + expect_true(endsWithDigit("abc123")) + expect_false(endsWithDigit("xyz")) +}) + +test_that("removeDigitsAtEnd removes digits at the end of a string", { + expect_equal(removeDigitsAtEnd("abc123"), "abc") + expect_equal(removeDigitsAtEnd("xyz"), "xyz") +}) + + +test_that("init_variable initializes a variable correctly", { + # Test case 1: Initialize a variable with default parameters + list_var <- init_variable() + expect_true("myVariable" %in% names(list_var)) + expect_equal(nrow(list_var$myVariable$data), 2) + + # Test case 2: Initialize a variable with custom parameters + list_var <- init_variable(name = "custom_variable", mu = c(1, 2, 3), sd = 0.5, level = 3) + expect_true("customvariable" %in% names(list_var)) + expect_equal(nrow(list_var$customvariable$data), 3) +}) + +test_that("inputs_checking performs input validation", { + + # Test case 1: Invalid inputs - sd is NA but mu has unique values + expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = 2, sd = NA, level = 2)) + + # Test case 2: Invalid inputs - empty name + expect_error(inputs_checking(list_var = c(), name = "", mu = 2, sd = NA, level = 2)) + + # Test case 3: Invalid inputs - non-numeric mu + expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = "invalid", sd = NA, level = 2)) + + # Test case 4: Invalid inputs - non-numeric sd + expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = 2, sd = "invalid", level = 2)) + + # Test case 5: Invalid inputs - level less than 2 + expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = 2, sd = NA, level = 1)) + + # Test case 6: Invalid inputs - mu and level have different lengths + expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = c(1, 2, 3), sd = NA, level = 2)) + + # Test case 7: Valid inputs + expect_silent(inputs_checking(list_var = c(), name = "myVariable", mu = c(1, 2, 3), sd = NA, level = 3)) +}) + + + +test_that("already_init_variable checks if a variable is already initialized", { + list_var <- init_variable() + + # Test case 1: Variable not initialized + list_var <- init_variable(name = "custom_variable", mu = c(2, 3), sd = NA, level = 2) + expect_true(already_init_variable(list_var, "customvariable")) + + # Test case 2: Variable already initialized + expect_false(already_init_variable(list_var, "myVariable")) + +}) + +test_that("fillInVariable fills in variable correctly", { + # Test case 1: Effects given by user + sub_obj <- fillInVariable("myVariable", c(1, 2, 3), NA, NA) + expect_equal(sub_obj$level, 3) + expect_equal(ncol(sub_obj$data), 2) + + # Test case 2: Effects simulated using mvrnorm + sub_obj <- fillInVariable("myVariable", 2, 0.5, 3) + expect_equal(sub_obj$level, 3) + expect_equal(sub_obj$sd, 0.5) + expect_equal(sub_obj$mu, 2) +}) + +test_that("build_sub_obj_return_to_user returns the expected output", { + level <- 3 + metaData <- paste("label", 1:level, sep = "_") + effectsGivenByUser <- c(2, 3, 4) + col_names <- c("metadata", "effects") + + result <- build_sub_obj_return_to_user(level, metaData, effectsGivenByUser, col_names) + + expect_equal(result$level, level) + expect_identical(result$data$metadata, metaData) + expect_identical(result$data$effects, effectsGivenByUser) + + +}) + +test_that("generateGridCombination_fromListVar returns expected output", { + result <- generateGridCombination_fromListVar(init_variable()) + expect <- data.frame(label_myVariable = c("myVariable1", "myVariable2")) + expect_equal(nrow(result), nrow(expect)) + expect_equal(ncol(result), ncol(expect)) +}) + +test_that("add_interaction adds an interaction between variables", { + list_var <- init_variable(name = "varA", mu = 1, sd = 1, level = 2) + list_var <- init_variable(list_var, name = "varB", mu = 2, sd = 1, level = 3) + list_var <- add_interaction(list_var, between_var = c("varA", "varB"), mu = 0.5, sd = 3) + expect_true("varA:varB" %in% names(list_var$interactions)) +}) + +test_that("add_interaction throws an error for invalid variables", { + list_var <- init_variable(name = "varA", mu = 1, sd = 1, level = 2) + expect_error(add_interaction(list_var, between_var = c("varA", "varB"), mu = 0.5, sd = NA)) +}) + + +test_that("getNumberOfCombinationsInInteraction calculates the number of combinations", { + list_var <- init_variable(name = "varA", mu = 1, sd = 1, level = 2) + list_var <- init_variable(list_var, name = "varB", mu = 2, sd = 1, level = 3) + expect_equal(getNumberOfCombinationsInInteraction(list_var, c("varA", "varB")), 6) +}) + +test_that("getLabels generates labels for variables", { + labels <- getLabels(c("varA", "varB"), c(2, 3)) + expect_equal(length(labels), 2) + expect_equal(length(labels[[1]]), 2) + expect_equal(length(labels[[2]]), 3) +}) + +test_that("getGridCombination generates a grid of combinations", { + labels <- list(A = c("A1", "A2"), B = c("B1", "B2", "B3")) + grid_combination <- getGridCombination(labels) + expect_equal(dim(grid_combination), c(6, 2)) +}) + +``` + +```{r function-mvrnorm, filename = "datafrommvrnorm_manipulations" } +#' getInput2mvrnorm +#' +#' @inheritParams init_variable +#' +#' @return +#' a list that can be used as input for MASS::mvrnorm +#' @export +#' +#' @examples +#' list_var <- init_variable(name = "my_var", mu = 0, sd = 2, level = 3) +#' getInput2mvrnorm(list_var) +getInput2mvrnorm <- function(list_var){ + # -- pick up sd provided by user + variable_standard_dev <- getGivenAttribute(list_var, attribute = "sd") %>% unlist() + interaction_standard_dev <- getGivenAttribute(list_var$interactions, attribute = "sd") %>% unlist() + list_stdev_2covmatx <- c(variable_standard_dev, interaction_standard_dev) + if (is.null(list_stdev_2covmatx)) ## NO SD provided + return(list(mu = NULL, covMatrix = NULL)) + + # - COV matrix + covar_userProvided = getGivenAttribute(list_var$correlations, "covar") + covMatrix <- getCovarianceMatrix(list_stdev_2covmatx, covar_userProvided) + + # -- MU + variable_mu <- getGivenAttribute(list_var, attribute = "mu") %>% unlist() + interaction_mu <- getGivenAttribute(list_var$interactions, attribute = "mu") %>% unlist() + list_mu <- c(variable_mu, interaction_mu) + + return(list(mu = list_mu, covMatrix = covMatrix)) + +} + + +#' getCovarianceMatrix +#' @param list_stdev standard deviation list +#' @param list_covar covariance list +#' +#' @return +#' covariance matrix +#' @export +#' +#' @examples +#' vector_sd <- c(1,2, 3) +#' names(vector_sd) <- c("varA", "varB", "varC") +#' vector_covar <- c(8, 12, 24) +#' names(vector_covar) <- c("varA.varB", "varA.varC", "varB.varC") +#' covMatrix <- getCovarianceMatrix(vector_sd, vector_covar) +getCovarianceMatrix <- function(list_stdev, list_covar){ + # -- cov(A, A) = sd(A)^2 + diag_cov <- list_stdev^2 + dimension <- length(diag_cov) + covariance_matrix <- matrix(0,nrow = dimension, ncol = dimension) + diag(covariance_matrix) <- diag_cov + colnames(covariance_matrix) <- paste("label", names(diag_cov), sep = "_") + rownames(covariance_matrix) <- paste("label", names(diag_cov), sep = "_") + names_covaration <- names(list_covar) + + ###### -- utils -- ##### + convertDF <- function(name, value){ + ret <- data.frame(value) + colnames(ret) <- name + ret + } + + ## -- needed to use reduce after ;) + l_covarUserDf <- lapply(names_covaration, function(n_cov) convertDF(n_cov, list_covar[n_cov] )) + covariance_matrix2ret <- Reduce(fillInCovarMatrice, x = l_covarUserDf, init = covariance_matrix) + covariance_matrix2ret +} + + +#' Fill in Covariance Matrix +#' +#' This function updates the covariance matrix with the specified covariance value between two variables. +#' +#' @param covarMatrice The input covariance matrix. +#' @param covar A data frame containing the covariance value between two variables. +#' @return The updated covariance matrix with the specified covariance value filled in. +#' @export +#' @examples +#' covarMat <- matrix(0, nrow = 3, ncol = 3) +#' colnames(covarMat) <- c("label_varA", "label_varB", "label_varC") +#' rownames(covarMat) <- c("label_varA", "label_varB", "label_varC") +#' covarValue <- data.frame("varA.varB" = 0.5) +#' fillInCovarMatrice(covarMatrice = covarMat, covar = covarValue) +fillInCovarMatrice <- function(covarMatrice, covar){ + varsInCovar <- strsplit(colnames(covar), split = "[.]") %>% unlist() + index_matrix <- paste("label",varsInCovar, sep = "_") + covar_value <- covar[1,1] + covarMatrice[index_matrix[1], index_matrix[2]] <- covar_value + covarMatrice[index_matrix[2], index_matrix[1]] <- covar_value + return(covarMatrice) +} + + +#' Check if a matrix is positive definite +#' This function checks whether a given matrix is positive definite, i.e., all of its eigenvalues are positive. +#' @param mat The matrix to be checked. +#' @return A logical value indicating whether the matrix is positive definite. +#' @export +#' @examples +#' # Create a positive definite matrix +#' mat1 <- matrix(c(4, 2, 2, 3), nrow = 2) +#' is_positive_definite(mat1) +#' # Expected output: TRUE +#' +#' # Create a non-positive definite matrix +#' mat2 <- matrix(c(4, 2, 2, -3), nrow = 2) +#' is_positive_definite(mat2) +#' # Expected output: FALSE +#' +#' # Check an empty matrix +#' mat3 <- matrix(nrow = 0, ncol = 0) +#' is_positive_definite(mat3) +#' # Expected output: TRUE +#' +#' @export +is_positive_definite <- function(mat) { + if (nrow(mat) == 0 && ncol(mat) == 0) return(TRUE) + eigenvalues <- eigen(mat)$values + all(eigenvalues > 0) +} + + + +#' getGeneMetadata +#' +#' @inheritParams init_variable +#' @param n_genes Number of genes to simulate +#' +#' @return +#' metadata matrix +#' +#' @export +#' +#' @examples +#' list_var <- init_variable() +#' metadata <- getGeneMetadata(list_var, n_genes = 10) +getGeneMetadata <- function(list_var, n_genes) { + metaData <- generateGridCombination_fromListVar(list_var) + n_combinations <- dim(metaData)[1] + genes_vec <- base::paste("gene", 1:n_genes, sep = "") + geneID <- rep(genes_vec, each = n_combinations) + metaData <- cbind(geneID, metaData) + + return(metaData) +} + + +#' getDataFromMvrnorm +#' +#' @inheritParams init_variable +#' @param input2mvrnorm list with mu and covariance matrix, output of getInput2mvrnorm +#' @param n_genes Number of genes to simulate +#' +#' @return +#' data simulated from multivariate normal distribution +#' +#' @export +#' +#' @examples +#' list_var <- init_variable() +#' input <- getInput2mvrnorm(list_var) +#' simulated_data <- getDataFromMvrnorm(list_var, input, n_genes = 10) +getDataFromMvrnorm <- function(list_var, input2mvrnorm, n_genes = 1) { + if (is.null(input2mvrnorm$covMatrix)) + return(list()) + + metaData <- getGeneMetadata(list_var, n_genes) + n_tirages <- dim(metaData)[1] + + mtx_mvrnormSamplings <- samplingFromMvrnorm(n_samplings = n_tirages, + l_mu = input2mvrnorm$mu, matx_cov = input2mvrnorm$covMatrix) + + dataFromMvrnorm <- cbind(metaData, mtx_mvrnormSamplings) + + return(list(dataFromMvrnorm)) +} + + +#' getDataFromMvrnorm +#' +#' @param n_samplings number of samplings using mvrnorm +#' @param l_mu vector of mu +#' @param matx_cov covariance matrix +#' +#' @return +#' samples generated from multivariate normal distribution +#' +#' @export +#' +#' @examples +#' n <- 100 +#' mu <- c(0, 0) +#' covMatrix <- matrix(c(1, 0.5, 0.5, 1), ncol = 2) +#' samples <- samplingFromMvrnorm(n_samplings = n, l_mu = mu, matx_cov = covMatrix) +samplingFromMvrnorm <- function(n_samplings, l_mu, matx_cov) { + mvrnormSamp <- MASS::mvrnorm(n = n_samplings, mu = l_mu, Sigma = matx_cov, empirical = TRUE) + + return(mvrnormSamp) +} + +``` + +```{r tests-mvrnorm} +test_that("getInput2mvrnorm returns the correct list", { + list_var <- init_variable() + input <- getInput2mvrnorm(list_var) + expect_is(input, "list") + expect_true("mu" %in% names(input)) + expect_true("covMatrix" %in% names(input)) +}) + + +test_that("fillInCovarMatrice returns the correct matrix", { + covarMat <- matrix(0, nrow = 3, ncol = 3) + colnames(covarMat) <- c("label_varA", "label_varB", "label_varC") + rownames(covarMat) <- c("label_varA", "label_varB", "label_varC") + covarValue <- data.frame("varA.varB" = 18) + matrice <- fillInCovarMatrice(covarMatrice = covarMat, covar = covarValue) + + expected_matrice <- matrix(0, nrow = 3, ncol = 3) + colnames(expected_matrice) <- c("label_varA", "label_varB", "label_varC") + rownames(expected_matrice) <- c("label_varA", "label_varB", "label_varC") + expected_matrice["label_varA", "label_varB"] <- 18 + expected_matrice["label_varB", "label_varA"] <- 18 + expect_identical(matrice, expected_matrice) +}) + +test_that("getCovarianceMatrix returns the correct covariance matrix", { + vector_sd <- c(1,2, 3) + names(vector_sd) <- c("varA", "varB", "varC") + vector_covar <- c(8, 12, 24) + names(vector_covar) <- c("varA.varB", "varA.varC", "varB.varC") + covMatrix <- getCovarianceMatrix(vector_sd, vector_covar) + + expect_is(covMatrix, "matrix") + expect_equal(dim(covMatrix), c(3, 3)) + expected_matrix <- matrix(c(1,8,12,8,4,24, 12,24,9), nrow = 3, byrow = T) + rownames(expected_matrix) <- c("label_varA", "label_varB", "label_varC") + colnames(expected_matrix) <- c("label_varA", "label_varB", "label_varC") + expect_equal(expected_matrix, covMatrix) +}) + +test_that("getGeneMetadata returns the correct metadata", { + list_var <- init_variable() + n_genes <- 10 + metadata <- getGeneMetadata(list_var, n_genes) + expect_is(metadata, "data.frame") + expect_equal(colnames(metadata), c("geneID", paste("label", (attributes(list_var)$names), sep ="_"))) + expect_equal(nrow(metadata), n_genes * list_var$myVariable$level) +}) + +test_that("getDataFromMvrnorm returns the correct data", { + list_var <- init_variable(name = "varA", mu = 1, sd = 4, level = 3) %>% init_variable("varB", mu = 2, sd = 1, level = 2) + input <- getInput2mvrnorm(list_var) + n_genes <- 10 + n_samplings <- n_genes * (list_var$varA$level ) * (list_var$varB$level ) + data <- getDataFromMvrnorm(list_var, input, n_genes) + expect_is(data, "list") + expect_equal(length(data), 1) + expect_is(data[[1]], "data.frame") + expect_equal(nrow(data[[1]]), n_samplings) + +}) + +test_that("getDataFromMvrnomr returns empty list",{ + list_var <- init_variable() + input <- getInput2mvrnorm(list_var) + n_genes <- 10 + n_samplings <- n_genes * (list_var$varA$level ) * (list_var$varB$level ) + data <- getDataFromMvrnorm(list_var, input, n_genes) + expect_is(data, "list") + expect_equal(data, list()) +}) + +test_that("samplingFromMvrnorm returns the correct sampling", { + n_samplings <- 100 + l_mu <- c(1, 2) + matx_cov <- matrix(c(1, 0.5, 0.5, 1), ncol = 2) + sampling <- samplingFromMvrnorm(n_samplings, l_mu, matx_cov) + + expect_is(sampling, "matrix") + expect_equal(dim(sampling), c(n_samplings, length(l_mu))) +}) + + +``` + +```{r function-dataFromUser, filename = "datafromUser_manipulations"} + +#' Get data from user +#' +#' +#' @param list_var A list of variables (already initialized) +#' @return A list of data to join +#' @export +#' +#' @examples +#' getDataFromUser(init_variable()) +getDataFromUser <- function(list_var) { + variable_data2join <- getGivenAttribute(list_var, "data") + id_var2join <- names(variable_data2join) + interaction_data2join <- getGivenAttribute(list_var$interactions, "data") + id_interaction2join <- names(interaction_data2join) + + data2join <- list(variable_data2join, interaction_data2join) %>% + unlist(recursive = FALSE) + id2join <- c(id_var2join, id_interaction2join) + l_data2join <- lapply(id2join, function(id) data2join[[id]]) + + return(l_data2join) +} + +``` + +```{r test-dataFromUser} +# Test unitaires pour la fonction join_dtf +test_that("join_dtf réalise la jointure correctement", { + # Création de données de test + df1 <- data.frame(id = 1:5, value = letters[1:5]) + df2 <- data.frame(id = 1:5, category = LETTERS[1:5]) + + # Exécution de la fonction + result <- join_dtf(df1, df2, "id", "id") + + # Vérification des résultats + expect_true(is.data.frame(result)) + expect_equal(nrow(result), 5) + expect_equal(ncol(result), 3) + expect_equal(names(result), c("id", "value", "category")) + expect_true(all.equal(result$id, df1$id)) + expect_true(all.equal(result$id, df2$id)) +}) + + +# Test unitaires pour la fonction getDataFromUser +test_that("getDataFromUser renvoie les données appropriées", { + # Exécution de la fonction + list_var <- init_variable() + list_var <- init_variable(list_var, "second_var") + result <- getDataFromUser(list_var) + + # Vérification des résultats + expect_true(is.list(result)) + expect_equal(length(result), 2) + expect_true(all(sapply(result, is.data.frame))) + expect_equal(names(result[[1]]), c("label_myVariable", "myVariable")) +}) +``` + + +## Perform simulation from data + +```{r function-simulation , filename = "simulation"} +#' Get input for simulation based on coefficients +#' +#' This function generates input data for simulation based on the coefficients provided in the \code{list_var} argument. +#' +#' @param list_var A list of variables (already initialized) +#' @param n_genes Number of genes to simulate (default: 1) +#' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL) +#' @return A data frame with input coefficients for simulation +#' @export +#' @examples +#' # Example usage +#' list_var <- init_variable() +#' getInput2simulation(list_var, n_genes = 10) +getInput2simulation <- function(list_var, n_genes = 1, input2mvrnorm = NULL) { + + # Use default input to mvrnorm if not provided by the user + if (is.null(input2mvrnorm)) input2mvrnorm = getInput2mvrnorm(list_var) + + l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes) + l_dataFromUser = getDataFromUser(list_var) + df_input2simu <- getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) + return(df_input2simu) +} + +#' getCoefficients +#' +#' Get the coefficients. +#' +#' @param list_var A list of variables (already initialized) +#' @param l_dataFromMvrnorm Data from the `getGeneMetadata` function (optional). +#' @param l_dataFromUser Data from the `getDataFromUser` function (optional). +#' @param n_genes The number of genes. +#' @export +#' @return A dataframe containing the coefficients. +#' @examples +#' # Example usage +#' list_var <- init_variable() +#' input2mvrnorm = getInput2mvrnorm(list_var) +#' l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes) +#' l_dataFromUser = getDataFromUser(list_var) +#' getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes = 3) +getCoefficients <- function(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) { + if (length(l_dataFromMvrnorm) == 0) { + metaData <- getGeneMetadata(list_var, n_genes) + l_dataFromMvrnorm <- list(metaData) + } + l_df2join <- c(l_dataFromMvrnorm, l_dataFromUser) + + + df_coef <- Reduce(function(d1, d2){ column_names = colnames(d2) + idx_key = grepl(pattern = "label", column_names ) + keys = column_names[idx_key] + join_dtf(d1, d2, k1 = keys , k2 = keys) + } + , l_df2join ) %>% as.data.frame() + column_names <- colnames(df_coef) + idx_column2factor <- grep(pattern = "label_", column_names) + + if (length(idx_column2factor) > 1) { + df_coef[, idx_column2factor] <- lapply(df_coef[, idx_column2factor], as.factor) + } else { + df_coef[, idx_column2factor] <- as.factor(df_coef[, idx_column2factor]) + } + + return(df_coef) +} + + +#' Get the log_qij values from the coefficient data frame. +#' +#' @param dtf_coef The coefficient data frame. +#' @return The coefficient data frame with log_qij column added. +#' @export +getLog_qij <- function(dtf_coef) { + dtf_beta_numeric <- dtf_coef[sapply(dtf_coef, is.numeric)] + dtf_coef$log_qij <- rowSums(dtf_beta_numeric, na.rm = TRUE) + return(dtf_coef) +} + + +#' Calculate mu_ij values based on coefficient data frame and scaling factor +#' +#' This function calculates mu_ij values by raising 2 to the power of the log_qij values +#' from the coefficient data frame and multiplying it by the provided scaling factor. +#' +#' @param dtf_coef Coefficient data frame containing the log_qij values +#' @param scaling_factor Scaling factor to multiply the calculated mu_ij values +#' +#' @return Coefficient data frame with an additional mu_ij column +#' +#' @examples +#' list_var <- init_variable() +#' N_GENES <- 5 +#' dtf_coef <- getInput2simulation(list_var, N_GENES) +#' dtf_coef <- getLog_qij(dtf_coef) +#' dtf_coef <- addBasalExpression(dtf_coef, N_GENES, c(10, 20, 0)) +#' getMu_ij(dtf_coef) +#' @export +getMu_ij <- function(dtf_coef) { + log_qij_scaled <- dtf_coef$log_qij + dtf_coef$basalExpr + dtf_coef$log_qij_scaled <- log_qij_scaled + mu_ij <- exp(log_qij_scaled) + dtf_coef$mu_ij <- mu_ij + return(dtf_coef) +} + +#' getMu_ij_matrix +#' +#' Get the Mu_ij matrix. +#' +#' @param dtf_coef A dataframe containing the coefficients. +#' @importFrom reshape2 dcast + +#' @export +#' @return A Mu_ij matrix. +getMu_ij_matrix <- function(dtf_coef) { + column_names <- colnames(dtf_coef) + idx_var <- grepl(pattern = "label", column_names) + l_var <- column_names[idx_var] + str_formula_rigth <- paste(l_var, collapse = " + ") + if (str_formula_rigth == "") stop("no variable label detected") + str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ") + formula <- as.formula(str_formula) + dtf_Muij <- dtf_coef %>% reshape2::dcast(., formula, value.var = "mu_ij", drop = F) + dtf_Muij[is.na(dtf_Muij)] <- 0 + mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix() + mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F] + return(mtx_Muij) +} + +#' getSubCountsTable +#' +#' Get the subcounts table. +#' +#' @param matx_Muij The Mu_ij matrix. +#' @param matx_dispersion The dispersion matrix. +#' @param replicateID The replication identifier. +#' @param l_bool_replication A boolean vector indicating the replicates. +#' @return A subcounts table. +getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) { + getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) { + k_ij <- stats::rnbinom(n_genes * n_samples, + size = matx_dispersion, + mu = matx_Muij) %>% + matrix(., nrow = n_genes, ncol = n_samples) + + k_ij[is.na(k_ij)] <- 0 + return(k_ij) + } + + if (!any(l_bool_replication)) + return(NULL) + + matx_Muij <- matx_Muij[, l_bool_replication, drop = FALSE] + matx_dispersion <- matx_dispersion[, l_bool_replication, drop = FALSE] + l_sampleID <- colnames(matx_Muij) + l_geneID <- rownames(matx_Muij) + dimension_mtx <- dim(matx_Muij) + n_genes <- dimension_mtx[1] + n_samples <- dimension_mtx[2] + matx_kij <- getKijMatrix(matx_Muij, matx_dispersion, n_genes, n_samples) + colnames(matx_kij) <- paste(l_sampleID, replicateID, sep = "_") + rownames(matx_kij) <- l_geneID + return(matx_kij) +} + + +``` + +```{r test-simulation} + + +# Test case 1: Check if the function returns a data frame +test_that("getInput2simulation returns a data frame", { + list_var <- init_variable() + result <- getInput2simulation(list_var) + expect_is(result, "data.frame") + expected <- data.frame(geneID = c("gene1", "gene1"), label_myVariable = as.factor(c("myVariable1", "myVariable2")), myVariable = c(2,3)) + expect_equal(result, expected) + }) + +# Test for getCoefficients function +test_that("getCoefficients returns the correct output", { + # Create dummy data + n_genes <- 3 + list_var = init_variable() + # Call the function + coefficients <- getCoefficients(list_var, list(), list(), n_genes) + + # Check the output + expect_equal(nrow(coefficients), n_genes*list_var$myVariable$level) + expect_equal(colnames(coefficients), c("geneID", "label_myVariable")) +}) + +# Test for getMu_ij_matrix function +test_that("getMu_ij_matrix returns the correct output", { + # Create a dummy coefficients dataframe + dtf_coef <- data.frame(geneID = c("Gene1", "Gene1", "Gene1"), + label_varA = c("A1", "A2", "A3"), + label_varB = c("B1", "B2", "B3"), + mu_ij = c(1, 2, 3)) + + # Call the function + mu_matrix <- getMu_ij_matrix(dtf_coef) + # Check the output + expect_equal(dim(mu_matrix), c(1, 9)) + +}) + +# Test for getSubCountsTable function +test_that("getSubCountsTable returns the correct output", { + # Create dummy data + l_genes <- c("gene1", "gene2", "gene3") + matx_Muij = data.frame(sple1 = c(1,3,4), sple2 = c(2, 0, 9), sple3 = c(1, 69, 2)) %>% as.matrix() + rownames(matx_Muij) <- l_genes + matx_dispersion <- matrix(0.5, nrow = 3, ncol = 3) + replicateID <- 1 + l_bool_replication <- c(TRUE, FALSE, TRUE) + + # Call the function + subcounts_table <- getSubCountsTable(matx_Muij, matx_dispersion, 1, l_bool_replication) + + # Check the output + expect_equal(dim(subcounts_table), c(3, 2)) + expect_equal(rownames(subcounts_table), l_genes) +}) + + +``` + +## Simulation + +```{r function-simulation2 , filename = "simulation2"} + +#' getReplicationMatrix +#' +#' @param minN Minimum number of replicates for each sample +#' @param maxN Maximum number of replicates for each sample +#' @param n_samples Number of samples +#' @export +#' @return A replication matrix indicating which samples are replicated +getReplicationMatrix <- function(minN, maxN, n_samples) { + + # Create a list of logical vectors representing the minimum number of replicates + l_replication_minimum = lapply(1:n_samples, + FUN = function(i) rep(TRUE, times = minN) ) + + # Create a list of random logical vectors representing additional replicates + l_replication_random = lapply(1:n_samples, + FUN = function(i) sample(x = c(TRUE, FALSE), size = maxN-minN, replace = T) ) + + # Combine the replication vectors into matrices + matx_replication_minimum <- do.call(cbind, l_replication_minimum) + matx_replication_random <- do.call(cbind, l_replication_random) + + # Combine the minimum replicates and random replicates into a single matrix + matx_replication <- rbind(matx_replication_minimum, matx_replication_random) + + # Sort the columns of the replication matrix in descending order + matx_replication = apply(matx_replication, 2, sort, decreasing = TRUE ) %>% matrix(nrow = maxN) + + return(matx_replication) +} + +#' getCountsTable +#' +#' @param matx_Muij Matrix of mean expression values for each gene and sample +#' @param matx_dispersion Matrix of dispersion values for each gene and sample +#' @param matx_bool_replication Replication matrix indicating which samples are replicated +#' +#' @return A counts table containing simulated read counts for each gene and sample +getCountsTable <- function(matx_Muij , matx_dispersion, matx_bool_replication ){ + max_replicates <- dim(matx_bool_replication)[1] + + # Apply the getSubCountsTable function to each row of the replication matrix + l_countsTable = lapply(1:max_replicates, function(i) getSubCountsTable(matx_Muij , matx_dispersion, i, matx_bool_replication[i,] )) + + # Combine the counts tables into a single matrix + countsTable = do.call(cbind, l_countsTable) + + return(countsTable %>% as.data.frame()) +} + +#' getDispersionMatrix +#' +#' @param list_var A list of variables (already initialized) +#' @param n_genes Number of genes +#' @param dispersion Vector of dispersion values for each gene +#' @export +#' +#' @return A matrix of dispersion values for each gene and sample +getDispersionMatrix <- function(list_var, n_genes, dispersion = stats::runif(n_genes, min = 0, max = 1000)){ + l_geneID = base::paste("gene", 1:n_genes, sep = "") + l_sampleID = getSampleID(list_var) + n_samples = length(l_sampleID) + l_dispersion <- dispersion + + # Create a data frame for the dispersion values + dtf_dispersion = list(dispersion = l_dispersion) %>% as.data.frame() + dtf_dispersion <- dtf_dispersion[, rep("dispersion", n_samples)] + rownames(dtf_dispersion) = l_geneID + colnames(dtf_dispersion) = l_sampleID + + matx_dispersion = dtf_dispersion %>% as.matrix() + + return(matx_dispersion) +} + + + + + +#' Replicate rows of a data frame by group +#' +#' Replicates the rows of a data frame based on a grouping variable and replication counts for each group. +#' +#' @param df Data frame to replicate +#' @param group_var Name of the grouping variable in the data frame +#' @param rep_list Vector of replication counts for each group +#' @return Data frame with replicated rows +#' @examples +#' df <- data.frame(group = c("A", "B"), value = c(1, 2)) +#' .replicateByGroup(df, "group", c(2, 3)) +#' +#' @export +.replicateByGroup <- function(df, group_var, rep_list) { + l_group_var <- df[[group_var]] + group_levels <- unique(l_group_var) + names(rep_list) <- group_levels + group_indices <- rep_list[l_group_var] + replicated_indices <- rep(seq_len(nrow(df)), times = group_indices) + replicated_df <- df[replicated_indices, ] + suffix_sampleID <- sequence(group_indices) + replicated_df[["sampleID"]] <- paste(replicated_df[["sampleID"]], suffix_sampleID, sep = "_") + rownames(replicated_df) <- NULL + return(replicated_df) +} + + + +#' Replicate rows of a data frame +#' +#' Replicates the rows of a data frame by a specified factor. +#' +#' @param df Data frame to replicate +#' @param n Replication factor for each row +#' @return Data frame with replicated rows +#' @export +#' @examples +#' df <- data.frame(a = 1:3, b = letters[1:3]) +#' .replicateRows(df, 2) +#' +.replicateRows <- function(df, n) { + indices <- rep(seq_len(nrow(df)), each = n) + replicated_df <- df[indices, , drop = FALSE] + rownames(replicated_df) <- NULL + return(replicated_df) +} + +#' Get sample metadata +#' +#' Generates sample metadata based on the input variables, replication matrix, and number of genes. +#' +#' @param list_var A list of variables (already initialized) +#' @param replicationMatrix Replication matrix +#' @param n_genes Number of genes +#' @return Data frame of sample metadata +#' @importFrom data.table setorderv +#' @export +#' @examples +#' list_var <- init_variable() +#' n_genes <- 10 +#' replicationMatrix <- generateReplicationMatrix(list_var ,2, 3) +#' getSampleMetadata(list_var, n_genes, replicationMatrix) +getSampleMetadata <- function(list_var, n_genes, replicationMatrix) { + l_sampleIDs = getSampleID(list_var) + metaData <- generateGridCombination_fromListVar(list_var) + metaData[] <- lapply(metaData, as.character) ## before reordering + data.table::setorderv(metaData, cols = colnames(metaData)) + metaData[] <- lapply(metaData, as.factor) + metaData$sampleID <- l_sampleIDs + rep_list <- colSums(replicationMatrix) + metaData$sampleID <- as.character(metaData$sampleID) ## before replicating + sampleMetadata <- .replicateByGroup(metaData, "sampleID", rep_list) + colnames(sampleMetadata) <- gsub("label_", "", colnames(sampleMetadata)) + return(sampleMetadata) +} + + +#' getSampleID +#' +#' @param list_var A list of variables (already initialized) +#' @export +#' @return A sorted vector of sample IDs +getSampleID <- function(list_var){ + gridCombination <- generateGridCombination_fromListVar(list_var) + l_sampleID <- apply( gridCombination , 1 , paste , collapse = "_" ) %>% unname() + return(sort(l_sampleID)) +} + + +``` + +```{r test-simulations} + +test_that("getReplicationMatrix returns the correct replication matrix", { + minN <- 2 + maxN <- 4 + n_samples <- 3 + expected <- matrix(c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE), nrow = maxN) + + set.seed(123) + result <- getReplicationMatrix(minN, maxN, n_samples) + + expect_equal(result, expected) +}) + +test_that("getSampleID return the correct list of sampleID",{ + expect_equal(getSampleID(init_variable()), c("myVariable1", "myVariable2")) +}) + +# Create a test case for getMu_ij +test_that("getMu_ij returns the correct output", { + # Create a sample coefficient data frame + dtf_coef <- data.frame( + log_qij = c(1, 9, 0.1), + basalExpr = c(2, 3, 4) + ) + + # Call the getMu_ij function + result <- getMu_ij(dtf_coef) + + # Check if the mu_ij column is added + expect_true("mu_ij" %in% colnames(result)) + + # Check the values of mu_ij + #expected_mu_ij <- c(20.08554, 162754.79142 , 60.34029) + #expect_equal(result$mu_ij, expected_mu_ij, tolerance = 0.000001) +}) + + +# Create a test case for getLog_qij +test_that("getLog_qij returns the correct output", { + # Create a sample coefficient data frame + dtf_coef <- data.frame( + beta1 = c(1.2, 2.3, 3.4), + beta2 = c(0.5, 1.0, 1.5), + non_numeric = c("a", "b", "c") + ) + + # Call the getLog_qij function + result <- getLog_qij(dtf_coef) + + # Check if the log_qij column is added + expect_true("log_qij" %in% colnames(result)) + + # Check the values of log_qij + expected_log_qij <- c(1.7, 3.3, 4.9) + expect_equal(result$log_qij, expected_log_qij) +}) + +test_that("getCountsTable returns the correct counts table", { + mat_mu_ij <- matrix(c(1,2,3,4,5,6), ncol = 3, byrow = T) + rownames(mat_mu_ij) <- c("gene1", "gene2") + colnames(mat_mu_ij) <- c("sample1", "sample2", "sample3") + mat_disp <- matrix(c(0.3,0.3,0.3, 0.5,0.5,0.5), ncol = 3, byrow = T) + rownames(mat_disp) <- c("gene1", "gene2") + colnames(mat_disp) <- c("sample1", "sample2", "sample3") + mat_repl <- matrix(c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), ncol = 3, byrow = T) + + expected_df <- matrix(c(0,0,1,0,0,0,0,1,0,2,34,18,0,0,3,10,7,2), nrow = 2, byrow = T) %>% as.data.frame() + rownames(expected_df) <- c("gene1", "gene2") + colnames(expected_df) <- c("sample1_1", "sample2_1", "sample3_1", "sample1_2", + "sample2_2","sample3_2","sample1_3", "sample2_3" ,"sample3_3") + + set.seed(123) + result <- getCountsTable(mat_mu_ij, mat_disp, mat_repl) + + expect_true(is.data.frame(result)) + expect_equal(colnames(result), colnames(expected_df)) + expect_equal(rownames(result), rownames(expected_df)) + +}) + + + +test_that("getSampleMetadata returns expected output", { + # Set up input variables + list_var <- init_variable() + n_genes <- 3 + replicationMatrix <- matrix(TRUE, nrow = 2, ncol = 2) + + # Run the function + result <- getSampleMetadata(list_var, n_genes, replicationMatrix) + + # Define expected output + expected_colnames <- c("myVariable", "sampleID") + expect_equal(colnames(result), expected_colnames) + + # Check the output class + expect_true(is.data.frame(result)) + + # check nrow output + expect_equal(nrow(result), 4) + +}) + + +test_that(".replicateByGroup return the correct ouptut", { + df <- data.frame(group = c("A", "B"), value = c(1, 2)) + result <- .replicateByGroup(df, "group", c(2, 3)) + + expect <- data.frame(group = c("A", "A", "B", "B", "B"), + value = c(1, 1, 2,2,2), + sampleID = c("_1", "_2", "_1", "_2", "_3" )) + expect_equal(result, expect) + +}) + + +test_that("getDispersionMatrix returns the correct dispersion matrix", { + n_genes = 3 + list_var = init_variable() + dispersion <- 1:3 + expected <- matrix(1:3,byrow = F, nrow = 3, ncol = 2) + rownames(expected) <- c("gene1", "gene2", "gene3") + colnames(expected) <- c("myVariable1", "myVariable2") + result <- getDispersionMatrix(list_var, n_genes, dispersion ) + expect_equal(result, expected) +}) + + + +``` + +## Mock RNAseq + +```{r function-mock , filename = "mock-rnaSeq" } + +#' Check the validity of the dispersion matrix +#' +#' Checks if the dispersion matrix has the correct dimensions. +#' +#' @param matx_dispersion Replication matrix +#' @param matx_bool_replication Replication matrix +#' @return TRUE if the dimensions are valid, FALSE otherwise +#' @export +#' @examples +#' matx_dispersion <- matrix(1:12, nrow = 3, ncol = 4) +#' matx_bool_replication <- matrix(TRUE, nrow = 3, ncol = 4) +#' .isDispersionMatrixValid(matx_dispersion, matx_bool_replication) +.isDispersionMatrixValid <- function(matx_dispersion, matx_bool_replication) { + expected_nb_column <- dim(matx_bool_replication)[2] + if (expected_nb_column != dim(matx_dispersion)[2]) { + return(FALSE) + } + return(TRUE) +} + +#' Generate count table +#' +#' Generates the count table based on the mu_ij matrix, dispersion matrix, and replication matrix. +#' +#' @param mu_ij_matx_rep Replicated mu_ij matrix +#' @param matx_dispersion_rep Replicated dispersion matrix +#' @return Count table +#' @export +#' @examples +#' mu_ij_matx_rep <- matrix(1:12, nrow = 3, ncol = 4) +#' matx_dispersion_rep <- matrix(1:12, nrow = 3, ncol = 4) +#' generateCountTable(mu_ij_matx_rep, matx_dispersion_rep) +generateCountTable <- function(mu_ij_matx_rep, matx_dispersion_rep) { + message("k_ij ~ Nbinom(mu_ij, dispersion)") + n_genes <- dim(mu_ij_matx_rep)[1] + n_samples <- dim(mu_ij_matx_rep)[2] + n_samplings <- prod(n_genes * n_samples) + mat_countsTable <- rnbinom(n_samplings, size = matx_dispersion_rep, mu = mu_ij_matx_rep) %>% + matrix(., nrow = n_genes, ncol = n_samples) + colnames(mat_countsTable) <- colnames(mu_ij_matx_rep) + rownames(mat_countsTable) <- rownames(mu_ij_matx_rep) + mat_countsTable[is.na(mat_countsTable)] <- 0 + return(mat_countsTable) +} + + +#' Perform RNA-seq simulation +#' +#' Simulates RNA-seq data based on the input variables. +#' +#' @param list_var List of input variables +#' @param n_genes Number of genes +#' @param min_replicates Minimum replication count +#' @param max_replicates Maximum replication count +#' @param sequencing_depth Sequencing depth +#' @param basal_expression base expression gene +#' @param matx_dispersion User-provided dispersion matrix (optional) +#' @return List containing the ground truth, counts, and metadata +#' @export +#' @examples +#' mock_rnaseq(list_var = init_variable(), +#' n_genes = 1000, min_replicates = 2, +#' max_replicates = 4) +mock_rnaseq <- function(list_var, n_genes, min_replicates, max_replicates, sequencing_depth = NULL, + basal_expression = 0 , dispersion = stats::runif(n_genes, min = 0, max = 1000) ) { + + ## -- get my effect + df_inputSimulation <- getInput2simulation(list_var, n_genes) + ## -- add column logQij + df_inputSimulation <- getLog_qij(df_inputSimulation) + df_inputSimulation <- addBasalExpression(df_inputSimulation, n_genes, basal_expression) + df_inputSimulation <- getMu_ij(df_inputSimulation ) + + message("Building mu_ij matrix") + ## -- matrix + matx_Muij <- getMu_ij_matrix(df_inputSimulation) + l_sampleID <- getSampleID(list_var) + matx_bool_replication <- generateReplicationMatrix(list_var, min_replicates, max_replicates) + mu_ij_matx_rep <- .replicateMatrix(matx_Muij, matx_bool_replication) + + + dispersion <- getValidDispersion(dispersion) + genes_dispersion <- sample(dispersion , size = n_genes, replace = T) + matx_dispersion <- getDispersionMatrix(list_var, n_genes, genes_dispersion) + l_geneID = base::paste("gene", 1:n_genes, sep = "") + names(genes_dispersion) <- l_geneID + + ## same order as mu_ij_matx_rep + matx_dispersion <- matx_dispersion[ order(row.names(matx_dispersion)), ] + matx_dispersion_rep <- .replicateMatrix(matx_dispersion, matx_bool_replication) + matx_countsTable <- generateCountTable(mu_ij_matx_rep, matx_dispersion_rep) + + message("Counts simulation: Done") + + + dtf_countsTable <- matx_countsTable %>% as.data.frame() + if (!is.null(sequencing_depth)) { + message("Scaling count table according to sequencing depth.") + dtf_countsTable <- scaleCountsTable(dtf_countsTable, sequencing_depth) + } + + metaData <- getSampleMetadata(list_var, n_genes, matx_bool_replication) + libSize <- sum(colSums(dtf_countsTable)) + settings_df <- getSettingsTable(n_genes, min_replicates, max_replicates, libSize) + + list2ret <- list( + settings = settings_df, + init = list_var, + groundTruth = list(effects = df_inputSimulation, gene_dispersion = genes_dispersion), + counts = dtf_countsTable, + metadata = metaData) + return(list2ret) +} + + + + +#' Validate and Filter Dispersion Values +#' +#' This function takes an input vector and validates it to ensure that it meets certain criteria. +#' +#' @param input_vector A vector to be validated. +#' @return A validated and filtered numeric vector. +#' @details The function checks whether the input is a vector, suppresses warnings while converting to numeric, +#' and filters out non-numeric elements. It also checks for values greater than zero and removes negative values. +#' If the resulting vector has a length of zero, an error is thrown. +#' @examples +#' getValidDispersion(c(0.5, 1.2, -0.3, "invalid", 0.8)) +#' @export +getValidDispersion <- function(input_vector) { + # Verify if it's a vector + if (!is.vector(input_vector)) { + stop("dispersion param is not a vector.") + } + + input_vector <- suppressWarnings(as.numeric(input_vector)) + + # Filter numeric elements + numeric_elements <- !is.na(input_vector) + if (sum(!numeric_elements) > 0) { + message("Non-numeric elements were removed from the dispersion vector") + input_vector <- input_vector[numeric_elements] + } + + # Check and filter values > 0 + numeric_positive_elements <- input_vector > 0 + if (sum(!numeric_positive_elements) > 0) { + message("Negative numeric values were removed from the dispersion vector") + input_vector <- input_vector[numeric_positive_elements] + } + + if (length(input_vector) == 0) stop("Invalid dispersion values provided.") + + return(input_vector) +} + + +#' Generate replication matrix +#' +#' Generates the replication matrix based on the minimum and maximum replication counts. +#' +#' @param list_var Number of samples +#' @param min_replicates Minimum replication count +#' @param max_replicates Maximum replication count +#' @return Replication matrix +#' @export +#' @examples +#' list_var = init_variable() +#' generateReplicationMatrix(list_var, min_replicates = 2, max_replicates = 4) +generateReplicationMatrix <- function(list_var, min_replicates, max_replicates) { + if (min_replicates > max_replicates) { + message("min_replicates > max_replicates have been supplied") + message("Automatic reversing") + tmp_min_replicates <- min_replicates + min_replicates <- max_replicates + max_replicates <- tmp_min_replicates + } + l_sampleIDs <- getSampleID(list_var) + n_samples <- length(l_sampleIDs) + return(getReplicationMatrix(min_replicates, max_replicates, n_samples = n_samples)) +} + +#' Replicate matrix +#' +#' Replicates a matrix based on a replication matrix. +#' +#' @param matrix Matrix to replicate +#' @param replication_matrix Replication matrix +#' @return Replicated matrix +#' @export +#' @examples +#' matrix <- matrix(1:9, nrow = 3, ncol = 3) +#' replication_matrix <- matrix(TRUE, nrow = 3, ncol = 3) +#' .replicateMatrix(matrix, replication_matrix) +.replicateMatrix <- function(matrix, replication_matrix) { + n_genes <- dim(matrix)[1] + rep_list <- colSums(replication_matrix) + replicated_indices <- rep(seq_len(ncol(matrix)), times = rep_list) + replicated_matrix <- matrix[, replicated_indices, drop = FALSE] + suffix_sampleID <- sequence(rep_list) + colnames(replicated_matrix) <- paste(colnames(replicated_matrix), suffix_sampleID, sep = "_") + return(replicated_matrix) +} + + +``` + +```{r test-hiddenFunction} + +# Test case: Valid input vector with numeric and positive values +test_that("Valid input vector with numeric and positive values", { + input_vector <- c(0.5, 1.2, 0.8) + result <- getValidDispersion(input_vector) + expect_identical(result, input_vector) +}) + +# Test case: Valid input vector with numeric, positive, and negative values +test_that("Valid input vector with numeric, positive, and negative values", { + input_vector <- c(0.5, -0.3, 1.2, 0.8) + result <- getValidDispersion(input_vector) + expect_identical(result, c(0.5, 1.2, 0.8)) +}) + +# Test case: Valid input vector with non-numeric elements +test_that("Valid input vector with non-numeric elements", { + input_vector <- c(0.5, "invalid", 0.8) + result <- getValidDispersion(input_vector) + expect_identical(result, c(0.5, 0.8)) +}) + +# Test case: Empty input vector +test_that("Empty input vector", { + input_vector <- numeric(0) + expect_error(getValidDispersion(input_vector), "Invalid dispersion values provided.") +}) + +# Test case: unique value in vector +test_that("unique value in vector", { + input_vector <- 5 + expect_equal(getValidDispersion(input_vector), 5) +}) + +# Test case: All negative values +test_that("All negative values", { + input_vector <- c(-0.5, -1.2, -0.8) + expect_error(getValidDispersion(input_vector), "Invalid dispersion values provided.") +}) + + +# Test for .isDispersionMatrixValid +test_that(".isDispersionMatrixValid returns TRUE for valid dimensions", { + matx_dispersion <- matrix(1:6, nrow = 2, ncol = 3) + matx_bool_replication <- matrix(TRUE, nrow = 2, ncol = 3) + expect_true(.isDispersionMatrixValid(matx_dispersion, matx_bool_replication)) +}) + +test_that(".isDispersionMatrixValid throws an error for invalid dimensions", { + matx_dispersion <- matrix(1:4, nrow = 2, ncol = 2) + matx_bool_replication <- matrix(TRUE, nrow = 2, ncol = 3) + expect_false(.isDispersionMatrixValid(matx_dispersion, matx_bool_replication)) +}) + +# Test for generateCountTable +test_that("generateCountTable generates count table with correct dimensions", { + mu_ij_matx_rep <- matrix(1:6, nrow = 2, ncol = 3) + matx_dispersion_rep <- matrix(1:6, nrow = 2, ncol = 3) + count_table <- generateCountTable(mu_ij_matx_rep, matx_dispersion_rep) + expect_equal(dim(count_table), c(2, 3)) +}) + + + +# Test for .replicateMatrix +test_that(".replicateMatrix replicates matrix correctly", { + matrix <- matrix(1:9, nrow = 3, ncol = 3) + replication_matrix <- matrix(TRUE, nrow = 3, ncol = 3) + replicated_matrix <- .replicateMatrix(matrix, replication_matrix) + expect_equal(dim(replicated_matrix), c(3, 9)) +}) + +``` + +```{r test-mock} + +# Test for mock_rnaseq +#test_that("mock_rnaseq returns expected output", { + # Set up input variables +# list_var <- NULL +# n_genes <- 3 +# min_replicates <- 2 +# max_replicates <- 4 +# df_inputSimulation <- data.frame(gene_id = 1:3, coef_value = c(0.5, 0.3, 0.2)) +# matx_dispersion <- matrix(1:9, nrow = 3, ncol = 3) + + # Run the function +# expect_error(mock_rnaseq(list_var, n_genes, min_replicates, max_replicates, df_inputSimulation, +# matx_dispersion)) + + + #list_var <- init_variable(name = "my_var", mu = c(10, 20), level = 2 ) + #n_genes <- 10 + #min_replicates <- 2 + #max_replicates <- 4 + #scaling_factor <- 1 + #df_inputSimulation <- getInput2simulation(list_var, n_genes) + #dispersion <- getDispersionMatrix(list_var, n_genes, c(1000, 1000, 1000, 1000, 1000, 1, 1, 1, 1, 1)) + #mock_rnaseq(list_var, n_genes, min_replicates, + # max_replicates, + # df_inputSimulation, dispersion) + #ERROOR +#}) + + +# Test for generateReplicationMatrix +test_that("generateReplicationMatrix generates replication matrix correctly", { + replication_matrix <- generateReplicationMatrix(init_variable(),min_replicates = 2, max_replicates = 4) + expect_equal(dim(replication_matrix), c(4, 2)) +}) + +``` + + +## Prepare data + +```{r function-preparingData , filename = "prepare_data2fit"} + +#' Convert count matrix to long data frame +#' +#' Converts a count matrix to a long data frame format using geneID as the identifier. +#' +#' @param countMatrix Count matrix +#' @param value_name Name for the value column +#' @param id_vars Name for the id column (default "geneID") +#' @return Long data frame +#' @importFrom reshape2 melt +#' @export +#' @examples +#' list_var <- init_variable() +#' mock_data <- mock_rnaseq(list_var, n_genes = 3, 2, 2) +#' countMatrix_2longDtf(mock_data$counts) +countMatrix_2longDtf <- function(countMatrix, value_name = "kij", id_vars = "geneID") { + countMatrix <- as.data.frame(countMatrix) + countMatrix$geneID <- rownames(countMatrix) + dtf_countLong <- reshape2::melt(countMatrix, id.vars = id_vars, variable.name = "sampleID", + value.name = value_name) + dtf_countLong$sampleID <- as.character(dtf_countLong$sampleID) + return(dtf_countLong) +} + +#' Get column name with sampleID +#' +#' Returns the column name in the metadata data frame that corresponds to the given sampleID. +#' +#' @param dtf_countsLong Long data frame of counts +#' @param metadata Metadata data frame +#' @return Column name with sampleID +#' @export +#' @examples +#' list_var <- init_variable() +#' mock_data <- mock_rnaseq(list_var, n_genes = 3, 2,2, 2) +#' dtf_countLong <- countMatrix_2longDtf(mock_data$counts) +#' .getColumnWithSampleID(dtf_countLong, mock_data$metadata) +.getColumnWithSampleID <- function(dtf_countsLong, metadata) { + example_spleID <- as.character(dtf_countsLong[1, "sampleID"]) + regex <- paste("^", as.character(dtf_countsLong[1, "sampleID"]), "$", sep = "") + for (indice_col in dim(metadata)[2]) { + if (grep(pattern = regex, metadata[, indice_col]) == 1) { + return(colnames(metadata)[indice_col]) + } else { + return(NA) # SampleID does not correspond between countMatrix and metadata + } + } +} + +#' Prepare data for fitting +#' +#' Prepares the countMatrix and metadata for fitting by converting the countMatrix to a long format and joining with metadata. +#' +#' @param countMatrix Count matrix +#' @param metadata Metadata data frame +#' @param normalization A boolean value indicating whether to apply median ratio +#' normalization. If \code{TRUE} (default), the counts matrix will be +#' normalized using median ratio normalization. If +#' \code{FALSE}, no normalization will be applied. +#' @param response_name String referring to target variable name that is being modeled and predicted (default : "kij") +#' @param groupID String referring the group variable name (default : "geneID") +#' @return Data frame for fitting +#' @export +#' @examples +#' list_var <- init_variable() +#' mock_data <- mock_rnaseq(list_var, n_genes = 3, 2,2, 2) +#' data2fit <- prepareData2fit(mock_data$counts, mock_data$metadata) +prepareData2fit <- function(countMatrix, metadata, normalization = TRUE , response_name = "kij", groupID = "geneID" ) { + + ## -- scaling for size differences + if ( isTRUE(normalization) ) { + message("INFO: Median ratio normalization.") + countMatrix <- medianRatioNormalization(countMatrix) + } + + dtf_countsLong <- countMatrix_2longDtf(countMatrix, response_name) + metadata_columnForjoining <- .getColumnWithSampleID(dtf_countsLong, metadata) + if (is.na(metadata_columnForjoining)) { + stop("SampleIDs do not seem to correspond between countMatrix and metadata") + } + data2fit <- join_dtf(dtf_countsLong, metadata, k1 = "sampleID", k2 = metadata_columnForjoining) + if (sum(is.na(data2fit[[groupID]])) > 0) { + warning("Something went wrong. NA introduced in the geneID column. Check the coherence between countMatrix and metadata.") + } + return(data2fit) +} + + + +#' Apply Median Ratio Normalization to a Counts Matrix +#' +#' This function performs median ratio normalization on a counts matrix to +#' adjust for differences in sequencing depth across samples. +#' +#' @param countsMatrix A counts matrix where rows represent genes and columns +#' represent samples. +#' +#' @return A normalized counts matrix after applying median ratio normalization. +#' +#' @details This function calculates the logarithm of the counts matrix, +#' computes the average log expression for each gene, and then scales each +#' sample's counts by the exponential of the difference between its average log +#' expression and the median of those averages. +#' +#' @examples +#' counts <- matrix(c(100, 200, 300, 1000, 1500, 2500), ncol = 2) +#' normalized_counts <- medianRatioNormalization(counts) +#' +#' @export +medianRatioNormalization <- function(countsMatrix) { + log_data <- log(countsMatrix) + average_log <- rowMeans(log_data) + + if (all(is.infinite(average_log))) + stop("Every gene contains at least one zero, cannot compute log geometric means") + + idx2keep <- average_log != "-Inf" + average_log <- average_log[idx2keep] + + ratio_data <- sweep(log_data[idx2keep, ], 1, average_log, "-") + sample_medians <- apply(ratio_data, 2, median) + + scaling_factors <- exp(sample_medians) + countsMatrix_normalized <- sweep(countsMatrix, 2, scaling_factors, "/") + + return(countsMatrix_normalized) +} + + +``` + +```{r test-prepareData2fit} + + +# Unit tests for countMatrix_2longDtf +test_that("countMatrix_2longDtf converts count matrix to long data frame", { + # Sample count matrix + list_var <- init_variable() + mock_data <- mock_rnaseq(list_var, n_genes = 3, 2,2, 1) + # Convert count matrix to long data frame + dtf_countLong <- countMatrix_2longDtf(mock_data$counts) + expect_true(is.character(dtf_countLong$sampleID)) + expect_true(is.character(dtf_countLong$geneID)) + expect_true(is.numeric(dtf_countLong$kij)) + expect_equal(unique(dtf_countLong$geneID), c("gene1", "gene2", "gene3")) + expect_equal(unique(dtf_countLong$sampleID),c("myVariable1_1", "myVariable1_2", + "myVariable2_1", "myVariable2_2")) +}) + +# Unit tests for getColumnWithSampleID +test_that("getColumnWithSampleID returns column name with sampleID", { + # dummy data + list_var <- init_variable() + mock_data <- mock_rnaseq(list_var, n_genes = 3, 2,2, 2) + dtf_countLong <- countMatrix_2longDtf(mock_data$counts) + + # Expected output + expected_output <- "sampleID" + + # Get column name with sampleID + column_name <- .getColumnWithSampleID(dtf_countLong, mock_data$metadata) + + # Check if the output matches the expected output + expect_identical(column_name, expected_output) +}) + +# Unit tests for prepareData2fit +test_that("prepareData2fit prepares data for fitting", { + # dummy data + list_var <- init_variable() + mock_data <- mock_rnaseq(list_var, n_genes = 3, 2,2, 2) + + # Prepare data for fitting + data2fit <- prepareData2fit(mock_data$counts, mock_data$metadata) + + expect_true(is.character(data2fit$sampleID)) + expect_true(is.character(data2fit$geneID)) + expect_true(is.numeric(data2fit$kij)) + expect_equal(unique(data2fit$geneID), c("gene1", "gene2", "gene3")) + expect_equal(unique(data2fit$sampleID),c("myVariable1_1", "myVariable1_2", + "myVariable2_1", "myVariable2_2")) +}) + + + + + +# Test case 1: Normalization with positive counts +test_that("Median ratio normalization works for positive counts", { + counts <- matrix(c(100, 200, 300, 1000, 1500, 2500), ncol = 2) + normalized_counts <- medianRatioNormalization(counts) + + expected_normalized_counts <- matrix(c(288.6751 , 577.3503 , 866.0254 , 346.4102, 519.6152, 866.0254), ncol = 2) + + expect_equal(normalized_counts, expected_normalized_counts, tolerance = 1e-4) +}) + +# Test case 2: Normalization with zero counts +test_that("Median ratio normalization return error for zero counts", { + counts <- matrix(c(0, 0, 0, 1000, 1500, 2500), ncol = 2) + expect_error(medianRatioNormalization(counts)) + +}) + + +# Test case 5: All-zero genes +test_that("Throws an error when all-zero genes are encountered", { + counts <- matrix(c(0, 0, 0, 0, 0, 0), ncol = 2) + expect_error(medianRatioNormalization(counts)) +}) + + +``` + +```{r functionFitModel, filename = "fitModel"} +#' Check if Data is Valid for Model Fitting +#' +#' This function checks whether the provided data contains all the variables required in the model formula for fitting. +#' +#' @param data2fit The data frame or tibble containing the variables to be used for model fitting. +#' @param formula The formula specifying the model to be fitted. +#' +#' @return \code{TRUE} if all the variables required in the formula are present in \code{data2fit}, otherwise an error is raised. +#' +#' @examples +#' data(iris) +#' formula <- Sepal.Length ~ Sepal.Width + Petal.Length +#' isValidInput2fit(iris, formula) # Returns TRUE if all required variables are present +#' @keywords internal +#' @export +isValidInput2fit <- function(data2fit, formula){ + vec_bool <- all.vars(formula) %in% colnames(data2fit) + for (i in seq_along(vec_bool)){ + if (isFALSE(vec_bool[i]) ) { + stop(paste("Variable", all.vars(formula)[i], "not found")) + } + } + return(TRUE) +} + + + + +#' Fit a model using the fitModel function. +#' +#' @param formula Formula specifying the model formula +#' @param data Data frame containing the data +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function +#' @return Fitted model object or NULL if there was an error +#' @export +#' @examples +#' .fitModel(formula = mpg ~ cyl + disp, data = mtcars) +.fitModel <- function(formula, data, ...) { + # Fit the model using glm.nb from the GLmmTMB package + model <- glmmTMB::glmmTMB(formula, ..., data = data ) + model$frame <- data + ## family in ... => avoid error in future update + additional_args <- list(...) + familyArgs <- additional_args[['family']] + if (!is.null(familyArgs)) model$call$family <- familyArgs + ## control in ... => avoid error in future update + controlArgs <- additional_args[['control']] + if (!is.null(controlArgs)) model$call$control <- controlArgs + return(model) +} + + + +#' Fit the model based using fitModel functions. +#' +#' @param group The specific group to fit the model for +#' @param group_by Column name in data representing the grouping variable +#' @param formula Formula specifying the model formula +#' @param data Data frame containing the data +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function +#' @return Fitted model object or NULL if there was an error +#' @export +#' @examples +#' .subsetData_andfit(group = "setosa", group_by = "Species", +#' formula = Sepal.Length ~ Sepal.Width + Petal.Length, +#' data = iris ) +.subsetData_andfit <- function(group, group_by, formula, data, ...) { + subset_data <- data[data[[group_by]] == group, ] + fit_res <- .fitModel(formula, subset_data, ...) + #glance_df <- glance.negbin(group_by ,group , fit_res) + #tidy_df <- tidy.negbin(group_by ,group,fit_res ) + #list(glance = glance_df, summary = tidy_df) + fit_res +} + + + +#' Launch the model fitting process for a specific group. +#' +#' This function fits the model using the specified group, group_by, formula, and data. +#' It handles warnings and errors during the fitting process and returns the fitted model or NULL if there was an error. +#' +#' @param group The specific group to fit the model for +#' @param group_by Column name in data representing the grouping variable +#' @param formula Formula specifying the model formula +#' @param data Data frame containing the data +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function +#' @return List with 'glance' and 'summary' attributes representing the fitted model or NULL if there was an error +#' @export +#' @examples +#' launchFit(group = "setosa", group_by = "Species", +#' formula = Sepal.Length ~ Sepal.Width + Petal.Length, +#' data = iris ) +launchFit <- function(group, group_by, formula, data, ...) { + tryCatch( + expr = { + withCallingHandlers( + .subsetData_andfit(group, group_by, formula, data, ...), + warning = function(w) { + message(paste(Sys.time(), "warning for group", group, ":", conditionMessage(w))) + invokeRestart("muffleWarning") + }) + }, + error = function(e) { + message(paste(Sys.time(), "error for group", group, ":", conditionMessage(e))) + NULL + #return(list(glance = empty.glance.negbin(group_by, group), summary = empty.tidy.negbin(group_by, group))) + } + ) +} + + +#' Fit models in parallel for each group using mclapply and handle logging. +#' Uses parallel_fit to fit the models. +#' +#' @param groups Vector of unique group values +#' @param group_by Column name in data representing the grouping variable +#' @param formula Formula specifying the model formula +#' @param data Data frame containing the data +#' @param n.cores The number of CPU cores to use for parallel processing. +#' If set to NULL (default), the number of available CPU cores will be automatically detected. +#' @param log_file File to write log (default : log.txt) +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function +#' @return List of fitted model objects or NULL for any errors +#' @export +#' @examples +#' .parallel_fit(group_by = "Species", "setosa", +#' formula = Sepal.Length ~ Sepal.Width + Petal.Length, +#' data = iris, n.cores = 1, log_file = "log.txt" ) +.parallel_fit <- function(groups, group_by, formula, data, n.cores = NULL, log_file, ...) { + if (is.null(n.cores)) n.cores <- parallel::detectCores() + + clust <- parallel::makeCluster(n.cores, outfile = log_file) + parallel::clusterExport(clust, c(".subsetData_andfit", ".fitModel"), envir=environment()) + results_fit <- parallel::parLapply(clust, X = setNames(groups, groups), fun = launchFit, + group_by = group_by, formula = formula, data = data, ...) + + parallel::stopCluster(clust) + closeAllConnections() + return(results_fit) +} + +#' Fit models in parallel for each group using mclapply and handle logging. +#' Uses parallel_fit to fit the models. +#' +#' @param formula Formula specifying the model formula +#' @param data Data frame containing the data +#' @param group_by Column name in data representing the grouping variable +#' @param n.cores The number of CPU cores to use for parallel processing. +#' If set to NULL (default), the number of available CPU cores will be automatically detected. +#' @param log_file File path to save the log messages (default : log.txt) +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function +#' @return List of fitted model objects or NULL for any errors +#' @export +#' @examples +#' fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length, +#' data = iris, group_by = "Species", n.cores = 1) +fitModelParallel <- function(formula, data, group_by, n.cores = NULL, log_file = "log.txt", ...) { + isValidInput2fit(data, formula) + + groups <- unique(data[[group_by]]) + # Fit models in parallel and capture the results + results <- .parallel_fit(groups, group_by, formula, data, n.cores, log_file, ...) + #results <- mergeListDataframes(results) + return(results) +} + + +``` + + +```{r test-fitData} + + +test_that("isValidInput2fit returns TRUE for valid data", { + data(iris) + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + result <- isValidInput2fit(iris, formula) + expect_true(result) +}) + +# Test that the function raises an error when a required variable is missing +test_that("isValidInput2fit raises an error for missing variable", { + data(iris) + formula <- Sepal.Length ~ Sepal.Width + NonExistentVariable + expect_error(isValidInput2fit(iris, formula), "Variable NonExistentVariable not found") +}) + +test_that(".fitModel returns a fitted model object", { + data(mtcars) + formula <- mpg ~ cyl + disp + fitted_model <- suppressWarnings(.fitModel(formula, mtcars)) + #expect_warning(.fitModel(formula, mtcars)) + expect_s3_class(fitted_model, "glmmTMB") + + # Test with invalid formula + invalid_formula <- mpg ~ cyl + disp + invalid_var + expect_error(.fitModel(invalid_formula, mtcars)) + + + # Additional parameters: + #change family + formula + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + (1 | Species) + fitted_models <- suppressWarnings(.fitModel(formula = formula, + data = iris, + family = glmmTMB::nbinom1(link = "log") )) + expect_s3_class(fitted_models$call$family, "family") + expect_equal(fitted_models$call$formula, formula) + #change control settings + fitted_models <- suppressWarnings(.fitModel(formula = formula, + data = iris, + family = glmmTMB::nbinom1(link = "log"), + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3)))) + expect_equal(fitted_models$call$control, glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3,eval.max=1e3))) + + + +}) + +#test_that(".fitMixteModel returns a fitted mixed-effects model object or NULL if there was an error", { +# data(mtcars) +# formula <- mpg ~ cyl + disp + (1|gear) +# fitted_model <- .fitMixteModel(formula, mtcars) + # Add appropriate expectations for the fitted mixed-effects model object + + # Test with invalid formula +# invalid_formula <- formula + "invalid" +# fitted_model_error <- .fitMixteModel(invalid_formula, mtcars) +# expect_null(fitted_model_error) +#}) + +test_that(".subsetData_andfit returns a glmTMB obj", { + data(iris) + group <- "setosa" + group_by <- "Species" + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_model <- .subsetData_andfit(group, group_by, formula, iris) + expect_s3_class(fitted_model, "glmmTMB") + + # Test with invalid formula + invalid_formula <- Sepal.Length ~ Sepal.Width + Petal.Length + invalid_var + expect_error(.subsetData_andfit(group, group_by, invalid_formula, mtcars)) + + + # Additional parameters: + #change family + formula + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + (1 | Species) + fitted_models <- suppressWarnings(.subsetData_andfit(group, + group_by, + formula = formula, + data = iris, + family = glmmTMB::nbinom1(link = "log") )) + expect_s3_class(fitted_models$call$family, "family") + expect_equal(fitted_models$call$formula, formula) + #change control settings + fitted_models <- suppressWarnings(.subsetData_andfit(group, + group_by, + formula = formula, + data = iris, + family = glmmTMB::nbinom1(link = "log"), + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3)))) + expect_equal(fitted_models$call$control, glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3,eval.max=1e3))) + +}) + +test_that("launchFit handles warnings and errors during the fitting process", { + data(mtcars) + group <- "Group1" + group_by <- "Group" + formula <- mpg ~ cyl + disp + fitted_model <- suppressWarnings(launchFit(group, group_by, formula, mtcars)) + expect_s3_class(fitted_model, "glmmTMB") + + # Test with invalid formula + invalid_formula <- Sepal.Length ~ Sepal.Width + Petal.Length + output_msg <- capture_message( HTRSIM::launchFit(group, group_by, invalid_formula, mtcars)) + expect_match(output_msg$message, ".* error for group Group1 : object 'Sepal.Length' not found") + + + # Additional parameters: + #change family + formula + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_models <- suppressWarnings(launchFit(formula = formula, + data = iris, + group_by = group_by, + group = "setosa", + family = glmmTMB::nbinom1(link = "log") )) + expect_s3_class(fitted_models$call$family, "family") + expect_equal(fitted_models$call$formula, formula) + #change control settings + fitted_models <- suppressWarnings(launchFit(formula = formula, + data = iris, + group_by = group_by, + group = "setosa", + family = glmmTMB::nbinom1(link = "log"), + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3)))) + expect_equal(fitted_models$call$control, glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3,eval.max=1e3))) +}) + +test_that(".parallel_fit returns a list of fitted model objects or NULL for any errors", { + data(iris) + groups <- unique(iris$Species) + group_by <- "Species" + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_models <- .parallel_fit(groups, group_by, formula, iris, log_file = "log.txt", n.cores = 1) + expect_s3_class(fitted_models$setosa, "glmmTMB") + expect_length(fitted_models, length(groups)) + + # Test with invalid formula + invalid_formula <- blabla ~ cyl + disp + result <- suppressWarnings(.parallel_fit(groups, group_by, invalid_formula, + iris, log_file = "log.txt", n.cores = 1)) + expect_equal(result, expected = list(setosa = NULL, versicolor = NULL, virginica = NULL)) + + + # Additional parameters: + #change family + formula + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_models <- suppressWarnings(.parallel_fit(formula = formula, + data = iris, + group_by = group_by, + groups = "setosa", + log_file = "log.txt", + n.cores = 1, + family = glmmTMB::nbinom1(link = "log") )) + expect_s3_class(fitted_models$setosa$call$family, "family") + expect_equal(fitted_models$setosa$call$formula, formula) + #change control settings + fitted_models <- suppressWarnings(.parallel_fit(formula = formula, + data = iris, + group_by = group_by, + groups = "setosa", + log_file = "log.txt", + family = glmmTMB::nbinom1(link = "log"), + n.cores = 1, + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3)))) + expect_equal(fitted_models$setosa$call$control, glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3,eval.max=1e3))) +}) + +test_that("fitModelParallel fits models in parallel for each group and returns a list of fitted model objects or NULL for any errors", { + data(iris) + groups <- unique(iris$Species) + group_by <- "Species" + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + #is.numeric(iris) + #iris <- data.frame(lapply(iris, function(y) if(is.numeric(y)) round(y, 0) else y)) + fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) + expect_s3_class(fitted_models$setosa, "glmmTMB") + expect_length(fitted_models, length(groups)) + + invalid_formula <- blabla ~ cyl + disp + expect_error(fitModelParallel(invalid_formula, iris, group_by ,log_file = "log.txt", n.cores = 1)) + + # Additional parameters: + #change family + formula + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_models <- suppressWarnings(fitModelParallel(formula = formula, + data = iris, + group_by = group_by, + n.cores = 1, + family = glmmTMB::nbinom1(link = "log") )) + expect_s3_class(fitted_models$setosa$call$family, "family") + expect_equal(fitted_models$setosa$call$formula, formula) + #change control settings + fitted_models <- suppressWarnings(fitModelParallel(formula = formula, + data = iris, + group_by = group_by, + n.cores = 1, + family = glmmTMB::nbinom1(link = "log"), + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3)))) + expect_equal(fitted_models$setosa$call$control, glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3,eval.max=1e3))) + +}) + +``` + + +```{r functionUpdateFitModel, filename = "updateFitModel"} + + +#' Update GLMNB models in parallel. +#' +#' This function fits GLMNB models in parallel using multiple cores, allowing for faster computation. +#' +#' @param formula Formula for the GLMNB model. +#' @param l_tmb List of GLMNB objects. +#' @param n.cores Number of cores to use for parallel processing. If NULL, the function will use all available cores. +#' @param log_file File path for the log output. +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function. +#' @export +#' @return A list of updated GLMNB models. +#' +#' @examples +#' data(iris) +#' groups <- unique(iris$Species) +#' group_by <- "Species" +#' formula <- Sepal.Length ~ Sepal.Width + Petal.Length +#' fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) +#' new_formula <- Sepal.Length ~ Sepal.Width +#' results <- updateParallel(new_formula, fitted_models, n.cores = 1) +updateParallel <- function(formula, l_tmb, n.cores = NULL, log_file = "log.txt", ...) { + + isValidInput2fit(l_tmb[[1]]$frame, formula) + # Fit models update in parallel and capture the results + results <- .parallel_update(formula, l_tmb, n.cores, log_file, ...) + return(results) +} + + +#' Internal function to fit GLMNB models in parallel. +#' +#' This function is used internally by \code{\link{updateParallel}} to fit GLMNB models in parallel. +#' +#' @param formula Formula for the GLMNB model. +#' @param l_tmb List of GLMNB objects. +#' @param n.cores Number of cores to use for parallel processing. +#' @param log_file File path for the log output. +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function. +#' @export +#' @return A list of updated GLMNB models. +#' @examples +#' data(iris) +#' groups <- unique(iris$Species) +#' group_by <- "Species" +#' formula <- Sepal.Length ~ Sepal.Width + Petal.Length +#' fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) +#' new_formula <- Sepal.Length ~ Sepal.Width +#' results <- .parallel_update(new_formula, fitted_models, n.cores = 1) +.parallel_update <- function(formula, l_tmb, n.cores = NULL, log_file = "log.txt", ...) { + if (is.null(n.cores)) n.cores <- parallel::detectCores() + clust <- parallel::makeCluster(n.cores, outfile = log_file) + #l_geneID <- attributes(l_tmb)$names + parallel::clusterExport(clust, c("launchUpdate", "fitUpdate"), envir=environment()) + updated_res <- parallel::parLapply(clust, X = l_tmb, fun = launchUpdate , formula = formula, ...) + parallel::stopCluster(clust) + closeAllConnections() + return(updated_res) +} + + +#' Fit and update a GLMNB model. +#' +#' This function fits and updates a GLMNB model using the provided formula. +#' +#' @param glmnb_obj A GLMNB object to be updated. +#' @param formula Formula for the updated GLMNB model. +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function. +#' @export +#' @return An updated GLMNB model. +#' +#' @examples +#' data(iris) +#' groups <- unique(iris$Species) +#' group_by <- "Species" +#' formula <- Sepal.Length ~ Sepal.Width + Petal.Length +#' fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) +#' new_formula <- Sepal.Length ~ Sepal.Width +#' updated_model <- fitUpdate(fitted_models[[1]], new_formula) +fitUpdate <- function(glmnb_obj, formula , ...){ + data = glmnb_obj$frame + resUpdt <- stats::update(glmnb_obj, formula, ...) + resUpdt$frame <- data + ## family in ... => avoid error in future update + additional_args <- list(...) + familyArgs <- additional_args[['family']] + if (!is.null(familyArgs)) resUpdt$call$family <- familyArgs + ## control in ... => avoid error in future update + controlArgs <- additional_args[['control']] + if (!is.null(controlArgs)) resUpdt$call$control <- controlArgs + return(resUpdt) +} + + +#' Launch the update process for a GLMNB model. +#' +#' This function launches the update process for a GLMNB model, capturing and handling warnings and errors. +#' +#' @param glmnb_obj A GLMNB object to be updated. +#' @param formula Formula for the updated GLMNB model. +#' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function. +#' @export +#' @return An updated GLMNB model or NULL if an error occurs. +#' +#' @examples +#' data(iris) +#' groups <- unique(iris$Species) +#' group_by <- "Species" +#' formula <- Sepal.Length ~ Sepal.Width + Petal.Length +#' fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) +#' new_formula <- Sepal.Length ~ Sepal.Width +#' updated_model <- launchUpdate(fitted_models[[1]], new_formula) +launchUpdate <- function(glmnb_obj, formula, ...) { + group = deparse(substitute(glmnb_obj)) + tryCatch( + expr = { + withCallingHandlers( + fitUpdate(glmnb_obj, formula, ...), + warning = function(w) { + message(paste(Sys.time(), "warning for group", group ,":", conditionMessage(w))) + invokeRestart("muffleWarning") + }) + }, + error = function(e) { + message(paste(Sys.time(), "error for group", group,":", conditionMessage(e))) + return(NULL) + } + ) +} + +``` + + +```{r test-updateFit} +# Test updateParallel function +test_that("updateParallel function returns correct results", { + # Load the required data + data(iris) + groups <- unique(iris$Species) + group_by <- "Species" + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) + new_formula <- Sepal.Length ~ Sepal.Width + results <- updateParallel(new_formula, fitted_models, n.cores = 1) + expect_is(results, "list") + expect_equal(length(results), length(fitted_models)) + expect_is(results$setosa, "glmmTMB") + + #uncorrect formula + new_formula <- Sepal.Length ~ blabla + expect_error(updateParallel(new_formula, fitted_models, n.cores = 1)) + + # Additional parameters: + #change family + formula + new_formula <- Sepal.Length ~ Sepal.Width + updated_model <- suppressWarnings(updateParallel(l_tmb = fitted_models, + formula = new_formula, + n.cores = 1, + family = glmmTMB::nbinom1(link = "log") )) + expect_s3_class(updated_model$setosa$call$family, "family") + expect_equal(updated_model$setosa$call$formula, new_formula) + #change control settings + updated_model <- suppressWarnings(updateParallel(l_tmb = fitted_models, + formula = new_formula, + family = glmmTMB::nbinom1(link = "log"), + n.cores = 1, + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3)))) + expect_equal(updated_model$setosa$call$control, glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3,eval.max=1e3))) + + # Update an updated model + updated_updated_model <- suppressWarnings(updateParallel(l_tmb = updated_model, + formula = new_formula, + n.cores = 1, + family = glmmTMB::ziGamma(link = "inverse"))) + expect_s3_class(updated_updated_model$setosa$call$family, "family") +}) + +# Test .parallel_update function +test_that(".parallel_update function returns correct results", { +# Load the required data + data(iris) + groups <- unique(iris$Species) + group_by <- "Species" + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) + new_formula <- Sepal.Length ~ Sepal.Width + results <- .parallel_update(new_formula, fitted_models, n.cores = 1) + expect_is(results, "list") + expect_equal(length(results), length(fitted_models)) + expect_is(results$setosa, "glmmTMB") + + #uncorrect formula + new_formula <- Sepal.Length ~ blabla + results <- .parallel_update(new_formula, fitted_models, n.cores = 1) + expect_is(results, "list") + expect_equal(length(results), length(fitted_models)) + expect_equal(results$setosa, NULL) + + # Additional parameters: + #change family + formula + new_formula <- Sepal.Length ~ Sepal.Width + updated_model <- suppressWarnings(.parallel_update(l_tmb = fitted_models, + formula = new_formula, + n.cores = 1, + family = glmmTMB::nbinom1(link = "log") )) + expect_s3_class(updated_model$setosa$call$family, "family") + expect_equal(updated_model$setosa$call$formula, new_formula) + #change control + updated_model <- suppressWarnings(.parallel_update(l_tmb = fitted_models, + formula = new_formula, + n.cores = 1, + family = glmmTMB::nbinom1(link = "log"), + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3)))) + expect_equal(updated_model$setosa$call$control, glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3,eval.max=1e3))) +}) + +# Test fitUpdate function +test_that("fitUpdate function returns correct results", { + #Load the required data + data(iris) + groups <- unique(iris$Species) + group_by <- "Species" + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) + new_formula <- Sepal.Length ~ Sepal.Width + + updated_model <- fitUpdate(fitted_models[[1]], new_formula) + expect_is(updated_model, "glmmTMB") + + # Additional parameters: + #change family + formula + updated_model <- suppressWarnings(fitUpdate(fitted_models[[1]], new_formula, + family = glmmTMB::nbinom1(link = "log") )) + expect_s3_class(updated_model$call$family, "family") + expect_equal(updated_model$call$formula, new_formula) + #change control + updated_model <- suppressWarnings(fitUpdate(fitted_models[[1]], + new_formula, + family = glmmTMB::nbinom1(link = "log"), + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3)))) + expect_equal(updated_model$call$control, glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3,eval.max=1e3))) + +}) + + +# Test launchUpdate function +test_that("launchUpdate function returns correct results", { + data(iris) + groups <- unique(iris$Species) + group_by <- "Species" + formula <- Sepal.Length ~ Sepal.Width + Petal.Length + fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) + new_formula <- Sepal.Length ~ Sepal.Width + updated_model <- launchUpdate(fitted_models[[1]], new_formula) + expect_is(updated_model, "glmmTMB") + # Additional parameters: + #change family + formula + updated_model <- launchUpdate(fitted_models[[1]], new_formula, family = glmmTMB::nbinom1(link = "log") ) + expect_s3_class(updated_model$call$family, "family") + expect_equal(updated_model$call$formula, new_formula) + #change control + updated_model <- launchUpdate(fitted_models[[1]], new_formula, family = glmmTMB::nbinom1(link = "log"), + control = glmmTMB::glmmTMBControl(optimizer=optim, optArgs=list(method="BFGS"))) + expect_equal(updated_model$call$control, glmmTMB::glmmTMBControl(optimizer=optim, optArgs=list(method="BFGS"))) + +}) + +``` + +```{r functionTidyGLM, filename = "tidy_glmmTMB"} + + +#' Extract Fixed Effects from a GLMMTMB Model Summary +#' +#' This function extracts fixed effects from the summary of a glmmTMB model. +#' +#' @param x A glmmTMB model object. +#' @return A dataframe containing the fixed effects and their corresponding statistics. +#' @export +#' @examples +#' +#' model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris) +#' fixed_effects <- extract_fixed_effect(model) +extract_fixed_effect <- function(x){ + ss = summary(x) + as.data.frame(ss$coefficients$cond) + ss_reshaped <- lapply(ss$coefficients, + function(sub_obj){ + if(is.null(sub_obj)) return(NULL) + sub_obj <- data.frame(sub_obj) + sub_obj$term <- removeDuplicatedWord(rownames(sub_obj)) + rownames(sub_obj) <- NULL + sub_obj <- renameColumns(sub_obj) + sub_obj + } + ) + + ss_df <- do.call(rbind, ss_reshaped) + ss_df$component <- sapply(rownames(ss_df), function(x) strsplit(x, split = "[.]")[[1]][1]) + ss_df$effect <- "fixed" + rownames(ss_df) <- NULL + ss_df +} + + + +#' Extract Tidy Summary of glmmTMB Model +#' +#' This function extracts a tidy summary of the fixed and random effects from a glmmTMB model and binds them together in a data frame. Missing columns are filled with NA. +#' +#' @param glm_TMB A glmmTMB model object. +#' @param ID An identifier to be included in the output data frame. +#' @return A data frame containing a tidy summary of the fixed and random effects from the glmmTMB model. +#' @export +#' @examples +#' +#' model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris) +#' tidy_summary <- getTidyGlmmTMB(glm_TMB = model, ID = "Model1") +getTidyGlmmTMB <- function(glm_TMB, ID){ + if(is.null(glm_TMB)) return(NULL) + df1 <- extract_fixed_effect(glm_TMB) + df1 <- build_missingColumn_with_na(df1) + df2 <- extract_ran_pars(glm_TMB) + df2 <- build_missingColumn_with_na(df2) + df_2ret <- rbind(df1, df2) + df_2ret[df_2ret == "NaN"] <- NA + df_2ret <- df_2ret[rowSums(!is.na(df_2ret)) > 0, ] # drop rows full of NA + df_2ret$ID <- ID + df_2ret <- reorderColumns(df_2ret, + c("ID","effect", "component", "group", "term", + "estimate", "std.error", "statistic", "p.value")) + return(df_2ret) +} + + + +#' Extract Tidy Summary of Multiple glmmTMB Models +#' +#' This function takes a list of glmmTMB models and extracts a tidy summary of the fixed and random effects from each model. It then combines the results into a single data frame. +#' +#' @param l_tmb A list of glmmTMB model objects. +#' @return A data frame containing a tidy summary of the fixed and random effects from all glmmTMB models in the list. +#' @export +#' @examples +#' model1 <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1 | Species), data = iris) +#' model2 <- glmmTMB::glmmTMB(Petal.Length ~ Sepal.Length + Sepal.Width + (1 | Species), data = iris) +#' model_list <- list(Model1 = model1, Model2 = model2) +#' tidy_summary <- tidy_tmb(model_list) +tidy_tmb <- function(l_tmb){ + if (identical(class(l_tmb), "glmmTMB")) return(getTidyGlmmTMB(l_tmb, "glmmTMB")) + attributes(l_tmb)$names + l_tidyRes <- lapply(attributes(l_tmb)$names, + function(ID) + { + glm_TMB <- l_tmb[[ID]] + getTidyGlmmTMB(glm_TMB, ID) + } + ) + ret <- do.call("rbind", l_tidyRes) + return(ret) +} + + +#' Build DataFrame with Missing Columns and NA Values +#' +#' This function takes a DataFrame and a list of column names and adds missing columns with NA values to the DataFrame. +#' +#' @param df The input DataFrame. +#' @param l_columns A character vector specifying the column names to be present in the DataFrame. +#' @return A DataFrame with missing columns added and filled with NA values. +#' @export +#' @examples +#' +#' df <- data.frame(effect = "fixed", term = "Sepal.Length", estimate = 0.7) +#' df_with_na <- build_missingColumn_with_na(df) +build_missingColumn_with_na <- function(df, l_columns = c("effect", "component", "group", + "term", "estimate", "std.error", "statistic", "p.value")) { + df_missing_cols <- setdiff(l_columns, colnames(df)) + # Ajouter les colonnes manquantes à df1 + if (length(df_missing_cols) > 0) { + for (col in df_missing_cols) { + df[[col]] <- NA + } + } + return(df) +} + +#' Remove Duplicated Words from Strings +#' +#' This function takes a vector of strings and removes duplicated words within each string. +#' +#' @param strings A character vector containing strings with potential duplicated words. +#' @return A character vector with duplicated words removed from each string. +#' @export +#' @examples +#' +#' words <- c("hellohello", "worldworld", "programmingprogramming", "R isis great") +#' cleaned_words <- removeDuplicatedWord(words) +removeDuplicatedWord <- function(strings){ + gsub("(.*)\\1+", "\\1", strings, perl = TRUE) +} + + + + +#' Convert Correlation Matrix to Data Frame +#' +#' This function converts a correlation matrix into a data frame containing the correlation values and their corresponding interaction names. +#' +#' @param corr_matrix A correlation matrix to be converted. +#' @return A data frame with the correlation values and corresponding interaction names. +#' @export +#' @examples +#' mat <- matrix(c(1, 0.7, 0.5, 0.7, 1, 0.3, 0.5, 0.3, 1), nrow = 3, dimnames = list(c("A", "B", "C"), c("A", "B", "C"))) +#' correlation_matrix_2df(mat) +correlation_matrix_2df <- function(corr_matrix){ + vec_corr <- corr_matrix[lower.tri(corr_matrix)] + col_names <- removeDuplicatedWord(colnames(corr_matrix)) + row_names <- removeDuplicatedWord(rownames(corr_matrix)) + interaction_names <- vector("character", length(vec_corr)) + k <- 1 + n <- nrow(corr_matrix) + for (i in 1:(n - 1)) { + for (j in (i + 1):n) { + interaction_names[k] <- paste("cor__", paste(col_names[i], ".", row_names[j], sep = ""), sep ="") + k <- k + 1 + } + } + names(vec_corr) <- interaction_names + ret <- data.frame(estimate = vec_corr) + ret$term <- rownames(ret) + rownames(ret) <- NULL + ret +} + +#' Wrapper for Extracting Variance-Covariance Components +#' +#' This function extracts variance-covariance components from a glmmTMB model object for a specific grouping factor and returns them as a data frame. +#' +#' @param var_cor A variance-covariance object from the glmmTMB model. +#' @param elt A character indicating the type of effect, either "cond" or "zi". +#' @return A data frame containing the standard deviation and correlation components for the specified grouping factor. +#' @export +#' @examples +#' model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris, family = gaussian) +#' var_cor <- summary(model)$varcor$cond +#' ran_pars_df <- wrapper_var_cor(var_cor, "Species") +wrapper_var_cor <- function(var_cor, elt){ + var_group <- attributes(var_cor)$names + l_ret <- lapply(var_group, + function(group) + { + ## -- standard dev + std_df <- data.frame(estimate = attributes(var_cor[[group]])$stddev) + std_df$term <- paste("sd_", removeDuplicatedWord(rownames(std_df)), sep = "") + ## -- correlation + corr_matrix <- attributes(var_cor[[group]])$correlation + #no correlation 2 return + if (nrow(corr_matrix) <= 1) ret <- std_df + else { + corr_df <- correlation_matrix_2df(corr_matrix) + ret <- rbind(std_df, corr_df) + } + ret$component <- elt + ret$effect <- "ran_pars" + ret$group <- group + rownames(ret) <- NULL + return(ret) + }) + l_ret + +} + + +#' Extract Random Parameters from a glmmTMB Model +#' +#' This function extracts the random parameters from a glmmTMB model and returns them as a data frame. +#' +#' @param x A glmmTMB model object. +#' @return A data frame containing the random parameters and their estimates. +#' @export +#' @examples +#' model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris, +#' family = gaussian) +#' random_params <- extract_ran_pars(model) +extract_ran_pars <- function(x){ + ss <- summary(x) + l_2parcour <- c("cond", "zi") + l_res = lapply(setNames(l_2parcour, l_2parcour), + function(elt) + { + var_cor <- ss$varcor[[elt]] + return(wrapper_var_cor(var_cor, elt)) + }) + + ret <- rbind(do.call("rbind", l_res$cond),do.call("rbind", l_res$zi)) + return(ret) + +} + + +#' Rename Columns in a Data Frame +#' +#' This function renames columns in a data frame based on specified old names and corresponding new names. +#' +#' @param df A data frame. +#' @param old_names A character vector containing the old column names to be replaced. +#' @param new_names A character vector containing the corresponding new column names. +#' @return The data frame with renamed columns. +#' @export +#' @examples +#' df <- data.frame(Estimate = c(1.5, 2.0, 3.2), +#' Std..Error = c(0.1, 0.3, 0.2), +#' z.value = c(3.75, 6.67, 4.90), +#' Pr...z.. = c(0.001, 0.0001, 0.002)) +#' +#' renamed_df <- renameColumns(df, old_names = c("Estimate", "Std..Error", "z.value", "Pr...z.."), +#' new_names = c("estimate", "std.error", "statistic", "p.value")) +#' +renameColumns <- function(df, old_names = c("Estimate","Std..Error", "z.value", "Pr...z.."), + new_names = c("estimate","std.error", "statistic", "p.value")) { + stopifnot(length(old_names) == length(new_names)) + + for (i in seq_along(old_names)) { + old_col <- old_names[i] + new_col <- new_names[i] + + if (old_col %in% names(df)) { + names(df)[names(df) == old_col] <- new_col + } else { + warning(paste("Column", old_col, "not found in the data frame. Skipping renaming.")) + } + } + + return(df) +} + + + +#' Reorder the columns of a dataframe +#' +#' This function reorders the columns of a dataframe according to the specified column order. +#' +#' @param df The input dataframe. +#' @param columnOrder A vector specifying the desired order of columns. +#' +#' @return A dataframe with columns reordered according to the specified column order. +#' @export +#' @examples +#' # Example dataframe +#' df <- data.frame(A = 1:3, B = 4:6, C = 7:9) +#' +#' # Define the desired column order +#' columnOrder <- c("B", "C", "A") +#' +#' # Reorder the columns of the dataframe +#' df <- reorderColumns(df, columnOrder) +reorderColumns <- function(df, columnOrder) { + df <- df[, columnOrder, drop = FALSE] + return(df) +} + +``` + + +```{r test-tidyGLM} + +test_that("extract_fixed_effect returns the correct results for glmmTMB models", { + data(iris) + # Créer un modèle glmmTMB avec les données iris (exemple) + model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris) + + # Appeler la fonction extract_fixed_effect sur le modèle + result <- extract_fixed_effect(model) + + # Check les résultats attendus + expect_s3_class(result, "data.frame") + expect_equal(result$effect, c("fixed", "fixed", "fixed")) + expect_equal(result$component , c("cond", "cond", "cond")) + expect_equal(result$term , c("(Intercept)", "Sepal.Width", "Petal.Length")) + +}) + + +test_that("getTidyGlmmTMB returns the correct results for glmmTMB models", { + data(iris) + # Créer un modèle glmmTMB avec les données iris (exemple) + model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris) + tidy_summary <- getTidyGlmmTMB(glm_TMB = model, ID = "Model1") + + # Check les résultats attendus + expect_s3_class(tidy_summary, "data.frame") + expect_equal(tidy_summary$effect, c("fixed", "fixed", "fixed")) + expect_equal(tidy_summary$component , c("cond", "cond", "cond")) + expect_equal(tidy_summary$term , c("(Intercept)", "Sepal.Width", "Petal.Length")) + expect_equal(tidy_summary$ID , c("Model1", "Model1", "Model1")) + + #MODEL == NULL + tidy_summary <- getTidyGlmmTMB(glm_TMB = NULL, ID = "Model1") + expect_equal(tidy_summary, NULL) +}) + + +test_that("build_missingColumn_with_na returns the correct results", { + df <- data.frame(effect = "fixed", term = "Sepal.Length", estimate = 0.7) + df_with_na <- build_missingColumn_with_na(df) + expected_df <- data.frame(effect = "fixed", + term = "Sepal.Length", + estimate = 0.7, + component = NA, + group = NA, + std.error = NA, + statistic = NA, + p.value = NA) + + expect_equal(df_with_na, expected_df) +}) + + +test_that("removeDuplicatedWord returns expected output", { + words <- c("hellohello", "worldworld", "programmingprogramming", "R isis great") + cleaned_words <- removeDuplicatedWord(words) + expect_equal(cleaned_words, c("hello", "world", "programming", "R is great")) +}) + + + +test_that("correlation_matrix_2df returns expected output",{ + + mat <- matrix(c(1, 0.7, 0.5, 0.7, 1, 0.3, 0.5, 0.3, 1), nrow = 3, dimnames = list(c("A", "B", "C"), c("A", "B", "C"))) + df_corr <- correlation_matrix_2df(mat) + df_expected <- data.frame(estimate = c(0.7, 0.5, 0.3), + term = c("cor__A.B", "cor__A.C", "cor__B.C")) + expect_equal(df_corr, df_expected) +}) + + + +test_that("wrapper_var_cor returns expected output",{ + data(iris) + model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris, family = gaussian) + var_cor <- summary(model)$varcor$cond + ran_pars_df <- wrapper_var_cor(var_cor, "Species") + expected_l = list(data.frame(estimate = 0.4978083, term = "sd_(Intercept)", + component = "Species", effect = "ran_pars", group = "Species")) + expect_equal(ran_pars_df , expected_l, tolerance = 0.0000001) +}) + + +test_that("extract_ran_pars returns expected output",{ + model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), + data = iris, family = gaussian) + random_params <- extract_ran_pars(model) + + expected_df = data.frame(estimate = 0.4978083, term = "sd_(Intercept)", + component = "cond", effect = "ran_pars", group = "Species") + expect_equal(random_params , expected_df, tolerance = 0.0000001) +}) + + +test_that("renameColumns returns expected output",{ + df <- data.frame(Estimate = c(1.5, 2.0, 3.2), + Std..Error = c(0.1, 0.3, 0.2), + z.value = c(3.75, 6.67, 4.90), + Pr...z.. = c(0.001, 0.0001, 0.002)) + + new_colnames <- c("estimate", "std.error", "statistic", "p.value") + renamed_df <- renameColumns(df, old_names = c("Estimate", "Std..Error", "z.value", "Pr...z.."), + new_names = new_colnames) + expect_equal(colnames(renamed_df),c("estimate", "std.error", "statistic", "p.value")) + expect_equal(dim(renamed_df), dim(df)) +}) + + +test_that("reorderColumns returns expected output",{ + df <- data.frame(A = 1:3, B = 4:6, C = 7:9) + # Define the desired column order + columnOrder <- c("B", "C", "A") + # Reorder the columns of the dataframe + df_reorder <- reorderColumns(df, columnOrder) + expect_equal(colnames(df_reorder), columnOrder) + expect_equal(dim(df_reorder), dim(df)) + +}) + + +test_that("tidy_tmb returns expected output",{ + model1 <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1 | Species), data = iris) + model2 <- glmmTMB::glmmTMB(Petal.Length ~ Sepal.Length + Sepal.Width + (1 | Species), data = iris) + model_list <- list(Model1 = model1, Model2 = model2) + result <- tidy_tmb(model_list) + expect_equal(unique(result$ID), c("Model1", "Model2")) + expect_equal(unique(result$effect), c("fixed", "ran_pars")) + expect_equal(unique(result$component), "cond") + expect_equal(unique(result$term), c("(Intercept)", "Sepal.Width", "Petal.Length", "sd_(Intercept)", "Sepal.Length")) + expect_true("estimate" %in% colnames(result)) + expect_true("std.error" %in% colnames(result)) + expect_true("statistic" %in% colnames(result)) + expect_true("p.value" %in% colnames(result)) + + + # zi component + model2 <- glmmTMB::glmmTMB(Petal.Length ~ Sepal.Length + Sepal.Width + (1 | Species), data = iris, ziformula = ~1) + model_list <- list(Model1 = model1, Model2 = model2) + result_withZi <- tidy_tmb(model_list) + expect_equal(dim(result_withZi)[1], dim(result)[1] + 1 ) + expect_equal(unique(result_withZi$component), c("cond", "zi")) + + ## unique obect in list + model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris) + result <- tidy_tmb(model) + expect_true("effect" %in% colnames(result)) + expect_true("component" %in% colnames(result)) + expect_true("group" %in% colnames(result)) + expect_true("term" %in% colnames(result)) + expect_true("estimate" %in% colnames(result)) + expect_true("std.error" %in% colnames(result)) + expect_true("statistic" %in% colnames(result)) + expect_true("p.value" %in% colnames(result)) +}) +``` + + +```{r functionGlanceGLM, filename = "glance_tmb"} + +#' Extracts the summary statistics from a list of glmmTMB models. +#' +#' This function takes a list of glmmTMB models and extracts the summary statistics (AIC, BIC, logLik, deviance, +#' df.resid, and dispersion) for each model and returns them as a single DataFrame. +#' +#' @param l_tmb A list of glmmTMB models or a unique glmmTMB obj model +#' @return A DataFrame with the summary statistics for all the glmmTMB models in the list. +#' @export +#' +#' @examples +#' data(mtcars) +#' models <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, group_by = "Species",n.cores = 1, data = iris) +#' result <- glance_tmb(models) +glance_tmb <- function(l_tmb){ + if (identical(class(l_tmb), "glmmTMB")) return(getGlance(l_tmb)) + l_group <- attributes(l_tmb)$names + l_glance <- lapply(setNames(l_group, l_group), function(group) getGlance(l_tmb[[group]])) + return(do.call("rbind", l_glance)) +} + + +#' Extracts the summary statistics from a single glmmTMB model. +#' +#' This function takes a single glmmTMB model and extracts the summary statistics (AIC, BIC, logLik, deviance, +#' df.resid, and dispersion) from the model and returns them as a DataFrame. +#' +#' @param x A glmmTMB model. +#' @return A DataFrame with the summary statistics for the glmmTMB model. +#' @export +#' +#' @examples +#' data(mtcars) +#' model <- glmmTMB::glmmTMB(mpg ~ wt + (1|cyl), data = mtcars) +#' getGlance(model) +getGlance <- function(x){ + if (is.null(x)) return(NULL) + ret <- data.frame(t(summary(x)$AICtab)) + ret$dispersion <- glmmTMB::sigma(x) + ret +} + + +``` + + +```{r testGlanceGLM } + +test_that("glance_tmb returns the summary statistics for multiple models", { + data(iris) + models <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, group_by = "Species",n.cores = 1, data = iris) + result <- glance_tmb(models) + expect_true("AIC" %in% colnames(result)) + expect_true("BIC" %in% colnames(result)) + expect_true("logLik" %in% colnames(result)) + expect_true("deviance" %in% colnames(result)) + expect_true("df.resid" %in% colnames(result)) + expect_true("dispersion" %in% colnames(result)) + expect_true(sum(c("setosa","versicolor", "virginica") %in% rownames(result)) == 3) + + ## unique obect in list + model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris) + result <- glance_tmb(model) + expect_true("AIC" %in% colnames(result)) + expect_true("BIC" %in% colnames(result)) + expect_true("logLik" %in% colnames(result)) + expect_true("deviance" %in% colnames(result)) + expect_true("df.resid" %in% colnames(result)) + expect_true("dispersion" %in% colnames(result)) + +}) + +test_that("getGlance returns the summary statistics for a single model", { + model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris) + result <- getGlance(model) + expect_true("AIC" %in% colnames(result)) + expect_true("BIC" %in% colnames(result)) + expect_true("logLik" %in% colnames(result)) + expect_true("deviance" %in% colnames(result)) + expect_true("df.resid" %in% colnames(result)) + expect_true("dispersion" %in% colnames(result)) +}) +``` + + +```{r functionPlotMetrics, filename = "plot_metrics"} + +#' Subset the glance DataFrame based on selected variables. +#' +#' This function subsets the glance DataFrame to keep only the specified variables. +#' +#' @param glance_df The glance DataFrame to subset. +#' @param focus A character vector of variable names to keep, including "AIC", "BIC", "logLik", "deviance", +#' "df.resid", and "dispersion". +#' @return A subsetted glance DataFrame with only the selected variables. +#' @export +#' +#' @examples +#' data(iris) +#' models <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, group_by = "Species",n.cores = 1, data = iris) +#' glance_df <- glance_tmb(models) +#' glance_df$group_id <- rownames(glance_df) +#' subset_glance(glance_df, c("AIC", "BIC")) +subset_glance <- function(glance_df, focus){ + idx_existing_column <- focus %in% c("AIC", "BIC", "logLik", "deviance" ,"df.resid", "dispersion" ) + if(sum(!idx_existing_column) > 0) warning(paste(focus[!idx_existing_column], ": does not exist\n")) + focus <- focus[idx_existing_column] + if (identical(focus, character(0))) + stop(paste0("Please select at least one variable to focus on : ", + "AIC, BIC, logLik, deviance, df.resid, dispersion" )) + glance_df <- glance_df[ , c("group_id", focus)] + return(glance_df) +} + + +#' Plot Metrics for Generalized Linear Mixed Models (GLMM) +#' +#' This function generates a density plot of the specified metrics for the given +#' list of generalized linear mixed models (GLMMs). +#' +#' @param l_tmb A list of GLMM objects to extract metrics from. +#' @param focus A character vector specifying the metrics to focus on. Possible +#' values include "AIC", "BIC", "logLik", "deviance", "df.resid", and +#' "dispersion". If \code{NULL}, all available metrics will be plotted. +#' +#' @return A ggplot object displaying density plots for the specified metrics. +#' +#' @importFrom reshape2 melt +#' @importFrom ggplot2 aes facet_wrap geom_density theme_bw theme ggtitle +#' +#' @export +#' +#' @examples +#' models_list <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, +#' group_by = "Species",n.cores = 1, data = iris) +#' metrics_plot(models_list, focus = c("AIC", "BIC", "deviance")) +metrics_plot <- function(l_tmb, focus = NULL) { + glance_df <- glance_tmb(l_tmb) + glance_df$group_id <- rownames(glance_df) + if (!is.null(focus)) { + glance_df <- subset_glance(glance_df, focus) + } + long_glance_df <- reshape2::melt(glance_df, variable.name = "metric") + p <- ggplot2::ggplot(long_glance_df) + + ggplot2::geom_density(ggplot2::aes(x = value, col = metric, fill = metric), alpha = 0.4) + + ggplot2::facet_wrap(~metric, scales = "free") + + ggplot2::theme_bw() + + ggplot2::theme(legend.position = 'null') + + ggplot2::ggtitle("Metrics plot") + return(p) +} + + +``` + +```{r testPlotMetrics } + + +test_that("subset_glance subsets the glance DataFrame correctly", { + data(iris) + models <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, group_by = "Species",n.cores = 1, data = iris) + glance_df <- glance_tmb(models) + glance_df$group_id <- rownames(glance_df) + result <- subset_glance(glance_df, c("AIC", "BIC")) + expect_true("AIC" %in% colnames(result)) + expect_true("BIC" %in% colnames(result)) + expect_true("group_id" %in% colnames(result)) + expect_true(sum(c("setosa","versicolor", "virginica") %in% rownames(result)) == 3) +}) + + + + +test_that("metrics_plot returns a ggplot object", { + + data(iris) + l_glmTMB <- list( + setosa = glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length, + data = subset(iris, Species == "setosa")), + versicolor = glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length, + data = subset(iris, Species == "versicolor")), + virginica = glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length, + data = subset(iris, Species == "virginica")) + ) + p <- metrics_plot(l_glmTMB) + expect_true(inherits(p, "gg")) + +}) + + +``` + + + + + + + +```{r functionEvalDispersion, filename = "evaluateDispersion"} + +#' Evaluate Dispersion Comparison +#' +#' Compares dispersion values between two data frames containing dispersion information. +#' +#' @param TMB_dispersion_df A data frame containing dispersion values from TMB. +#' @param DESEQ_dispersion_df A data frame containing dispersion values from DESeq2. +#' +#' @return A list containing a dispersion plot and a data frame with dispersion comparison. +#' @importFrom ggplot2 scale_color_manual +#' @export +#' +#' @examples +#' \dontrun{ +#' disp_comparison <- evaluateDispersion(TMB_dispersion_df, DESEQ_dispersion_df) +#' plot_dispersion <- disp_comparison$disp_plot +#' comparison_df <- disp_comparison$data +#' } +evaluateDispersion <- function(TMB_dispersion_df, DESEQ_dispersion_df, color2use) { + disp_comparison_dtf <- rbind(TMB_dispersion_df, DESEQ_dispersion_df) + disp_plot <- dispersion_plot(disp_comparison_dtf, col = "from") + ggplot2::scale_color_manual(values = color2use) + + return(list(disp_plot = disp_plot, data = disp_comparison_dtf)) +} + + +#' Get Dispersion Comparison +#' +#' Compares inferred dispersion values with actual dispersion values. +#' +#' @param inferred_dispersion A data frame containing inferred dispersion values. +#' @param actual_dispersion A numeric vector containing actual dispersion values. +#' +#' @return A data frame comparing actual and inferred dispersion values. +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' dispersion_comparison <- getDispersionComparison(inferred_disp, actual_disp) +#' print(dispersion_comparison) +#' } +getDispersionComparison <- function(inferred_dispersion, actual_dispersion) { + actual_disp <- data.frame(actual_dispersion = actual_dispersion) + actual_disp$geneID <- rownames(actual_disp) + rownames(actual_disp) <- NULL + disp_comparison <- join_dtf(actual_disp, inferred_dispersion, "geneID", "geneID") + return(disp_comparison) +} + + +#' Extract DESeq2 Dispersion Values +#' +#' Extracts inferred dispersion values from a DESeq2 wrapped object. +#' +#' @param deseq_wrapped A DESeq2 wrapped object containing dispersion values. +#' +#' @return A data frame containing inferred dispersion values. +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' dispersion_df <- extractDESeqDispersion(deseq2_object) +#' print(dispersion_df) +#' } +extractDESeqDispersion <- function(deseq_wrapped) { + inferred_dispersion <- data.frame(inferred_dispersion = deseq_wrapped$dispersion) + inferred_dispersion$geneID <- rownames(inferred_dispersion) + rownames(inferred_dispersion) <- NULL + return(inferred_dispersion) +} + + +#' Extract TMB Dispersion Values +#' +#' Extracts inferred dispersion values from a TMB result object. +#' +#' @param l_tmb A TMB result object containing dispersion values. +#' +#' @return A data frame containing inferred dispersion values. +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' dispersion_df <- extractTMBDispersion(tmb_result) +#' print(dispersion_df) +#' } +extractTMBDispersion <- function(l_tmb) { + glanceRes <- glance_tmb(l_tmb) + inferred_dispersion <- data.frame(inferred_dispersion = glanceRes$dispersion) + inferred_dispersion$geneID <- rownames(glanceRes) + rownames(inferred_dispersion) <- NULL + return(inferred_dispersion) +} + + + +#' Dispersion Evaluation Plot +#' +#' Creates a scatter plot to evaluate the dispersion values between actual and inferred dispersions. +#' +#' @param eval_dispersion A data frame containing actual and inferred dispersion values. +#' @param ... Additional arguments to be passed to the ggplot2::aes function. +#' @importFrom ggplot2 ggplot geom_point aes geom_abline theme_bw ggtitle scale_x_log10 scale_y_log10 +#' @return A ggplot2 scatter plot. +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' disp_plot <- dispersion_plot(disp_comparison_dtf, col = "from") +#' print(disp_plot) +#' } +dispersion_plot <- function(eval_dispersion, ...) { + + args <- lapply(list(...), function(x) if (!is.null(x)) ggplot2::sym(x)) + + p <- ggplot2::ggplot(eval_dispersion) + + ggplot2::geom_point(ggplot2::aes(x = actual_dispersion, y = inferred_dispersion, !!!args), size = 3, alpha = 0.6) + + ggplot2::geom_abline(intercept = 0, slope = 1, lty = 3, col = 'red', linewidth = 1) + + ggplot2::theme_bw() + + ggplot2::ggtitle("Dispersion evaluation") + + ggplot2::scale_x_log10() + + ggplot2::scale_y_log10() + + return(p) +} + + + +``` + +```{r testPlotMetrics } + + +# Example data + + +# Tests +test_that("dispersion_plot function works correctly", { + eval_disp <- data.frame( + actual_dispersion = c(0.1, 0.2, 0.3), + inferred_dispersion = c(0.12, 0.18, 0.28), + from = c("HTRfit", "HTRfit", "DESeq2") + ) + disp_plot <- dispersion_plot(eval_disp, col = "from") + expect_s3_class(disp_plot, "gg") +}) + +test_that("extractTMBDispersion function extracts dispersion correctly", { + N_GENES = 100 + MAX_REPLICATES = 5 + MIN_REPLICATES = 5 + input_var_list <- init_variable(name = "varA", mu = 10, sd = 0.1, level = 3) + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES) + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata) + l_res <- fitModelParallel(formula = kij ~ varA, + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + extracted_disp <- extractTMBDispersion(l_res) + expect_identical(colnames(extracted_disp), c("inferred_dispersion", "geneID")) +}) + +test_that("extractDESeqDispersion function extracts dispersion correctly", { + N_GENES = 100 + MAX_REPLICATES = 5 + MIN_REPLICATES = 5 + input_var_list <- init_variable(name = "varA", mu = 10, sd = 0.1, level = 3) + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES) + dds <- DESeq2::DESeqDataSetFromMatrix( + countData = round(mock_data$counts), + colData = mock_data$metadata, + design = ~ varA) + dds <- DESeq2::DESeq(dds, quiet = TRUE) + deseq_wrapped = wrapper_DESeq2(dds, 2, "greaterAbs") + + extracted_disp <- extractDESeqDispersion(deseq_wrapped) + expect_identical(colnames(extracted_disp), c("inferred_dispersion", "geneID")) +}) + +test_that("getDispersionComparison function works correctly", { + N_GENES = 100 + MAX_REPLICATES = 5 + MIN_REPLICATES = 5 + input_var_list <- init_variable(name = "varA", mu = 10, sd = 0.1, level = 3) + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES) + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata) + l_res <- fitModelParallel(formula = kij ~ varA, + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + + tmb_disp_inferred <- extractTMBDispersion(l_res) + + comparison <- getDispersionComparison(tmb_disp_inferred, mock_data$groundTruth$gene_dispersion) + expect_identical(colnames(comparison), c("actual_dispersion", "geneID", "inferred_dispersion")) +}) + +test_that("evaluateDispersion function works correctly", { + N_GENES = 100 + MAX_REPLICATES = 5 + MIN_REPLICATES = 5 + input_var_list <- init_variable(name = "varA", mu = 10, sd = 0.1, level = 3) + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES) + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata) + l_res <- fitModelParallel(formula = kij ~ varA, + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + dds <- DESeq2::DESeqDataSetFromMatrix( + countData = round(mock_data$counts), + colData = mock_data$metadata, + design = ~ varA) + dds <- DESeq2::DESeq(dds, quiet = TRUE) + deseq_wrapped = wrapper_DESeq2(dds, 2, "greaterAbs") + + tmb_disp_inferred <- extractTMBDispersion(l_res) + TMB_dispersion_df <- getDispersionComparison(tmb_disp_inferred, mock_data$groundTruth$gene_dispersion) + TMB_dispersion_df$from <- 'HTRfit' + DESEQ_disp_inferred <- extractDESeqDispersion(deseq_wrapped) + DESEQ_dispersion_df <- getDispersionComparison(DESEQ_disp_inferred , mock_data$groundTruth$gene_dispersion) + DESEQ_dispersion_df$from <- 'DESeq2' + + eval_disp <- evaluateDispersion(TMB_dispersion_df, DESEQ_dispersion_df, c("red", "blue")) + expect_identical(names(eval_disp), c("disp_plot", "data")) +}) + + + +``` + + + +## Sequencing depth + + +```{r function-seqDepth, filename = "scalingSequencingDepth"} + +#' Scale Counts Table +#' +#' This function scales a counts table based on the expected sequencing depth per sample. +#' +#' @param countsTable A counts table containing raw read counts. +#' @param seq_depth sequencing depth vector +#' @return A scaled counts table. +#' +#' @export +#' @examples +#' mock_data <- list(counts = matrix(c(10, 20, 30, 20, 30, 10, 10, 20, 20, 20, 30, 10), ncol = 4)) +#' scaled_counts <- scaleCountsTable(countsTable = mock_data$counts, 1000000) +#' +scaleCountsTable <- function(countsTable, seq_depth){ + seq_depth_simu <- colSums(countsTable) + + if (length(seq_depth) > length(seq_depth_simu)) + message("INFO: The length of the sequencing_depth vector exceeds the number of samples. Only the first N values will be utilized.") + if (length(seq_depth) < length(seq_depth_simu)) + message("INFO: The length of the sequencing_depth vector is shorter than the number of samples. Values will be recycled.") + + scalingDepth_factor <- suppressWarnings(seq_depth/seq_depth_simu) + counts_scaled <- as.data.frame(sweep(as.matrix(countsTable), 2, scalingDepth_factor, "*")) + return(counts_scaled) +} + + + + +``` + +```{r test-scalingSequencingDepth} + +# Test case 1: Scaling with valid min_seq_depth and max_seq_depth +test_that("Valid scaling of counts table", { + # Test data + mock_data <- list(counts = matrix(c(10, 20, 30, 20, 30, 10, 10, 20, 20, 20, 30, 10), ncol = 4)) + # Test function + scaled_counts <- scaleCountsTable(countsTable = mock_data$counts, 115000) + + # Expected scaled counts + expected_scaled_counts <- matrix(c(5000, 10000, 15000, 10000, 15000, 5000, + 5000, 10000, 10000, 10000, 15000, 5000), ncol = 4) + + # Check if the scaled counts match the expected scaled counts + expect_true(all(colSums(scaled_counts) == 115000)) + +}) + +``` + + +## Gene expression + + +```{r function-geneExpressionScaling, filename = "scalingGeneExpression"} + + + + +#' Get bin expression for a data frame. +#' +#' This function divides the values of a specified column in a data frame into \code{n_bins} bins of equal width. +#' The bin labels are then added as a new column in the data frame. +#' +#' @param dtf_coef A data frame containing the values to be binned. +#' @param n_bins The number of bins to create. +#' +#' @return A data frame with an additional column named \code{binExpression}, containing the bin labels. +#' @export +#' @examples +#' dtf <- data.frame(mu_ij = c(10, 20, 30, 15, 25, 35, 40, 5, 12, 22)) +#' dtf_with_bins <- getBinExpression(dtf, n_bins = 3) +#' +getBinExpression <- function(dtf_coef, n_bins){ + col2bin <- "mu_ij" + bin_labels <- cut(dtf_coef[[col2bin]], n_bins, labels = paste("BinExpression", 1:n_bins, sep = "_")) + dtf_coef$binExpression <- bin_labels + return(dtf_coef) +} + + + + +#' Generate BE data. +#' +#' This function generates BE data for a given number of genes, in a vector of BE values. +#' +#' @param n_genes The number of genes to generate BE data for. +#' @param basal_expression a numeric vector from which sample BE for eacg genes +#' +#' @return A data frame containing gene IDs, BE values +#' +#' @examples +#' generate_BE(n_genes = 100, 10) +#' +#' @export +generate_BE <- function(n_genes, basal_expression) { + ## --avoid bug if one value in basal_expr + pool2sample <- c(basal_expression, basal_expression) + BE <- sample(x = pool2sample, size = n_genes, replace = T) + l_geneID <- base::paste("gene", 1:n_genes, sep = "") + ret <- list(geneID = l_geneID, basalExpr = BE) %>% as.data.frame() + return(ret) +} + + + +#' Compute basal expresion for gene expression based on the coefficients data frame. +#' +#' This function takes the coefficients data frame \code{dtf_coef} and computes +#' basal expression for gene expression. The scaling factors are generated +#' using the function \code{generate_BE}. +#' +#' @param dtf_coef A data frame containing the coefficients for gene expression. +#' @param n_genes number of genes in simulation +#' @param basal_expression +#' +#' @return A modified data frame \code{dtf_coef} with an additional column containing +#' the scaling factors for gene expression. +#' @export +#' @examples +#' list_var <- init_variable() +#' N_GENES <- 5 +#' dtf_coef <- getInput2simulation(list_var, N_GENES) +#' dtf_coef <- getLog_qij(dtf_coef) +#' addBasalExpression(dtf_coef, N_GENES, 1) +addBasalExpression <- function(dtf_coef, n_genes, basal_expression){ + BE_df <- generate_BE(n_genes, basal_expression ) + dtf_coef <- join_dtf(dtf_coef, BE_df, "geneID", "geneID") + return(dtf_coef) +} + + + + +``` + +```{r test-geneExpressionScaling} + +test_that("generate_BE returns correct number of genes", { + be_data <- generate_BE(n_genes = 100, 1) + expect_equal(nrow(be_data), 100) +}) + + +test_that("generate_BE returns BE values within specified vector", { + BE_vec <- c(1, 2, 33, 0.4) + be_data <- generate_BE(n_genes = 100, BE_vec) + expect_true(all(be_data$basalExpr %in% BE_vec)) +}) + + +test_that("Test for addbasalExpre function",{ + + list_var <- init_variable() + N_GENES <- 5 + dtf_coef <- getInput2simulation(list_var, N_GENES) + dtf_coef <- getLog_qij(dtf_coef) + + # Test the function + dtf_coef_with_BE <- addBasalExpression(dtf_coef, N_GENES, 5) + + # Check if the output is a data frame + expect_true(is.data.frame(dtf_coef_with_BE)) + + # Check if the number of rows is equal to number of row in dtf_coef + expect_equal(nrow(dtf_coef_with_BE), nrow(dtf_coef)) + + # Check if the number of rows is equal to number of row in dtf_coef +1 + expect_equal(ncol(dtf_coef_with_BE), ncol(dtf_coef)+1) + + # Check if the data frame has a new column "BE" + expect_true("basalExpr" %in% colnames(dtf_coef_with_BE)) + + # Check if the values in the "BE" column are numeric + expect_true(all(is.numeric(dtf_coef_with_BE$basalExpr))) + +}) + + +# Test 1: Check if the function returns the correct number of bins +test_that("getBinExpression returns the correct number of bins", { + dtf <- data.frame(mu_ij = c(10, 20, 30, 15, 25, 35, 40, 5, 12, 22)) + n_bins <- 3 + dtf_with_bins <- getBinExpression(dtf, n_bins) + expect_equal(nrow(dtf_with_bins), nrow(dtf), label = "Number of rows should remain the same") + expect_equal(ncol(dtf_with_bins), ncol(dtf) + 1, label = "Number of columns should increase by 1") +}) + +# Test 2: Check if the function adds the binExpression column correctly +test_that("getBinExpression adds the binExpression column correctly", { + dtf <- data.frame(mu_ij = c(10, 20, 30, 15, 25, 35, 40, 5, 12, 22)) + n_bins <- 3 + dtf_with_bins <- getBinExpression(dtf, n_bins) + expected_bins <- c("BinExpression_1", "BinExpression_2", "BinExpression_3", "BinExpression_1", "BinExpression_2", + "BinExpression_3", "BinExpression_3", "BinExpression_1", "BinExpression_1", "BinExpression_2") + expect_equal(dtf_with_bins$binExpression, factor(expected_bins)) +}) + +# Test 3: Check if the function handles negative values correctly +test_that("getBinExpression handles negative values correctly", { + dtf <- data.frame(mu_ij = c(10, -20, 30, -15, 25, 35, -40, 5, 12, -22)) + n_bins <- 4 + dtf_with_bins <- getBinExpression(dtf, n_bins) + expected_bins <- c("BinExpression_3", "BinExpression_2", "BinExpression_4", "BinExpression_2", "BinExpression_4", + "BinExpression_4", "BinExpression_1", "BinExpression_3", "BinExpression_3", "BinExpression_1") + expect_equal(dtf_with_bins$binExpression, factor(expected_bins)) +}) + + + +``` + +```{r function-setCorrelation, filename = "setCorrelation"} + +#' Compute Covariation from Correlation and Standard Deviations +#' +#' This function computes the covariation between two variables (A and B) given their correlation and standard deviations. +#' +#' @param corr_AB The correlation coefficient between variables A and B. +#' @param sd_A The standard deviation of variable A. +#' @param sd_B The standard deviation of variable B. +#' +#' @return The covariation between variables A and B. +#' @export +#' @examples +#' corr <- 0.7 +#' sd_A <- 3 +#' sd_B <- 4 +#' compute_covariation(corr, sd_A, sd_B) +compute_covariation <- function(corr_AB, sd_A, sd_B) { + cov_AB <- corr_AB * sd_A * sd_B + return(cov_AB) +} + + +#' Get Standard Deviations for Variables in Correlation +#' +#' This function extracts the standard deviations for the variables involved in the correlation. +#' +#' @param list_var A list containing the variables and their attributes. +#' @param between_var A character vector containing the names of the variables involved in the correlation. +#' +#' @return A numeric vector containing the standard deviations for the variables in the correlation. +#' @export +#' @examples +#' list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% +#' init_variable(name = "varB", mu = 0, sd = 25, level = 3) +#' between_var <- c("varA", "varB") +#' getStandardDeviationInCorrelation(list_var, between_var) +getStandardDeviationInCorrelation <- function(list_var, between_var){ + for (var in between_var) sd_List <- getGivenAttribute(list_var, "sd") + for (var in between_var) sd_ListFromInteraction <- getGivenAttribute(list_var$interactions, "sd") + sd_List <- c(sd_List, sd_ListFromInteraction) + return(unname(unlist(sd_List[between_var]))) +} + + + +#' Set Correlation between Variables +#' +#' Set the correlation between two or more variables in a simulation. +#' +#' @param list_var A list containing the variables used in the simulation, initialized using \code{\link{init_variable}}. +#' @param between_var Character vector specifying the names of the variables to set the correlation between. +#' @param corr Numeric value specifying the desired correlation between the variables. +#' +#' @return Updated \code{list_var} with the specified correlation set between the variables. +#' +#' @details The function checks if the variables specified in \code{between_var} are declared and initialized in the \code{list_var}. It also ensures that at least two variables with provided standard deviation are required to set a correlation in the simulation. +#' The specified correlation value must be within the range [-1, 1]. The function computes the corresponding covariance between the variables based on the specified correlation and standard deviations. +#' The correlation information is then added to the \code{list_var} in the form of a data frame containing the correlation value and the corresponding covariance value. +#' @export +#' @examples +#' list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% +#' init_variable(name = "varB", mu = 0, sd = 25, level = 3) +#' list_var <- set_correlation(list_var, between_var = c("varA", "varB"), corr = 0.7) +set_correlation <- function(list_var, between_var, corr) { + + # Check if variables in between_var are declared and initialized + bool_checkBetweenVarValidity <- function(between_var, list_var) { + nb_varInCorrelation <- length(unique(between_var)) + stopifnot(nb_varInCorrelation > 1) + # -- check also for interaction + varInitialized <- c(getListVar(list_var), getListVar(list_var$interactions)) + existingVar_nb <- varInitialized %in% between_var %>% sum() + if (existingVar_nb != nb_varInCorrelation) { + return(FALSE) + } else { + return(TRUE) + } + } + + name_correlation <- paste(between_var, collapse = ".") + bool_valid_corr <- bool_checkBetweenVarValidity(between_var, list_var) + if (!bool_valid_corr) { + stop("At least one variable in between_var is not declared. Variable not initialized cannot be used in a correlation.") + } + + vec_standardDev <- getStandardDeviationInCorrelation(list_var, between_var) + if (length(vec_standardDev) < 2) { + stop("Exactly two variables with provided standard deviation are required to set a correlation in simulation.") + } + # Validate the specified correlation value to be within the range [-1, 1] + if (corr < -1 || corr > 1) { + stop("Invalid correlation value. Correlation must be in the range [-1, 1].") + } + + name_interaction <- paste(between_var, collapse = ":") + corr <- data.frame(cor = corr, covar = compute_covariation(corr, vec_standardDev[1], vec_standardDev[2] )) + list_var$correlations[[name_correlation]] <- corr + return(list_var) +} + + +``` + +```{r test-setcorrelation} + +test_that("compute_covariation returns the correct covariation", { + # Test case 1: Positive correlation + corr <- 0.7 + sd_A <- 3 + sd_B <- 4 + expected_cov <- corr * sd_A * sd_B + actual_cov <- compute_covariation(corr, sd_A, sd_B) + expect_equal(actual_cov, expected_cov) + + # Test case 2: Negative correlation + corr <- -0.5 + sd_A <- 2.5 + sd_B <- 3.5 + expected_cov <- corr * sd_A * sd_B + actual_cov <- compute_covariation(corr, sd_A, sd_B) + expect_equal(actual_cov, expected_cov) + + # Test case 3: Zero correlation + corr <- 0 + sd_A <- 1 + sd_B <- 2 + expected_cov <- corr * sd_A * sd_B + actual_cov <- compute_covariation(corr, sd_A, sd_B) + expect_equal(actual_cov, expected_cov) +}) + + +# Unit tests for getStandardDeviationInCorrelation +test_that("getStandardDeviationInCorrelation returns correct standard deviations", { + + # Initialize list_var + list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% + init_variable(name = "varB", mu = 0, sd = 25, level = 3) + + # Test case 1: Two variables correlation + between_var_1 <- c("varA", "varB") + sd_expected_1 <- c(5, 25) + sd_result_1 <- getStandardDeviationInCorrelation(list_var, between_var_1) + expect_equal(sd_result_1, sd_expected_1) + +}) + + + +test_that("set_correlation sets the correlation between variables correctly", { + # Initialize variables in the list_var + list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% + init_variable(name = "varB", mu = 0, sd = 25, level = 3) + + # Test setting correlation between varA and varB + list_var <- set_correlation(list_var, between_var = c("varA", "varB"), corr = 0.7) + + corr_result <- list_var$correlations$varA.varB$cor + covar_result <- list_var$correlations$varA.varB$covar + expect_equal(corr_result, 0.7) + expect_equal(covar_result, 87.5) + + # Test setting correlation between varA and varC (should raise an error) + expect_error(set_correlation(list_var, between_var = c("varA", "varC"), corr = 0.8), + "At least one variable in between_var is not declared. Variable not initialized cannot be used in a correlation.") + + # Test setting correlation with invalid correlation value + expect_error(set_correlation(list_var, between_var = c("varA", "varB"), corr = 1.5)) + + # Test setting correlation with less than 2 variables with provided standard deviation + expect_error(set_correlation(list_var, between_var = c("varA"), corr = 0.7)) +}) + + +``` + +```{r functionActualMainFixEff, filename = "actualMainFixEffects" } + +#' Calculate average values by group +#' +#' @param data The input data frame +#' @param column The name of the target variable +#' @param group_by The names of the grouping variables +#' @importFrom data.table setDT tstrsplit +#' @return A data frame with average values calculated by group +#' @export +averageByGroup <- function(data, column, group_by) { + group_values <- split(data[[column]], data[group_by]) + mean_values <- sapply(group_values, mean) + result <- data.frame(Group = names(mean_values), logQij_mean = mean_values) + data.table::setDT(result)[, {{ group_by }} := data.table::tstrsplit(Group, "[.]")] + result <- subset(as.data.frame(result), select = -Group) + return(result) +} + +#' Convert specified columns to factor +#' +#' @param data The input data frame +#' @param columns The column names to be converted to factors +#' @return The modified data frame with specified columns converted to factors +#' @export +convert2Factor <- function(data, columns) { + if (is.character(columns)) { + columns <- match(columns, colnames(data)) + } + + if (length(columns) > 1) data[, columns] <- lapply(data[, columns], as.factor ) + else data[, columns] <- as.factor(data[, columns]) + return(data) +} + +#' Subset Fixed Effect Inferred Terms +#' +#' This function subsets the tidy TMB object to extract the fixed effect inferred terms +#' along with their categorization into interaction and non-interaction terms. +#' +#' @param tidy_tmb The tidy TMB object containing the inferred terms. +#' +#' @return A list with two elements: +#' \describe{ +#' \item{fixed_term}{A list with two components - \code{nonInteraction} and \code{interaction}, +#' containing the names of the fixed effect inferred terms categorized as non-interaction and interaction terms, respectively.} +#' \item{data}{A data frame containing the subset of tidy_tmb that contains the fixed effect inferred terms.} +#' } +#' @export +#' @examples +#' input_var_list <- init_variable() +#' mock_data <- mock_rnaseq(input_var_list, 10, 2, 2) +#' getData2computeActualFixEffect(mock_data$groundTruth$effect) +#' data2fit = prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata ) +#' #-- fit data +#' resFit <- fitModelParallel(formula = kij ~ myVariable , +#' data = data2fit, group_by = "geneID", +#' family = glmmTMB::nbinom2(link = "log"), n.cores = 1) +#' tidy_tmb <- tidy_tmb(resFit) +#' subsetFixEffectInferred(tidy_tmb) +subsetFixEffectInferred <- function(tidy_tmb){ + fixed_tidy <- tidy_tmb[tidy_tmb$effect == "fixed",] + l_term <- unique(fixed_tidy$term) + l_term <- l_term[!l_term %in% c("(Intercept)", NA)] + index_interaction <- grepl(x = l_term, ":") + l_term_nonInteraction <- l_term[!index_interaction] + l_term_interaction <- l_term[index_interaction] + l_term2ret <- list(nonInteraction = l_term_nonInteraction, interaction = l_term_interaction ) + return(list(fixed_term = l_term2ret, data = fixed_tidy)) +} + + +#' Get data for calculating actual values +#' +#' @param inference The inference data frame +#' @param groundTruth The ground truth data frame +#' @return A list containing required data for calculating actual values +#' @export +#' @examples +#' input_var_list <- init_variable() +#' mock_data <- mock_rnaseq(input_var_list, 10, 2, 2) +#' getData2computeActualFixEffect(mock_data$groundTruth$effect) +getData2computeActualFixEffect <- function(groundTruth){ + col_names <- colnames(groundTruth) + categorical_vars <- col_names[grepl(col_names, pattern = "label_")] + average_gt <- averageByGroup(groundTruth, "log_qij_scaled", c("geneID", categorical_vars)) + average_gt <- convert2Factor(data = average_gt, columns = categorical_vars ) + return(list(categorical_vars = categorical_vars, data = average_gt)) +} + + +#' Get the intercept dataframe +#' +#' @param fixeEff_dataActual The input list containing the categorical variables and the data +#' @return The intercept dataframe +#' @export +getActualIntercept <- function(fixeEff_dataActual) { + ## -- split list + data<- fixeEff_dataActual$data + categorical_vars <- fixeEff_dataActual$categorical_vars + + if (length(categorical_vars) == 1){ + l_labels <- list() + l_labels[[categorical_vars]] <- levels(data[, categorical_vars]) + + } else l_labels <- lapply(data[, categorical_vars], levels) + index_ref <- sapply(categorical_vars, function(var) data[[var]] == l_labels[[var]][1]) + index_ref <- rowSums(index_ref) == dim(index_ref)[2] + df_intercept <- data[index_ref, ] + df_intercept$term <- "(Intercept)" + colnames(df_intercept)[colnames(df_intercept) == "logQij_mean"] <- "actual" + df_intercept$description <- "(Intercept)" + + index2keep <- !colnames(df_intercept) %in% categorical_vars + df_intercept <- df_intercept[,index2keep] + + return(df_intercept) +} + + +#' Generate actual values for a given term +#' +#' @param term The term for which actual values are calculated +#' @param df_actualIntercept The intercept dataframe +#' @param dataActual The average ground truth dataframe +#' @param categorical_vars The names of the categorical variables +#' @return The data frame with actual values for the given term +#' @export +generateActualForMainFixEff <- function(term , df_actualIntercept , dataActual , categorical_vars){ + + computeActualValueForMainFixEff <- function(df_actualIntercept, df_term) { + df_term$actual <- df_term$logQij_mean - df_actualIntercept$actual + return(subset(df_term, select = -c(logQij_mean))) + } + + df_term <- subsetByTermLabel(dataActual, categorical_vars , term ) + df_term <- computeActualValueForMainFixEff(df_actualIntercept, df_term) + df_term$description <- gsub("\\d+$", "", term) + return(df_term) +} + + + +#' subset data By Term Label +#' +#' +#' Get a subset of the data based on a specific term label in the categorical variables. +#' +#' @param data The data frame to subset +#' @param categorical_vars The categorical variables to consider +#' @param term_label The term label to search for +#' @return A subset of the data frame containing rows where the categorical variables match the specified term label +#' @export +#' +#' @examples +#' # Create a data frame +#' my_data <- data.frame(color = c("red", "blue", "green", "red"), +#' size = c("small", "medium", "large", "medium"), +#' shape = c("circle", "square", "triangle", "circle")) +#' my_data[] <- lapply(my_data, as.factor) +#' +#' # Get the subset for the term "medium" in the "size" variable +#' subsetByTermLabel(my_data, "size", "medium") +#' # Output: A data frame with rows where "size" is "medium" +#' +#' # Get the subset for the term "red" in the "color" variable +#' subsetByTermLabel(my_data, "color", "red") +#' # Output: A data frame with rows where "color" is "red" +subsetByTermLabel <- function(data, categorical_vars, term_label ) { + if (length(categorical_vars) == 1) { + l_labels <- list() + l_labels[[categorical_vars]] <- levels(data[, categorical_vars]) + } else { + l_labels <- lapply(data[, categorical_vars], levels) + } + + term_variable <- findAttribute(term_label, l_labels) + if(is.null(term_variable)) stop("term_label not in 'data'") + + index_ref <- sapply(categorical_vars, function(var) { + if (var == term_variable) { + data[[var]] == term_label + } else { + data[[var]] == l_labels[[var]][1] + } + }) + + index_ref <- rowSums(index_ref) == dim(index_ref)[2] + df_term <- data[index_ref, ] + df_term$term <- term_label + return(df_term) +} + +#' Find Attribute +#' +#' Find the attribute containing the specified term in a given list. +#' +#' @param term The term to search for +#' @param list The list to search within +#' @return The attribute containing the term, or NULL if the term is not found in any attribute +#' @export +#' +#' @examples +#' # Create a list +#' my_list <- list(color = c("red", "blue", "green"), +#' size = c("small", "medium", "large"), +#' shape = c("circle", "square", "triangle")) +#' +#' # Find the attribute containing "medium" +#' findAttribute("medium", my_list) +findAttribute <- function(term, list) { + for (attr in names(list)) { + if (term %in% list[[attr]]) { + return(attr) + } + } + return(NULL) # If the term is not found in any attribute +} + +#' Get actual values for non-interaction terms +#' +#' @param l_term list of term to compute +#' @param fixeEff_dataActual A list containing required data for calculating actual values +#' @param df_actualIntercept The data frame containing the actual intercept values +#' @return A data frame with actual values for non-interaction terms +#' @export +getActualMainFixEff <- function( l_term , fixeEff_dataActual , df_actualIntercept ){ + ## -- split list + categorical_vars <- fixeEff_dataActual$categorical_vars + data_groundTruth <- fixeEff_dataActual$data + ## -- iteration over term + l_actual <- lapply(l_term, + function(term){ + generateActualForMainFixEff(term, df_actualIntercept, + data_groundTruth, categorical_vars)}) + df_actual <- do.call("rbind", l_actual) + index2keep <- !colnames(df_actual) %in% categorical_vars + df_actual <- df_actual[,index2keep] + return(df_actual) +} + + + + + +``` + +```{r test-actualMainFixEff} + +test_that("Test for subsetFixEffectInferred function", { + # Prepare the test data + input_var_list <- init_variable(name = "varA", mu = c(1,2,3), level = 3) %>% + init_variable(name = "varB", mu = c(2,-6), level = 2) %>% + add_interaction(between_var = c("varA", "varB"), mu = 1, sd = 3) + + mock_data <- mock_rnaseq(input_var_list, 10, 2, 2) + getData2computeActualFixEffect(mock_data$groundTruth$effect) + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata, normalization = F) + + # Fit data + resFit <- fitModelParallel(formula = kij ~ varA + varB + varA:varB, + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + tidy_tmb <- tidy_tmb(resFit) + + # Test the subsetFixEffectInferred function + result <- subsetFixEffectInferred(tidy_tmb) + # Define expected output + expected_nonInteraction <- c("varA2", "varA3", "varB2") + expected_interaction <- c("varA2:varB2", "varA3:varB2") + + # Compare the output with the expected values + expect_equal(result$fixed_term$nonInteraction, expected_nonInteraction) + expect_equal(result$fixed_term$interaction, expected_interaction) +}) + + +# Tests for averageByGroup +test_that("averageByGroup returns correct average values", { + # Create a sample data frame + data <- data.frame( + Group1 = rep(c("A", "B", "C", "D"), each = 2), + Group2 = rep(c("X", "Y"), times = 4), + Value = 1:8 + ) + + # Calculate average values by group + result <- averageByGroup(data, column = "Value", group_by = c("Group1", "Group2")) + + # Check the output + expect_equal(nrow(result), 8) # Number of rows + expect_equal(colnames(result), c("logQij_mean","Group1", "Group2" )) # Column names + expect_equal(result$logQij_mean, c(1, 3,5, 7, 2, 4, 6, 8)) # Average values +}) + +# Tests for convert2Factor +test_that("convert2Factor converts specified columns to factors", { + # Create a sample data frame + data <- data.frame( + Category1 = c("A", "B", "A", "B"), + Category2 = c("X", "Y", "X", "Z"), + Value = 1:4, + stringsAsFactors = FALSE + ) + + # Convert columns to factors + result <- convert2Factor(data, columns = c("Category1", "Category2")) + + # Check the output + expect_is(result$Category1, "factor") # Category1 column converted to factor + expect_is(result$Category2, "factor") # Category2 column converted to factor +}) + +# Tests for findAttribute +test_that("findAttribute returns the correct attribute", { + # Create a sample list + my_list <- list( + color = c("red", "blue", "green"), + size = c("small", "medium", "large"), + shape = c("circle", "square", "triangle") + ) + + # Find attributes + attr1 <- findAttribute("medium", my_list) + attr2 <- findAttribute("rectangle", my_list) + + # Check the output + expect_equal(attr1, "size") # Attribute containing "medium" + expect_equal(attr2, NULL) # Attribute containing "rectangle" +}) + +# Tests for getActualIntercept +test_that("getActualIntercept returns the correct intercept dataframe", { + # Create a sample data frame + data <- data.frame( + Category1 = c("A", "B", "A", "B"), + Category2 = c("X", "Y", "X", "Z"), + logQij_mean = 1:4 + ) + data[, c("Category1", "Category2")] <- lapply(data[, c("Category1", "Category2")], as.factor ) + + l_fixEffDataActual= list(categorical_vars = c("Category1", "Category2"), data = data) + # Get the intercept dataframe + result <- getActualIntercept(l_fixEffDataActual) + + # Check the output + expect_equal(nrow(result), 2) # Number of rows + expect_equal(unique(result$term), "(Intercept)") # Term column + expect_equal(result$actual, c(1, 3)) # Actual column +}) + + + + + +# Test subsetByTermLabel with single categorical variable +test_that("subsetByTermLabel with single categorical variable", { + my_data <- list(color = c("red", "blue", "green", "red"), + size = c("small", "medium", "large", "medium"), + shape = c("circle", "square", "triangle", "circle")) + my_data <- expand.grid(my_data) + my_data[] <- lapply(my_data, as.factor) + + subset_data <- subsetByTermLabel(my_data, categorical_vars = "size", term_label = "medium") + expected_data <- my_data[my_data$size == "medium", ] + expected_data$term <- "medium" + + expect_equal(subset_data, expected_data) +}) + +# Test subsetByTermLabel with single term label in multiple categorical variables +test_that("subsetByTermLabel with single term label in multiple categorical variables", { + my_data <- list(color = c("red", "blue", "green", "red"), + size = c("small", "medium", "large", "medium"), + shape = c("circle", "square", "triangle", "circle")) + my_data <- expand.grid(my_data) + my_data[] <- lapply(my_data, as.factor) + + subset_data <- subsetByTermLabel(data = my_data, categorical_vars = c("color", "shape"), term_label = "circle") + expected_data <- my_data[my_data$shape == "circle" & my_data$color == "red" , ] + expected_data$term <- "circle" + + expect_equal(subset_data, expected_data) +}) + +# Test subsetByTermLabel with non-existent term label expect error +test_that("subsetByTermLabel with non-existent term label", { + my_data <- list(color = c("red", "blue", "green", "red"), + size = c("small", "medium", "large", "medium"), + shape = c("circle", "square", "triangle", "circle")) + my_data <- expand.grid(my_data) + my_data[] <- lapply(my_data, as.factor) + + expect_error(subsetByTermLabel(data = my_data, categorical_vars = "size", term_label = "extra-large")) +}) + + + +# Test getActualMainFixEff +test_that("getActualMainFixEff", { + input_var_list <- init_variable() + mock_data <- mock_rnaseq(input_var_list, 2, 2, 2) + data2fit <- prepareData2fit(mock_data$counts, mock_data$metadata) + inference <- fitModelParallel(kij ~ myVariable , + group_by = "geneID", data2fit, n.cores = 1) + tidy_inference <- tidy_tmb(inference) + tidy_fix <- subsetFixEffectInferred(tidy_inference) + fixEff_dataActual <- getData2computeActualFixEffect(mock_data$groundTruth$effects) + actual_intercept <- getActualIntercept(fixEff_dataActual) + ## -- main = non interaction + actual_mainFixEff <- getActualMainFixEff(tidy_fix$fixed_term$nonInteraction, + fixEff_dataActual, actual_intercept) + + expected_actual <- data.frame(geneID = c("gene1", "gene2"), + term = c("myVariable2", "myVariable2"), + actual = c(1, 1), + description = "myVariable") + rownames(actual_mainFixEff) <- NULL + rownames(actual_mainFixEff) <- NULL + expect_equal(actual_mainFixEff, expected_actual) +}) + + + +test_that("getData2computeActualFixEffect return correct output",{ + # Prepare the test data + input_var_list <- init_variable() + mock_data <- mock_rnaseq(input_var_list, 2, 2, 2) + data2fit <- prepareData2fit(mock_data$counts, mock_data$metadata) + inference <- fitModelParallel(kij ~ myVariable, group_by = "geneID", data2fit, n.cores = 1) + tidy_inference <- tidy_tmb(inference) + tidy_fix <- subsetFixEffectInferred(tidy_inference) + + # Call the function to test + fixEff_dataActual <- getData2computeActualFixEffect(mock_data$groundTruth$effects) + + # Define expected output + expected_data <- data.frame(logQij_mean = c(2,2,3,3), geneID = c("gene1", "gene2", "gene1", "gene2"), label_myVariable = factor(c("myVariable1", "myVariable1", "myVariable2", "myVariable2"))) + expected_categorical_vars <- "label_myVariable" + # Compare the output with the expected values + expect_equal(fixEff_dataActual$data, expected_data) + expect_equal(fixEff_dataActual$categorical_vars, expected_categorical_vars) +}) + + +test_that("generateActualForMainFixEff returns correct values for main fixed effect term", { + # Prepare the test data + input_var_list <- init_variable() + mock_data <- mock_rnaseq(input_var_list, 2, 2, 2) + data2fit <- prepareData2fit(mock_data$counts, mock_data$metadata) + fixEff_dataActual <- getData2computeActualFixEffect(mock_data$groundTruth$effects) + actual_intercept <- getActualIntercept(fixEff_dataActual) + df_term <- generateActualForMainFixEff("myVariable2", actual_intercept, fixEff_dataActual$data, fixEff_dataActual$categorical_vars) + + # Define expected output + expected <- data.frame( + geneID = c("gene1", "gene2"), + label_myVariable = factor(c("myVariable2", "myVariable2"), levels = c("myVariable1", "myVariable2")), + term = c("myVariable2", "myVariable2"), + actual = c(1, 1), + description = c("myVariable", "myVariable") + ) + rownames(df_term) <- NULL + rownames(expected) <- NULL + # Compare the output with the expected values + expect_equal(df_term, expected) +}) + + +``` + +```{r functionActualInteractionFixEff, filename = "actualInteractionFixEffects" } +#' Filter DataFrame +#' +#' Filter a DataFrame based on the specified filter list. +#' +#' @param df The DataFrame to be filtered +#' @param filter_list A list specifying the filters to be applied +#' @return The filtered DataFrame +#' @export +#' +#' @examples +#' # Create a DataFrame +#' df <- data.frame(ID = c(1, 2, 3, 4), +#' Name = c("John", "Jane", "Mike", "Sarah"), +#' Age = c(25, 30, 28, 32), +#' Gender = c("Male", "Female", "Male", "Female")) +#' +#' # Create a filter list +#' filter_list <- list(Name = c("John", "Mike"), Age = c(25, 28)) +#' +#' # Filter the DataFrame +#' filter_dataframe(df, filter_list) +filter_dataframe <- function(df, filter_list ) { + filtered_df <- df + + for (attr_name in attributes(filter_list)$names) { + attr_value <- filter_list[[attr_name]] + + filtered_df <- filtered_df[filtered_df[[attr_name]] %in% attr_value, ] + } + + return(filtered_df) +} + + +#' Calculate actual interaction values between two terms in a data frame. +#' +#' This function calculates the actual interaction values between two terms, \code{lbl_term_1} and \code{lbl_term_2}, +#' in the given data frame \code{data}. The interaction values are computed based on the mean log expression levels +#' of the conditions satisfying the specified term combinations, and also considering a reference condition. +#' +#' @param data A data frame containing the expression data and associated terms. +#' @param df_intercept A data frame containing the intercept values for computing interaction effects. +#' @param l_reference A data frame representing the reference condition for the interaction. +#' @param clmn_term_1 The name of the column in \code{data} representing the first term. +#' @param lbl_term_1 The label of the first term to compute interactions for. +#' @param clmn_term_2 The name of the column in \code{data} representing the second term. +#' @param lbl_term_2 The label of the second term to compute interactions for. +#' +#' @return A numeric vector containing the actual interaction values between the specified terms. +#' @export +#' @examples +#' average_gt <- data.frame(clmn_term_1 = c("A", "A", "B", "B"), +#' clmn_term_2 = c("X", "Y", "X", "Y"), +#' logQij_mean = c(1.5, 2.0, 0.5, 1.0)) +#' df_intercept <- data.frame(actual = 0.8) +#' # Définir les paramètres de la fonction +#' l_label <- list(clmn_term_1 = c("A", "B"), clmn_term_2 = c("X", "Y")) +#' clmn_term_1 <- "clmn_term_1" +#' lbl_term_1 <- "A" +#' clmn_term_2 <- "clmn_term_2" +#' lbl_term_2 <- "Y" +#' # Calculer la valeur d'interaction réelle +#' actual_interaction <- calculate_actual_interaction_values(average_gt, df_intercept, +#' l_label, clmn_term_1, lbl_term_1, +#' clmn_term_2, lbl_term_2) +calculate_actual_interaction_values <- function(data , df_intercept, l_reference , clmn_term_1, lbl_term_1, clmn_term_2, lbl_term_2) { + A <- data[data[[clmn_term_1]] == lbl_term_1 & data[[clmn_term_2]] == lbl_term_2, ] + B <- data[data[[clmn_term_1]] == lbl_term_1 & data[[clmn_term_2]] == l_reference[[clmn_term_2]][1], ] + C <- data[data[[clmn_term_1]] == l_reference[[clmn_term_1]][1] & data[[clmn_term_2]] == lbl_term_2, ] + D <- df_intercept + actual_interaction <- (A$logQij_mean - B$logQij_mean) - (C$logQij_mean - D$actual) + return(actual_interaction) +} + + +#' Prepare data for computing interaction values. +#' +#' This function prepares the data for computing interaction values between variables. +#' It filters the \code{dataActual} data frame by selecting only the rows where the categorical variables +#' specified in \code{categorical_vars} are at their reference levels. +#' +#' @param categorical_vars A character vector containing the names of categorical variables. +#' @param categorical_varsInInteraction A character vector containing the names of categorical variables involved in interactions. +#' @param dataActual A data frame containing the actual data with categorical variables and associated expression levels. +#' +#' @return A data frame containing the filtered data for computing interaction values. +#' @export +prepareData2computeInteraction <- function(categorical_vars, categorical_varsInInteraction, dataActual){ + l_RefInCategoricalVars <- lapply(dataActual[, categorical_vars], function(vector) levels(vector)[1]) + l_categoricalVars_NOT_InInteraction <- categorical_vars[! categorical_vars %in% categorical_varsInInteraction ] + l_filter <- l_RefInCategoricalVars[l_categoricalVars_NOT_InInteraction] + dataActual_2computeInteractionValues <- filter_dataframe(dataActual, l_filter) + return(dataActual_2computeInteractionValues) +} + + + +#' Generate actual values for the interaction fixed effect. +#' +#' This function calculates the actual values for the interaction fixed effect +#' based on the input labels in the interaction, categorical variables in the interaction, +#' data to compute interaction values, actual intercept, and the reference levels in +#' categorical variables. +#' +#' @param labelsInInteraction A vector containing the labels of the interaction terms. +#' @param l_categoricalVarsInInteraction A vector containing the names of categorical variables +#' involved in the interaction. +#' @param data2computeInteraction The data frame used to compute interaction values. +#' @param actual_intercept The actual intercept data frame. +#' @param l_RefInCategoricalVars A list containing the reference levels of categorical variables. +#' +#' @return A data frame with the actual values for the interaction fixed effect. +#' The data frame includes columns: term, actual, and description. +#' +#' @export +generateActualInteractionFixEff <- function(labelsInInteraction, l_categoricalVarsInInteraction, data2computeInteraction, actual_intercept, l_RefInCategoricalVars ){ + clmn_term_1 <- l_categoricalVarsInInteraction[1] + lbl_term_1 <- labelsInInteraction[1] + clmn_term_2 <- l_categoricalVarsInInteraction[2] + lbl_term_2 <- labelsInInteraction[2] + interactionValues <- calculate_actual_interaction_values(data2computeInteraction, + actual_intercept, l_RefInCategoricalVars, clmn_term_1, + lbl_term_1, clmn_term_2, lbl_term_2) + + ## categorical vars 2 keep + #index2keep <- grepl("label_", colnames(actual_intercept)) + df_actualForMyInteraction <- actual_intercept#[, index2keep] + df_actualForMyInteraction$term <- paste(labelsInInteraction, collapse = ":") + df_actualForMyInteraction$actual <- interactionValues + df_actualForMyInteraction$description <- paste(gsub("\\d+$", "", lbl_term_1) , + gsub("\\d+$", "", lbl_term_2), sep = ":") + + return(df_actualForMyInteraction) + +} + + + +#' Get the actual interaction values for a given interaction term in the data. +#' +#' This function takes an interaction term, the dataset, and the names of the categorical variables +#' as inputs. It calculates the actual interaction values based on the difference in log-transformed +#' mean expression levels for the specified interaction term. The function first prepares the data for +#' computing the interaction values and then generates the actual interaction values. +#' +#' @param labelsInInteraction A character vector containing the labels of the categorical levels +#' involved in the interaction. +#' @param data The dataset containing the gene expression data and categorical variables. +#' @param categorical_vars A character vector containing the names of the categorical variables in +#' the dataset. +#' @param actual_intercept +#' @return A data frame containing the actual interaction values. +#' @export +getActualInteractionFixEff <- function(labelsInInteraction, data, categorical_vars, actual_intercept){ + l_RefInCategoricalVars <- lapply(data[, categorical_vars], function(vector) levels(vector)[1]) + l_labelsInCategoricalVars <- lapply(data[, categorical_vars], levels) + l_categoricalVarsInInteraction <- lapply(labelsInInteraction, + function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() + data2computeInteraction <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, data ) + actualInteractionValues <- generateActualInteractionFixEff(labelsInInteraction,l_categoricalVarsInInteraction , + data2computeInteraction, actual_intercept, l_RefInCategoricalVars) + return(actualInteractionValues) +} + + +#' Compute actual interaction values for multiple interaction terms. +#' +#' This function calculates the actual interaction values for multiple interaction terms +#' using the provided data. +#' +#' @param l_interactionTerm A list of interaction terms in the form of "term1:term2". +#' @param categorical_vars A character vector containing the names of categorical variables in the data. +#' @param dataActual The data frame containing the actual gene expression values and metadata. +#' +#' @return A data frame containing the actual interaction values for each interaction term. +#' @export +#' @examples +#' N_GENES <- 4 +#' MIN_REPLICATES <- 3 +#' MAX_REPLICATES <- 3 +#' init_var <- init_variable(name = "varA", mu = 8, sd = 0.1, level = 3) %>% +#' init_variable(name = "varB", mu = c(5,-5), NA , level = 2) %>% +#' init_variable(name = "varC", mu = 1, 3, 3) %>% +#' add_interaction(between_var = c("varA", "varC"), mu = 5, 0.1) +#' mock_data <- mock_rnaseq(init_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES ) +#' data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata ) +#' results_fit <- fitModelParallel(formula = kij ~ varA + varB + varC + varA:varC, +#' data = data2fit, group_by = "geneID", +#' family = glmmTMB::nbinom2(link = "log"), n.cores = 1) +#' tidy_tmb <- tidy_tmb(results_fit) +#' fixEff_dataInference <- subsetFixEffectInferred(tidy_tmb) +#' fixEff_dataActual <- getData2computeActualFixEffect(mock_data$groundTruth$effects) +#' interactionTerm <- fixEff_dataInference$fixed_term$interaction[[1]] +#' categorical_vars <- fixEff_dataActual$categorical_vars +#' dataActual <- fixEff_dataActual$data +#' l_labelsInCategoricalVars <- lapply(dataActual[, categorical_vars], levels) +#' l_interaction <- strsplit(interactionTerm, split = ":")[[1]] +#' l_categoricalVarsInInteraction <- lapply(l_interaction, +#' function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() +#' data_prepared <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, dataActual) +#' actual_intercept <- getActualIntercept(fixEff_dataActual) +#' # Compute actual interaction values for multiple interactions +#' actualInteraction <- computeActualInteractionFixEff(interactionTerm, categorical_vars, dataActual, actual_intercept) +computeActualInteractionFixEff <- function(l_interactionTerm, categorical_vars, dataActual, actualIntercept){ + + l_interaction <- strsplit(l_interactionTerm, split = ":") + l_interactionActualValues <- lapply(l_interaction, function(labelsInInteraction) + getActualInteractionFixEff(labelsInInteraction, dataActual, categorical_vars, actualIntercept)) + actualInteraction_df <- do.call('rbind', l_interactionActualValues) + return(actualInteraction_df) +} +``` + +```{r test-actualInteractionFixEff } + +test_that("filter_dataframe retourne le dataframe filtré correctement", { + # Créer un exemple de dataframe + df <- data.frame( + col1 = c(1, 2, 3, 4, 5), + col2 = c("A", "B", "C", "D", "E"), + col3 = c("X", "Y", "Z", "X", "Y") + ) + + # Créer une liste de filtres + filter_list <- list( + col1 = c(2), + col2 = "B", + col3 = c("Y") + ) + + # Appliquer les filtres sur le dataframe + filtered_df <- filter_dataframe(df, filter_list) + + # Vérifier que les lignes correspondantes sont présentes dans le dataframe filtré + expect_equal(nrow(filtered_df), 1) + expect_true(all(filtered_df$col1 %in% c(2))) + expect_true(all(filtered_df$col2 == "B")) + expect_true(all(filtered_df$col3 %in% c("Y"))) +}) + +test_that("filter_dataframe retourne le dataframe d'origine si aucun filtre n'est spécifié", { + # Créer une liste de filtres vide + filter_list <- list() + + # Appliquer les filtres sur le dataframe + filtered_df <- filter_dataframe(df, filter_list) + + # Vérifier que le dataframe filtré est identique au dataframe d'origine + expect_identical(filtered_df, df) +}) + +test_that("calculate_actual_interaction_values retourne la valeur d'interaction réelle correctement", { + average_gt <- data.frame( + clmn_term_1 = c("A", "A", "B", "B"), + clmn_term_2 = c("X", "Y", "X", "Y"), + logQij_mean = c(1.5, 2.0, 0.5, 1.0) + ) + + df_intercept <- data.frame( + actual = 0.8 + ) + + # Définir les paramètres de la fonction + l_label <- list(clmn_term_1 = c("A", "B"), clmn_term_2 = c("X", "Y")) + clmn_term_1 <- "clmn_term_1" + lbl_term_1 <- "A" + clmn_term_2 <- "clmn_term_2" + lbl_term_2 <- "Y" + + # Calculer la valeur d'interaction réelle + actual_interaction <- calculate_actual_interaction_values(average_gt, df_intercept, l_label, clmn_term_1, lbl_term_1, clmn_term_2, lbl_term_2) + + # Vérifier que la valeur d'interaction réelle est correcte + expect_equal(actual_interaction, -0.7) +}) + + + +test_that("prepareData2computeInteraction filters data correctly", { + + data <- data.frame( + geneID = c("gene1", "gene2", "gene3", "gene4"), + label_varA = factor(c("A", "A", "B", "B")), + label_varB = factor(c("X", "X", "Y", "Y")), + label_varC = factor(c("P", "P", "Q", "Q")), + logQij_mean = c(1.2, 3.4, 5.6, 7.8) + ) + categorical_vars <- c("label_varA", "label_varB", "label_varC") + categorical_varsInInteraction <- c("label_varA", "label_varC") + + dataActual_2computeInteractionValues <- prepareData2computeInteraction(categorical_vars, categorical_varsInInteraction, data) + + expect_equal(nrow(dataActual_2computeInteractionValues), 2) + expect_true(all(dataActual_2computeInteractionValues$label_varA %in% c("A", "A"))) + expect_true(all(dataActual_2computeInteractionValues$label_varB %in% c("X", "X"))) + expect_true(all(dataActual_2computeInteractionValues$label_varC %in% c("P", "P"))) + expect_equal(dataActual_2computeInteractionValues$logQij_mean, c(1.2, 3.4 )) +}) + + + +## TEST +test_that("Generate actual interaction fixed effect correctly", { + + ########################################################################" + N_GENES <- 4 + MIN_REPLICATES <- 3 + MAX_REPLICATES <- 3 + + init_var <- init_variable(name = "varA", mu = 8, sd = 0.1, level = 3) %>% + init_variable(name = "varB", mu = c(5, -5), NA, level = 2) %>% + init_variable(name = "varC", mu = 1, 3, 3) %>% + add_interaction(between_var = c("varA", "varC"), mu = 5, 0.1) + + # -- simulation + mock_data <- mock_rnaseq(init_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES) + + # -- fit data + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata) + results_fit <- fitModelParallel(formula = kij ~ varA + varB + varC + varA:varC, + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + + # -- inputs + tidy_tmb <- tidy_tmb(results_fit) + fixEff_dataInference <- subsetFixEffectInferred(tidy_tmb) + fixEff_dataActual <- getData2computeActualFixEffect(mock_data$groundTruth$effects) + + interactionTerm <- fixEff_dataInference$fixed_term$interaction[[1]] + categorical_vars <- fixEff_dataActual$categorical_vars + dataActual <- fixEff_dataActual$data + l_labelsInCategoricalVars <- lapply(dataActual[, categorical_vars], levels) + l_interaction <- strsplit(interactionTerm, split = ":")[[1]] + l_categoricalVarsInInteraction <- lapply(l_interaction, + function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() + + data_prepared <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, dataActual) + actual_intercept <- getActualIntercept(fixEff_dataActual) + l_RefInCategoricalVars <- lapply(dataActual[, categorical_vars], function(vector) levels(vector)[1]) + ####################################################################### + + actualInteraction <- generateActualInteractionFixEff(l_interaction, l_categoricalVarsInInteraction, data_prepared, actual_intercept, l_RefInCategoricalVars) + + # Add your assertions here based on the expected values + # For example: + expect_true(nrow(actualInteraction) == 4) + expect_equal(actualInteraction$geneID, c("gene1", "gene2", "gene3", "gene4")) + expect_true(all(actualInteraction$term %in% 'varA2:varC2')) + #expect_true(all(actualInteraction$description %in% 'interaction')) + expect_true(is.numeric(actualInteraction$actual)) + + # Add more assertions as needed... +}) + + +# Test the function `generateActualInteractionFixEff` +test_that("Test generateActualInteractionFixEff function", { + # Generate example data + data <- data.frame( + geneID = c("gene1", "gene1", "gene2", "gene2"), + varA = factor(c("A1", "A2", "A1", "A2")), + varB = factor(c("B1", "B2", "B1", "B2")), + varC = factor(c("C1", "C2", "C1", "C2")), + logQij_mean = c(3.2, 5.1, 4.7, 6.3) + ) + + categorical_vars <- c("varA", "varB", "varC") + labelsInInteraction <- c("A1", "B1") + + actual_intercept <- data.frame(actual = c(23, 21 ), + geneID = c("gene1", "gene2"), + term = c("(Intercept)", "(Intercept)"), + description = c("(Intercept)", "(Intercept)")) + # Run the function + actualInteractionValues <- getActualInteractionFixEff(labelsInInteraction, data, categorical_vars, actual_intercept) + + l_RefInCategoricalVars <- lapply(data[, categorical_vars], function(vector) levels(vector)[1]) + l_labelsInCategoricalVars <- lapply(data[, categorical_vars], levels) + l_categoricalVarsInInteraction <- lapply(labelsInInteraction, + function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() + data2computeInteraction <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, data ) + actualInteractionValues <- generateActualInteractionFixEff(labelsInInteraction,l_categoricalVarsInInteraction , + data2computeInteraction, actual_intercept, l_RefInCategoricalVars) + + + + # Define the expected output based on the example data + expected_output <- data.frame( + term = "A1:B1", + geneID = c("gene1", "gene2"), + actual = c(19.8, 16.3), + description = c("interaction", "interaction") + ) + + # Add your assertions here to compare the actual output with the expected output + expect_equal(nrow(actualInteractionValues), nrow(expected_output)) + expect_equal(actualInteractionValues$geneID, expected_output$geneID) + expect_equal(actualInteractionValues$term, expected_output$term) + expect_equal(actualInteractionValues$actual, expected_output$actual) + #expect_equal(actualInteractionValues$description, expected_output$description) + +}) + + + +test_that("Test getActualInteractionFixEff", { + + # Exemple de données d'entrée + N_GENES <- 4 + MIN_REPLICATES <- 3 + MAX_REPLICATES <- 3 + + init_var <- init_variable(name = "varA", mu = 8, sd = 0.1, level = 3) %>% + init_variable(name = "varB", mu = c(5,-5), NA, level = 2) %>% + init_variable(name = "varC", mu = 1, 3, 3) %>% + add_interaction(between_var = c("varA", "varC"), mu = 5, 0.1) + + # Simulation + mock_data <- mock_rnaseq(init_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES) + + # Données de fit + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata) + results_fit <- fitModelParallel(formula = kij ~ varA + varB + varC + varA:varC, + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + + # Données d'entrée + tidy_tmb <- tidy_tmb(results_fit) + fixEff_dataInference <- subsetFixEffectInferred(tidy_tmb) + fixEff_dataActual <- getData2computeActualFixEffect(mock_data$groundTruth$effects) + interactionTerm <- fixEff_dataInference$fixed_term$interaction[[1]] + categorical_vars <- fixEff_dataActual$categorical_vars + dataActual <- fixEff_dataActual$data + l_labelsInCategoricalVars <- lapply(dataActual[, categorical_vars], levels) + l_interaction <- strsplit(interactionTerm, split = ":")[[1]] + l_categoricalVarsInInteraction <- lapply(l_interaction, + function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() + + data_prepared <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, dataActual) + actual_intercept <- getActualIntercept(fixEff_dataActual) + + # Appel de la fonction à tester + actualInteraction <- getActualInteractionFixEff(l_interaction, data_prepared, categorical_vars, actual_intercept) + + + expect_true(nrow(actualInteraction) == 4) + expect_equal(actualInteraction$geneID, c("gene1", "gene2", "gene3", "gene4")) + expect_true(all(actualInteraction$term %in% 'varA2:varC2')) + #expect_true(all(actualInteraction$description %in% 'interaction')) + expect_true(is.numeric(actualInteraction$actual)) +}) + + +``` + +```{r function-inferenceToExpected, filename = "inferenceToExpected" } + +#' Compare the results of inference with the ground truth data. +#' +#' This function takes the data frames containing the inference results and the ground truth data +#' and generates a table to compare the inferred values with the expected values. +#' +#' @param tidy_tmb A data frame containing the results of inference. +#' @param df_ground_truth A data frame containing the ground truth data used for simulation. +#' @param gene_id_column The name of the column in both data frames representing the gene IDs. +#' +#' @return A data frame +#' +#' @examples +#' +#' @export +compareInferenceToExpected <- function(tidy_tmb , df_ground_truth) { + + ## -- get data + fixEff_dataInference <- subsetFixEffectInferred(tidy_tmb) + fixEff_dataActual <- getData2computeActualFixEffect(df_ground_truth) + actual_intercept <- getActualIntercept(fixEff_dataActual) + + ## -- main = non interaction + l_mainEffectTerm <- fixEff_dataInference$fixed_term$nonInteraction + actual_mainFixEff <- getActualMainFixEff(l_mainEffectTerm, fixEff_dataActual, actual_intercept) + + ## -- interaction term + l_interactionTerm <- fixEff_dataInference$fixed_term$interaction + categorical_vars <- fixEff_dataActual$categorical_vars + data <- fixEff_dataActual$data + actualInteractionFixEff <- computeActualInteractionFixEff(l_interactionTerm, categorical_vars, data, actual_intercept) + + ## -- rbind Interaction & Main + actual_fixEff <- rbind(actual_mainFixEff , actualInteractionFixEff, actual_intercept ) + + ## -- join inference & actual + inference_fixEff <- fixEff_dataInference$data + res <- join_dtf(inference_fixEff, actual_fixEff , c("ID", "term"), c("geneID", "term")) + return(res) +} + +``` + +## Wald test + +```{r function-waldTest, filename = "waldTest" } + +#' Wald test for hypothesis testing +#' +#' This function performs a Wald test for hypothesis testing by comparing an estimation +#' to a reference value using the provided standard error. It allows testing for +#' one-tailed alternatives: "greater" - β > reference_value, "less" - β < −reference_value, +#' or two-tailed alternative: "greaterAbs" - |β| > reference_value. +#' If the p-value obtained is greater than 1, it is set to 1 to avoid invalid p-values. +#' +#' @param estimation The estimated coefficient value. +#' @param std_error The standard error of the estimation. +#' @param reference_value The reference value for comparison (default is 0). +#' @param alternative The type of alternative hypothesis to test (default is "greaterAbs"). +#' @return A list containing the test statistic and p-value. +#' @export +#' @examples +#' # Perform a Wald test with the default "greaterAbs" alternative +#' wald_test(estimation = 0.1, std_error = 0.02, reference_value = 0.2) +wald_test <- function(estimation, std_error, reference_value = 0, alternative = "greaterAbs") { + if (alternative == "greater") { + test_statistic <- (estimation - reference_value) / std_error + p_value <- 1 - pnorm(test_statistic, mean = 0, sd = 1, lower.tail = TRUE) + } else if (alternative == "less") { + test_statistic <- (estimation - reference_value) / std_error + p_value <- pnorm(test_statistic, mean = 0, sd = 1, lower.tail = TRUE) + } else if (alternative == "greaterAbs") { + test_statistic <- (abs(estimation) - reference_value) / std_error + p_value <- 2 * (1 - pnorm(test_statistic, mean = 0, sd = 1, lower.tail = TRUE)) + } else { + stop("Invalid alternative type. Use 'greater', 'less', or 'greaterAbs'.") + } + + # Set p-value to 1 if it exceeds 1 + p_value <- pmin(p_value, 1) + return(list(statistic = test_statistic, p.value = p_value)) +} + + + + +#' Perform statistical tests and return tidy results +#' +#' This function takes a list of glmmTMB objects and performs statistical tests based on the estimated coefficients and their standard errors. The results are returned in a tidy data frame format. +#' +#' @param list_tmb A list of glmmTMB objects representing the fitted models. +#' @param coeff_threshold The threshold value for coefficient testing (default is 0). +#' @param alternative_hypothesis The type of alternative hypothesis for the statistical test (default is "greaterAbs"). +#' Possible options are "greater" (for greater than threshold), "less" (for less than threshold), +#' and "greaterAbs" (for greater than absolute value of threshold). +#' @param correction_method a character string indicating the correction method to apply to p-values. Possible values are: +#' "holm", "hochberg", "hommel", #' "bonferroni", "BH", "BY", "fdr", and "none". +#' +#' @return A tidy data frame containing the results of statistical tests for the estimated coefficients. +#' +#' @importFrom stats p.adjust +#' @export +#' +#' @examples +#' data(iris) +#' model_list <- fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length, +#' data = iris, group_by = "Species", n.cores = 1) +#' results_df <- results(model_list, coeff_threshold = 0.1, alternative_hypothesis = "greater") +results <- function(list_tmb, coeff_threshold = 0, alternative_hypothesis = "greaterAbs", correction_method = "BH") { + tidy_tmb_df <- tidy_tmb(list_tmb) + if (coeff_threshold != 0 && alternative_hypothesis != "greaterAbs") { + waldRes <- wald_test(tidy_tmb_df$estimate, tidy_tmb_df$std.error, coeff_threshold, alternative_hypothesis) + tidy_tmb_df$statistic <- waldRes$statistic + tidy_tmb_df$p.value <- waldRes$p.value + } + tidy_tmb_df$p.adj <- stats::p.adjust(tidy_tmb_df$p.value, method = correction_method) + return(tidy_tmb_df) +} + + +``` + + + +```{r test-waldTest} + +# Test unitaires +test_that("wald_test performs correct tests", { + # Test with "greater" alternative + result_greater <- wald_test(estimation = 0.1, std_error = 0.02, reference_value = 0.05, alternative = "greater") + expect_equal(result_greater$p.value, 1 - pnorm((0.1 - 0.05) / 0.02, mean = 0, sd = 1, lower.tail = TRUE)) + + # Test with "less" alternative + result_less <- wald_test(estimation = 0.1, std_error = 0.02, reference_value = 0.05, alternative = "less") + expect_equal(result_less$p.value, pnorm((0.1 - 0.05) / 0.02, mean = 0, sd = 1, lower.tail = TRUE)) + + # Test with "greaterAbs" alternative + result_greaterAbs <- wald_test(estimation = 0.1, std_error = 0.02, reference_value = 0.05, alternative = "greaterAbs") + expect_equal(result_greaterAbs$p.value, (2 * (1 - pnorm((abs(0.1) - 0.05) / 0.02, mean = 0, sd = 1, lower.tail = TRUE)))) + + # Test with invalid alternative + expect_error(wald_test(estimation = 0.1, std_error = 0.02, reference_value = 0.05, alternative = "invalid")) +}) + + + +test_that("results function performs statistical tests correctly", { + # Charger les données iris pour les tests + data(iris) + # Fit models and perform statistical tests + model_list <- fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length, + data = iris, group_by = "Species", n.cores = 1) + results_df <- results(model_list, coeff_threshold = 0.1, alternative_hypothesis = "greater") + + # Vérifier que les colonnes 'statistic' et 'p.value' ont été ajoutées au dataframe + expect_true("statistic" %in% colnames(results_df)) + expect_true("p.value" %in% colnames(results_df)) + + # Vérifier que les tests statistiques ont été effectués correctement + # Ici, nous ne vérifierons pas les valeurs exactes des résultats car elles peuvent varier en fonction de la machine et des packages utilisés. + # Nous nous assurerons seulement que les résultats sont dans le format attendu. + expect_is(results_df$statistic, "numeric") + expect_is(results_df$p.value, "numeric") + expect_is(results_df$p.adj, "numeric") + + + # Vérifier que les p-values ne dépassent pas 1 + expect_true(all(results_df$p.value <= 1)) + + # Vérifier que les valeurs sont correctes pour les colonnes 'statistic' et 'p.value' + # (Cela dépend des données iris et des modèles ajustés) + # Remarque : Vous devrez peut-être ajuster ces tests en fonction des valeurs réelles des données iris et des modèles ajustés. + expect_true(all(!is.na(results_df$statistic))) + expect_true(all(!is.na(results_df$p.value))) + + # Vérifier que le seuil des coefficients et l'hypothèse alternative sont correctement appliqués + # Ici, nous nous attendons à ce que les p-values soient uniquement pour les coefficients dépassant le seuil + expect_true(all(ifelse(abs(results_df$estimate) > 0.1, results_df$p.value <= 1, results_df$p.value == 1))) + expect_true(all(ifelse(abs(results_df$estimate) > 0.1, results_df$p.adj <= 1, results_df$p.adj == 1))) + + }) + + + + +``` + + +## ROC plot + + +```{r function-rocPlot, filename = "ROCplot"} + + +#' Get Labels for Expected Differential Expression +#' +#' This function assigns labels to genes based on whether their actual effect estimates +#' indicate differential expression according to a given threshold and alternative hypothesis. +#' +#' @param comparison_df A data frame containing comparison results with actual effect estimates. +#' @param coeff_threshold The threshold value for determining differential expression. +#' @param alt_hypothesis The alternative hypothesis for comparison. Possible values are "greater", +#' "less", and "greaterAbs". +#' @return A modified data frame with an additional column indicating if the gene is differentially expressed. +#' +#' @examples +#' # Generate a sample comparison data frame +#' comparison_data <- data.frame( +#' geneID = c("gene1", "gene2", "gene3"), +#' actual = c(0.5, -0.3, 0.8) +#' ) +#' +#' # Get labels for expected differential expression +#' labeled_data <- getLabelExpected(comparison_data, coeff_threshold = 0.2, alt_hypothesis = "greater") +#' +#' @export +getLabelExpected <- function(comparison_df, coeff_threshold, alt_hypothesis) { + if (alt_hypothesis == "greater") { + idx_DE <- comparison_df$actual > coeff_threshold + comparison_df$isDE <- idx_DE + } else if (alt_hypothesis == "less") { + idx_DE <- comparison_df$actual < coeff_threshold + comparison_df$isDE <- idx_DE + } else if (alt_hypothesis == "greaterAbs") { + idx_DE <- abs(comparison_df$actual) > coeff_threshold + comparison_df$isDE <- idx_DE + } + return(comparison_df) +} + + +#' Generate ROC Curve Plot +#' +#' This function generates an ROC curve plot based on the comparison dataframe. +#' +#' @param comparison_df A dataframe containing comparison results. +#' @param ... additional params to pass ggplot2::aes +#' @return A ggplot object representing the ROC curve plot. +#' @importFrom plotROC geom_roc +#' @importFrom ggplot2 ggtitle theme_bw aes sym +#' +#' @examples +#' comparison_data <- data.frame( +#' geneID = c("gene1", "gene2", "gene3"), +#' isDE = c(TRUE, FALSE, TRUE), +#' p.adj = c(0.05, 0.2, 0.01) +#' ) +#' roc_plot(comparison_data) +#' +#' @export +roc_plot <- function(comparison_df, ...) { + + checkLabelValidityForROC <- function(labels) { + if (all(labels == TRUE)) + message("WARNING : No FALSE label in 'isDE' column, ROC curve cannot be computed") + if (all(labels == FALSE)) + message("WARNING : No TRUE label in 'isDE' column, ROC curve cannot be computed") + } + + checkLabelValidityForROC(comparison_df$isDE) + + args <- lapply(list(...), function(x) if (!is.null(x)) ggplot2::sym(x)) + + #comparison_df$isDE <- factor(comparison_df$isDE, levels= c(TRUE, FALSE)) + ggplot2::ggplot(comparison_df, ggplot2::aes(d = !isDE , m = p.adj, !!!args )) + + plotROC::geom_roc(n.cuts = 0, labels = FALSE) + + ggplot2::theme_bw() + + ggplot2::ggtitle("ROC curve") +} + + + +``` + +```{r test-rocPlot} + + +# Test cases for getLabelExpected function +test_that("getLabelExpected assigns labels correctly", { + + + # Sample comparison data frame + comparison_data <- data.frame( + geneID = c("gene1", "gene2", "gene3"), + actual = c(0.5, -0.3, 0.8) + ) + + # Test case 1: Alt hypothesis = "greater" + labeled_data_greater <- getLabelExpected(comparison_data, coeff_threshold = 0.2, alt_hypothesis = "greater") + expect_identical(labeled_data_greater$isDE, c(TRUE, FALSE, TRUE)) + + # Test case 2: Alt hypothesis = "less" + labeled_data_less <- getLabelExpected(comparison_data, coeff_threshold = -0.2, alt_hypothesis = "less") + expect_identical(labeled_data_less$isDE, c(FALSE, TRUE, FALSE)) + + # Test case 3: Alt hypothesis = "greaterAbs" + labeled_data_greaterAbs <- getLabelExpected(comparison_data, coeff_threshold = 0.6, alt_hypothesis = "greaterAbs") + expect_identical(labeled_data_greaterAbs$isDE, c(FALSE, FALSE, TRUE)) + +}) + + +test_that("ROC plot is generated correctly", { + comparison_data <- data.frame( + geneID = c("gene1", "gene2", "gene3"), + isDE = c(TRUE, FALSE, TRUE), + p.adj = c(0.05, 0.2, 0.01) + ) + + plot <- roc_plot(comparison_data) + + expect_true("gg" %in% class(plot)) +}) + + +``` + +## Counts plot + + +```{r function-countsPlot, filename = "countsPlot"} + +#' Generate a density plot of gene counts +#' +#' This function generates a density plot of gene counts from mock data. +#' +#' @param mock_obj The mock data object containing gene counts. +#' +#' @return A ggplot2 density plot. +#' +#' @importFrom ggplot2 aes geom_density theme_bw ggtitle scale_x_log10 element_blank +#' @export +#' +#' @examples +#' mock_data <- list(counts = matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)) +#' counts_plot(mock_data) +counts_plot <- function(mock_obj){ + + counts <- unname(unlist(mock_obj$counts)) + p <- ggplot2::ggplot() + + ggplot2::aes(x = "Genes", y = counts) + + ggplot2::geom_point(position = "jitter", alpha = 0.6, size = 0.4, col = "#D1F2EB") + + ggplot2::geom_violin(fill = "#F8F9F9", alpha = 0.4) + + ggplot2::stat_summary(fun = "mean", geom = "point", color = "#B63A0F", size = 5) + + ggplot2::theme_bw() + + ggplot2::ggtitle("Gene expression plot") + + ggplot2::theme(axis.title.x = ggplot2::element_blank()) + return(p) +} + + +``` + +```{r test-countsPlot} + + + +# Test cases +test_that("Counts plot is generated correctly", { + mock_data <- list( + counts = matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3) + ) + + plot <- counts_plot(mock_data) + + expect_true("gg" %in% class(plot)) +}) + + + +``` + + +## Identity plot + + +```{r function-identityPlot, filename = "identityPlot"} + +#' Generate an identity plot +#' +#' This function generates an identity plot for comparing actual values with estimates. +#' +#' @param comparison_df A data frame containing comparison results with "actual" and "estimate" columns. +#' @param ... additional parameters to pass ggplot2::aes +#' @return A ggplot2 identity plot. +#' +#' @importFrom ggplot2 sym aes geom_point geom_abline facet_wrap theme_bw ggtitle scale_x_log10 scale_y_log10 +#' @export +#' @examples +#' comparison_data <- data.frame( +#' actual = c(1, 2, 3, 4, 5), +#' estimate = c(0.9, 2.2, 2.8, 4.1, 5.2), +#' description = rep("Category A", 5)) +#' identity_plot(comparison_data) + +identity_plot <- function(comparison_df, ...){ + + args <- lapply(list(...), function(x) if (!is.null(x)) ggplot2::sym(x)) + + + ggplot2::ggplot(comparison_df) + + ggplot2::geom_point(ggplot2::aes(x = actual, y = estimate, !!!args), alpha = 0.6) + + ggplot2::geom_abline(intercept = 0, slope = 1, lty = 3, col = 'red', linewidth = 1) + + ggplot2::facet_wrap(~description, scales = "free") + + ggplot2::theme_bw() + + ggplot2::ggtitle("Identity plot") + + ggplot2::scale_x_log10() + + ggplot2::scale_y_log10() + + +} + + +``` + +```{r test-identityPlot} + + +# Test cases +test_that("Identity plot is generated correctly", { + comparison_data <- data.frame( + actual = c(1, 2, 3, 4, 5), + estimate = c(0.9, 2.2, 2.8, 4.1, 5.2), + description = rep("Category A", 5) + ) + + plot <- identity_plot(comparison_data) + + expect_true("gg" %in% class(plot)) +}) + + + +``` + + +## Simulation report + +```{r function-simulationReport, filename = "simulationReport" } + +#' Export the Analysis Report to a File +#' +#' This function generates an analysis report by arranging and combining various plots +#' and tables, and then exports the report to a specified file. +#' +#' @param report_file Path to the file where the report will be exported. +#' @param table_settings A table containing settings and parameters used in the analysis. +#' @param metrics_plot A plot displaying various performance metrics. +#' @param roc_curve A plot displaying the Receiver Operating Characteristic (ROC) curve. +#' @param dispersion_plot A plot displaying the dispersion values. +#' @param id_plot A plot displaying unique identifiers. +#' @param counts_plot A plot displaying the gene counts. +#' +#' +#' @importFrom gridExtra arrangeGrob grid.arrange +#' @importFrom ggplot2 ggsave +#' +#' +#' @return report +#' @export +exportReportFile <- function(report_file, table_settings, roc_curve, dispersion_plot, id_plot, counts_plot){ + + middle_part <- gridExtra::arrangeGrob(counts_plot, dispersion_plot, heights = c(1, 1.5)) + left_part <- gridExtra::arrangeGrob(table_settings, roc_curve ,heights = c(1, 1.5)) + p2export <- gridExtra::grid.arrange(left_part, middle_part, id_plot ,ncol = 3, widths = c(1,1,2)) + + if (!is.null(report_file)) ggplot2::ggsave(report_file, p2export, height = 10, width = 15) + + return(p2export) +} + + +#' Generate a Formatted Table as a Grid Graphics Object +#' +#' This function generates a formatted table using the provided data frame and returns +#' it as a grid graphics object. +#' +#' @param df The data frame to be converted into a formatted table. +#' +#' @return A grid graphics object representing the formatted table. +#' @export +#' @importFrom ggplot2 unit +#' @importFrom gridExtra tableGrob ttheme_minimal +#' @examples +#' # Create a sample data frame +#' sample_data <- data.frame( +#' Name = c("Alice", "Bob", "Charlie"), +#' Age = c(25, 30, 28) +#' ) +#' +#' # Generate the formatted table +#' table_grob <- getGrobTable(sample_data) +getGrobTable <- function(df){ + theme_custom <- gridExtra::ttheme_minimal( + core=list(bg_params = list(fill = c("#F8F9F9", "#E5E8E8"), col=NA)), + colhead=list(fg_params=list(col="white", fontface=4L), bg_params = list(fill = "#5D6D7E", col=NA)), + base_size = 15) + grob_df <- gridExtra::tableGrob(df, rows=NULL, theme = theme_custom, widths = ggplot2::unit(x = c(0.4,0.3), "npc" ) ) + return(grob_df) +} + + +#' Generate a simulation report +#' +#' This function generates a simulation report containing various plots and evaluation metrics. +#' +#' @param mock_obj A list containing simulation data and ground truth. +#' @param list_tmb A list of model results. +#' @param dds_obj a DESeq2 object +#' @param coeff_threshold A threshold for comparing estimates. +#' @param alt_hypothesis The alternative hypothesis for comparisons ("greater", "less", "greaterAbs"). +#' @param report_file File name to save the generated report. If NULL, the report will not be exported. +#' @importFrom ggplot2 aes geom_point geom_abline facet_wrap theme_bw ggtitle +#' @return A list containing settings, plots, and evaluation results. +#' @export +simulationReport <- function(mock_obj, list_tmb = NULL, dds_obj = NULL , + coeff_threshold = 0, alt_hypothesis = "greaterAbs", report_file = NULL){ + + #-- init + TMB_comparison_df <- data.frame() + DESEQ_comparison_df <- data.frame() + DESEQ_dispersion_df <- data.frame() + TMB_dispersion_df <- data.frame() + + # -- build data from list_tmb + if (!is.null(list_tmb)){ + tidyRes <- results(list_tmb, coeff_threshold, alt_hypothesis) + TMB_comparison_df <- compareInferenceToExpected(tidyRes, mock_obj$groundTruth$effects) + TMB_comparison_df <- getLabelExpected(TMB_comparison_df, coeff_threshold, alt_hypothesis) + TMB_comparison_df$from <- "HTRfit" + tmb_disp_inferred <- extractTMBDispersion(list_tmb) + TMB_dispersion_df <- getDispersionComparison(tmb_disp_inferred, mock_data$groundTruth$gene_dispersion) + TMB_dispersion_df$from <- 'HTRfit' + } + + if (!is.null(dds_obj)){ + deseq2_wrapped <- wrapper_DESeq2(dds, coeff_threshold, alt_hypothesis) + DESEQ_comparison_df <- compareInferenceToExpected(deseq2_wrapped$fixEff, mock_obj$groundTruth$effects) + DESEQ_comparison_df <- getLabelExpected(DESEQ_comparison_df, coeff_threshold, alt_hypothesis) + DESEQ_comparison_df$from <- "DESeq2" + DESEQ_comparison_df$component <- NA + DESEQ_comparison_df$group <- NA + DESEQ_disp_inferred <- extractDESeqDispersion(deseq2_wrapped) + DESEQ_dispersion_df <- getDispersionComparison(DESEQ_disp_inferred , mock_data$groundTruth$gene_dispersion) + DESEQ_dispersion_df$from <- 'DESeq2' + } + + comparison_df <- rbind( DESEQ_comparison_df, TMB_comparison_df ) + + + color2use <- c("#D2B4DE", "#5D6D7E") + color2use <- color2use[c(!is.null(list_tmb), !is.null(dds_obj))] + + # -- plotting + roc_curve <- roc_plot(comparison_df, col = "from" ) + ggplot2::scale_color_manual(values = color2use) + id_plot <- identity_plot(comparison_df, col = "from") + ggplot2::scale_color_manual(values = color2use) + #metrics_plot <- metrics_plot(list_tmb) + evalDisp <- evaluateDispersion(TMB_dispersion_df, DESEQ_dispersion_df, color2use ) + dispersion_plot <- evalDisp$disp_plot + counts_plot <- counts_plot(mock_obj) + + # -- export report + df_settings <- mock_obj$settings + grobTableSettings <- getGrobTable(df_settings) + exportReportFile(report_file, grobTableSettings, roc_curve, dispersion_plot, id_plot, counts_plot) + + # -- return + ret <- list(settings = df_settings, metrics_plot = metrics_plot, + dispersionEvaluation = evalDisp, identity_plot = id_plot, counts_plot = counts_plot, data = comparison_df) + return(ret) +} + +``` + + + +```{r test-simulationReport} + + +# Test case 1: Testing with a sample data frame +test_that("Generating a formatted table works correctly", { + sample_data <- data.frame( + Name = c("Alice", "Bob", "Charlie"), + Age = c(25, 30, 28) + ) + + table_grob <- getGrobTable(sample_data) + + expect_s3_class(table_grob, "gtable") +}) + +# Test case 4: Testing with non-numeric values +test_that("Handling non-numeric values in the data frame", { + non_numeric_data <- data.frame( + Name = c("Alice", "Bob", "Charlie"), + Age = c(25, "N/A", 28) +) + + table_grob <- getGrobTable(non_numeric_data) + + expect_s3_class(table_grob, "gtable") +}) + +``` + +## DESeq2 + +```{r function-deseq2, filename = "wrapperDESeq2" } + +#' Wrapper Function for DESeq2 Analysis +#' +#' This function performs differential expression analysis using DESeq2 based on the provided +#' DESeqDataSet (dds) object. It calculates the dispersion values from the dds object and then +#' performs inference on the log-fold change (LFC) values using the specified parameters. +#' +#' @param dds A DESeqDataSet object containing the count data and experimental design. +#' @param lfcThreshold The threshold for minimum log-fold change (LFC) to consider differentially expressed. +#' @param altHypothesis The alternative hypothesis for the analysis, indicating the direction of change. +#' Options are "greater", "less", or "two.sided". +#' @param correction_method The method for p-value correction. Default is "BH" (Benjamini-Hochberg). +#' +#' @return A list containing the dispersion values and the results of the differential expression analysis. +#' The dispersion values are calculated from the dds object and named according to sample names. +#' The inference results include adjusted p-values and log2 fold changes for each gene. +#' +#' @examples +#' N_GENES = 100 +#' MAX_REPLICATES = 5 +#' MIN_REPLICATES = 5 +#' ## --init variable +#' input_var_list <- init_variable( name = "genotype", mu = 12, sd = 0.1, level = 3) %>% +#' init_variable(name = "environment", mu = c(0,1), NA , level = 2) +#' +#' mock_data <- mock_rnaseq(input_var_list, N_GENES, MIN_REPLICATES, max_replicates = MAX_REPLICATES) +#' dds <- DESeq2::DESeqDataSetFromMatrix(mock_data$counts , mock_data$metadata, ~ genotype + environment) +#' dds <- DESeq2::DESeq(dds, quiet = TRUE) +#' result <- wrapper_DESeq2(dds, lfcThreshold = 1, altHypothesis = "greater") +#' @export +wrapper_DESeq2 <- function(dds, lfcThreshold , altHypothesis, correction_method = "BH") { + dds_full <- S4Vectors::mcols(dds) %>% as.data.frame() + + ## -- dispersion + message("INFO: The dispersion values from DESeq2 were reparametrized to their reciprocals (1/dispersion).") + dispersion <- 1/dds_full$dispersion + names(dispersion) <- rownames(dds_full) + + ## -- coeff + inference_df <- get_inference(dds_full, lfcThreshold, altHypothesis, correction_method) + res <- list(dispersion = dispersion, fixEff = inference_df) + return(res) +} + + + +#' Calculate Inference for Differential Expression Analysis +#' +#' This function calculates inference for differential expression analysis based on the results of DESeq2. +#' +#' @param dds_full A data frame containing DESeq2 results, including estimate and standard error information. +#' @param lfcThreshold Log fold change threshold for determining differentially expressed genes. +#' @param altHypothesis Alternative hypothesis for testing, one of "greater", "less", or "two.sided". +#' @param correction_method Method for multiple hypothesis correction, e.g., "BH" (Benjamini-Hochberg). +#' +#' @return A data frame containing inference results, including statistics, p-values, and adjusted p-values. +#' +#' @examples +#' \dontrun{ +#' # Example usage of the function +#' inference_result <- get_inference(dds_full, lfcThreshold = 0.5, altHypothesis = "greater", correction_method = "BH") +#' } +#' @importFrom stats p.adjust +#' @export +get_inference <- function(dds_full, lfcThreshold, altHypothesis, correction_method){ + + ## -- build subdtf + stdErr_df <- getSE_df(dds_full) + estim_df <- getEstimate_df(dds_full) + ## -- join + df2ret <- join_dtf(estim_df, stdErr_df, k1 = c("ID", "term") , k2 = c("ID", "term")) + + ## -- convert to log10 + message("INFO: The log2-fold change estimates and standard errors from DESeq2 were converted to the natural logarithm scale.") + df2ret$estimate <- df2ret$estimate*log(2) + df2ret$std.error <- df2ret$std.error*log(2) + + ## -- some details reshaped + df2ret$term <- gsub("_vs_.*","", df2ret$term) + df2ret$term <- gsub(pattern = "_", df2ret$term, replacement = "") + df2ret$term <- removeDuplicatedWord(df2ret$term) + df2ret$term <- gsub(pattern = "[.]", df2ret$term, replacement = ":") + df2ret$effect <- "fixed" + idx_intercept <- df2ret$term == "Intercept" + df2ret$term[idx_intercept] <- "(Intercept)" + + ## -- statistical part + waldRes <- wald_test(df2ret$estimate, df2ret$std.error, lfcThreshold, altHypothesis) + df2ret$statistic <- waldRes$statistic + df2ret$p.value <- waldRes$p.value + df2ret$p.adj <- stats::p.adjust(df2ret$p.value, method = correction_method) + + return(df2ret) +} + + +#' Extract Standard Error Information from DESeq2 Results +#' +#' This function extracts the standard error (SE) information from DESeq2 results. +#' +#' @param dds_full A data frame containing DESeq2 results, including standard error columns. +#' +#' @return A data frame with melted standard error information, including gene IDs and terms. +#' +#' @examples +#' \dontrun{ +#' # Example usage of the function +#' se_info <- getSE_df(dds_full) +#' } +#' @importFrom reshape2 melt +#' @export +getSE_df <- function(dds_full){ + columnsInDds_full <- colnames(dds_full) + SE_columns <- columnsInDds_full [ grepl("SE" , columnsInDds_full) ] + SE_df <- dds_full[, SE_columns] + SE_df$ID <- rownames(SE_df) + SE_df_long <- reshape2::melt(SE_df, + measure.vars = SE_columns, + variable.name = "term", value.name = "std.error", drop = F) + SE_df_long$term <- gsub(pattern = "SE_", SE_df_long$term, replacement = "") + return(SE_df_long) + +} + + +#' Extract Inferred Estimate Information from DESeq2 Results +#' +#' This function extracts the inferred estimate values from DESeq2 results. +#' +#' @param dds_full A data frame containing DESeq2 results, including estimate columns. +#' +#' @return A data frame with melted inferred estimate information, including gene IDs and terms. +#' +#' @examples +#' \dontrun{ +#' # Example usage of the function +#' estimate_info <- getEstimate_df(dds_full) +#' } +#' @importFrom reshape2 melt +#' @export +getEstimate_df <- function(dds_full){ + columnsInDds_full <- colnames(dds_full) + SE_columns <- columnsInDds_full [ grepl("SE" , columnsInDds_full) ] + inferedVal_columns <- gsub("SE_", "" , x = SE_columns) + + estimate_df <- dds_full[, inferedVal_columns] + estimate_df$ID <- rownames(estimate_df) + estimate_df_long <- reshape2::melt(estimate_df, + measure.vars = inferedVal_columns, + variable.name = "term", value.name = "estimate", drop = F) + return(estimate_df_long) + +} + +``` + + +```{r test-wrapperDESeq2} + + +test_that("get_inference returns a data frame with correct columns", { + # Create a sample dds_full data frame + N_GENES = 100 + MAX_REPLICATES = 5 + MIN_REPLICATES = 5 + ## --init variable + input_var_list <- init_variable( name = "genotype", mu = 12, sd = 0.1, level = 3) %>% + init_variable(name = "environment", mu = c(0,1), NA , level = 2) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, MIN_REPLICATES, max_replicates = MAX_REPLICATES) + dds <- DESeq2::DESeqDataSetFromMatrix(mock_data$counts , mock_data$metadata, ~ genotype + environment) + dds <- DESeq2::DESeq(dds, quiet = TRUE) + dds_full <- S4Vectors::mcols(dds) %>% as.data.frame() + + # Call the function + inference_results <- get_inference(dds_full, lfcThreshold = 0.5, altHypothesis = "greater", correction_method = "BH") + + # Check if the returned object is a data frame + expect_true(is.data.frame(inference_results)) + + # Check if the data frame contains the correct columns + expect_true("ID" %in% colnames(inference_results)) + expect_true("estimate" %in% colnames(inference_results)) + expect_true("std.error" %in% colnames(inference_results)) + expect_true("term" %in% colnames(inference_results)) + expect_true("effect" %in% colnames(inference_results)) + expect_true("statistic" %in% colnames(inference_results)) + expect_true("p.value" %in% colnames(inference_results)) + expect_true("p.adj" %in% colnames(inference_results)) +}) + + + + + + +test_that("getEstimate_df function works correctly", { + + # Create a sample dds_full data frame + N_GENES = 100 + MAX_REPLICATES = 5 + MIN_REPLICATES = 5 + ## --init variable + input_var_list <- init_variable( name = "genotype", mu = 12, sd = 0.1, level = 3) %>% + init_variable(name = "environment", mu = c(0,1), NA , level = 2) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, MIN_REPLICATES, max_replicates = MAX_REPLICATES) + dds <- DESeq2::DESeqDataSetFromMatrix(mock_data$counts , mock_data$metadata, ~ genotype + environment) + dds <- DESeq2::DESeq(dds, quiet = TRUE) + dds_full <- S4Vectors::mcols(dds) %>% as.data.frame() + + # Call the function + estimate_df_long <- getEstimate_df(dds_full) + + # Check if the resulting data frame has the expected structure + expect_true("ID" %in% colnames(estimate_df_long)) + expect_true("term" %in% colnames(estimate_df_long)) + expect_true("estimate" %in% colnames(estimate_df_long)) +}) + + + +# Define a test context +test_that("getSE_df function works correctly", { + + # Create a sample dds_full data frame + N_GENES = 100 + MAX_REPLICATES = 5 + MIN_REPLICATES = 5 + ## --init variable + input_var_list <- init_variable( name = "genotype", mu = 12, sd = 0.1, level = 3) %>% + init_variable(name = "environment", mu = c(0,1), NA , level = 2) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, MIN_REPLICATES, max_replicates = MAX_REPLICATES) + dds <- DESeq2::DESeqDataSetFromMatrix(mock_data$counts , mock_data$metadata, ~ genotype + environment) + dds <- DESeq2::DESeq(dds, quiet = TRUE) + dds_full <- S4Vectors::mcols(dds) %>% as.data.frame() + + # Call the function + SE_df_long <- getSE_df(dds_full) + + # Check if the resulting data frame has the expected structure + expect_true("ID" %in% colnames(SE_df_long)) + expect_true("term" %in% colnames(SE_df_long)) + expect_true("std.error" %in% colnames(SE_df_long)) +}) + + +# Define a test context +test_that("wrapperDESeq2 function works correctly", { + + # Create a sample dds_full data frame + N_GENES = 100 + MAX_REPLICATES = 5 + MIN_REPLICATES = 5 + ## --init variable + input_var_list <- init_variable( name = "genotype", mu = 12, sd = 0.1, level = 3) %>% + init_variable(name = "environment", mu = c(0,1), NA , level = 2) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, MIN_REPLICATES, max_replicates = MAX_REPLICATES) + dds <- DESeq2::DESeqDataSetFromMatrix(mock_data$counts , mock_data$metadata, ~ genotype + environment) + dds <- DESeq2::DESeq(dds, quiet = TRUE) + deseq2_wrapped <- wrapper_DESeq2(dds, 0.2, "greaterAbs") + + expect_true(is.list(deseq2_wrapped)) + + # Check if the resulting data frame has the expected structure + expect_true("ID" %in% colnames(deseq2_wrapped$fixEff)) + expect_true("term" %in% colnames(deseq2_wrapped$fixEff)) + expect_true("std.error" %in% colnames(deseq2_wrapped$fixEff)) + expect_true("estimate" %in% colnames(deseq2_wrapped$fixEff)) + expect_true("statistic" %in% colnames(deseq2_wrapped$fixEff)) + expect_true("p.value" %in% colnames(deseq2_wrapped$fixEff)) + expect_true("p.adj" %in% colnames(deseq2_wrapped$fixEff)) + +}) + +``` + + + + + +```{r development-inflate, eval=FALSE} +fusen::fill_description(fields = list(Title = "HTRSIM"), overwrite = T) +usethis::use_mit_license("Arnaud DUVERMY") +usethis::use_pipe(export = TRUE) +devtools::document() +# Keep eval=FALSE to avoid infinite loop in case you hit the knit button +# Execute in the console directly +fusen::inflate(flat_file = "dev/flat_full_bis.Rmd", vignette_name = "Get started") +```