Select Git revision
flat_full.Rmd
Arnaud Duvermy authored
flat_full.Rmd 281.63 KiB
title: "flat_full.Rmd for working package"
output: html_document
editor_options:
chunk_output_type: console
library(testthat)
# Load already included functions if relevant
pkgload::load_all(export_all = FALSE)
#' Join two data frames using data.table
#'
#' @param d1 Data frame 1
#' @param d2 Data frame 2
#' @param k1 Key columns for data frame 1
#' @param k2 Key columns for data frame 2
#' @importFrom data.table data.table
#' @return Joined data frame
#' @export
#'
#' @examples
#'
#' # Example usage:
#' df1 <- data.frame(id = 1:5, value = letters[1:5])
#' df2 <- data.frame(id = 1:5, category = LETTERS[1:5])
#' join_dtf(df1, df2, "id", "id")
join_dtf <- function(d1, d2, k1, k2) {
d1.dt_table <- data.table::data.table(d1, key = k1)
d2.dt_table <- data.table::data.table(d2, key = k2)
dt_joined <- d1.dt_table[d2.dt_table, allow.cartesian = TRUE]
return(dt_joined %>% as.data.frame())
}
#' Clean Variable Name
#'
#' This function removes digits, spaces, and special characters from a variable name.
#' If any of these are present, they will be replaced with an underscore '_'.
#'
#' @param name The input variable name to be cleaned.
#' @return The cleaned variable name without digits, spaces, or special characters.
#'
#' @details
#' This function will check the input variable name for the presence of digits,
#' spaces, and special characters. If any of these are found, they will be removed
#' from the variable name and replaced with an underscore '_'. Additionally, it will
#' check if the cleaned name is not one of the reserved names "interactions" or
#' "correlations" which are not allowed as variable names.
#' @export
#' @examples
#' clean_variable_name("my_var,:&$àà(-i abl23 e_na__ç^me ")
clean_variable_name <- function(name){
message("Variable name should not contain digits, spaces, or special characters.\nIf any of these are present, they will be removed from the variable name.")
# avoid space in variable name
name <- gsub(" ", "_", name, fixed = TRUE)
# avoid digit in variable name
name <- gsub("[0-9]", "", name)
# avoid special character in variable name
name <- gsub("[[:punct:]]", "", name)
forbidden_names <- c("interactions", "correlations")
if (name %in% forbidden_names) {
forbidden_str <- paste(forbidden_names, collapse = " and ")
stop(forbidden_str, "cannot be used as variable name")
}
return(name)
}
#' Get Setting Table
#'
#' Create a table of experimental settings.
#'
#' This function takes various experimental parameters and returns a data frame
#' that represents the experimental settings.
#'
#' @param n_genes Number of genes in the experiment.
#' @param max_replicates Maximum number of replicates for each gene.
#' @param min_replicates Minimum number of replicates for each gene.
#' @param lib_size total number of reads
#'
#' @return A data frame containing the experimental settings with their corresponding values.
#' @export
getSettingsTable <- function(n_genes, max_replicates, min_replicates, lib_size ){
settings_df <- data.frame(parameters = c("# genes", "Max # replicates", "Min # replicates", "Library size" ),
values = c(n_genes, max_replicates, min_replicates, lib_size))
rownames(settings_df) <- NULL
return(settings_df)
}
# Test unitaires pour la fonction join_dtf
test_that("join_dtf réalise la jointure correctement", {
# Création de données de test
df1 <- data.frame(id = 1:5, value = letters[1:5])
df2 <- data.frame(id = 1:5, category = LETTERS[1:5])
# Exécution de la fonction
result <- join_dtf(df1, df2, "id", "id")
# Vérification des résultats
expect_true(is.data.frame(result))
expect_equal(nrow(result), 5)
expect_equal(ncol(result), 3)
expect_equal(names(result), c("id", "value", "category"))
expect_true(all.equal(result$id, df1$id))
expect_true(all.equal(result$id, df2$id))
})
test_that("clean_variable_name correctly removes digits, spaces, and special characters", {
expect_equal(clean_variable_name("my variable name"), "myvariablename")
expect_equal(clean_variable_name("variable_1"), "variable")
expect_equal(clean_variable_name("^spec(ial#chars! "), "specialchars")
})
test_that("clean_variable_name handles reserved names properly", {
expect_error(clean_variable_name("interactions"))
expect_error(clean_variable_name("correlations"))
})
#' Initialize variable
#'
#' @param list_var Either c() or output of init_variable
#' @param name Variable name
#' @param mu Either a numeric value or a numeric vector (of length = level)
#' @param sd Either numeric value or NA
#' @param level Numeric value to specify the number of levels to simulate
#'
#' @return
#' A list with initialized variables
#' @export
#'
#' @examples
#' init_variable(name = "my_varA", mu = 2, sd = 9, level = 200)
init_variable <- function(list_var = c(), name = "myVariable", mu = c(2,3), sd = NA, level = NA) {
name <- clean_variable_name(name)
# Only mu specified by user => set level param
if (is.na(level) && is.na(sd)) {
level <- length(mu)
}
# Validate inputs
inputs_checking(list_var, name, mu, sd, level)
if (endsWithDigit(name)) {
warning("Names ending with digits are not allowed. They will be removed from the variable name.")
name <- removeDigitsAtEnd(name)
}
# Initialize new variable
list_var[[name]] <- fillInVariable(name, mu, sd, level)
return(list_var)
}
#' Check if a string ends with a digit
#'
#' This function checks whether a given string ends with a digit.
#'
#' @param string The input string to be checked
#' @return \code{TRUE} if the string ends with a digit, \code{FALSE} otherwise
#' @export
#' @examples
#' endsWithDigit("abc123") # Output: TRUE
#' endsWithDigit("xyz") # Output: FALSE
endsWithDigit <- function(string) {
lastChar <- substring(string, nchar(string))
return(grepl("[0-9]", lastChar))
}
#' Remove digits at the end of a string
#'
#' This function removes any digits occurring at the end of a given string.
#'
#' @param string The input string from which digits are to be removed
#' @return The modified string with digits removed from the end
#' @export
#' @examples
#' removeDigitsAtEnd("abc123") # Output: "abc"
#' removeDigitsAtEnd("xyz") # Output: "xyz"
removeDigitsAtEnd <- function(string) {
return(gsub("\\d+$", "", string))
}
#' Check Input Parameters
#'
#' This function checks the validity of the input parameters for initializing a variable.
#' It ensures that the necessary conditions are met for the input parameters.
#'
#' @param list_var List containing the variables to be initialized.
#' @param name Name of the variable.
#' @param mu Mean of the variable.
#' @param sd Standard deviation of the variable (optional).
#' @param level Number of levels for categorical variables.
#'
#' @return NULL
#' @export
#'
#' @examples
#' inputs_checking(list_var = c(), name = "var1", mu = 0, sd = 1, level = 2)
inputs_checking <- function(list_var, name, mu, sd, level) {
stopifnot(name != "")
stopifnot(is.character(name))
stopifnot(is.numeric(mu))
stopifnot(is.numeric(sd) | is.na(sd))
stopifnot(is.numeric(level))
stopifnot(length(level) == 1)
stopifnot(level >= 2)
if (!is.null(list_var)) {
error_msg <- "Non conformable list_var parameter.\nlist_var must be set as an init_var output or initialized as c()"
if (!is.list(list_var)) {
stop(error_msg)
}
}
if (length(mu) > 1) {
stopifnot(length(mu) == level)
}
if (is.na(sd)) {
if (level != length(mu)) {
stop("sd was specified as NA. mu should have the same length as the number of levels\n")
}
}
# Check if variable is already initialized
name_not_in_list_var <- identical(which(already_init_variable(list_var, name)), integer(0))
if (!name_not_in_list_var) {
message(paste(name, "is already initialized in list_var.\nIt will be updated", sep = " "))
}
return(NULL)
}
#' Check if Variable is Already Initialized
#'
#' This function checks if a variable is already initialized in the variable list.
#'
#' @param list_var A list object representing the variable list.
#' @param new_var_name A character string specifying the name of the new variable.
#'
#' @return TRUE if the variable is already initialized, FALSE otherwise.
#' @export
#'
#' @examples
#' my_list <- list(var1 = 1, var2 = 2, var3 = 3)
#' already_initialized <- already_init_variable(list_var = my_list, new_var_name = "myVariable")
already_init_variable <- function(list_var, new_var_name) {
if (is.null(list_var)) {
return(FALSE)
}
var_names <- names(list_var)
return(new_var_name %in% var_names)
}
#' Fill in Variable
#'
#' This function fills in a variable with simulated data based on the provided parameters.
#'
#' @param name The name of the variable.
#' @param mu A numeric value or a numeric vector (of length = level) representing the mean.
#' @param sd A numeric value representing the standard deviation, or NA if not applicable.
#' @param level A numeric value specifying the number of levels to simulate.
#'
#' @return A data frame or a list containing the simulated data for the variable.
#' @export
#'
#' @examples
#' variable_data <- fillInVariable(name = "myVariable", mu = c(2, 3), sd = NA, level = 2)
fillInVariable <- function(name, mu, sd, level) {
if (length(mu) > 1 | is.na(sd)) { # Effects given by user
level <- length(mu)
l_labels <- paste(name, 1:level, sep = '')
l_betaEffects <- mu
column_names <- c(paste("label", name, sep = "_"), name)
sub_obj <- build_sub_obj_return_to_user(level, metaData = l_labels,
effectsGivenByUser = l_betaEffects,
column_names)
} else {
sub_obj <- as.data.frame(list(mu = mu, sd = sd, level = level))
}
return(sub_obj)
}
#' Build Sub Object to Return to User
#'
#' This function builds the sub-object to be returned to the user.
#'
#' @param level A numeric value specifying the number of levels.
#' @param metaData A list of labels.
#' @param effectsGivenByUser A list of effects given by the user.
#' @param col_names A character vector specifying the column names to use.
#' @importFrom utils tail
#'
#' @return A list with the sub-object details.
build_sub_obj_return_to_user <- function(level, metaData, effectsGivenByUser, col_names) {
sub_obj <- list(level = level)
data <- cbind(metaData, effectsGivenByUser) %>% as.data.frame()
colnames(data) <- col_names
var_name <- utils::tail(col_names, n = 1)
data[, var_name] <- as.numeric(data[, var_name])
sub_obj$data <- data
return(sub_obj)
}
#' Add interaction
#'
#' @param list_var A list of variables (already initialized)
#' @param between_var A vector of variable names to include in the interaction
#' @param mu Either a numeric value or a numeric vector (of length = level)
#' @param sd Either numeric value or NA
#'
#' @return
#' A list with initialized interaction
#' @export
#'
#' @examples
#' init_variable(name = "myvarA", mu = 2, sd = 3, level = 200) %>%
#' init_variable(name = "myvarB", mu = 1, sd = 0.2, level = 2 ) %>%
#' add_interaction(between_var = c("myvarA", "myvarB"), mu = 3, sd = 2)
add_interaction <- function(list_var, between_var, mu, sd = NA) {
name_interaction <- paste(between_var, collapse = ":")
check_input2interaction(name_interaction, list_var, between_var, mu, sd)
# Check the number of variables in the interaction
if (length(between_var) > 3) {
stop("Cannot initialize an interaction with more than 3 variables.")
}
interactionCombinations <- getNumberOfCombinationsInInteraction(list_var, between_var)
list_var$interactions[[name_interaction]] <- fillInInteraction(list_var, between_var, mu, sd, interactionCombinations)
return(list_var)
}
#' Check input for interaction
#'
#' @param name_interaction String specifying the name of the interaction (example: "varA:varB")
#' @param list_var A list of variables (already initialized)
#' @param between_var A vector of variable names to include in the interaction
#' @param mu Either a numeric value or a numeric vector (of length = level)
#' @param sd Either numeric value or NA
#'
#' @return
#' NULL (throws an error if the input is invalid)
#' @export
check_input2interaction <- function(name_interaction, list_var, between_var, mu, sd) {
# Check if variables in between_var are declared and initialized
bool_checkInteractionValidity <- function(between_var, list_var) {
nb_varInInteraction <- length(unique(between_var))
stopifnot(nb_varInInteraction > 1)
existingVar_nb <- getListVar(list_var) %in% between_var %>% sum()
if (existingVar_nb != nb_varInInteraction) {
return(FALSE)
} else {
return(TRUE)
}
}
bool_valid_interaction <- bool_checkInteractionValidity(between_var, list_var)
if (!bool_valid_interaction) {
stop("At least one variable in between_var is not declared. Variable not initialized cannot be used in an interaction.")
}
requestedNumberOfValues <- getNumberOfCombinationsInInteraction(list_var, between_var)
if (is.na(sd) && requestedNumberOfValues != length(mu)) {
msg_e <- "sd was specified as NA. mu should have the same length as the possible number of interactions:\n"
msg_e2 <- paste(requestedNumberOfValues, "interaction values are requested.")
stop(paste(msg_e, msg_e2))
}
level <- requestedNumberOfValues
inputs_checking(list_var$interactions, name_interaction, mu, sd, level)
}
#' Get the number of combinations in an interaction
#'
#' @param list_var A list of variables (already initialized)
#' @param between A vector of variable names to include in the interaction
#'
#' @return
#' The number of combinations in the interaction
#' @export
getNumberOfCombinationsInInteraction <- function(list_var, between) {
levelInlistVar <- getGivenAttribute(list_var, "level") %>% unlist()
n_combinations <- prod(levelInlistVar[between])
return(n_combinations)
}
#' getGridCombination
#'
#' Generates all possible combinations of labels.
#'
#' @param l_labels List of label vectors
#'
#' @return A data frame with all possible combinations of labels
#' @export
#'
#' @examples
#' l_labels <- list(
#' c("A", "B", "C"),
#' c("X", "Y")
#' )
#' getGridCombination(l_labels)
getGridCombination <- function(l_labels) {
grid <- expand.grid(l_labels)
colnames(grid) <- paste("label", seq_along(l_labels), sep = "_")
return(grid)
}
#' Get grid combination from list_var
#'
#' @param list_var A list of variables (already initialized)
#'
#' @return
#' The grid combination between variable in list_var
#' @export
generateGridCombination_fromListVar <- function (list_var){
l_levels <- getGivenAttribute(list_var, "level") %>% unlist()
vars <- names(l_levels)
l_levels <- l_levels[vars]
l_labels <- getLabels(l_variables2labelized = vars, l_nb_label = l_levels)
gridComb <- getGridCombination(l_labels)
colnames(gridComb) <- paste("label", vars, sep = "_")
return(gridComb)
}
#' Fill in interaction
#'
#' @param list_var A list of variables (already initialized)
#' @param between A vector of variable names to include in the interaction
#' @param mu Either a numeric value or a numeric vector (of length = level)
#' @param sd Either numeric value or NA
#' @param level Number of interactions
#'
#' @return
#' A data frame with the filled-in interaction values
#' @export
fillInInteraction <- function(list_var, between, mu, sd, level) {
if (length(mu) > 1 || is.na(sd)) {
l_levels <- getGivenAttribute(list_var, "level") %>% unlist()
l_levelsOfInterest <- l_levels[between]
l_labels_varOfInterest <- getLabels(l_variables2labelized = between, l_nb_label = l_levelsOfInterest )
grid_combination <- getGridCombination(l_labels_varOfInterest)
n_combinations <- dim(grid_combination)[1]
column_names <- c(paste("label", between, sep = "_"), paste(between, collapse = ":"))
sub_dtf <- build_sub_obj_return_to_user(level = n_combinations,
metaData = grid_combination,
effectsGivenByUser = mu,
col_names = column_names)
} else {
sub_dtf <- list(mu = mu, sd = sd, level = level) %>% as.data.frame()
}
return(sub_dtf)
}
#' Get the list of variable names
#'
#' @param input R list, e.g., output of init_variable
#'
#' @return
#' A character vector with the names of variables
getListVar <- function(input) attributes(input)$names
#' Get a given attribute from a list of variables
#'
#' @param list_var A list of variables (already initialized)
#' @param attribute A string specifying the attribute to retrieve in all occurrences of the list
#'
#' @return
#' A list without NULL values
getGivenAttribute <- function(list_var, attribute) {
l <- lapply(list_var, FUN = function(var) var[[attribute]])
l_withoutNull <- l[!vapply(l, is.null, logical(1))]
return(l_withoutNull)
}
#' Get labels for variables
#'
#' @param l_variables2labelized A list of variables
#' @param l_nb_label A list of numeric values representing the number of levels per variable
#'
#' @return
#' A list of labels per variable
getLabels <- function(l_variables2labelized, l_nb_label) {
getVarNameLabel <- function(name, level) {
list_label <- paste(name, 1:level, sep = "")
return(list_label)
}
listLabels <- Map(getVarNameLabel, l_variables2labelized, l_nb_label)
return(listLabels)
}
test_that("endsWithDigit returns the correct result", {
expect_true(endsWithDigit("abc123"))
expect_false(endsWithDigit("xyz"))
})
test_that("removeDigitsAtEnd removes digits at the end of a string", {
expect_equal(removeDigitsAtEnd("abc123"), "abc")
expect_equal(removeDigitsAtEnd("xyz"), "xyz")
})
test_that("init_variable initializes a variable correctly", {
# Test case 1: Initialize a variable with default parameters
list_var <- init_variable()
expect_true("myVariable" %in% names(list_var))
expect_equal(nrow(list_var$myVariable$data), 2)
# Test case 2: Initialize a variable with custom parameters
list_var <- init_variable(name = "custom_variable", mu = c(1, 2, 3), sd = 0.5, level = 3)
expect_true("customvariable" %in% names(list_var))
expect_equal(nrow(list_var$customvariable$data), 3)
})
test_that("inputs_checking performs input validation", {
# Test case 1: Invalid inputs - sd is NA but mu has unique values
expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = 2, sd = NA, level = 2))
# Test case 2: Invalid inputs - empty name
expect_error(inputs_checking(list_var = c(), name = "", mu = 2, sd = NA, level = 2))
# Test case 3: Invalid inputs - non-numeric mu
expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = "invalid", sd = NA, level = 2))
# Test case 4: Invalid inputs - non-numeric sd
expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = 2, sd = "invalid", level = 2))
# Test case 5: Invalid inputs - level less than 2
expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = 2, sd = NA, level = 1))
# Test case 6: Invalid inputs - mu and level have different lengths
expect_error(inputs_checking(list_var = c(), name = "myVariable", mu = c(1, 2, 3), sd = NA, level = 2))
# Test case 7: Valid inputs
expect_silent(inputs_checking(list_var = c(), name = "myVariable", mu = c(1, 2, 3), sd = NA, level = 3))
})
test_that("already_init_variable checks if a variable is already initialized", {
list_var <- init_variable()
# Test case 1: Variable not initialized
list_var <- init_variable(name = "custom_variable", mu = c(2, 3), sd = NA, level = 2)
expect_true(already_init_variable(list_var, "customvariable"))
# Test case 2: Variable already initialized
expect_false(already_init_variable(list_var, "myVariable"))
})
test_that("fillInVariable fills in variable correctly", {
# Test case 1: Effects given by user
sub_obj <- fillInVariable("myVariable", c(1, 2, 3), NA, NA)
expect_equal(sub_obj$level, 3)
expect_equal(ncol(sub_obj$data), 2)
# Test case 2: Effects simulated using mvrnorm
sub_obj <- fillInVariable("myVariable", 2, 0.5, 3)
expect_equal(sub_obj$level, 3)
expect_equal(sub_obj$sd, 0.5)
expect_equal(sub_obj$mu, 2)
})
test_that("build_sub_obj_return_to_user returns the expected output", {
level <- 3
metaData <- paste("label", 1:level, sep = "_")
effectsGivenByUser <- c(2, 3, 4)
col_names <- c("metadata", "effects")
result <- build_sub_obj_return_to_user(level, metaData, effectsGivenByUser, col_names)
expect_equal(result$level, level)
expect_identical(result$data$metadata, metaData)
expect_identical(result$data$effects, effectsGivenByUser)
})
test_that("generateGridCombination_fromListVar returns expected output", {
result <- generateGridCombination_fromListVar(init_variable())
expect <- data.frame(label_myVariable = c("myVariable1", "myVariable2"))
expect_equal(nrow(result), nrow(expect))
expect_equal(ncol(result), ncol(expect))
})
test_that("add_interaction adds an interaction between variables", {
list_var <- init_variable(name = "varA", mu = 1, sd = 1, level = 2)
list_var <- init_variable(list_var, name = "varB", mu = 2, sd = 1, level = 3)
list_var <- add_interaction(list_var, between_var = c("varA", "varB"), mu = 0.5, sd = 3)
expect_true("varA:varB" %in% names(list_var$interactions))
})
test_that("add_interaction throws an error for invalid variables", {
list_var <- init_variable(name = "varA", mu = 1, sd = 1, level = 2)
expect_error(add_interaction(list_var, between_var = c("varA", "varB"), mu = 0.5, sd = NA))
})
test_that("getNumberOfCombinationsInInteraction calculates the number of combinations", {
list_var <- init_variable(name = "varA", mu = 1, sd = 1, level = 2)
list_var <- init_variable(list_var, name = "varB", mu = 2, sd = 1, level = 3)
expect_equal(getNumberOfCombinationsInInteraction(list_var, c("varA", "varB")), 6)
})
test_that("getLabels generates labels for variables", {
labels <- getLabels(c("varA", "varB"), c(2, 3))
expect_equal(length(labels), 2)
expect_equal(length(labels[[1]]), 2)
expect_equal(length(labels[[2]]), 3)
})
test_that("getGridCombination generates a grid of combinations", {
labels <- list(A = c("A1", "A2"), B = c("B1", "B2", "B3"))
grid_combination <- getGridCombination(labels)
expect_equal(dim(grid_combination), c(6, 2))
})
#' getInput2mvrnorm
#'
#' @inheritParams init_variable
#'
#' @return
#' a list that can be used as input for MASS::mvrnorm
#' @export
#'
#' @examples
#' list_var <- init_variable(name = "my_var", mu = 0, sd = 2, level = 3)
#' getInput2mvrnorm(list_var)
getInput2mvrnorm <- function(list_var){
# -- pick up sd provided by user
variable_standard_dev <- getGivenAttribute(list_var, attribute = "sd") %>% unlist()
interaction_standard_dev <- getGivenAttribute(list_var$interactions, attribute = "sd") %>% unlist()
list_stdev_2covmatx <- c(variable_standard_dev, interaction_standard_dev)
if (is.null(list_stdev_2covmatx)) ## NO SD provided
return(list(mu = NULL, covMatrix = NULL))
# - COV matrix
covar_userProvided = getGivenAttribute(list_var$correlations, "covar")
covMatrix <- getCovarianceMatrix(list_stdev_2covmatx, covar_userProvided)
# -- MU
variable_mu <- getGivenAttribute(list_var, attribute = "mu") %>% unlist()
interaction_mu <- getGivenAttribute(list_var$interactions, attribute = "mu") %>% unlist()
list_mu <- c(variable_mu, interaction_mu)
return(list(mu = list_mu, covMatrix = covMatrix))
}
#' getCovarianceMatrix
#' @param list_stdev standard deviation list
#' @param list_covar covariance list
#'
#' @return
#' covariance matrix
#' @export
#'
#' @examples
#' vector_sd <- c(1,2, 3)
#' names(vector_sd) <- c("varA", "varB", "varC")
#' vector_covar <- c(8, 12, 24)
#' names(vector_covar) <- c("varA.varB", "varA.varC", "varB.varC")
#' covMatrix <- getCovarianceMatrix(vector_sd, vector_covar)
getCovarianceMatrix <- function(list_stdev, list_covar){
# -- cov(A, A) = sd(A)^2
diag_cov <- list_stdev^2
dimension <- length(diag_cov)
covariance_matrix <- matrix(0,nrow = dimension, ncol = dimension)
diag(covariance_matrix) <- diag_cov
colnames(covariance_matrix) <- paste("label", names(diag_cov), sep = "_")
rownames(covariance_matrix) <- paste("label", names(diag_cov), sep = "_")
names_covaration <- names(list_covar)
###### -- utils -- #####
convertDF <- function(name, value){
ret <- data.frame(value)
colnames(ret) <- name
ret
}
## -- needed to use reduce after ;)
l_covarUserDf <- lapply(names_covaration, function(n_cov) convertDF(n_cov, list_covar[n_cov] ))
covariance_matrix2ret <- Reduce(fillInCovarMatrice, x = l_covarUserDf, init = covariance_matrix)
covariance_matrix2ret
}
#' Fill in Covariance Matrix
#'
#' This function updates the covariance matrix with the specified covariance value between two variables.
#'
#' @param covarMatrice The input covariance matrix.
#' @param covar A data frame containing the covariance value between two variables.
#' @return The updated covariance matrix with the specified covariance value filled in.
#' @export
#' @examples
#' covarMat <- matrix(0, nrow = 3, ncol = 3)
#' colnames(covarMat) <- c("label_varA", "label_varB", "label_varC")
#' rownames(covarMat) <- c("label_varA", "label_varB", "label_varC")
#' covarValue <- data.frame("varA.varB" = 0.5)
#' fillInCovarMatrice(covarMatrice = covarMat, covar = covarValue)
fillInCovarMatrice <- function(covarMatrice, covar){
varsInCovar <- strsplit(colnames(covar), split = "[.]") %>% unlist()
index_matrix <- paste("label",varsInCovar, sep = "_")
covar_value <- covar[1,1]
covarMatrice[index_matrix[1], index_matrix[2]] <- covar_value
covarMatrice[index_matrix[2], index_matrix[1]] <- covar_value
return(covarMatrice)
}
#' Check if a matrix is positive definite
#' This function checks whether a given matrix is positive definite, i.e., all of its eigenvalues are positive.
#' @param mat The matrix to be checked.
#' @return A logical value indicating whether the matrix is positive definite.
#' @export
#' @examples
#' # Create a positive definite matrix
#' mat1 <- matrix(c(4, 2, 2, 3), nrow = 2)
#' is_positive_definite(mat1)
#' # Expected output: TRUE
#'
#' # Create a non-positive definite matrix
#' mat2 <- matrix(c(4, 2, 2, -3), nrow = 2)
#' is_positive_definite(mat2)
#' # Expected output: FALSE
#'
#' # Check an empty matrix
#' mat3 <- matrix(nrow = 0, ncol = 0)
#' is_positive_definite(mat3)
#' # Expected output: TRUE
#'
#' @export
is_positive_definite <- function(mat) {
if (nrow(mat) == 0 && ncol(mat) == 0) return(TRUE)
eigenvalues <- eigen(mat)$values
all(eigenvalues > 0)
}
#' getGeneMetadata
#'
#' @inheritParams init_variable
#' @param n_genes Number of genes to simulate
#'
#' @return
#' metadata matrix
#'
#' @export
#'
#' @examples
#' list_var <- init_variable()
#' metadata <- getGeneMetadata(list_var, n_genes = 10)
getGeneMetadata <- function(list_var, n_genes) {
metaData <- generateGridCombination_fromListVar(list_var)
n_combinations <- dim(metaData)[1]
genes_vec <- base::paste("gene", 1:n_genes, sep = "")
geneID <- rep(genes_vec, each = n_combinations)
metaData <- cbind(geneID, metaData)
return(metaData)
}
#' getDataFromMvrnorm
#'
#' @inheritParams init_variable
#' @param input2mvrnorm list with mu and covariance matrix, output of getInput2mvrnorm
#' @param n_genes Number of genes to simulate
#'
#' @return
#' data simulated from multivariate normal distribution
#'
#' @export
#'
#' @examples
#' list_var <- init_variable()
#' input <- getInput2mvrnorm(list_var)
#' simulated_data <- getDataFromMvrnorm(list_var, input, n_genes = 10)
getDataFromMvrnorm <- function(list_var, input2mvrnorm, n_genes = 1) {
if (is.null(input2mvrnorm$covMatrix))
return(list())
metaData <- getGeneMetadata(list_var, n_genes)
n_tirages <- dim(metaData)[1]
mtx_mvrnormSamplings <- samplingFromMvrnorm(n_samplings = n_tirages,
l_mu = input2mvrnorm$mu, matx_cov = input2mvrnorm$covMatrix)
dataFromMvrnorm <- cbind(metaData, mtx_mvrnormSamplings)
return(list(dataFromMvrnorm))
}
#' getDataFromMvrnorm
#'
#' @param n_samplings number of samplings using mvrnorm
#' @param l_mu vector of mu
#' @param matx_cov covariance matrix
#'
#' @return
#' samples generated from multivariate normal distribution
#'
#' @export
#'
#' @examples
#' n <- 100
#' mu <- c(0, 0)
#' covMatrix <- matrix(c(1, 0.5, 0.5, 1), ncol = 2)
#' samples <- samplingFromMvrnorm(n_samplings = n, l_mu = mu, matx_cov = covMatrix)
samplingFromMvrnorm <- function(n_samplings, l_mu, matx_cov) {
mvrnormSamp <- MASS::mvrnorm(n = n_samplings, mu = l_mu, Sigma = matx_cov, empirical = TRUE)
return(mvrnormSamp)
}
test_that("getInput2mvrnorm returns the correct list", {
list_var <- init_variable()
input <- getInput2mvrnorm(list_var)
expect_is(input, "list")
expect_true("mu" %in% names(input))
expect_true("covMatrix" %in% names(input))
})
test_that("fillInCovarMatrice returns the correct matrix", {
covarMat <- matrix(0, nrow = 3, ncol = 3)
colnames(covarMat) <- c("label_varA", "label_varB", "label_varC")
rownames(covarMat) <- c("label_varA", "label_varB", "label_varC")
covarValue <- data.frame("varA.varB" = 18)
matrice <- fillInCovarMatrice(covarMatrice = covarMat, covar = covarValue)
expected_matrice <- matrix(0, nrow = 3, ncol = 3)
colnames(expected_matrice) <- c("label_varA", "label_varB", "label_varC")
rownames(expected_matrice) <- c("label_varA", "label_varB", "label_varC")
expected_matrice["label_varA", "label_varB"] <- 18
expected_matrice["label_varB", "label_varA"] <- 18
expect_identical(matrice, expected_matrice)
})
test_that("getCovarianceMatrix returns the correct covariance matrix", {
vector_sd <- c(1,2, 3)
names(vector_sd) <- c("varA", "varB", "varC")
vector_covar <- c(8, 12, 24)
names(vector_covar) <- c("varA.varB", "varA.varC", "varB.varC")
covMatrix <- getCovarianceMatrix(vector_sd, vector_covar)
expect_is(covMatrix, "matrix")
expect_equal(dim(covMatrix), c(3, 3))
expected_matrix <- matrix(c(1,8,12,8,4,24, 12,24,9), nrow = 3, byrow = T)
rownames(expected_matrix) <- c("label_varA", "label_varB", "label_varC")
colnames(expected_matrix) <- c("label_varA", "label_varB", "label_varC")
expect_equal(expected_matrix, covMatrix)
})
test_that("getGeneMetadata returns the correct metadata", {
list_var <- init_variable()
n_genes <- 10
metadata <- getGeneMetadata(list_var, n_genes)
expect_is(metadata, "data.frame")
expect_equal(colnames(metadata), c("geneID", paste("label", (attributes(list_var)$names), sep ="_")))
expect_equal(nrow(metadata), n_genes * list_var$myVariable$level)
})
test_that("getDataFromMvrnorm returns the correct data", {
list_var <- init_variable(name = "varA", mu = 1, sd = 4, level = 3) %>% init_variable("varB", mu = 2, sd = 1, level = 2)
input <- getInput2mvrnorm(list_var)
n_genes <- 10
n_samplings <- n_genes * (list_var$varA$level ) * (list_var$varB$level )
data <- getDataFromMvrnorm(list_var, input, n_genes)
expect_is(data, "list")
expect_equal(length(data), 1)
expect_is(data[[1]], "data.frame")
expect_equal(nrow(data[[1]]), n_samplings)
})
test_that("getDataFromMvrnomr returns empty list",{
list_var <- init_variable()
input <- getInput2mvrnorm(list_var)
n_genes <- 10
n_samplings <- n_genes * (list_var$varA$level ) * (list_var$varB$level )
data <- getDataFromMvrnorm(list_var, input, n_genes)
expect_is(data, "list")
expect_equal(data, list())
})
test_that("samplingFromMvrnorm returns the correct sampling", {
n_samplings <- 100
l_mu <- c(1, 2)
matx_cov <- matrix(c(1, 0.5, 0.5, 1), ncol = 2)
sampling <- samplingFromMvrnorm(n_samplings, l_mu, matx_cov)
expect_is(sampling, "matrix")
expect_equal(dim(sampling), c(n_samplings, length(l_mu)))
})
#' Get data from user
#'
#'
#' @param list_var A list of variables (already initialized)
#' @return A list of data to join
#' @export
#'
#' @examples
#' getDataFromUser(init_variable())
getDataFromUser <- function(list_var) {
variable_data2join <- getGivenAttribute(list_var, "data")
id_var2join <- names(variable_data2join)
interaction_data2join <- getGivenAttribute(list_var$interactions, "data")
id_interaction2join <- names(interaction_data2join)
data2join <- list(variable_data2join, interaction_data2join) %>%
unlist(recursive = FALSE)
id2join <- c(id_var2join, id_interaction2join)
l_data2join <- lapply(id2join, function(id) data2join[[id]])
return(l_data2join)
}
# Test unitaires pour la fonction join_dtf
test_that("join_dtf réalise la jointure correctement", {
# Création de données de test
df1 <- data.frame(id = 1:5, value = letters[1:5])
df2 <- data.frame(id = 1:5, category = LETTERS[1:5])
# Exécution de la fonction
result <- join_dtf(df1, df2, "id", "id")
# Vérification des résultats
expect_true(is.data.frame(result))
expect_equal(nrow(result), 5)
expect_equal(ncol(result), 3)
expect_equal(names(result), c("id", "value", "category"))
expect_true(all.equal(result$id, df1$id))
expect_true(all.equal(result$id, df2$id))
})
# Test unitaires pour la fonction getDataFromUser
test_that("getDataFromUser renvoie les données appropriées", {
# Exécution de la fonction
list_var <- init_variable()
list_var <- init_variable(list_var, "second_var")
result <- getDataFromUser(list_var)
# Vérification des résultats
expect_true(is.list(result))
expect_equal(length(result), 2)
expect_true(all(sapply(result, is.data.frame)))
expect_equal(names(result[[1]]), c("label_myVariable", "myVariable"))
})
#' Compute Covariation from Correlation and Standard Deviations
#'
#' This function computes the covariation between two variables (A and B) given their correlation and standard deviations.
#'
#' @param corr_AB The correlation coefficient between variables A and B.
#' @param sd_A The standard deviation of variable A.
#' @param sd_B The standard deviation of variable B.
#'
#' @return The covariation between variables A and B.
#' @export
#' @examples
#' corr <- 0.7
#' sd_A <- 3
#' sd_B <- 4
#' compute_covariation(corr, sd_A, sd_B)
compute_covariation <- function(corr_AB, sd_A, sd_B) {
cov_AB <- corr_AB * sd_A * sd_B
return(cov_AB)
}
#' Get Standard Deviations for Variables in Correlation
#'
#' This function extracts the standard deviations for the variables involved in the correlation.
#'
#' @param list_var A list containing the variables and their attributes.
#' @param between_var A character vector containing the names of the variables involved in the correlation.
#'
#' @return A numeric vector containing the standard deviations for the variables in the correlation.
#' @export
#' @examples
#' list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>%
#' init_variable(name = "varB", mu = 0, sd = 25, level = 3)
#' between_var <- c("varA", "varB")
#' getStandardDeviationInCorrelation(list_var, between_var)
getStandardDeviationInCorrelation <- function(list_var, between_var){
for (var in between_var) sd_List <- getGivenAttribute(list_var, "sd")
for (var in between_var) sd_ListFromInteraction <- getGivenAttribute(list_var$interactions, "sd")
sd_List <- c(sd_List, sd_ListFromInteraction)
return(unname(unlist(sd_List[between_var])))
}
#' Set Correlation between Variables
#'
#' Set the correlation between two or more variables in a simulation.
#'
#' @param list_var A list containing the variables used in the simulation, initialized using \code{\link{init_variable}}.
#' @param between_var Character vector specifying the names of the variables to set the correlation between.
#' @param corr Numeric value specifying the desired correlation between the variables.
#'
#' @return Updated \code{list_var} with the specified correlation set between the variables.
#'
#' @details The function checks if the variables specified in \code{between_var} are declared and initialized in the \code{list_var}. It also ensures that at least two variables with provided standard deviation are required to set a correlation in the simulation.
#' The specified correlation value must be within the range (-1, 1). The function computes the corresponding covariance between the variables based on the specified correlation and standard deviations.
#' The correlation information is then added to the \code{list_var} in the form of a data frame containing the correlation value and the corresponding covariance value.
#' @export
#' @examples
#' list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>%
#' init_variable(name = "varB", mu = 0, sd = 25, level = 3)
#' list_var <- set_correlation(list_var, between_var = c("varA", "varB"), corr = 0.7)
set_correlation <- function(list_var, between_var, corr) {
# Check if variables in between_var are declared and initialized
bool_checkBetweenVarValidity <- function(between_var, list_var) {
nb_varInCorrelation <- length(unique(between_var))
stopifnot(nb_varInCorrelation > 1)
# -- check also for interaction
varInitialized <- c(getListVar(list_var), getListVar(list_var$interactions))
existingVar_nb <- varInitialized %in% between_var %>% sum()
if (existingVar_nb != nb_varInCorrelation) {
return(FALSE)
} else {
return(TRUE)
}
}
name_correlation <- paste(between_var, collapse = ".")
bool_valid_corr <- bool_checkBetweenVarValidity(between_var, list_var)
if (!bool_valid_corr) {
stop("At least one variable in between_var is not declared. Variable not initialized cannot be used in a correlation.")
}
vec_standardDev <- getStandardDeviationInCorrelation(list_var, between_var)
if (length(vec_standardDev) < 2) {
stop("Exactly two variables with provided standard deviation are required to set a correlation in simulation.")
}
# Validate the specified correlation value to be within the range [-1, 1]
if (corr < -1 || corr > 1) {
stop("Invalid correlation value. Correlation must be in the range [-1, 1].")
}
name_interaction <- paste(between_var, collapse = ":")
corr <- data.frame(cor = corr, covar = compute_covariation(corr, vec_standardDev[1], vec_standardDev[2] ))
list_var$correlations[[name_correlation]] <- corr
return(list_var)
}
test_that("compute_covariation returns the correct covariation", {
# Test case 1: Positive correlation
corr <- 0.7
sd_A <- 3
sd_B <- 4
expected_cov <- corr * sd_A * sd_B
actual_cov <- compute_covariation(corr, sd_A, sd_B)
expect_equal(actual_cov, expected_cov)
# Test case 2: Negative correlation
corr <- -0.5
sd_A <- 2.5
sd_B <- 3.5
expected_cov <- corr * sd_A * sd_B
actual_cov <- compute_covariation(corr, sd_A, sd_B)
expect_equal(actual_cov, expected_cov)
# Test case 3: Zero correlation
corr <- 0
sd_A <- 1
sd_B <- 2
expected_cov <- corr * sd_A * sd_B
actual_cov <- compute_covariation(corr, sd_A, sd_B)
expect_equal(actual_cov, expected_cov)
})
# Unit tests for getStandardDeviationInCorrelation
test_that("getStandardDeviationInCorrelation returns correct standard deviations", {
# Initialize list_var
list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>%
init_variable(name = "varB", mu = 0, sd = 25, level = 3)
# Test case 1: Two variables correlation
between_var_1 <- c("varA", "varB")
sd_expected_1 <- c(5, 25)
sd_result_1 <- getStandardDeviationInCorrelation(list_var, between_var_1)
expect_equal(sd_result_1, sd_expected_1)
})
test_that("set_correlation sets the correlation between variables correctly", {
# Initialize variables in the list_var
list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>%
init_variable(name = "varB", mu = 0, sd = 25, level = 3)
# Test setting correlation between varA and varB
list_var <- set_correlation(list_var, between_var = c("varA", "varB"), corr = 0.7)
corr_result <- list_var$correlations$varA.varB$cor
covar_result <- list_var$correlations$varA.varB$covar
expect_equal(corr_result, 0.7)
expect_equal(covar_result, 87.5)
# Test setting correlation between varA and varC (should raise an error)
expect_error(set_correlation(list_var, between_var = c("varA", "varC"), corr = 0.8),
"At least one variable in between_var is not declared. Variable not initialized cannot be used in a correlation.")
# Test setting correlation with invalid correlation value
expect_error(set_correlation(list_var, between_var = c("varA", "varB"), corr = 1.5))
# Test setting correlation with less than 2 variables with provided standard deviation
expect_error(set_correlation(list_var, between_var = c("varA"), corr = 0.7))
})
#' Get input for simulation based on coefficients
#'
#' This function generates input data for simulation based on the coefficients provided in the \code{list_var} argument.
#'
#' @param list_var A list of variables (already initialized)
#' @param n_genes Number of genes to simulate (default: 1)
#' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL)
#' @return A data frame with input coefficients for simulation
#' @export
#' @examples
#' # Example usage
#' list_var <- init_variable()
#' getInput2simulation(list_var, n_genes = 10)
getInput2simulation <- function(list_var, n_genes = 1, input2mvrnorm = NULL) {
# Use default input to mvrnorm if not provided by the user
if (is.null(input2mvrnorm)) input2mvrnorm = getInput2mvrnorm(list_var)
l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
l_dataFromUser = getDataFromUser(list_var)
df_input2simu <- getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes)
return(df_input2simu)
}
#' getCoefficients
#'
#' Get the coefficients.
#'
#' @param list_var A list of variables (already initialized)
#' @param l_dataFromMvrnorm Data from the `getGeneMetadata` function (optional).
#' @param l_dataFromUser Data from the `getDataFromUser` function (optional).
#' @param n_genes The number of genes.
#' @export
#' @return A dataframe containing the coefficients.
#' @examples
#' # Example usage
#' list_var <- init_variable()
#' input2mvrnorm = getInput2mvrnorm(list_var)
#' l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
#' l_dataFromUser = getDataFromUser(list_var)
#' getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes = 3)
getCoefficients <- function(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) {
if (length(l_dataFromMvrnorm) == 0) {
metaData <- getGeneMetadata(list_var, n_genes)
l_dataFromMvrnorm <- list(metaData)
}
l_df2join <- c(l_dataFromMvrnorm, l_dataFromUser)
df_coef <- Reduce(function(d1, d2){ column_names = colnames(d2)
idx_key = grepl(pattern = "label", column_names )
keys = column_names[idx_key]
join_dtf(d1, d2, k1 = keys , k2 = keys)
}
, l_df2join ) %>% as.data.frame()
column_names <- colnames(df_coef)
idx_column2factor <- grep(pattern = "label_", column_names)
if (length(idx_column2factor) > 1) {
df_coef[, idx_column2factor] <- lapply(df_coef[, idx_column2factor], as.factor)
} else {
df_coef[, idx_column2factor] <- as.factor(df_coef[, idx_column2factor])
}
return(df_coef)
}
#' Get the log_qij values from the coefficient data frame.
#'
#' @param dtf_coef The coefficient data frame.
#' @return The coefficient data frame with log_qij column added.
#' @export
getLog_qij <- function(dtf_coef) {
dtf_beta_numeric <- dtf_coef[sapply(dtf_coef, is.numeric)]
dtf_coef$log_qij <- rowSums(dtf_beta_numeric, na.rm = TRUE)
return(dtf_coef)
}
#' Calculate mu_ij values based on coefficient data frame and scaling factor
#'
#' This function calculates mu_ij values by raising 2 to the power of the log_qij values
#' from the coefficient data frame and multiplying it by the provided scaling factor.
#'
#' @param dtf_coef Coefficient data frame containing the log_qij values
#'
#' @return Coefficient data frame with an additional mu_ij column
#'
#' @examples
#' list_var <- init_variable()
#' dtf_coef <- getInput2simulation(list_var, 10)
#' dtf_coef <- getLog_qij(dtf_coef)
#' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
#' getMu_ij(dtf_coef)
#' @export
getMu_ij <- function(dtf_coef) {
log_qij_scaled <- dtf_coef$log_qij + dtf_coef$basalExpr
dtf_coef$log_qij_scaled <- log_qij_scaled
mu_ij <- exp(log_qij_scaled)
dtf_coef$mu_ij <- mu_ij
return(dtf_coef)
}
#' getMu_ij_matrix
#'
#' Get the Mu_ij matrix.
#'
#' @param dtf_coef A dataframe containing the coefficients.
#' @importFrom reshape2 dcast
#' @importFrom stats as.formula
#' @export
#' @return A Mu_ij matrix.
getMu_ij_matrix <- function(dtf_coef) {
column_names <- colnames(dtf_coef)
idx_var <- grepl(pattern = "label", column_names)
l_var <- column_names[idx_var]
str_formula_rigth <- paste(l_var, collapse = " + ")
if (str_formula_rigth == "") stop("no variable label detected")
str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ")
formula <- stats::as.formula(str_formula)
dtf_Muij <- dtf_coef %>% reshape2::dcast(formula = formula, value.var = "mu_ij", drop = F)
dtf_Muij[is.na(dtf_Muij)] <- 0
mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix()
mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F]
return(mtx_Muij)
}
#' getSubCountsTable
#'
#' Get the subcounts table.
#'
#' @param matx_Muij The Mu_ij matrix.
#' @param matx_dispersion The dispersion matrix.
#' @param replicateID The replication identifier.
#' @param l_bool_replication A boolean vector indicating the replicates.
#' @importFrom stats rnbinom
#'
#' @return A subcounts table.
getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) {
getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) {
k_ij <- stats::rnbinom(n_genes * n_samples,
size = matx_dispersion,
mu = matx_Muij) %>%
matrix(nrow = n_genes, ncol = n_samples)
k_ij[is.na(k_ij)] <- 0
return(k_ij)
}
if (!any(l_bool_replication))
return(NULL)
matx_Muij <- matx_Muij[, l_bool_replication, drop = FALSE]
matx_dispersion <- matx_dispersion[, l_bool_replication, drop = FALSE]
l_sampleID <- colnames(matx_Muij)
l_geneID <- rownames(matx_Muij)
dimension_mtx <- dim(matx_Muij)
n_genes <- dimension_mtx[1]
n_samples <- dimension_mtx[2]
matx_kij <- getKijMatrix(matx_Muij, matx_dispersion, n_genes, n_samples)
colnames(matx_kij) <- paste(l_sampleID, replicateID, sep = "_")
rownames(matx_kij) <- l_geneID
return(matx_kij)
}
# Test case 1: Check if the function returns a data frame
test_that("getInput2simulation returns a data frame", {
list_var <- init_variable()
result <- getInput2simulation(list_var)
expect_is(result, "data.frame")
expected <- data.frame(geneID = c("gene1", "gene1"), label_myVariable = as.factor(c("myVariable1", "myVariable2")), myVariable = c(2,3))
expect_equal(result, expected)
})
# Test for getCoefficients function
test_that("getCoefficients returns the correct output", {
# Create dummy data
n_genes <- 3
list_var = init_variable()
# Call the function
coefficients <- getCoefficients(list_var, list(), list(), n_genes)
# Check the output
expect_equal(nrow(coefficients), n_genes*list_var$myVariable$level)
expect_equal(colnames(coefficients), c("geneID", "label_myVariable"))
})
# Test for getMu_ij_matrix function
test_that("getMu_ij_matrix returns the correct output", {
# Create a dummy coefficients dataframe
dtf_coef <- data.frame(geneID = c("Gene1", "Gene1", "Gene1"),
label_varA = c("A1", "A2", "A3"),
label_varB = c("B1", "B2", "B3"),
mu_ij = c(1, 2, 3))
# Call the function
mu_matrix <- getMu_ij_matrix(dtf_coef)
# Check the output
expect_equal(dim(mu_matrix), c(1, 9))
})
# Test for getSubCountsTable function
test_that("getSubCountsTable returns the correct output", {
# Create dummy data
l_genes <- c("gene1", "gene2", "gene3")
matx_Muij = data.frame(sple1 = c(1,3,4), sple2 = c(2, 0, 9), sple3 = c(1, 69, 2)) %>% as.matrix()
rownames(matx_Muij) <- l_genes
matx_dispersion <- matrix(0.5, nrow = 3, ncol = 3)
replicateID <- 1
l_bool_replication <- c(TRUE, FALSE, TRUE)
# Call the function
subcounts_table <- getSubCountsTable(matx_Muij, matx_dispersion, 1, l_bool_replication)
# Check the output
expect_equal(dim(subcounts_table), c(3, 2))
expect_equal(rownames(subcounts_table), l_genes)
})
#' getReplicationMatrix
#'
#' @param minN Minimum number of replicates for each sample
#' @param maxN Maximum number of replicates for each sample
#' @param n_samples Number of samples
#' @export
#' @return A replication matrix indicating which samples are replicated
getReplicationMatrix <- function(minN, maxN, n_samples) {
# Create a list of logical vectors representing the minimum number of replicates
l_replication_minimum = lapply(1:n_samples,
FUN = function(i) rep(TRUE, times = minN) )
# Create a list of random logical vectors representing additional replicates
l_replication_random = lapply(1:n_samples,
FUN = function(i) sample(x = c(TRUE, FALSE), size = maxN-minN, replace = T) )
# Combine the replication vectors into matrices
matx_replication_minimum <- do.call(cbind, l_replication_minimum)
matx_replication_random <- do.call(cbind, l_replication_random)
# Combine the minimum replicates and random replicates into a single matrix
matx_replication <- rbind(matx_replication_minimum, matx_replication_random)
# Sort the columns of the replication matrix in descending order
matx_replication = apply(matx_replication, 2, sort, decreasing = TRUE ) %>% matrix(nrow = maxN)
return(matx_replication)
}
#' getCountsTable
#'
#' @param matx_Muij Matrix of mean expression values for each gene and sample
#' @param matx_dispersion Matrix of dispersion values for each gene and sample
#' @param matx_bool_replication Replication matrix indicating which samples are replicated
#'
#' @return A counts table containing simulated read counts for each gene and sample
getCountsTable <- function(matx_Muij , matx_dispersion, matx_bool_replication ){
max_replicates <- dim(matx_bool_replication)[1]
# Apply the getSubCountsTable function to each row of the replication matrix
l_countsTable = lapply(1:max_replicates, function(i) getSubCountsTable(matx_Muij , matx_dispersion, i, matx_bool_replication[i,] ))
# Combine the counts tables into a single matrix
countsTable = do.call(cbind, l_countsTable)
return(countsTable %>% as.data.frame())
}
#' getDispersionMatrix
#'
#' @param list_var A list of variables (already initialized)
#' @param n_genes Number of genes
#' @param dispersion Vector of dispersion values for each gene
#' @export
#'
#' @return A matrix of dispersion values for each gene and sample
getDispersionMatrix <- function(list_var, n_genes, dispersion = stats::runif(n_genes, min = 0, max = 1000)){
l_geneID = base::paste("gene", 1:n_genes, sep = "")
l_sampleID = getSampleID(list_var)
n_samples = length(l_sampleID)
l_dispersion <- dispersion
# Create a data frame for the dispersion values
dtf_dispersion = list(dispersion = l_dispersion) %>% as.data.frame()
dtf_dispersion <- dtf_dispersion[, rep("dispersion", n_samples)]
rownames(dtf_dispersion) = l_geneID
colnames(dtf_dispersion) = l_sampleID
matx_dispersion = dtf_dispersion %>% as.matrix()
return(matx_dispersion)
}
#' Replicate rows of a data frame by group
#'
#' Replicates the rows of a data frame based on a grouping variable and replication counts for each group.
#'
#' @param df Data frame to replicate
#' @param group_var Name of the grouping variable in the data frame
#' @param rep_list Vector of replication counts for each group
#' @return Data frame with replicated rows
#' @examples
#' df <- data.frame(group = c("A", "B"), value = c(1, 2))
#' .replicateByGroup(df, "group", c(2, 3))
#'
#' @export
.replicateByGroup <- function(df, group_var, rep_list) {
l_group_var <- df[[group_var]]
group_levels <- unique(l_group_var)
names(rep_list) <- group_levels
group_indices <- rep_list[l_group_var]
replicated_indices <- rep(seq_len(nrow(df)), times = group_indices)
replicated_df <- df[replicated_indices, ]
suffix_sampleID <- sequence(group_indices)
replicated_df[["sampleID"]] <- paste(replicated_df[["sampleID"]], suffix_sampleID, sep = "_")
rownames(replicated_df) <- NULL
return(replicated_df)
}
#' Replicate rows of a data frame
#'
#' Replicates the rows of a data frame by a specified factor.
#'
#' @param df Data frame to replicate
#' @param n Replication factor for each row
#' @return Data frame with replicated rows
#' @export
#' @examples
#' df <- data.frame(a = 1:3, b = letters[1:3])
#' .replicateRows(df, 2)
#'
.replicateRows <- function(df, n) {
indices <- rep(seq_len(nrow(df)), each = n)
replicated_df <- df[indices, , drop = FALSE]
rownames(replicated_df) <- NULL
return(replicated_df)
}
#' Get sample metadata
#'
#' Generates sample metadata based on the input variables, replication matrix, and number of genes.
#'
#' @param list_var A list of variables (already initialized)
#' @param replicationMatrix Replication matrix
#' @param n_genes Number of genes
#' @return Data frame of sample metadata
#' @importFrom data.table setorderv
#' @export
#' @examples
#' list_var <- init_variable()
#' n_genes <- 10
#' replicationMatrix <- generateReplicationMatrix(list_var ,2, 3)
#' getSampleMetadata(list_var, n_genes, replicationMatrix)
getSampleMetadata <- function(list_var, n_genes, replicationMatrix) {
l_sampleIDs = getSampleID(list_var)
metaData <- generateGridCombination_fromListVar(list_var)
metaData[] <- lapply(metaData, as.character) ## before reordering
data.table::setorderv(metaData, cols = colnames(metaData))
metaData[] <- lapply(metaData, as.factor)
metaData$sampleID <- l_sampleIDs
rep_list <- colSums(replicationMatrix)
metaData$sampleID <- as.character(metaData$sampleID) ## before replicating
sampleMetadata <- .replicateByGroup(metaData, "sampleID", rep_list)
colnames(sampleMetadata) <- gsub("label_", "", colnames(sampleMetadata))
return(sampleMetadata)
}
#' getSampleID
#'
#' @param list_var A list of variables (already initialized)
#' @export
#' @return A sorted vector of sample IDs
getSampleID <- function(list_var){
gridCombination <- generateGridCombination_fromListVar(list_var)
l_sampleID <- apply( gridCombination , 1 , paste , collapse = "_" ) %>% unname()
return(sort(l_sampleID))
}
test_that("getReplicationMatrix returns the correct replication matrix", {
minN <- 2
maxN <- 4
n_samples <- 3
expected <- matrix(c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE), nrow = maxN)
set.seed(123)
result <- getReplicationMatrix(minN, maxN, n_samples)
expect_equal(result, expected)
})
test_that("getSampleID return the correct list of sampleID",{
expect_equal(getSampleID(init_variable()), c("myVariable1", "myVariable2"))
})
# Create a test case for getMu_ij
test_that("getMu_ij returns the correct output", {
# Create a sample coefficient data frame
dtf_coef <- data.frame(
log_qij = c(1, 9, 0.1),
basalExpr = c(2, 3, 4)
)
# Call the getMu_ij function
result <- getMu_ij(dtf_coef)
# Check if the mu_ij column is added
expect_true("mu_ij" %in% colnames(result))
# Check the values of mu_ij
#expected_mu_ij <- c(20.08554, 162754.79142 , 60.34029)
#expect_equal(result$mu_ij, expected_mu_ij, tolerance = 0.000001)
})
# Create a test case for getLog_qij
test_that("getLog_qij returns the correct output", {
# Create a sample coefficient data frame
dtf_coef <- data.frame(
beta1 = c(1.2, 2.3, 3.4),
beta2 = c(0.5, 1.0, 1.5),
non_numeric = c("a", "b", "c")
)
# Call the getLog_qij function
result <- getLog_qij(dtf_coef)
# Check if the log_qij column is added
expect_true("log_qij" %in% colnames(result))
# Check the values of log_qij
expected_log_qij <- c(1.7, 3.3, 4.9)
expect_equal(result$log_qij, expected_log_qij)
})
test_that("getCountsTable returns the correct counts table", {
mat_mu_ij <- matrix(c(1,2,3,4,5,6), ncol = 3, byrow = T)
rownames(mat_mu_ij) <- c("gene1", "gene2")
colnames(mat_mu_ij) <- c("sample1", "sample2", "sample3")
mat_disp <- matrix(c(0.3,0.3,0.3, 0.5,0.5,0.5), ncol = 3, byrow = T)
rownames(mat_disp) <- c("gene1", "gene2")
colnames(mat_disp) <- c("sample1", "sample2", "sample3")
mat_repl <- matrix(c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), ncol = 3, byrow = T)
expected_df <- matrix(c(0,0,1,0,0,0,0,1,0,2,34,18,0,0,3,10,7,2), nrow = 2, byrow = T) %>% as.data.frame()
rownames(expected_df) <- c("gene1", "gene2")
colnames(expected_df) <- c("sample1_1", "sample2_1", "sample3_1", "sample1_2",
"sample2_2","sample3_2","sample1_3", "sample2_3" ,"sample3_3")
set.seed(123)
result <- getCountsTable(mat_mu_ij, mat_disp, mat_repl)
expect_true(is.data.frame(result))
expect_equal(colnames(result), colnames(expected_df))
expect_equal(rownames(result), rownames(expected_df))
})
test_that("getSampleMetadata returns expected output", {
# Set up input variables
list_var <- init_variable()
n_genes <- 3
replicationMatrix <- matrix(TRUE, nrow = 2, ncol = 2)
# Run the function
result <- getSampleMetadata(list_var, n_genes, replicationMatrix)
# Define expected output
expected_colnames <- c("myVariable", "sampleID")
expect_equal(colnames(result), expected_colnames)
# Check the output class
expect_true(is.data.frame(result))
# check nrow output
expect_equal(nrow(result), 4)
})
test_that(".replicateByGroup return the correct ouptut", {
df <- data.frame(group = c("A", "B"), value = c(1, 2))
result <- .replicateByGroup(df, "group", c(2, 3))
expect <- data.frame(group = c("A", "A", "B", "B", "B"),
value = c(1, 1, 2,2,2),
sampleID = c("_1", "_2", "_1", "_2", "_3" ))
expect_equal(result, expect)
})
test_that("getDispersionMatrix returns the correct dispersion matrix", {
n_genes = 3
list_var = init_variable()
dispersion <- 1:3
expected <- matrix(1:3,byrow = F, nrow = 3, ncol = 2)
rownames(expected) <- c("gene1", "gene2", "gene3")
colnames(expected) <- c("myVariable1", "myVariable2")
result <- getDispersionMatrix(list_var, n_genes, dispersion )
expect_equal(result, expected)
})
#' Check the validity of the dispersion matrix
#'
#' Checks if the dispersion matrix has the correct dimensions.
#'
#' @param matx_dispersion Replication matrix
#' @param matx_bool_replication Replication matrix
#' @return TRUE if the dimensions are valid, FALSE otherwise
#' @export
#' @examples
#' matx_dispersion <- matrix(1:12, nrow = 3, ncol = 4)
#' matx_bool_replication <- matrix(TRUE, nrow = 3, ncol = 4)
#' .isDispersionMatrixValid(matx_dispersion, matx_bool_replication)
.isDispersionMatrixValid <- function(matx_dispersion, matx_bool_replication) {
expected_nb_column <- dim(matx_bool_replication)[2]
if (expected_nb_column != dim(matx_dispersion)[2]) {
return(FALSE)
}
return(TRUE)
}
#' Generate count table
#'
#' Generates the count table based on the mu_ij matrix, dispersion matrix, and replication matrix.
#'
#' @param mu_ij_matx_rep Replicated mu_ij matrix
#' @param matx_dispersion_rep Replicated dispersion matrix
#' @return Count table
#' @export
#' @examples
#' mu_ij_matx_rep <- matrix(1:12, nrow = 3, ncol = 4)
#' matx_dispersion_rep <- matrix(1:12, nrow = 3, ncol = 4)
#' generateCountTable(mu_ij_matx_rep, matx_dispersion_rep)
generateCountTable <- function(mu_ij_matx_rep, matx_dispersion_rep) {
message("k_ij ~ Nbinom(mu_ij, dispersion)")
n_genes <- dim(mu_ij_matx_rep)[1]
n_samples <- dim(mu_ij_matx_rep)[2]
n_samplings <- prod(n_genes * n_samples)
mat_countsTable <- rnbinom(n_samplings,
size = matx_dispersion_rep,
mu = mu_ij_matx_rep) %>%
matrix(nrow = n_genes, ncol = n_samples)
colnames(mat_countsTable) <- colnames(mu_ij_matx_rep)
rownames(mat_countsTable) <- rownames(mu_ij_matx_rep)
mat_countsTable[is.na(mat_countsTable)] <- 0
return(mat_countsTable)
}
#' Perform RNA-seq simulation
#'
#' Simulates RNA-seq data based on the input variables.
#'
#' @param list_var List of input variables
#' @param n_genes Number of genes
#' @param min_replicates Minimum replication count
#' @param max_replicates Maximum replication count
#' @param sequencing_depth Sequencing depth
#' @param basal_expression base expression gene
#' @param dispersion User-provided dispersion vector (optional)
#' @return List containing the ground truth, counts, and metadata
#' @export
#' @examples
#' mock_rnaseq(list_var = init_variable(),
#' n_genes = 1000, min_replicates = 2,
#' max_replicates = 4)
mock_rnaseq <- function(list_var, n_genes, min_replicates, max_replicates, sequencing_depth = NULL,
basal_expression = 0 , dispersion = stats::runif(n_genes, min = 0, max = 1000) ) {
## -- get my effect
df_inputSimulation <- getInput2simulation(list_var, n_genes)
## -- add column logQij
df_inputSimulation <- getLog_qij(df_inputSimulation)
df_inputSimulation <- addBasalExpression(df_inputSimulation, n_genes, basal_expression)
df_inputSimulation <- getMu_ij(df_inputSimulation )
message("Building mu_ij matrix")
## -- matrix
matx_Muij <- getMu_ij_matrix(df_inputSimulation)
l_sampleID <- getSampleID(list_var)
matx_bool_replication <- generateReplicationMatrix(list_var, min_replicates, max_replicates)
mu_ij_matx_rep <- .replicateMatrix(matx_Muij, matx_bool_replication)
dispersion <- getValidDispersion(dispersion)
genes_dispersion <- sample(dispersion , size = n_genes, replace = T)
matx_dispersion <- getDispersionMatrix(list_var, n_genes, genes_dispersion)
l_geneID = base::paste("gene", 1:n_genes, sep = "")
names(genes_dispersion) <- l_geneID
## same order as mu_ij_matx_rep
matx_dispersion <- matx_dispersion[ order(row.names(matx_dispersion)), ]
matx_dispersion_rep <- .replicateMatrix(matx_dispersion, matx_bool_replication)
matx_countsTable <- generateCountTable(mu_ij_matx_rep, matx_dispersion_rep)
message("Counts simulation: Done")
dtf_countsTable <- matx_countsTable %>% as.data.frame()
if (!is.null(sequencing_depth)) {
message("Scaling count table according to sequencing depth.")
dtf_countsTable <- scaleCountsTable(dtf_countsTable, sequencing_depth)
}
metaData <- getSampleMetadata(list_var, n_genes, matx_bool_replication)
libSize <- sum(colSums(dtf_countsTable))
settings_df <- getSettingsTable(n_genes, min_replicates, max_replicates, libSize)
list2ret <- list(
settings = settings_df,
init = list_var,
groundTruth = list(effects = df_inputSimulation, gene_dispersion = genes_dispersion),
counts = dtf_countsTable,
metadata = metaData)
return(list2ret)
}
#' Validate and Filter Dispersion Values
#'
#' This function takes an input vector and validates it to ensure that it meets certain criteria.
#'
#' @param input_vector A vector to be validated.
#' @return A validated and filtered numeric vector.
#' @details The function checks whether the input is a vector, suppresses warnings while converting to numeric,
#' and filters out non-numeric elements. It also checks for values greater than zero and removes negative values.
#' If the resulting vector has a length of zero, an error is thrown.
#' @examples
#' getValidDispersion(c(0.5, 1.2, -0.3, "invalid", 0.8))
#' @export
getValidDispersion <- function(input_vector) {
# Verify if it's a vector
if (!is.vector(input_vector)) {
stop("dispersion param is not a vector.")
}
input_vector <- suppressWarnings(as.numeric(input_vector))
# Filter numeric elements
numeric_elements <- !is.na(input_vector)
if (sum(!numeric_elements) > 0) {
message("Non-numeric elements were removed from the dispersion vector")
input_vector <- input_vector[numeric_elements]
}
# Check and filter values > 0
numeric_positive_elements <- input_vector > 0
if (sum(!numeric_positive_elements) > 0) {
message("Negative numeric values were removed from the dispersion vector")
input_vector <- input_vector[numeric_positive_elements]
}
if (length(input_vector) == 0) stop("Invalid dispersion values provided.")
return(input_vector)
}
#' Generate replication matrix
#'
#' Generates the replication matrix based on the minimum and maximum replication counts.
#'
#' @param list_var Number of samples
#' @param min_replicates Minimum replication count
#' @param max_replicates Maximum replication count
#' @return Replication matrix
#' @export
#' @examples
#' list_var = init_variable()
#' generateReplicationMatrix(list_var, min_replicates = 2, max_replicates = 4)
generateReplicationMatrix <- function(list_var, min_replicates, max_replicates) {
if (min_replicates > max_replicates) {
message("min_replicates > max_replicates have been supplied")
message("Automatic reversing")
tmp_min_replicates <- min_replicates
min_replicates <- max_replicates
max_replicates <- tmp_min_replicates
}
l_sampleIDs <- getSampleID(list_var)
n_samples <- length(l_sampleIDs)
return(getReplicationMatrix(min_replicates, max_replicates, n_samples = n_samples))
}
#' Replicate matrix
#'
#' Replicates a matrix based on a replication matrix.
#'
#' @param matrix Matrix to replicate
#' @param replication_matrix Replication matrix
#' @return Replicated matrix
#' @export
#' @examples
#' matrix <- matrix(1:9, nrow = 3, ncol = 3)
#' replication_matrix <- matrix(TRUE, nrow = 3, ncol = 3)
#' .replicateMatrix(matrix, replication_matrix)
.replicateMatrix <- function(matrix, replication_matrix) {
n_genes <- dim(matrix)[1]
rep_list <- colSums(replication_matrix)
replicated_indices <- rep(seq_len(ncol(matrix)), times = rep_list)
replicated_matrix <- matrix[, replicated_indices, drop = FALSE]
suffix_sampleID <- sequence(rep_list)
colnames(replicated_matrix) <- paste(colnames(replicated_matrix), suffix_sampleID, sep = "_")
return(replicated_matrix)
}
# Test case: Valid input vector with numeric and positive values
test_that("Valid input vector with numeric and positive values", {
input_vector <- c(0.5, 1.2, 0.8)
result <- getValidDispersion(input_vector)
expect_identical(result, input_vector)
})
# Test case: Valid input vector with numeric, positive, and negative values
test_that("Valid input vector with numeric, positive, and negative values", {
input_vector <- c(0.5, -0.3, 1.2, 0.8)
result <- getValidDispersion(input_vector)
expect_identical(result, c(0.5, 1.2, 0.8))
})
# Test case: Valid input vector with non-numeric elements
test_that("Valid input vector with non-numeric elements", {
input_vector <- c(0.5, "invalid", 0.8)
result <- getValidDispersion(input_vector)
expect_identical(result, c(0.5, 0.8))
})
# Test case: Empty input vector
test_that("Empty input vector", {
input_vector <- numeric(0)
expect_error(getValidDispersion(input_vector), "Invalid dispersion values provided.")
})
# Test case: unique value in vector
test_that("unique value in vector", {
input_vector <- 5
expect_equal(getValidDispersion(input_vector), 5)
})
# Test case: All negative values
test_that("All negative values", {
input_vector <- c(-0.5, -1.2, -0.8)
expect_error(getValidDispersion(input_vector), "Invalid dispersion values provided.")
})
# Test for .isDispersionMatrixValid
test_that(".isDispersionMatrixValid returns TRUE for valid dimensions", {
matx_dispersion <- matrix(1:6, nrow = 2, ncol = 3)
matx_bool_replication <- matrix(TRUE, nrow = 2, ncol = 3)
expect_true(.isDispersionMatrixValid(matx_dispersion, matx_bool_replication))
})
test_that(".isDispersionMatrixValid throws an error for invalid dimensions", {
matx_dispersion <- matrix(1:4, nrow = 2, ncol = 2)
matx_bool_replication <- matrix(TRUE, nrow = 2, ncol = 3)
expect_false(.isDispersionMatrixValid(matx_dispersion, matx_bool_replication))
})
# Test for generateCountTable
test_that("generateCountTable generates count table with correct dimensions", {
mu_ij_matx_rep <- matrix(1:6, nrow = 2, ncol = 3)
matx_dispersion_rep <- matrix(1:6, nrow = 2, ncol = 3)
count_table <- generateCountTable(mu_ij_matx_rep, matx_dispersion_rep)
expect_equal(dim(count_table), c(2, 3))
})
# Test for .replicateMatrix
test_that(".replicateMatrix replicates matrix correctly", {
matrix <- matrix(1:9, nrow = 3, ncol = 3)
replication_matrix <- matrix(TRUE, nrow = 3, ncol = 3)
replicated_matrix <- .replicateMatrix(matrix, replication_matrix)
expect_equal(dim(replicated_matrix), c(3, 9))
})
# Test for mock_rnaseq
#test_that("mock_rnaseq returns expected output", {
# Set up input variables
# list_var <- NULL
# n_genes <- 3
# min_replicates <- 2
# max_replicates <- 4
# df_inputSimulation <- data.frame(gene_id = 1:3, coef_value = c(0.5, 0.3, 0.2))
# matx_dispersion <- matrix(1:9, nrow = 3, ncol = 3)
# Run the function
# expect_error(mock_rnaseq(list_var, n_genes, min_replicates, max_replicates, df_inputSimulation,
# matx_dispersion))
#list_var <- init_variable(name = "my_var", mu = c(10, 20), level = 2 )
#n_genes <- 10
#min_replicates <- 2
#max_replicates <- 4
#scaling_factor <- 1
#df_inputSimulation <- getInput2simulation(list_var, n_genes)
#dispersion <- getDispersionMatrix(list_var, n_genes, c(1000, 1000, 1000, 1000, 1000, 1, 1, 1, 1, 1))
#mock_rnaseq(list_var, n_genes, min_replicates,
# max_replicates,
# df_inputSimulation, dispersion)
#ERROOR
#})
# Test for generateReplicationMatrix
test_that("generateReplicationMatrix generates replication matrix correctly", {
replication_matrix <- generateReplicationMatrix(init_variable(),min_replicates = 2, max_replicates = 4)
expect_equal(dim(replication_matrix), c(4, 2))
})