diff --git a/src/v4/HTRSIM/dev/flat_full_bis.Rmd b/src/v4/HTRSIM/dev/flat_full_bis.Rmd index 4c09449a09b51445d71bf1c81ec25e3c9a11a7b8..74e04aeb221c405bc62b4df6a68ce49022e033ef 100644 --- a/src/v4/HTRSIM/dev/flat_full_bis.Rmd +++ b/src/v4/HTRSIM/dev/flat_full_bis.Rmd @@ -24,22 +24,6 @@ pkgload::load_all(export_all = FALSE) ``` -# Initialize variable to simulate - -<!-- -Create a chunk for the core of the function - -- The chunk needs to be named `function` at least -- It contains the code of a documented function -- The chunk can also be named `function-my_median` to make it easily -findable in your Rmd -- Let the `@examples` part empty, and use the next `examples` chunk instead to present reproducible examples - -After inflating the template - -- This function code will automatically be added in a new file in the "R/" directory ---> - ```{r function-utils, filename = "utils"} #' Join two data frames using data.table #' @@ -344,13 +328,15 @@ fillInVariable <- function(name, mu, sd, level) { #' @param metaData A list of labels. #' @param effectsGivenByUser A list of effects given by the user. #' @param col_names A character vector specifying the column names to use. +#' @importFrom utils tail + #' #' @return A list with the sub-object details. build_sub_obj_return_to_user <- function(level, metaData, effectsGivenByUser, col_names) { sub_obj <- list(level = level) data <- cbind(metaData, effectsGivenByUser) %>% as.data.frame() colnames(data) <- col_names - var_name <- tail(col_names, n = 1) + var_name <- utils::tail(col_names, n = 1) data[, var_name] <- as.numeric(data[, var_name]) sub_obj$data <- data return(sub_obj) @@ -1037,8 +1023,182 @@ test_that("getDataFromUser renvoie les données appropriées", { }) ``` +```{r function-setCorrelation, filename = "setCorrelation"} + +#' Compute Covariation from Correlation and Standard Deviations +#' +#' This function computes the covariation between two variables (A and B) given their correlation and standard deviations. +#' +#' @param corr_AB The correlation coefficient between variables A and B. +#' @param sd_A The standard deviation of variable A. +#' @param sd_B The standard deviation of variable B. +#' +#' @return The covariation between variables A and B. +#' @export +#' @examples +#' corr <- 0.7 +#' sd_A <- 3 +#' sd_B <- 4 +#' compute_covariation(corr, sd_A, sd_B) +compute_covariation <- function(corr_AB, sd_A, sd_B) { + cov_AB <- corr_AB * sd_A * sd_B + return(cov_AB) +} + + +#' Get Standard Deviations for Variables in Correlation +#' +#' This function extracts the standard deviations for the variables involved in the correlation. +#' +#' @param list_var A list containing the variables and their attributes. +#' @param between_var A character vector containing the names of the variables involved in the correlation. +#' +#' @return A numeric vector containing the standard deviations for the variables in the correlation. +#' @export +#' @examples +#' list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% +#' init_variable(name = "varB", mu = 0, sd = 25, level = 3) +#' between_var <- c("varA", "varB") +#' getStandardDeviationInCorrelation(list_var, between_var) +getStandardDeviationInCorrelation <- function(list_var, between_var){ + for (var in between_var) sd_List <- getGivenAttribute(list_var, "sd") + for (var in between_var) sd_ListFromInteraction <- getGivenAttribute(list_var$interactions, "sd") + sd_List <- c(sd_List, sd_ListFromInteraction) + return(unname(unlist(sd_List[between_var]))) +} + + + +#' Set Correlation between Variables +#' +#' Set the correlation between two or more variables in a simulation. +#' +#' @param list_var A list containing the variables used in the simulation, initialized using \code{\link{init_variable}}. +#' @param between_var Character vector specifying the names of the variables to set the correlation between. +#' @param corr Numeric value specifying the desired correlation between the variables. +#' +#' @return Updated \code{list_var} with the specified correlation set between the variables. +#' +#' @details The function checks if the variables specified in \code{between_var} are declared and initialized in the \code{list_var}. It also ensures that at least two variables with provided standard deviation are required to set a correlation in the simulation. +#' The specified correlation value must be within the range (-1, 1). The function computes the corresponding covariance between the variables based on the specified correlation and standard deviations. +#' The correlation information is then added to the \code{list_var} in the form of a data frame containing the correlation value and the corresponding covariance value. +#' @export +#' @examples +#' list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% +#' init_variable(name = "varB", mu = 0, sd = 25, level = 3) +#' list_var <- set_correlation(list_var, between_var = c("varA", "varB"), corr = 0.7) +set_correlation <- function(list_var, between_var, corr) { + + # Check if variables in between_var are declared and initialized + bool_checkBetweenVarValidity <- function(between_var, list_var) { + nb_varInCorrelation <- length(unique(between_var)) + stopifnot(nb_varInCorrelation > 1) + # -- check also for interaction + varInitialized <- c(getListVar(list_var), getListVar(list_var$interactions)) + existingVar_nb <- varInitialized %in% between_var %>% sum() + if (existingVar_nb != nb_varInCorrelation) { + return(FALSE) + } else { + return(TRUE) + } + } + + name_correlation <- paste(between_var, collapse = ".") + bool_valid_corr <- bool_checkBetweenVarValidity(between_var, list_var) + if (!bool_valid_corr) { + stop("At least one variable in between_var is not declared. Variable not initialized cannot be used in a correlation.") + } + + vec_standardDev <- getStandardDeviationInCorrelation(list_var, between_var) + if (length(vec_standardDev) < 2) { + stop("Exactly two variables with provided standard deviation are required to set a correlation in simulation.") + } + # Validate the specified correlation value to be within the range [-1, 1] + if (corr < -1 || corr > 1) { + stop("Invalid correlation value. Correlation must be in the range [-1, 1].") + } + + name_interaction <- paste(between_var, collapse = ":") + corr <- data.frame(cor = corr, covar = compute_covariation(corr, vec_standardDev[1], vec_standardDev[2] )) + list_var$correlations[[name_correlation]] <- corr + return(list_var) +} + + +``` + +```{r test-setcorrelation} + +test_that("compute_covariation returns the correct covariation", { + # Test case 1: Positive correlation + corr <- 0.7 + sd_A <- 3 + sd_B <- 4 + expected_cov <- corr * sd_A * sd_B + actual_cov <- compute_covariation(corr, sd_A, sd_B) + expect_equal(actual_cov, expected_cov) + + # Test case 2: Negative correlation + corr <- -0.5 + sd_A <- 2.5 + sd_B <- 3.5 + expected_cov <- corr * sd_A * sd_B + actual_cov <- compute_covariation(corr, sd_A, sd_B) + expect_equal(actual_cov, expected_cov) + + # Test case 3: Zero correlation + corr <- 0 + sd_A <- 1 + sd_B <- 2 + expected_cov <- corr * sd_A * sd_B + actual_cov <- compute_covariation(corr, sd_A, sd_B) + expect_equal(actual_cov, expected_cov) +}) + + +# Unit tests for getStandardDeviationInCorrelation +test_that("getStandardDeviationInCorrelation returns correct standard deviations", { + + # Initialize list_var + list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% + init_variable(name = "varB", mu = 0, sd = 25, level = 3) + + # Test case 1: Two variables correlation + between_var_1 <- c("varA", "varB") + sd_expected_1 <- c(5, 25) + sd_result_1 <- getStandardDeviationInCorrelation(list_var, between_var_1) + expect_equal(sd_result_1, sd_expected_1) + +}) + + + +test_that("set_correlation sets the correlation between variables correctly", { + # Initialize variables in the list_var + list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% + init_variable(name = "varB", mu = 0, sd = 25, level = 3) + + # Test setting correlation between varA and varB + list_var <- set_correlation(list_var, between_var = c("varA", "varB"), corr = 0.7) + + corr_result <- list_var$correlations$varA.varB$cor + covar_result <- list_var$correlations$varA.varB$covar + expect_equal(corr_result, 0.7) + expect_equal(covar_result, 87.5) + + # Test setting correlation between varA and varC (should raise an error) + expect_error(set_correlation(list_var, between_var = c("varA", "varC"), corr = 0.8), + "At least one variable in between_var is not declared. Variable not initialized cannot be used in a correlation.") + + # Test setting correlation with invalid correlation value + expect_error(set_correlation(list_var, between_var = c("varA", "varB"), corr = 1.5)) + + # Test setting correlation with less than 2 variables with provided standard deviation + expect_error(set_correlation(list_var, between_var = c("varA"), corr = 0.7)) +}) + -## Perform simulation from data +``` ```{r function-simulation , filename = "simulation"} #' Get input for simulation based on coefficients @@ -1127,7 +1287,6 @@ getLog_qij <- function(dtf_coef) { #' from the coefficient data frame and multiplying it by the provided scaling factor. #' #' @param dtf_coef Coefficient data frame containing the log_qij values -#' @param scaling_factor Scaling factor to multiply the calculated mu_ij values #' #' @return Coefficient data frame with an additional mu_ij column #' @@ -1153,6 +1312,7 @@ getMu_ij <- function(dtf_coef) { #' #' @param dtf_coef A dataframe containing the coefficients. #' @importFrom reshape2 dcast +#' @importFrom stats as.formula #' @export #' @return A Mu_ij matrix. @@ -1163,8 +1323,8 @@ getMu_ij_matrix <- function(dtf_coef) { str_formula_rigth <- paste(l_var, collapse = " + ") if (str_formula_rigth == "") stop("no variable label detected") str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ") - formula <- as.formula(str_formula) - dtf_Muij <- dtf_coef %>% reshape2::dcast(., formula, value.var = "mu_ij", drop = F) + formula <- stats::as.formula(str_formula) + dtf_Muij <- dtf_coef %>% reshape2::dcast(formula = formula, value.var = "mu_ij", drop = F) dtf_Muij[is.na(dtf_Muij)] <- 0 mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix() mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F] @@ -1179,13 +1339,15 @@ getMu_ij_matrix <- function(dtf_coef) { #' @param matx_dispersion The dispersion matrix. #' @param replicateID The replication identifier. #' @param l_bool_replication A boolean vector indicating the replicates. +#' @importFrom stats rnbinom +#' #' @return A subcounts table. getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) { getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) { k_ij <- stats::rnbinom(n_genes * n_samples, size = matx_dispersion, mu = matx_Muij) %>% - matrix(., nrow = n_genes, ncol = n_samples) + matrix(nrow = n_genes, ncol = n_samples) k_ij[is.na(k_ij)] <- 0 return(k_ij) @@ -1271,7 +1433,6 @@ test_that("getSubCountsTable returns the correct output", { ``` -## Simulation ```{r function-simulation2 , filename = "simulation2"} @@ -1576,7 +1737,6 @@ test_that("getDispersionMatrix returns the correct dispersion matrix", { ``` -## Mock RNAseq ```{r function-mock , filename = "mock-rnaSeq" } @@ -1617,8 +1777,10 @@ generateCountTable <- function(mu_ij_matx_rep, matx_dispersion_rep) { n_genes <- dim(mu_ij_matx_rep)[1] n_samples <- dim(mu_ij_matx_rep)[2] n_samplings <- prod(n_genes * n_samples) - mat_countsTable <- rnbinom(n_samplings, size = matx_dispersion_rep, mu = mu_ij_matx_rep) %>% - matrix(., nrow = n_genes, ncol = n_samples) + mat_countsTable <- rnbinom(n_samplings, + size = matx_dispersion_rep, + mu = mu_ij_matx_rep) %>% + matrix(nrow = n_genes, ncol = n_samples) colnames(mat_countsTable) <- colnames(mu_ij_matx_rep) rownames(mat_countsTable) <- rownames(mu_ij_matx_rep) mat_countsTable[is.na(mat_countsTable)] <- 0 @@ -1636,7 +1798,7 @@ generateCountTable <- function(mu_ij_matx_rep, matx_dispersion_rep) { #' @param max_replicates Maximum replication count #' @param sequencing_depth Sequencing depth #' @param basal_expression base expression gene -#' @param matx_dispersion User-provided dispersion matrix (optional) +#' @param dispersion User-provided dispersion vector (optional) #' @return List containing the ground truth, counts, and metadata #' @export #' @examples @@ -1902,8 +2064,6 @@ test_that("generateReplicationMatrix generates replication matrix correctly", { ``` -## Prepare data - ```{r function-preparingData , filename = "prepare_data2fit"} #' Convert count matrix to long data frame @@ -2008,6 +2168,8 @@ prepareData2fit <- function(countMatrix, metadata, normalization = TRUE , respon #' computes the average log expression for each gene, and then scales each #' sample's counts by the exponential of the difference between its average log #' expression and the median of those averages. +#' +#' @importFrom median #' #' @examples #' counts <- matrix(c(100, 200, 300, 1000, 1500, 2500), ncol = 2) @@ -2025,7 +2187,7 @@ medianRatioNormalization <- function(countsMatrix) { average_log <- average_log[idx2keep] ratio_data <- sweep(log_data[idx2keep, ], 1, average_log, "-") - sample_medians <- apply(ratio_data, 2, median) + sample_medians <- apply(ratio_data, 2, stats::median) scaling_factors <- exp(sample_medians) countsMatrix_normalized <- sweep(countsMatrix, 2, scaling_factors, "/") @@ -2146,6 +2308,83 @@ isValidInput2fit <- function(data2fit, formula){ } +#' Drop Random Effects from a Formula +#' +#' This function allows you to remove random effects from a formula by specifying +#' which terms to drop. It checks for the presence of vertical bars ('|') in the +#' terms of the formula and drops the random effects accordingly. If all terms +#' are random effects, the function updates the formula to have only an intercept. +#' +#' @param form The formula from which random effects should be dropped. +#' +#' @return A modified formula with specified random effects dropped. +#' +#' @examples +#' # Create a formula with random effects +#' formula <- y ~ x1 + (1 | group) + (1 | subject) +#' # Drop the random effects related to 'group' +#' modified_formula <- drop_randfx(formula) +#' +#' @importFrom stats terms +#' @importFrom stats update +#' +#' @export +drop_randfx <- function(form) { + form.t <- stats::terms(form) + dr <- grepl("|", labels(form.t), fixed = TRUE) + if (mean(dr) == 1) { + form.u <- stats::update(form, . ~ 1) + } else { + if (mean(dr) == 0) { + form.u <- form + } else { + form.td <- stats::drop.terms(form.t, which(dr)) + form.u <- stats::update(form, form.td) + } + } + form.u +} + + + +#' Check if a Model Matrix is Full Rank +#' +#' This function checks whether a model matrix is full rank, which is essential for +#' certain statistical analyses. It computes the eigenvalues of the crossproduct +#' of the model matrix and determines if the first eigenvalue is positive and if +#' the ratio of the last eigenvalue to the first is within a defined tolerance. +#' +#' This function is inspired by a similar function found in the Limma package. +#' +#' @param metadata The metadata used to create the model matrix. +#' @param formula The formula used to specify the model matrix. +#' +#' @return \code{TRUE} if the model matrix is full rank, \code{FALSE} otherwise. +#' +#' @examples +#' metadata <- data.frame(x = rnorm(10), y = rnorm(10)) +#' formula <- y ~ x +#' is_fullrank(metadata, formula) +#' +#' +#' @importFrom stats model.matrix +#' @export +is_fullrank <- function(metadata, formula) { + ## drop random eff + formula <- drop_randfx(formula) + ## TEST + model_matrix <- stats::model.matrix(data = metadata, formula) + e <- eigen(crossprod(model_matrix), symmetric = TRUE, only.values = TRUE)$values + modelFullRank <- e[1] > 0 && abs(e[length(e)] / e[1]) > 1e-13 + + if (!modelFullRank) + stop("The model matrix is not full rank, so the model cannot be fit as specified. One or more variables or interaction terms in the design formula are linear combinations of the others and must be removed.") + + return(TRUE) +} + + + #' Fit a model using the fitModel function. @@ -2244,6 +2483,7 @@ launchFit <- function(group, group_by, formula, data, ...) { #' @param log_file File to write log (default : log.txt) #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function #' @return List of fitted model objects or NULL for any errors +#' @importFrom stats setNames #' @export #' @examples #' .parallel_fit(group_by = "Species", "setosa", @@ -2254,11 +2494,11 @@ launchFit <- function(group, group_by, formula, data, ...) { clust <- parallel::makeCluster(n.cores, outfile = log_file) parallel::clusterExport(clust, c(".subsetData_andfit", ".fitModel"), envir=environment()) - results_fit <- parallel::parLapply(clust, X = setNames(groups, groups), fun = launchFit, + results_fit <- parallel::parLapply(clust, X = stats::setNames(groups, groups), fun = launchFit, group_by = group_by, formula = formula, data = data, ...) parallel::stopCluster(clust) - closeAllConnections() + #closeAllConnections() return(results_fit) } @@ -2278,7 +2518,11 @@ launchFit <- function(group, group_by, formula, data, ...) { #' fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length, #' data = iris, group_by = "Species", n.cores = 1) fitModelParallel <- function(formula, data, group_by, n.cores = NULL, log_file = "log.txt", ...) { + + ## SOme verification isValidInput2fit(data, formula) + is_fullrank(data, formula) + groups <- unique(data[[group_by]]) # Fit models in parallel and capture the results @@ -2340,6 +2584,68 @@ test_that(".fitModel returns a fitted model object", { }) + +# Test if random effects are dropped correctly +test_that("Drop random effects from formula", { + formula <- y ~ x1 + (1 | group) + (1 | subject) + modified_formula <- drop_randfx(formula) + expect_equal(deparse(modified_formula), "y ~ x1") +}) + +# Test if formula with no random effects remains unchanged +test_that("Keep formula with no random effects unchanged", { + formula <- y ~ x1 + x2 + modified_formula <- drop_randfx(formula) + expect_equal(deparse(modified_formula), "y ~ x1 + x2") +}) + +# Test if all random effects are dropped to intercept +test_that("Drop all random effects to intercept", { + formula <- ~ (1 | group) + (1 | subject) + modified_formula <- drop_randfx(formula) + expect_equal(deparse(modified_formula), ". ~ 1") +}) + + +# Test if a full-rank model matrix is identified correctly +test_that("Identify full-rank model matrix", { + metadata <- data.frame(x = rnorm(10), y = rnorm(10)) + formula <- y ~ x + expect_true(is_fullrank(metadata, formula)) +}) + +# Test if a rank-deficient model matrix is detected and throws an error +test_that("Detect rank-deficient model matrix and throw error", { + metadata <- data.frame(x = factor(rep(c("xA","xB"),each = 5)), + w = factor(rep(c("wA","wB"), each = 5)), + z = factor(rep(c("zA","zB"), each = 5)), + y = rnorm(10)) + formula <- y ~ x + w + z + y:w + expect_error(is_fullrank(metadata, formula), + regexp = "The model matrix is not full rank, so the model cannot be fit as specified.") +}) + +# Test if a rank-deficient model matrix is detected and throws an error +test_that("Detect rank-deficient model matrix and throw error (with random eff)", { + metadata <- data.frame(x = factor(rep(c("xA","xB"),each = 5)), + w = factor(rep(c("wA","wB"), each = 5)), + z = factor(rep(c("zA","zB"), each = 5)), + y = rnorm(10)) + formula <- y ~ x + w + z + y:w + (1 | w) + expect_error(is_fullrank(metadata, formula), + regexp = "The model matrix is not full rank, so the model cannot be fit as specified.") +}) + +# Test if a rank-deficient model matrix is detected and throws an error +test_that("Identify full-rank model matrix (with random eff)", { + metadata <- data.frame(x = factor(rep(c("xA","xB"),each = 5)), + w = factor(rep(c("wA","wB"), each = 5)), + z = factor(rep(c("zA","zB"), each = 5)), + y = rnorm(10)) + formula <- y ~ x + (1 | w) + expect_true(is_fullrank(metadata, formula)) +}) + #test_that(".fitMixteModel returns a fitted mixed-effects model object or NULL if there was an error", { # data(mtcars) # formula <- mpg ~ cyl + disp + (1|gear) @@ -2528,6 +2834,9 @@ test_that("fitModelParallel fits models in parallel for each group and returns a updateParallel <- function(formula, l_tmb, n.cores = NULL, log_file = "log.txt", ...) { isValidInput2fit(l_tmb[[1]]$frame, formula) + + is_fullrank(l_tmb[[1]]$frame, formula) + # Fit models update in parallel and capture the results results <- .parallel_update(formula, l_tmb, n.cores, log_file, ...) return(results) @@ -2560,7 +2869,7 @@ updateParallel <- function(formula, l_tmb, n.cores = NULL, log_file = "log.txt", parallel::clusterExport(clust, c("launchUpdate", "fitUpdate"), envir=environment()) updated_res <- parallel::parLapply(clust, X = l_tmb, fun = launchUpdate , formula = formula, ...) parallel::stopCluster(clust) - closeAllConnections() + #closeAllConnections() return(updated_res) } @@ -2920,7 +3229,11 @@ removeDuplicatedWord <- function(strings){ #' @return A data frame with the correlation values and corresponding interaction names. #' @export #' @examples -#' mat <- matrix(c(1, 0.7, 0.5, 0.7, 1, 0.3, 0.5, 0.3, 1), nrow = 3, dimnames = list(c("A", "B", "C"), c("A", "B", "C"))) +#' mat <- matrix(c(1, 0.7, 0.5, 0.7, +#' 1, 0.3, 0.5, 0.3, 1), +#' nrow = 3, +#' dimnames = list(c("A", "B", "C"), +#' c("A", "B", "C"))) #' correlation_matrix_2df(mat) correlation_matrix_2df <- function(corr_matrix){ vec_corr <- corr_matrix[lower.tri(corr_matrix)] @@ -2951,7 +3264,8 @@ correlation_matrix_2df <- function(corr_matrix){ #' @return A data frame containing the standard deviation and correlation components for the specified grouping factor. #' @export #' @examples -#' model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris, family = gaussian) +#' model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), +#' data = iris, family = gaussian) #' var_cor <- summary(model)$varcor$cond #' ran_pars_df <- wrapper_var_cor(var_cor, "Species") wrapper_var_cor <- function(var_cor, elt){ @@ -2988,6 +3302,7 @@ wrapper_var_cor <- function(var_cor, elt){ #' @param x A glmmTMB model object. #' @return A data frame containing the random parameters and their estimates. #' @export +#' @importFrom stats setNames #' @examples #' model <- glmmTMB::glmmTMB(Sepal.Length ~ Sepal.Width + Petal.Length + (1|Species), data = iris, #' family = gaussian) @@ -2995,7 +3310,7 @@ wrapper_var_cor <- function(var_cor, elt){ extract_ran_pars <- function(x){ ss <- summary(x) l_2parcour <- c("cond", "zi") - l_res = lapply(setNames(l_2parcour, l_2parcour), + l_res = lapply(stats::setNames(l_2parcour, l_2parcour), function(elt) { var_cor <- ss$varcor[[elt]] @@ -3240,15 +3555,16 @@ test_that("tidy_tmb returns expected output",{ #' @param l_tmb A list of glmmTMB models or a unique glmmTMB obj model #' @return A DataFrame with the summary statistics for all the glmmTMB models in the list. #' @export -#' +#' @importFrom stats setNames #' @examples #' data(mtcars) -#' models <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, group_by = "Species",n.cores = 1, data = iris) +#' models <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, +#' group_by = "Species",n.cores = 1, data = iris) #' result <- glance_tmb(models) glance_tmb <- function(l_tmb){ if (identical(class(l_tmb), "glmmTMB")) return(getGlance(l_tmb)) l_group <- attributes(l_tmb)$names - l_glance <- lapply(setNames(l_group, l_group), function(group) getGlance(l_tmb[[group]])) + l_glance <- lapply(stats::setNames(l_group, l_group), function(group) getGlance(l_tmb[[group]])) return(do.call("rbind", l_glance)) } @@ -3330,7 +3646,8 @@ test_that("getGlance returns the summary statistics for a single model", { #' #' @examples #' data(iris) -#' models <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, group_by = "Species",n.cores = 1, data = iris) +#' models <- fitModelParallel(Sepal.Length ~ Sepal.Width + Petal.Length, +#' group_by = "Species",n.cores = 1, data = iris) #' glance_df <- glance_tmb(models) #' glance_df$group_id <- rownames(glance_df) #' subset_glance(glance_df, c("AIC", "BIC")) @@ -3437,6 +3754,7 @@ test_that("metrics_plot returns a ggplot object", { #' #' @param TMB_dispersion_df A data frame containing dispersion values from TMB. #' @param DESEQ_dispersion_df A data frame containing dispersion values from DESeq2. +#' @param color2use vector of color use for points coloration #' #' @return A list containing a dispersion plot and a data frame with dispersion comparison. #' @importFrom ggplot2 scale_color_manual @@ -3444,7 +3762,7 @@ test_that("metrics_plot returns a ggplot object", { #' #' @examples #' \dontrun{ -#' disp_comparison <- evaluateDispersion(TMB_dispersion_df, DESEQ_dispersion_df) +#' disp_comparison <- evaluateDispersion(TMB_dispersion_df, DESEQ_dispersion_df, "red") #' plot_dispersion <- disp_comparison$disp_plot #' comparison_df <- disp_comparison$data #' } @@ -3667,7 +3985,6 @@ test_that("evaluateDispersion function works correctly", { -## Sequencing depth ```{r function-seqDepth, filename = "scalingSequencingDepth"} @@ -3724,8 +4041,6 @@ test_that("Valid scaling of counts table", { ``` -## Gene expression - ```{r function-geneExpressionScaling, filename = "scalingGeneExpression"} @@ -3886,191 +4201,17 @@ test_that("getBinExpression handles negative values correctly", { ``` -```{r function-setCorrelation, filename = "setCorrelation"} -#' Compute Covariation from Correlation and Standard Deviations -#' -#' This function computes the covariation between two variables (A and B) given their correlation and standard deviations. -#' -#' @param corr_AB The correlation coefficient between variables A and B. -#' @param sd_A The standard deviation of variable A. -#' @param sd_B The standard deviation of variable B. -#' -#' @return The covariation between variables A and B. -#' @export -#' @examples -#' corr <- 0.7 -#' sd_A <- 3 -#' sd_B <- 4 -#' compute_covariation(corr, sd_A, sd_B) -compute_covariation <- function(corr_AB, sd_A, sd_B) { - cov_AB <- corr_AB * sd_A * sd_B - return(cov_AB) -} - - -#' Get Standard Deviations for Variables in Correlation -#' -#' This function extracts the standard deviations for the variables involved in the correlation. -#' -#' @param list_var A list containing the variables and their attributes. -#' @param between_var A character vector containing the names of the variables involved in the correlation. -#' -#' @return A numeric vector containing the standard deviations for the variables in the correlation. -#' @export -#' @examples -#' list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% -#' init_variable(name = "varB", mu = 0, sd = 25, level = 3) -#' between_var <- c("varA", "varB") -#' getStandardDeviationInCorrelation(list_var, between_var) -getStandardDeviationInCorrelation <- function(list_var, between_var){ - for (var in between_var) sd_List <- getGivenAttribute(list_var, "sd") - for (var in between_var) sd_ListFromInteraction <- getGivenAttribute(list_var$interactions, "sd") - sd_List <- c(sd_List, sd_ListFromInteraction) - return(unname(unlist(sd_List[between_var]))) -} - - - -#' Set Correlation between Variables -#' -#' Set the correlation between two or more variables in a simulation. -#' -#' @param list_var A list containing the variables used in the simulation, initialized using \code{\link{init_variable}}. -#' @param between_var Character vector specifying the names of the variables to set the correlation between. -#' @param corr Numeric value specifying the desired correlation between the variables. -#' -#' @return Updated \code{list_var} with the specified correlation set between the variables. -#' -#' @details The function checks if the variables specified in \code{between_var} are declared and initialized in the \code{list_var}. It also ensures that at least two variables with provided standard deviation are required to set a correlation in the simulation. -#' The specified correlation value must be within the range [-1, 1]. The function computes the corresponding covariance between the variables based on the specified correlation and standard deviations. -#' The correlation information is then added to the \code{list_var} in the form of a data frame containing the correlation value and the corresponding covariance value. -#' @export -#' @examples -#' list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% -#' init_variable(name = "varB", mu = 0, sd = 25, level = 3) -#' list_var <- set_correlation(list_var, between_var = c("varA", "varB"), corr = 0.7) -set_correlation <- function(list_var, between_var, corr) { - - # Check if variables in between_var are declared and initialized - bool_checkBetweenVarValidity <- function(between_var, list_var) { - nb_varInCorrelation <- length(unique(between_var)) - stopifnot(nb_varInCorrelation > 1) - # -- check also for interaction - varInitialized <- c(getListVar(list_var), getListVar(list_var$interactions)) - existingVar_nb <- varInitialized %in% between_var %>% sum() - if (existingVar_nb != nb_varInCorrelation) { - return(FALSE) - } else { - return(TRUE) - } - } - - name_correlation <- paste(between_var, collapse = ".") - bool_valid_corr <- bool_checkBetweenVarValidity(between_var, list_var) - if (!bool_valid_corr) { - stop("At least one variable in between_var is not declared. Variable not initialized cannot be used in a correlation.") - } - - vec_standardDev <- getStandardDeviationInCorrelation(list_var, between_var) - if (length(vec_standardDev) < 2) { - stop("Exactly two variables with provided standard deviation are required to set a correlation in simulation.") - } - # Validate the specified correlation value to be within the range [-1, 1] - if (corr < -1 || corr > 1) { - stop("Invalid correlation value. Correlation must be in the range [-1, 1].") - } - - name_interaction <- paste(between_var, collapse = ":") - corr <- data.frame(cor = corr, covar = compute_covariation(corr, vec_standardDev[1], vec_standardDev[2] )) - list_var$correlations[[name_correlation]] <- corr - return(list_var) -} - - -``` - -```{r test-setcorrelation} - -test_that("compute_covariation returns the correct covariation", { - # Test case 1: Positive correlation - corr <- 0.7 - sd_A <- 3 - sd_B <- 4 - expected_cov <- corr * sd_A * sd_B - actual_cov <- compute_covariation(corr, sd_A, sd_B) - expect_equal(actual_cov, expected_cov) - - # Test case 2: Negative correlation - corr <- -0.5 - sd_A <- 2.5 - sd_B <- 3.5 - expected_cov <- corr * sd_A * sd_B - actual_cov <- compute_covariation(corr, sd_A, sd_B) - expect_equal(actual_cov, expected_cov) - - # Test case 3: Zero correlation - corr <- 0 - sd_A <- 1 - sd_B <- 2 - expected_cov <- corr * sd_A * sd_B - actual_cov <- compute_covariation(corr, sd_A, sd_B) - expect_equal(actual_cov, expected_cov) -}) - - -# Unit tests for getStandardDeviationInCorrelation -test_that("getStandardDeviationInCorrelation returns correct standard deviations", { - - # Initialize list_var - list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% - init_variable(name = "varB", mu = 0, sd = 25, level = 3) - - # Test case 1: Two variables correlation - between_var_1 <- c("varA", "varB") - sd_expected_1 <- c(5, 25) - sd_result_1 <- getStandardDeviationInCorrelation(list_var, between_var_1) - expect_equal(sd_result_1, sd_expected_1) - -}) - - - -test_that("set_correlation sets the correlation between variables correctly", { - # Initialize variables in the list_var - list_var <- init_variable(name = "varA", mu = 0, sd = 5, level = 3) %>% - init_variable(name = "varB", mu = 0, sd = 25, level = 3) - - # Test setting correlation between varA and varB - list_var <- set_correlation(list_var, between_var = c("varA", "varB"), corr = 0.7) - - corr_result <- list_var$correlations$varA.varB$cor - covar_result <- list_var$correlations$varA.varB$covar - expect_equal(corr_result, 0.7) - expect_equal(covar_result, 87.5) - - # Test setting correlation between varA and varC (should raise an error) - expect_error(set_correlation(list_var, between_var = c("varA", "varC"), corr = 0.8), - "At least one variable in between_var is not declared. Variable not initialized cannot be used in a correlation.") - - # Test setting correlation with invalid correlation value - expect_error(set_correlation(list_var, between_var = c("varA", "varB"), corr = 1.5)) - - # Test setting correlation with less than 2 variables with provided standard deviation - expect_error(set_correlation(list_var, between_var = c("varA"), corr = 0.7)) -}) - - -``` - -```{r functionActualMainFixEff, filename = "actualMainFixEffects" } - -#' Calculate average values by group + +```{r functionActualMainFixEff, filename = "actualMainFixEffects" } + +#' Calculate average values by group #' #' @param data The input data frame #' @param column The name of the target variable #' @param group_by The names of the grouping variables #' @importFrom data.table setDT tstrsplit +#' @importFrom rlang := #' @return A data frame with average values calculated by group #' @export averageByGroup <- function(data, column, group_by) { @@ -4137,7 +4278,6 @@ subsetFixEffectInferred <- function(tidy_tmb){ #' Get data for calculating actual values #' -#' @param inference The inference data frame #' @param groundTruth The ground truth data frame #' @return A list containing required data for calculating actual values #' @export @@ -4576,7 +4716,6 @@ filter_dataframe <- function(df, filter_list ) { #' of the conditions satisfying the specified term combinations, and also considering a reference condition. #' #' @param data A data frame containing the expression data and associated terms. -#' @param df_intercept A data frame containing the intercept values for computing interaction effects. #' @param l_reference A data frame representing the reference condition for the interaction. #' @param clmn_term_1 The name of the column in \code{data} representing the first term. #' @param lbl_term_1 The label of the first term to compute interactions for. @@ -4587,25 +4726,28 @@ filter_dataframe <- function(df, filter_list ) { #' @export #' @examples #' average_gt <- data.frame(clmn_term_1 = c("A", "A", "B", "B"), -#' clmn_term_2 = c("X", "Y", "X", "Y"), -#' logQij_mean = c(1.5, 2.0, 0.5, 1.0)) -#' df_intercept <- data.frame(actual = 0.8) +#' clmn_term_2 = c("X", "Y", "Y", "X"), +#' logQij_mean = c(1.5, 8.0, 0.5, 4.0)) #' # Définir les paramètres de la fonction #' l_label <- list(clmn_term_1 = c("A", "B"), clmn_term_2 = c("X", "Y")) #' clmn_term_1 <- "clmn_term_1" -#' lbl_term_1 <- "A" +#' lbl_term_1 <- "B" #' clmn_term_2 <- "clmn_term_2" #' lbl_term_2 <- "Y" #' # Calculer la valeur d'interaction réelle -#' actual_interaction <- calculate_actual_interaction_values(average_gt, df_intercept, -#' l_label, clmn_term_1, lbl_term_1, -#' clmn_term_2, lbl_term_2) -calculate_actual_interaction_values <- function(data , df_intercept, l_reference , clmn_term_1, lbl_term_1, clmn_term_2, lbl_term_2) { - A <- data[data[[clmn_term_1]] == lbl_term_1 & data[[clmn_term_2]] == lbl_term_2, ] - B <- data[data[[clmn_term_1]] == lbl_term_1 & data[[clmn_term_2]] == l_reference[[clmn_term_2]][1], ] - C <- data[data[[clmn_term_1]] == l_reference[[clmn_term_1]][1] & data[[clmn_term_2]] == lbl_term_2, ] - D <- df_intercept - actual_interaction <- (A$logQij_mean - B$logQij_mean) - (C$logQij_mean - D$actual) +#' actual_interaction <- calculate_actual_interactionX2_values(average_gt, +#' l_label, clmn_term_1, lbl_term_1, +#' clmn_term_2, lbl_term_2) +calculate_actual_interactionX2_values <- function(data, l_reference , clmn_term_1, lbl_term_1, clmn_term_2, lbl_term_2) { + A <- data[data[[clmn_term_1]] == lbl_term_1 & + data[[clmn_term_2]] == lbl_term_2, ] + B <- data[data[[clmn_term_1]] == lbl_term_1 & + data[[clmn_term_2]] == l_reference[[clmn_term_2]][1], ] + C <- data[data[[clmn_term_1]] == l_reference[[clmn_term_1]][1] & + data[[clmn_term_2]] == lbl_term_2, ] + D <- data[data[[clmn_term_1]] == l_reference[[clmn_term_1]][1] & + data[[clmn_term_2]] == l_reference[[clmn_term_2]][1], ] + actual_interaction <- (A$logQij_mean - B$logQij_mean) - (C$logQij_mean - D$logQij_mean) return(actual_interaction) } @@ -4643,25 +4785,24 @@ prepareData2computeInteraction <- function(categorical_vars, categorical_varsInI #' @param l_categoricalVarsInInteraction A vector containing the names of categorical variables #' involved in the interaction. #' @param data2computeInteraction The data frame used to compute interaction values. -#' @param actual_intercept The actual intercept data frame. #' @param l_RefInCategoricalVars A list containing the reference levels of categorical variables. #' #' @return A data frame with the actual values for the interaction fixed effect. #' The data frame includes columns: term, actual, and description. #' #' @export -generateActualInteractionFixEff <- function(labelsInInteraction, l_categoricalVarsInInteraction, data2computeInteraction, actual_intercept, l_RefInCategoricalVars ){ +generateActualInteractionX2_FixEff <- function(labelsInInteraction, l_categoricalVarsInInteraction, + data2computeInteraction, l_RefInCategoricalVars ){ clmn_term_1 <- l_categoricalVarsInInteraction[1] lbl_term_1 <- labelsInInteraction[1] clmn_term_2 <- l_categoricalVarsInInteraction[2] lbl_term_2 <- labelsInInteraction[2] - interactionValues <- calculate_actual_interaction_values(data2computeInteraction, - actual_intercept, l_RefInCategoricalVars, clmn_term_1, - lbl_term_1, clmn_term_2, lbl_term_2) + interactionValues <- calculate_actual_interactionX2_values(data2computeInteraction, + l_RefInCategoricalVars, clmn_term_1, + lbl_term_1, clmn_term_2, lbl_term_2) + - ## categorical vars 2 keep - #index2keep <- grepl("label_", colnames(actual_intercept)) - df_actualForMyInteraction <- actual_intercept#[, index2keep] + df_actualForMyInteraction <- data.frame(geneID = unique(data2computeInteraction$geneID)) df_actualForMyInteraction$term <- paste(labelsInInteraction, collapse = ":") df_actualForMyInteraction$actual <- interactionValues df_actualForMyInteraction$description <- paste(gsub("\\d+$", "", lbl_term_1) , @@ -4672,6 +4813,104 @@ generateActualInteractionFixEff <- function(labelsInInteraction, l_categoricalVa } +#' Generate Actual Interaction Values for Three Fixed Effects +#' +#' This function generates actual interaction values for three fixed effects in a dataset. It takes the labels of the three fixed effects, the dataset, and the reference values for the categorical variables. The function computes the actual interaction values and returns a data frame containing the geneID, the term description, and the actual interaction values. +#' +#' @param labelsInInteraction A character vector of labels for the three fixed effects. +#' @param l_categoricalVarsInInteraction A list of categorical variable names corresponding to the three fixed effects. +#' @param data2computeInteraction The dataset on which to compute the interaction values. +#' @param l_RefInCategoricalVars A list of reference values for the categorical variables. +#' +#' @return A data frame with geneID, term description, and actual interaction values. +#' +#' @export +generateActualInteractionX3_FixEff <- function(labelsInInteraction, l_categoricalVarsInInteraction, + data2computeInteraction, l_RefInCategoricalVars) { + + clmn_term_1 <- l_categoricalVarsInInteraction[1] + lbl_term_1 <- labelsInInteraction[1] + clmn_term_2 <- l_categoricalVarsInInteraction[2] + lbl_term_2 <- labelsInInteraction[2] + clmn_term_3 <- l_categoricalVarsInInteraction[3] + lbl_term_3 <- labelsInInteraction[3] + interactionValues <- calculate_actual_interactionX3_values(data2computeInteraction, + l_RefInCategoricalVars, clmn_term_1, + lbl_term_1, clmn_term_2, lbl_term_2, lbl_term_3, clmn_term_3) + + + df_actualForMyInteraction <- data.frame(geneID = unique(data2computeInteraction$geneID)) + df_actualForMyInteraction$term <- paste(labelsInInteraction, collapse = ":") + df_actualForMyInteraction$actual <- interactionValues + df_actualForMyInteraction$description <- paste(gsub("\\d+$", "", lbl_term_1) , + gsub("\\d+$", "", lbl_term_2), + gsub("\\d+$", "", lbl_term_3), sep = ":") + + return(df_actualForMyInteraction) + +} + + +#' Calculate Actual Interaction Values for Three Fixed Effects +#' +#' This function calculates actual interaction values for three fixed effects in a dataset. It takes the data, reference values for categorical variables, and the specifications for the fixed effects. The function computes the interaction values and returns the result. +#' +#' @param data The dataset on which to calculate interaction values. +#' @param l_reference A list of reference values for categorical variables. +#' @param clmn_term_1 The name of the first categorical variable. +#' @param lbl_term_1 The label for the first categorical variable. +#' @param clmn_term_2 The name of the second categorical variable. +#' @param lbl_term_2 The label for the second categorical variable. +#' @param lbl_term_3 The label for the third categorical variable. +#' @param clmn_term_3 The name of the third categorical variable. +#' +#' @return The computed actual interaction values. +#' +#' @export +calculate_actual_interactionX3_values <- function(data, l_reference, clmn_term_1, lbl_term_1, + clmn_term_2, lbl_term_2, lbl_term_3, clmn_term_3) { + ## Label term 3 + A <- data[data[[clmn_term_1]] == lbl_term_1 & + data[[clmn_term_2]] == lbl_term_2 & + data[[clmn_term_3]] == lbl_term_3, ] + + B <- data[data[[clmn_term_1]] == l_reference[[clmn_term_1]][1] & + data[[clmn_term_2]] == lbl_term_2 & + data[[clmn_term_3]] == lbl_term_3 , ] + + C <- data[data[[clmn_term_1]] == lbl_term_1 & + data[[clmn_term_2]] == l_reference[[clmn_term_2]][1] & + data[[clmn_term_3]] == lbl_term_3, ] + + D <- data[data[[clmn_term_1]] == l_reference[[clmn_term_1]][1] & + data[[clmn_term_2]] == l_reference[[clmn_term_2]][1] & + data[[clmn_term_3]] == lbl_term_3, ] + + termA = (A$logQij_mean-B$logQij_mean) - (C$logQij_mean - D$logQij_mean) + + ## Label term 3 == reference ! + A <- data[data[[clmn_term_1]] == lbl_term_1 & + data[[clmn_term_2]] == lbl_term_2 & + data[[clmn_term_3]] == l_reference[[clmn_term_3]][1], ] + + B <- data[data[[clmn_term_1]] == l_reference[[clmn_term_1]][1] & + data[[clmn_term_2]] == lbl_term_2 & + data[[clmn_term_3]] == l_reference[[clmn_term_3]][1] , ] + + C <- data[data[[clmn_term_1]] == lbl_term_1 & + data[[clmn_term_2]] == l_reference[[clmn_term_2]][1] & + data[[clmn_term_3]] == l_reference[[clmn_term_3]][1], ] + + D <- data[data[[clmn_term_1]] == l_reference[[clmn_term_1]][1] & + data[[clmn_term_2]] == l_reference[[clmn_term_2]][1] & + data[[clmn_term_3]] == l_reference[[clmn_term_3]][1], ] + + termB = (A$logQij_mean-B$logQij_mean) - (C$logQij_mean - D$logQij_mean) + actual_interaction <- termA - termB + return(actual_interaction) +} + + #' Get the actual interaction values for a given interaction term in the data. #' @@ -4685,17 +4924,29 @@ generateActualInteractionFixEff <- function(labelsInInteraction, l_categoricalVa #' @param data The dataset containing the gene expression data and categorical variables. #' @param categorical_vars A character vector containing the names of the categorical variables in #' the dataset. -#' @param actual_intercept #' @return A data frame containing the actual interaction values. #' @export -getActualInteractionFixEff <- function(labelsInInteraction, data, categorical_vars, actual_intercept){ +getActualInteractionFixEff <- function(labelsInInteraction, data, categorical_vars ){ l_RefInCategoricalVars <- lapply(data[, categorical_vars], function(vector) levels(vector)[1]) l_labelsInCategoricalVars <- lapply(data[, categorical_vars], levels) l_categoricalVarsInInteraction <- lapply(labelsInInteraction, function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() data2computeInteraction <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, data ) - actualInteractionValues <- generateActualInteractionFixEff(labelsInInteraction,l_categoricalVarsInInteraction , - data2computeInteraction, actual_intercept, l_RefInCategoricalVars) + + ## Interaction x3 + if (length(labelsInInteraction) == 3){ + actualInteractionValues <- generateActualInteractionX3_FixEff(labelsInInteraction, + l_categoricalVarsInInteraction , + data2computeInteraction, + l_RefInCategoricalVars) + } + # Interaction x2 + if (length(labelsInInteraction) == 2){ + actualInteractionValues <- generateActualInteractionX2_FixEff(labelsInInteraction, + l_categoricalVarsInInteraction , + data2computeInteraction, + l_RefInCategoricalVars) + } return(actualInteractionValues) } @@ -4719,8 +4970,10 @@ getActualInteractionFixEff <- function(labelsInInteraction, data, categorical_va #' init_variable(name = "varB", mu = c(5,-5), NA , level = 2) %>% #' init_variable(name = "varC", mu = 1, 3, 3) %>% #' add_interaction(between_var = c("varA", "varC"), mu = 5, 0.1) -#' mock_data <- mock_rnaseq(init_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES ) -#' data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata ) +#' mock_data <- mock_rnaseq(init_var, N_GENES, +#' MIN_REPLICATES, MAX_REPLICATES ) +#' data2fit <- prepareData2fit(countMatrix = mock_data$counts, +#' metadata = mock_data$metadata ) #' results_fit <- fitModelParallel(formula = kij ~ varA + varB + varC + varA:varC, #' data = data2fit, group_by = "geneID", #' family = glmmTMB::nbinom2(link = "log"), n.cores = 1) @@ -4735,14 +4988,13 @@ getActualInteractionFixEff <- function(labelsInInteraction, data, categorical_va #' l_categoricalVarsInInteraction <- lapply(l_interaction, #' function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() #' data_prepared <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, dataActual) -#' actual_intercept <- getActualIntercept(fixEff_dataActual) #' # Compute actual interaction values for multiple interactions -#' actualInteraction <- computeActualInteractionFixEff(interactionTerm, categorical_vars, dataActual, actual_intercept) -computeActualInteractionFixEff <- function(l_interactionTerm, categorical_vars, dataActual, actualIntercept){ +#' actualInteraction <- computeActualInteractionFixEff(interactionTerm, categorical_vars, dataActual) +computeActualInteractionFixEff <- function(l_interactionTerm, categorical_vars, dataActual){ l_interaction <- strsplit(l_interactionTerm, split = ":") l_interactionActualValues <- lapply(l_interaction, function(labelsInInteraction) - getActualInteractionFixEff(labelsInInteraction, dataActual, categorical_vars, actualIntercept)) + getActualInteractionFixEff(labelsInInteraction, dataActual, categorical_vars)) actualInteraction_df <- do.call('rbind', l_interactionActualValues) return(actualInteraction_df) } @@ -4786,29 +5038,25 @@ test_that("filter_dataframe retourne le dataframe d'origine si aucun filtre n'es expect_identical(filtered_df, df) }) -test_that("calculate_actual_interaction_values retourne la valeur d'interaction réelle correctement", { +test_that("calculate_actual_interactionX2_values retourne la valeur d'interaction réelle correctement", { average_gt <- data.frame( clmn_term_1 = c("A", "A", "B", "B"), clmn_term_2 = c("X", "Y", "X", "Y"), - logQij_mean = c(1.5, 2.0, 0.5, 1.0) - ) - - df_intercept <- data.frame( - actual = 0.8 + logQij_mean = c(1.5, 2.0, 85, 1.0) ) # Définir les paramètres de la fonction l_label <- list(clmn_term_1 = c("A", "B"), clmn_term_2 = c("X", "Y")) clmn_term_1 <- "clmn_term_1" - lbl_term_1 <- "A" + lbl_term_1 <- "B" clmn_term_2 <- "clmn_term_2" lbl_term_2 <- "Y" # Calculer la valeur d'interaction réelle - actual_interaction <- calculate_actual_interaction_values(average_gt, df_intercept, l_label, clmn_term_1, lbl_term_1, clmn_term_2, lbl_term_2) + actual_interaction <- calculate_actual_interactionX2_values(average_gt, l_label, clmn_term_1, lbl_term_1, clmn_term_2, lbl_term_2) # Vérifier que la valeur d'interaction réelle est correcte - expect_equal(actual_interaction, -0.7) + expect_equal(actual_interaction, -84.5) }) @@ -4876,7 +5124,8 @@ test_that("Generate actual interaction fixed effect correctly", { l_RefInCategoricalVars <- lapply(dataActual[, categorical_vars], function(vector) levels(vector)[1]) ####################################################################### - actualInteraction <- generateActualInteractionFixEff(l_interaction, l_categoricalVarsInInteraction, data_prepared, actual_intercept, l_RefInCategoricalVars) + actualInteraction <- generateActualInteractionX2_FixEff(l_interaction, l_categoricalVarsInInteraction, + data_prepared, l_RefInCategoricalVars) # Add your assertions here based on the expected values # For example: @@ -4890,43 +5139,39 @@ test_that("Generate actual interaction fixed effect correctly", { }) -# Test the function `generateActualInteractionFixEff` -test_that("Test generateActualInteractionFixEff function", { +# Test the function `generateActualInteractionX2_FixEff` +test_that("Test generateActualInteractionX2_FixEff function", { # Generate example data data <- data.frame( - geneID = c("gene1", "gene1", "gene2", "gene2"), - varA = factor(c("A1", "A2", "A1", "A2")), - varB = factor(c("B1", "B2", "B1", "B2")), - varC = factor(c("C1", "C2", "C1", "C2")), - logQij_mean = c(3.2, 5.1, 4.7, 6.3) + geneID = rep(x = c("gene1", "gene2"), each = 8), + logQij_mean = 1:16 + ) + metadata = expand.grid(list(varA = factor(c("A1", "A2")), + varB = factor(c("B1", "B2")), + varC = factor(c("C1", "C2")))) + metadata = rbind(metadata, metadata) + + data <- cbind(metadata, data) categorical_vars <- c("varA", "varB", "varC") - labelsInInteraction <- c("A1", "B1") + labelsInInteraction <- c("A2", "C2") actual_intercept <- data.frame(actual = c(23, 21 ), geneID = c("gene1", "gene2"), term = c("(Intercept)", "(Intercept)"), description = c("(Intercept)", "(Intercept)")) # Run the function - actualInteractionValues <- getActualInteractionFixEff(labelsInInteraction, data, categorical_vars, actual_intercept) - - l_RefInCategoricalVars <- lapply(data[, categorical_vars], function(vector) levels(vector)[1]) - l_labelsInCategoricalVars <- lapply(data[, categorical_vars], levels) - l_categoricalVarsInInteraction <- lapply(labelsInInteraction, - function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() - data2computeInteraction <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, data ) - actualInteractionValues <- generateActualInteractionFixEff(labelsInInteraction,l_categoricalVarsInInteraction , - data2computeInteraction, actual_intercept, l_RefInCategoricalVars) + actualInteractionValues <- getActualInteractionFixEff(labelsInInteraction, data, categorical_vars ) # Define the expected output based on the example data expected_output <- data.frame( - term = "A1:B1", + term = "A2:C2", geneID = c("gene1", "gene2"), - actual = c(19.8, 16.3), - description = c("interaction", "interaction") + actual = c(0, 0), + description = c("A:C", "A:C") ) # Add your assertions here to compare the actual output with the expected output @@ -4940,6 +5185,85 @@ test_that("Test generateActualInteractionFixEff function", { +# Test for generateActualInteractionX3FixEff +test_that("generateActualInteractionX3FixEff returns correct data frame", { + + # Create reference values + reference <- list( + varA = c("A1", "A2"), + varB = c("B1", "B2"), + varC = c("C1", "C2") + ) + # Generate example data + set.seed(123) + data <- data.frame( + geneID = rep(x = c("gene1", "gene2"), each = 8), + logQij_mean = sample(x = -3:12, 16) + + ) + metadata = expand.grid(list(varA = factor(c("A1", "A2")), + varB = factor(c("B1", "B2")), + varC = factor(c("C1", "C2")))) + metadata = rbind(metadata, metadata) + + data <- cbind(metadata, data) + + # Call the function + result <- generateActualInteractionX3_FixEff( + labelsInInteraction = c("A2", "B2", "C2"), + l_categoricalVarsInInteraction = c("varA", "varB", "varC"), + data2computeInteraction = data, + l_RefInCategoricalVars = reference + ) + + # Check the result + expect_equal(nrow(result), 2) + expect_equal(ncol(result), 4) + expect_identical(result$term, c("A2:B2:C2","A2:B2:C2")) + expect_equal(result$actual, c(-3, 13)) + expect_identical(result$description, c("A:B:C", "A:B:C")) +}) + +# Test for calculate_actual_interactionX3_values +test_that("calculate_actual_interactionX3_values returns correct values", { + # Create reference values + reference <- list( + varA = c("A1", "A2"), + varB = c("B1", "B2"), + varC = c("C1", "C2") + ) + # Generate example data + set.seed(123) + data <- data.frame( + geneID = rep(x = c("gene1", "gene2"), each = 8), + logQij_mean = sample(x = -8:8, 16) + + ) + metadata = expand.grid(list(varA = factor(c("A1", "A2")), + varB = factor(c("B1", "B2")), + varC = factor(c("C1", "C2")))) + metadata = rbind(metadata, metadata) + + data <- cbind(metadata, data) + # Call the function + result <- calculate_actual_interactionX3_values( + data = data, + l_reference = reference, + clmn_term_1 = "varA", + lbl_term_1 = "A2", + clmn_term_2 = "varB", + lbl_term_2 = "B2", + lbl_term_3 = "C2", + clmn_term_3 = "varC" + ) + + # Check the result + expect_equal(result, c(-7, 11)) +}) + + + +## Test interaction X2 test_that("Test getActualInteractionFixEff", { # Exemple de données d'entrée @@ -4974,10 +5298,10 @@ test_that("Test getActualInteractionFixEff", { function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() data_prepared <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, dataActual) - actual_intercept <- getActualIntercept(fixEff_dataActual) + #actual_intercept <- getActualIntercept(fixEff_dataActual) # Appel de la fonction à tester - actualInteraction <- getActualInteractionFixEff(l_interaction, data_prepared, categorical_vars, actual_intercept) + actualInteraction <- getActualInteractionFixEff(l_interaction, data_prepared, categorical_vars) expect_true(nrow(actualInteraction) == 4) @@ -4988,28 +5312,80 @@ test_that("Test getActualInteractionFixEff", { }) -``` - -```{r function-inferenceToExpected, filename = "inferenceToExpected" } - -#' Compare the results of inference with the ground truth data. -#' -#' This function takes the data frames containing the inference results and the ground truth data -#' and generates a table to compare the inferred values with the expected values. -#' -#' @param tidy_tmb A data frame containing the results of inference. -#' @param df_ground_truth A data frame containing the ground truth data used for simulation. -#' @param gene_id_column The name of the column in both data frames representing the gene IDs. -#' -#' @return A data frame -#' -#' @examples -#' -#' @export -compareInferenceToExpected <- function(tidy_tmb , df_ground_truth) { +## Test interaction X3 +test_that("Test getActualInteractionFixEff", { - ## -- get data - fixEff_dataInference <- subsetFixEffectInferred(tidy_tmb) + # Exemple de données d'entrée + N_GENES <- 4 + MIN_REPLICATES <- 20 + MAX_REPLICATES <- 20 + + init_var <- init_variable( name = "varA", mu = 3,sd = 1, level = 2) %>% + init_variable( name = "varB", mu = 2, sd = 2, level = 2) %>% + init_variable( name = "varC", mu = 2, sd = 1, level = 2) %>% + add_interaction(between_var = c("varA", 'varC'), mu = 0.3, sd = 1) %>% + add_interaction(between_var = c("varB", 'varC'), mu = 2, sd = 1) %>% + add_interaction(between_var = c("varA", 'varB'), mu = -2, sd = 1) %>% + add_interaction(between_var = c("varA", 'varB', "varC"), mu = 1, sd = 1) + + + # Simulation + mock_data <- mock_rnaseq(init_var, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, dispersion = 100) + + # Données de fit + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata) + results_fit <- fitModelParallel(formula = kij ~ varA + varB + varC + varA:varB + varB:varC + varA:varC + varA:varB:varC, + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + + # Données d'entrée + tidy_tmb <- tidy_tmb(results_fit) + fixEff_dataInference <- subsetFixEffectInferred(tidy_tmb) + fixEff_dataActual <- getData2computeActualFixEffect(mock_data$groundTruth$effects) + interactionTerm <- fixEff_dataInference$fixed_term$interaction[[4]] + categorical_vars <- fixEff_dataActual$categorical_vars + dataActual <- fixEff_dataActual$data + l_labelsInCategoricalVars <- lapply(dataActual[, categorical_vars], levels) + l_interaction <- strsplit(interactionTerm, split = ":")[[1]] + l_categoricalVarsInInteraction <- lapply(l_interaction, + function(label) findAttribute(label, l_labelsInCategoricalVars)) %>% unlist() + + data_prepared <- prepareData2computeInteraction(categorical_vars, l_categoricalVarsInInteraction, dataActual) + + actualInteraction <- getActualInteractionFixEff(l_interaction, data_prepared, categorical_vars) + + + expect_true(nrow(actualInteraction) == 4) + expect_equal(actualInteraction$geneID, c("gene1", "gene2", "gene3", "gene4")) + expect_true(all(actualInteraction$term %in% 'varA2:varB2:varC2')) + expect_true(all(actualInteraction$description %in% 'varA:varB:varC')) + expect_true(is.numeric(actualInteraction$actual)) +}) + + +``` + +```{r function-inferenceToExpected, filename = "inferenceToExpected" } + +#' Compare the results of inference with the ground truth data. +#' +#' This function takes the data frames containing the inference results and the ground truth data +#' and generates a table to compare the inferred values with the expected values. +#' +#' @param tidy_tmb A data frame containing the results of inference. +#' @param df_ground_truth A data frame containing the ground truth data used for simulation. +#' +#' @return A data frame +#' +#' @examples +#' +#' @export +inferenceToExpected_withFixedEff <- function(tidy_tmb , df_ground_truth) { + + ## -- get data + fixEff_dataInference <- subsetFixEffectInferred(tidy_tmb) fixEff_dataActual <- getData2computeActualFixEffect(df_ground_truth) actual_intercept <- getActualIntercept(fixEff_dataActual) @@ -5021,7 +5397,7 @@ compareInferenceToExpected <- function(tidy_tmb , df_ground_truth) { l_interactionTerm <- fixEff_dataInference$fixed_term$interaction categorical_vars <- fixEff_dataActual$categorical_vars data <- fixEff_dataActual$data - actualInteractionFixEff <- computeActualInteractionFixEff(l_interactionTerm, categorical_vars, data, actual_intercept) + actualInteractionFixEff <- computeActualInteractionFixEff(l_interactionTerm, categorical_vars, data) ## -- rbind Interaction & Main actual_fixEff <- rbind(actual_mainFixEff , actualInteractionFixEff, actual_intercept ) @@ -5034,7 +5410,6 @@ compareInferenceToExpected <- function(tidy_tmb , df_ground_truth) { ``` -## Wald test ```{r function-waldTest, filename = "waldTest" } @@ -5051,6 +5426,7 @@ compareInferenceToExpected <- function(tidy_tmb , df_ground_truth) { #' @param reference_value The reference value for comparison (default is 0). #' @param alternative The type of alternative hypothesis to test (default is "greaterAbs"). #' @return A list containing the test statistic and p-value. +#' @importFrom stats pnorm #' @export #' @examples #' # Perform a Wald test with the default "greaterAbs" alternative @@ -5058,7 +5434,7 @@ compareInferenceToExpected <- function(tidy_tmb , df_ground_truth) { wald_test <- function(estimation, std_error, reference_value = 0, alternative = "greaterAbs") { if (alternative == "greater") { test_statistic <- (estimation - reference_value) / std_error - p_value <- 1 - pnorm(test_statistic, mean = 0, sd = 1, lower.tail = TRUE) + p_value <- 1 - stats::pnorm(test_statistic, mean = 0, sd = 1, lower.tail = TRUE) } else if (alternative == "less") { test_statistic <- (estimation - reference_value) / std_error p_value <- pnorm(test_statistic, mean = 0, sd = 1, lower.tail = TRUE) @@ -5101,7 +5477,7 @@ wald_test <- function(estimation, std_error, reference_value = 0, alternative = #' results_df <- results(model_list, coeff_threshold = 0.1, alternative_hypothesis = "greater") results <- function(list_tmb, coeff_threshold = 0, alternative_hypothesis = "greaterAbs", correction_method = "BH") { tidy_tmb_df <- tidy_tmb(list_tmb) - if (coeff_threshold != 0 && alternative_hypothesis != "greaterAbs") { + if (coeff_threshold != 0 || alternative_hypothesis != "greaterAbs") { waldRes <- wald_test(tidy_tmb_df$estimate, tidy_tmb_df$std.error, coeff_threshold, alternative_hypothesis) tidy_tmb_df$statistic <- waldRes$statistic tidy_tmb_df$p.value <- waldRes$p.value @@ -5179,8 +5555,6 @@ test_that("results function performs statistical tests correctly", { ``` -## ROC plot - ```{r function-rocPlot, filename = "ROCplot"} @@ -5261,9 +5635,10 @@ roc_plot <- function(comparison_df, ...) { ggplot2::ggtitle("ROC curve") ## -- annotation AUC - df_AUC <- plotROC::calc_auc(p)[c("from", "AUC")] + df_AUC <- subset(plotROC::calc_auc(p) , select = -c(PANEL, group)) df_AUC$AUC <- round(df_AUC$AUC, digits = 3) - annotations <- do.call(paste, c(df_AUC, sep = " - AUC: ")) + if (nrow(df_AUC) == 1) annotations <- paste("AUC", df_AUC$AUC, sep = " : ") + else annotations <- do.call(paste, c(df_AUC, sep = " - AUC: ")) annotations <- paste(annotations, collapse = "\n") p <- p + ggplot2::annotate("text", x = .75, y = .25, label = annotations) return(p) @@ -5305,9 +5680,19 @@ test_that("ROC plot is generated correctly", { comparison_data <- data.frame( geneID = c("gene1", "gene2", "gene3"), isDE = c(TRUE, FALSE, TRUE), - p.adj = c(0.05, 0.2, 0.01) + p.adj = c(0.05, 0.2, 0.01), + from = "example" ) + plot <- roc_plot(comparison_data, col = "from") + + expect_true("gg" %in% class(plot)) + + comparison_data <- data.frame( + geneID = c("gene1", "gene2", "gene3"), + isDE = c(TRUE, FALSE, TRUE), + p.adj = c(0.05, 0.2, 0.01) ) + plot <- roc_plot(comparison_data) expect_true("gg" %in% class(plot)) @@ -5316,8 +5701,6 @@ test_that("ROC plot is generated correctly", { ``` -## Counts plot - ```{r function-countsPlot, filename = "countsPlot"} @@ -5340,7 +5723,7 @@ counts_plot <- function(mock_obj){ counts <- unname(unlist(mock_obj$counts)) p <- ggplot2::ggplot() + ggplot2::aes(x = "Genes", y = counts) + - ggplot2::geom_point(position = "jitter", alpha = 0.6, size = 0.4, col = "#D1F2EB") + + ggplot2::geom_point(position = "jitter", alpha = 0.6, size = 0.4, col = "#F0B27A") + ggplot2::geom_violin(fill = "#F8F9F9", alpha = 0.4) + ggplot2::stat_summary(fun = "mean", geom = "point", color = "#B63A0F", size = 5) + ggplot2::theme_bw() + @@ -5372,8 +5755,6 @@ test_that("Counts plot is generated correctly", { ``` -## Identity plot - ```{r function-identityPlot, filename = "identityPlot"} @@ -5435,8 +5816,6 @@ test_that("Identity plot is generated correctly", { ``` -## Simulation report - ```{r function-simulationReport, filename = "simulationReport" } #' Export the Analysis Report to a File @@ -5446,7 +5825,6 @@ test_that("Identity plot is generated correctly", { #' #' @param report_file Path to the file where the report will be exported. #' @param table_settings A table containing settings and parameters used in the analysis. -#' @param metrics_plot A plot displaying various performance metrics. #' @param roc_curve A plot displaying the Receiver Operating Characteristic (ROC) curve. #' @param dispersion_plot A plot displaying the dispersion values. #' @param id_plot A plot displaying unique identifiers. @@ -5526,7 +5904,8 @@ simulationReport <- function(mock_obj, list_tmb = NULL, dds_obj = NULL , # -- build data from list_tmb if (!is.null(list_tmb)){ tidyRes <- results(list_tmb, coeff_threshold, alt_hypothesis) - TMB_comparison_df <- compareInferenceToExpected(tidyRes, mock_obj$groundTruth$effects) + formula_used <- list_tmb[[1]]$modelInfo$allForm$formula + TMB_comparison_df <- compareInferenceToExpected(tidyRes, mock_obj$groundTruth$effects, formula_used) TMB_comparison_df <- getLabelExpected(TMB_comparison_df, coeff_threshold, alt_hypothesis) TMB_comparison_df$from <- "HTRfit" tmb_disp_inferred <- extractTMBDispersion(list_tmb) @@ -5536,7 +5915,7 @@ simulationReport <- function(mock_obj, list_tmb = NULL, dds_obj = NULL , if (!is.null(dds_obj)){ deseq2_wrapped <- wrapper_DESeq2(dds, coeff_threshold, alt_hypothesis) - DESEQ_comparison_df <- compareInferenceToExpected(deseq2_wrapped$fixEff, mock_obj$groundTruth$effects) + DESEQ_comparison_df <- inferenceToExpected_withFixedEff(deseq2_wrapped$fixEff, mock_obj$groundTruth$effects) DESEQ_comparison_df <- getLabelExpected(DESEQ_comparison_df, coeff_threshold, alt_hypothesis) DESEQ_comparison_df$from <- "DESeq2" DESEQ_comparison_df$component <- NA @@ -5549,7 +5928,7 @@ simulationReport <- function(mock_obj, list_tmb = NULL, dds_obj = NULL , comparison_df <- rbind( DESEQ_comparison_df, TMB_comparison_df ) - color2use <- c("#D2B4DE", "#5D6D7E") + color2use <- c("#D2B4DE", "#A2D9CE") color2use <- color2use[c(!is.null(list_tmb), !is.null(dds_obj))] # -- plotting @@ -5566,7 +5945,7 @@ simulationReport <- function(mock_obj, list_tmb = NULL, dds_obj = NULL , exportReportFile(report_file, grobTableSettings, roc_curve, dispersion_plot, id_plot, counts_plot) # -- return - ret <- list(settings = df_settings, metrics_plot = metrics_plot, + ret <- list(settings = df_settings, roc_plot = roc_curve, dispersionEvaluation = evalDisp, identity_plot = id_plot, counts_plot = counts_plot, data = comparison_df) return(ret) } @@ -5604,7 +5983,6 @@ test_that("Handling non-numeric values in the data frame", { ``` -## DESeq2 ```{r function-deseq2, filename = "wrapperDESeq2" } @@ -5632,8 +6010,9 @@ test_that("Handling non-numeric values in the data frame", { #' input_var_list <- init_variable( name = "genotype", mu = 12, sd = 0.1, level = 3) %>% #' init_variable(name = "environment", mu = c(0,1), NA , level = 2) #' -#' mock_data <- mock_rnaseq(input_var_list, N_GENES, MIN_REPLICATES, max_replicates = MAX_REPLICATES) -#' dds <- DESeq2::DESeqDataSetFromMatrix(mock_data$counts , mock_data$metadata, ~ genotype + environment) +#' mock_data <- mock_rnaseq(input_var_list, N_GENES, MIN_REPLICATES, MAX_REPLICATES) +#' dds <- DESeq2::DESeqDataSetFromMatrix(mock_data$counts , +#' mock_data$metadata, ~ genotype + environment) #' dds <- DESeq2::DESeq(dds, quiet = TRUE) #' result <- wrapper_DESeq2(dds, lfcThreshold = 1, altHypothesis = "greater") #' @export @@ -5667,7 +6046,9 @@ wrapper_DESeq2 <- function(dds, lfcThreshold , altHypothesis, correction_method #' @examples #' \dontrun{ #' # Example usage of the function -#' inference_result <- get_inference(dds_full, lfcThreshold = 0.5, altHypothesis = "greater", correction_method = "BH") +#' inference_result <- get_inference(dds_full, lfcThreshold = 0.5, +#' altHypothesis = "greater", +#' correction_method = "BH") #' } #' @importFrom stats p.adjust #' @export @@ -5886,8 +6267,6 @@ test_that("wrapperDESeq2 function works correctly", { ``` -## Anova - ```{r function-anova, filename = "anova"} @@ -5933,11 +6312,11 @@ handleAnovaError <- function(l_TMB, group, ...) { #' models in parallel for different groups specified in the list. It returns a list #' of ANOVA results for each group. #' -#' @param l_glmmTMB A list of \code{glmmTMB} models, with model names corresponding to the groups. +#' @param l_tmb A list of \code{glmmTMB} models, with model names corresponding to the groups. #' @param ... Additional arguments passed to \code{\link[stats]{anova}} function. #' #' @return A list of ANOVA results for each group. -#' +#' @importFrom stats setNames #' @examples #' # Perform ANOVA #' data(iris) @@ -5948,7 +6327,7 @@ handleAnovaError <- function(l_TMB, group, ...) { #' @export anovaParallel <- function(l_tmb, ...) { l_group <- attributes(l_tmb)$names - lapply(setNames(l_group, l_group), function(group) handleAnovaError(l_tmb, group, ...)) + lapply(stats::setNames(l_group, l_group), function(group) handleAnovaError(l_tmb, group, ...)) } @@ -5997,6 +6376,893 @@ test_that("anovaParallel returns valid ANOVA results", { ``` +```{r function-subsetGenes, filename = "subsetGenes"} + +#' Subset Genes in Genomic Data +#' +#' This function filters and adjusts genomic data within the Roxygeb project, based on a specified list of genes. +# It is designed to enhance precision and customization in transcriptomics analysis by retaining only the genes of interest. +# +#' @param l_genes A character vector specifying the genes to be retained in the dataset. +#' @param mockObj An object containing relevant genomic information to be filtered. +#' +#' @return A modified version of the 'mockObj' data object, with genes filtered according to 'l_genes'. +#' +#' @description The 'subsetGenes' function selects and retains genes from 'mockObj' that match the genes specified in 'l_genes'. +# It filters the 'groundTruth$effects' data to keep only the rows corresponding to the selected genes. +# Additionally, it updates 'gene_dispersion' and the count data, ensuring that only the selected genes are retained. +# The function also replaces the total number of genes in 'settings$values' with the length of 'l_genes'. +# The result is a more focused and tailored genomic dataset, facilitating precision in subsequent analyses. +#' +#' @examples +#' \dontrun{ +#' # Example list of genes to be retained +#' selected_genes <- c("GeneA", "GeneB", "GeneC") +#' +#' # Example data object 'mockObj' (simplified structure) +#' mockObj <- list( +#' # ... (mockObj structure) +#' ) +#' +#' # Using the subsetGenes function to filter 'mockObj' +#' filtered_mockObj <- subsetGenes(selected_genes, mockObj) +#' } +#' @export +subsetGenes <- function(l_genes, mockObj) { + # Selects the indices of genes in 'groundTruth$effects$geneID' that are present in 'l_genes'. + idx_gt_effects <- mockObj$groundTruth$effects$geneID %in% l_genes + + # Filters 'groundTruth$effects' to keep only the rows corresponding to the selected genes. + mockObj$groundTruth$effects <- mockObj$groundTruth$effects[idx_gt_effects, ] + + # Updates 'gene_dispersion' by retaining values corresponding to the selected genes. + mockObj$groundTruth$gene_dispersion <- mockObj$groundTruth$gene_dispersion[l_genes] + + # Filters the count data to keep only the rows corresponding to the selected genes. + mockObj$counts <- as.data.frame(mockObj$counts[l_genes, ]) + + # Replaces the total number of genes in 'settings$values' with the length of 'l_genes'. + mockObj$settings$values[1] <- length(l_genes) + + # Returns the modified 'mockObj'. + return(mockObj) +} + + +``` + + +```{r function-evaluationWithMixedEffect, filename = "evaluationWithMixedEffect"} + +#' Check if the formula contains a mixed effect structure. +#' +#' This function checks if the formula contains a mixed effect structure indicated by the presence of "|". +#' +#' @param formula A formula object. +#' +#' @return \code{TRUE} if the formula contains a mixed effect structure, \code{FALSE} otherwise. +#' +#' @examples +#' is_mixedEffect_inFormula(y ~ x + (1|group)) +#' +#' @export +is_mixedEffect_inFormula <- function(formula) { + return("|" %in% all.names(formula)) +} + +#' Check if the formula follows a specific type I mixed effect structure. +#' +#' This function checks if the formula follows a specific type I mixed effect structure, which consists of a fixed effect and a random effect indicated by the presence of "|". +#' +#' @param formula A formula object. +# +#' @return \code{TRUE} if the formula follows the specified type I mixed effect structure, \code{FALSE} otherwise. +# +#' @examples +#' is_formula_mixedTypeI(formula = y ~ x + (1|group)) +# +#' @export +is_formula_mixedTypeI <- function(formula) { + if (length(all.vars(formula)) != 3) return(FALSE) + if (sum(all.names(formula) == "+") > 1) return(FALSE) + if (sum(all.names(formula) == "/") > 0) return(FALSE) + return(TRUE) +} + + +#' Get the categorical variable associated with the fixed effect in a type I formula. +#' +#' This function extracts the categorical variable associated with the fixed effect in a type I formula from a tidy tibble. +# The categorical variable is constructed by taking the label of the second main fixed effect term (ignoring any numeric suffix) and prefixing it with "label_". +# +#' @param tidy_tmb A tidy tibble containing model terms. +# +#' @return The categorical variable associated with the fixed effect in the type I formula. +# +#' @examples +#' \dontrun{ +#' getCategoricalVar_inFixedEffect(tidy_tmb) +#' } +#' @export +getCategoricalVar_inFixedEffect <- function(tidy_tmb) { + main_fixEffs <- unique(subset(tidy_tmb, effect == "fixed")$term) + categorical_var_inFixEff <- paste("label", gsub("\\d+$", "", main_fixEffs[2]), sep = "_") + return(categorical_var_inFixEff) +} + + +#' Group log_qij values per genes and labels. +#' +#' This function groups log_qij values in a ground truth tibble per genes and labels using a specified categorical variable. +# +#' @param ground_truth A tibble containing ground truth data. +#' @param categorical_var The categorical variable to use for grouping. +# +#' @return A list of log_qij values grouped by genes and labels. +#' @importFrom stats as.formula +#' @importFrom reshape2 dcast +#' +# +#' @examples +#' ' \dontrun{ +#' group_logQij_per_genes_and_labels(ground_truth, categorical_var) +#' } +#' @export +group_logQij_per_genes_and_labels <- function(ground_truth, categorical_var) { + str_formula <- paste(c(categorical_var, "geneID"), collapse = " ~ ") + formula <- stats::as.formula(str_formula) + list_logqij <- ground_truth %>% + reshape2::dcast( + formula, + value.var = "log_qij_scaled", + fun.aggregate = list + ) + list_logqij[categorical_var] <- NULL + return(list_logqij) +} + +#' Calculate actual mixed effect values for each gene. +#' +#' This function calculates actual mixed effect values for each gene using the provided data, reference labels, and other labels in a categorical variable. +# +#' @param list_logqij A list of log_qij values grouped by genes and labels. +#' @param genes_iter_list A list of genes for which to calculate the actual mixed effect values. +#' @param categoricalVar_infos Information about the categorical variable, including reference labels and other labels. +# +#' @return A data frame containing the actual mixed effect values for each gene. +# +#' @examples +#' ' \dontrun{ +#' getActualMixed_typeI(list_logqij, genes_iter_list, categoricalVar_infos) +#' } +#' @export +getActualMixed_typeI <- function(list_logqij, genes_iter_list, categoricalVar_infos) { + labelRef_InCategoricalVar <- categoricalVar_infos$ref + labels_InCategoricalVar <- categoricalVar_infos$labels + labelOther_inCategoricalVar <- categoricalVar_infos$labelsOther + + data_per_gene <- lapply(genes_iter_list, function(g) { + data_gene <- data.frame(list_logqij[[g]]) + colnames(data_gene) <- labels_InCategoricalVar + return(data_gene) + }) + + l_actual_per_gene <- lapply(genes_iter_list, function(g) { + data_gene <- data_per_gene[[g]] + res <- calculate_actualMixed(data_gene, labelRef_InCategoricalVar, labelOther_inCategoricalVar) + res$geneID <- g + return(res) + }) + + actual_mixedEff <- do.call("rbind", l_actual_per_gene) + rownames(actual_mixedEff) <- NULL + return(actual_mixedEff) +} + + + +#' Compare the mixed-effects inference to expected values. +#' +#' This function compares the mixed-effects inference obtained from a mixed-effects model to expected values derived from a ground truth dataset. The function assumes a specific type I mixed-effect structure in the input model. +# +#' @param tidy_tmb tidy model results obtained from fitting a mixed-effects model. +#' @param ground_truth_eff A data frame containing ground truth effects. +# +#' @return A data frame with the comparison of estimated mixed effects to expected values. +#' @importFrom stats setNames +#' @examples +#' \dontrun{ +#' inferenceToExpected_withMixedEff(tidy_tmb(l_tmb), ground_truth_eff) +#' } +#' @export +inferenceToExpected_withMixedEff <- function(tidy_tmb, ground_truth_eff){ + + # -- CategoricalVar involve in fixEff + categorical_var <- getCategoricalVar_inFixedEffect(tidy_tmb) + labels_InCategoricalVar <- levels(ground_truth_eff[, categorical_var]) + labelRef_InCategoricalVar <- labels_InCategoricalVar[1] + labelOther_inCategoricalVar <- labels_InCategoricalVar[2:length(labels_InCategoricalVar)] + categoricalVar_infos <- list(ref = labelRef_InCategoricalVar, + labels = labels_InCategoricalVar, + labelsOther = labelOther_inCategoricalVar ) + + ## -- prepare data 2 get actual + l_logqij <- group_logQij_per_genes_and_labels(ground_truth_eff, categorical_var) + l_genes <- unique(ground_truth_eff$geneID) + genes_iter_list <- stats::setNames(l_genes,l_genes) + actual_mixedEff <- getActualMixed_typeI(l_logqij, genes_iter_list, categoricalVar_infos) + + res <- join_dtf(actual_mixedEff, tidy_tmb ,c("geneID", "term"), c("ID", "term")) + + ## -- reorder for convenience + actual <- res$actual + res <- res[, -1] + res$actual <- actual + return(res) +} + + +#' Calculate actual mixed effects. +#' +#' This function calculates actual mixed effects based on the given data for a specific type I mixed-effect structure. +# It calculates the expected values, standard deviations, and correlations between the fixed and random effects. +# The function is designed to work with specific input data for type I mixed-effect calculations. +# +#' @param data_gene Data for a specific gene. +#' @param labelRef_InCategoricalVar The reference label for the categorical variable. +#' @param labelOther_inCategoricalVar Labels for the categorical variable other than the reference label. +#' @importFrom stats sd cor +# +#' @return A data frame containing the calculated actual mixed effects. +# +#' @examples +#' \dontrun{ +# calculate_actualMixed(data_gene, labelRef_InCategoricalVar, labelOther_inCategoricalVar) +#' } +#' @export +calculate_actualMixed <- function(data_gene, labelRef_InCategoricalVar, labelOther_inCategoricalVar ){ + log_qij_scaled_intercept <- data_gene[labelRef_InCategoricalVar] + colnames(log_qij_scaled_intercept) <- '(Intercept)' + + if (length(labelOther_inCategoricalVar == 1 )) { + log_qij_scaled_other <- data_gene[labelOther_inCategoricalVar] + } else log_qij_scaled_other <- data_gene[,labelOther_inCategoricalVar] + log_qij_scaled_transf <- log_qij_scaled_other - log_qij_scaled_intercept[,"(Intercept)"] + + log_qij_scaled_transf <- cbind(log_qij_scaled_intercept, log_qij_scaled_transf) + ## -- fix eff + actual_fixedValues <- colMeans(log_qij_scaled_transf) + + ## -- stdev values + std_values <- sapply(log_qij_scaled_transf, function(x) stats::sd(x)) + names(std_values) <- paste("sd", names(std_values), sep = '_') + + ## -- correlation + corr_mat <- stats::cor(log_qij_scaled_transf) + indx <- which(upper.tri(corr_mat, diag = FALSE), arr.ind = TRUE) + corr2keep = corr_mat[indx] + name_corr <- paste(rownames(corr_mat)[indx[, "row"]], colnames(corr_mat)[indx[, "col"]], sep = ".") + names(corr2keep) <- paste("cor", name_corr, sep = "__") + + ## -- output + actual <- c(actual_fixedValues, std_values, corr2keep) + res <- as.data.frame(actual) + res$term <- rownames(res) + rownames(res) <- NULL + res$description <- sub("_.*", "", gsub("\\d+$", "" , res$term)) + return(res) + + +} + + +#' Compare inference results to expected values for a given model. +#' +#' This function compares the inference results from a model to the expected values based on a ground truth dataset with the simulated effects. The function handles models with mixed effects and fixed effects separately, ensuring that the comparison is appropriate for the specific model type. +#' +#' If a model includes mixed effects, the function checks for support for the specific mixed effect structure and provides an informative error message if the structure is not supported. +#' +#' @param tidy_tmb A fitted model object convert to tidy dataframe. +#' @param ground_truth_eff A ground truth dataset with the simulated effects. +#' @param formula_used formula used in model +#' +#' @return A data frame containing the comparison results, including the term names, inference values, and expected values. +#' +#' @examples +#' \dontrun{ +#' evalData <- compareInferenceToExpected(l_tmb, ground_truth_eff) +#' } +#' @export +compareInferenceToExpected <- function(tidy_tmb, ground_truth_eff, formula_used) { + ## -- parsing formula & check mixed effect + involvMixedEffect <- is_mixedEffect_inFormula(formula_used) + + msg_e_formula_type <- "This simulation evaluation supports certain types of formulas with mixed effects, but not all. + Please refer to the package documentation for information on supported formula structures. + You are welcome to implement additional functions to handle specific formula types with mixed effects that are not currently supported." + + ## -- if mixed effect + if (involvMixedEffect){ + message("Mixed effect detected in the formula structure.") + + if(!is_formula_mixedTypeI(formula_used)){ + stop(msg_e_formula_type) + } + evalData <- inferenceToExpected_withMixedEff(tidy_tmb, ground_truth_eff) + + ## -- only fixed effect + } else { + + message("Only fixed effects are detected in the formula structure.") + evalData <- inferenceToExpected_withFixedEff(tidy_tmb, ground_truth_eff) + } + + return(evalData) +} + + +``` + +```{r test-evaluationWithMixedEffect} + + + +test_that("Test is_mixedEffect_inFormula", { + formula1 <- y ~ a + (1 | B) + formula2 <- ~ a + (1 | B) + formula3 <- x ~ c + d + + expect_true(is_mixedEffect_inFormula(formula1)) + expect_true(is_mixedEffect_inFormula(formula2)) + expect_false(is_mixedEffect_inFormula(formula3)) +}) + +test_that("Test is_formula_mixedTypeI", { + formula1 <- y ~ x + (1 | group) + formula2 <- y ~ z + group1 + (1 | group1) + formula3 <- y ~ z + (1 | group1 + group2) + formula4 <- y ~ z + (1 | group1/z) + + expect_true(is_formula_mixedTypeI(formula1)) + expect_false(is_formula_mixedTypeI(formula2)) + expect_false(is_formula_mixedTypeI(formula3)) + expect_false(is_formula_mixedTypeI(formula4)) + +}) + + +test_that("getCategoricalVar_inFixedEffect returns the correct result", { + + ###### PREPARE DATA + N_GENES = 2 + MAX_REPLICATES = 4 + MIN_REPLICATES = 4 + + input_var_list <- init_variable( name = "genotype", mu = 2, sd = 0.5, level = 10) %>% + init_variable( name = "environment", mu = c(1, 3), sd = NA, level = 2) %>% + add_interaction(between_var = c("genotype", 'environment'), mu = 1, sd = 0.39) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + basal_expression = 3, dispersion = 100) + + data2fit = prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata, normalization = F) + + l_tmb <- fitModelParallel(formula = kij ~ environment + (environment | genotype ), + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + + + tidy_tmb <- tidy_tmb(l_tmb) + categorical_var <- getCategoricalVar_inFixedEffect(tidy_tmb) + expect_equal(categorical_var, "label_environment") +}) + +test_that("group_logQij_per_genes_and_labels returns the correct result", { + + ############ PREPARE DATA + N_GENES = 2 + MAX_REPLICATES = 4 + MIN_REPLICATES = 4 + input_var_list <- init_variable( name = "genotype", mu = 2, sd = 0.5, level = 10) %>% + init_variable( name = "environment", mu = c(1, 3), sd = NA, level = 2) %>% + add_interaction(between_var = c("genotype", 'environment'), mu = 1, sd = 0.39) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + basal_expression = 3, dispersion = 100) + + data2fit = prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata, normalization = F) + + l_tmb <- fitModelParallel(formula = kij ~ environment + (environment | genotype ), + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + + ground_truth_eff <- mock_data$groundTruth$effects + categorical_var <- "label_environment" + logqij_list <- group_logQij_per_genes_and_labels(ground_truth_eff, categorical_var) + + expect_is(logqij_list, "data.frame") + expect_equal(attributes(logqij_list)$names , c("gene1", "gene2")) + expect_equal(length(logqij_list$gene1), 2) + expect_equal(length(logqij_list$gene2), 2) + expect_equal(length(logqij_list$gene2[[1]]), 10) +}) + +test_that("getActualMixed_typeI returns the correct result", { + ############ PREPARE DATA + N_GENES = 2 + MAX_REPLICATES = 4 + MIN_REPLICATES = 4 + input_var_list <- init_variable( name = "genotype", mu = 2, sd = 0.5, level = 10) %>% + init_variable( name = "environment", mu = c(1, 3), sd = NA, level = 2) %>% + add_interaction(between_var = c("genotype", 'environment'), mu = 1, sd = 0.39) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + basal_expression = 3, dispersion = 100) + + data2fit = prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata, normalization = F) + + l_tmb <- fitModelParallel(formula = kij ~ environment + (environment | genotype ), + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + + ground_truth_eff <- mock_data$groundTruth$effects + categorical_var <- "label_environment" + logqij_list <- group_logQij_per_genes_and_labels(ground_truth_eff, categorical_var) + l_genes <- unique(ground_truth_eff$geneID) + genes_iter_list <- stats::setNames(l_genes, l_genes) + categoricalVar_infos= list(ref = "environment1", + labels = c("environment1", "environment2"), + labelsOther = "environment2") + + ## -- test + actual_mixedEff <- getActualMixed_typeI(logqij_list, + genes_iter_list, + categoricalVar_infos) + + ## -- verif + expect_is(actual_mixedEff, "data.frame") + expect_equal(colnames(actual_mixedEff), c("actual", "term", "description", "geneID")) + expect_equal(unique(actual_mixedEff$geneID), c("gene1", "gene2")) + expect_equal(unique(actual_mixedEff$term), c("(Intercept)", "environment2", + "sd_(Intercept)", "sd_environment2", "cor__(Intercept).environment2")) + +}) + + +# Test for InferenceToExpected_withMixedEff +test_that("inferenceToExpected_withMixedEff correctly compares inference to expected values", { + + ## -- PREPARE DATA + N_GENES = 2 + MAX_REPLICATES = 4 + MIN_REPLICATES = 4 + + input_var_list <- init_variable(name = "genotype", mu = 2, sd = 0.5, level = 10) %>% + init_variable(name = "environment", mu = c(1, 3), sd = NA, level = 2) %>% + add_interaction(between_var = c("genotype", 'environment'), mu = 1, sd = 0.39) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + basal_expression = 3, dispersion = 100) + + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata, normalization = FALSE) + + l_tmb <- fitModelParallel(formula = kij ~ environment + (environment | genotype), + data = data2fit, group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), n.cores = 1) + + ## -- call fonction to test + compared_df <- inferenceToExpected_withMixedEff(tidy_tmb(l_tmb), mock_data$groundTruth$effects) + + ## -- TEST VERIF + expect_equal(c("term", "description", "geneID", "effect", + "component", "group", "estimate", "std.error", + "statistic", "p.value", "actual" ) , colnames(compared_df)) + expect_equal(c("gene1", "gene2" ) , unique(compared_df$geneID)) + expect_equal(unique(compared_df$term), c("(Intercept)", "cor__(Intercept).environment2", "environment2", + "sd_(Intercept)", "sd_environment2")) + +}) + +# Test for calculate_actualMixed +test_that("calculate_actualMixed calculates actual mixed effects as expected", { + ## -- PREPARE DATA + N_GENES = 2 + MAX_REPLICATES = 4 + MIN_REPLICATES = 4 + + input_var_list <- init_variable(name = "genotype", mu = 2, sd = 0.5, level = 10) %>% + init_variable(name = "environment", mu = c(1, 3), sd = NA, level = 2) %>% + add_interaction(between_var = c("genotype", 'environment'), mu = 1, sd = 0.39) + + mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + basal_expression = 3, dispersion = 100) + + data2fit <- prepareData2fit(countMatrix = mock_data$counts, metadata = mock_data$metadata, normalization = FALSE) + + + ground_truth_eff <- mock_data$groundTruth$effects + categorical_var <- "label_environment" + logqij_list <- group_logQij_per_genes_and_labels(ground_truth_eff, categorical_var) + l_genes <- unique(ground_truth_eff$geneID) + genes_iter_list <- stats::setNames(l_genes, l_genes) + categoricalVar_infos= list(ref = "environment1", + labels = c("environment1", "environment2"), + labelsOther = "environment2") + + ## -- call function & test + data_per_gene <- lapply(genes_iter_list, function(g) { + data_gene <- data.frame(logqij_list[[g]]) + colnames(data_gene) <- categoricalVar_infos$labels + return(data_gene) + }) + data_gene <- data_per_gene$gene1 + actual_mixed <- calculate_actualMixed(data_gene, + labelRef_InCategoricalVar = categoricalVar_infos$ref , + labelOther_inCategoricalVar = categoricalVar_infos$labelsOther) + expect_equal( colnames(actual_mixed), c("actual", "term", "description")) + expect_equal(actual_mixed$term, c("(Intercept)", "environment2", + "sd_(Intercept)", "sd_environment2", + "cor__(Intercept).environment2")) + expect_equal(actual_mixed$description, c("(Intercept)", "environment", + "sd", "sd", + "cor")) +}) + + + +``` + + +# Initialize variable to simulate + +The `init_variable()` function, which is a key tool for defining the variables in your experimental design. You can specify the variables' names and the size of the effects involved. By manually setting the effect of a variable, you make it a fixed effect, while random effect definitions can make it either fixed or mixed. + +```{r example-init_variable, warning=FALSE, message=FALSE} +## -- Manually init my first variable +input_var_list <- init_variable( name = "varA", mu = c(0.2, 4, -3), level = 3) + +# The 'init_variable' function allows for precise control over the variables in your experimental design. +# In this example, we manually initialize 'varA' with specific means (mu) and levels. + +## -- Randomly init my first variable +input_var_list <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) + +# Alternatively, you can randomly initialize 'varA' by specifying a mean (mu) and standard deviation (sd). +# This introduces variability into 'varA', making it either a fixed or mixed effect in your design. + +## -- Randomly init several variables +input_var_list <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) %>% + init_variable( name = "varB", mu = -3, sd = 0.34, level = 2) + +# You can also initialize multiple variables, such as 'varA' and 'varB', with random values. +# This flexibility allows you to create diverse experimental designs. + +``` + +Similary to `init_variable()`, `add_interaction()` allow to init an interaction between variable. + +```{r example-add_interaction, warning=FALSE, message=FALSE} +## --init variable +input_var_list <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) %>% + init_variable( name = "varB", mu = 1, sd = 0.78, level = 2) %>% + init_variable( name = "varC", mu = -3, sd = 6, level = 3) %>% + add_interaction( between_var = c("varA", "varC"), mu = 9, sd = 0.2) + +# In this example, we initialize 'varA', 'varB', and 'varC', and create an interaction between 'varA' and 'varC' using 'add_interaction'. +# Interactions can be defined to represent complex relationships between variables. + +``` + +Using set_correlation you can constraint the correlation between a variable or an interaction that have been randomly declared. A manually define-variable cannot + +```{r example-setCorrelation, warning=FALSE, message=FALSE} +## -- Set correlation between 2 main variables +input_var_list <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) %>% + init_variable( name = "varB", mu = 1, sd = 0.78, level = 2) %>% + init_variable( name = "varC", mu = -3, sd = 6, level = 3) %>% + set_correlation( between_var = c("varA", "varC"), corr = 0.32) + +# You can set correlations between variables using 'set_correlation'. Here, we establish a correlation between 'varA' and 'varC'. + +## -- Set correlation between 2 interactions +input_var_list <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) %>% + init_variable( name = "varB", mu = 1, sd = 0.78, level = 2) %>% + init_variable( name = "varC", mu = -3, sd = 6, level = 3) %>% + add_interaction( between_var = c("varA", "varC"), mu = 9, sd = 0.2) %>% + add_interaction( between_var = c("varB", "varA"), mu = 9, sd = 0.2) %>% + set_correlation( between_var = c("varA:varC", "varB:varA"), corr = -0.5 ) + +# Similarly, you can set correlations between interactions, introducing complexity to your experimental design. + +## -- Set correlation between an interaction and a variable +input_var_list <- init_variable( name = "varA", mu = 1, sd = 0.2, level = 5) %>% + init_variable( name = "varB", mu = 1, sd = 0.78, level = 2) %>% + add_interaction( between_var = c("varA", "varB"), mu = 9, sd = 0.2) %>% + set_correlation( between_var = c("varA:varB", "varA"), corr = 0.8 ) + +## You can also establish correlations between interactions and variables, providing flexibility in your design. + +## -- output +input_var_list +``` + + +# Simulate RNAseq data + +In this section, you will learn how to simulate RNAseq data, how to generate data based on the input variables defined earlier. Using .... + +```{r example-mock_rnaseq, warning=FALSE, message=FALSE} +## -- Required parameters +N_GENES = 4 +MIN_REPLICATES = 2 +MAX_REPLICATES = 4 +######################## + + + +## -- simulate RNAseq data based on input_var_list +mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES) + +## -- Scaling genes counts with sequencing depth +SEQ_DEPTH = c(100000, 5000000, 10000000) #possible number of reads/sample +mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + sequencing_depth = SEQ_DEPTH) + +## -- Set gene dispersion : k ~ Nbinomial(mu, disp) +DISP = 0.1 # Same dispersion for each genes +DISP = 1000 # Same dispersion for each genes +DISP = runif(N_GENES, 0, 1000) ## Dispersion can vary between genes +mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + dispersion = DISP ) + +## -- Set basal gene expression +BASAL_EXPR = -3 # Value can be negative to simulate low expressed gene +BASAL_EXPR = 2 # Same basal gene expression for the N_GENES +BASAL_EXPR = c(-3, -2, -1, 0, 1, 2, 3 ) ## Basal expression can vary between genes +mock_data <- mock_rnaseq(input_var_list, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + basal_expression = BASAL_EXPR) + +## -- output list attributes +names(mock_data) +``` + + +# Fitting models + +## Prepare data for fitting + + We explore how to prepare the data for modeling and the importance of data normalization. + + +```{r example-prepareData, warning=FALSE, message=FALSE} +## -- get data from simulation or real data +count_matrix <- mock_data$counts +metaData <- mock_data$metadata +############################## + +## -- convert counts matrix and samples metadatas in a data frame for fitting +data2fit = prepareData2fit(countMatrix = count_matrix, + metadata = metaData, + normalization = F) + + +## -- data normalization +data2fit = prepareData2fit(countMatrix = count_matrix, + metadata = metaData, + normalization = T, + response_name = "kij") + +## -- output +head(data2fit) +``` + +## Generalized linear model fit + +We discuss the use of generalized linear models (GLMs) and how to incorporate mixed effects into your models. You'll also find examples of fitting models beyond RNAseq data. + +```{r example-fitModelParallel, warning=FALSE, message=FALSE} + +## -- fit data from your model +l_tmb <- fitModelParallel(formula = kij ~ varA, + data = data2fit, + group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), + log_file = "log.txt", + n.cores = 1) + + +## -- use mixed effect in your model +l_tmb <- fitModelParallel(formula = kij ~ varA + ( 1 | varB ), + data = data2fit, + group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), + log_file = "log.txt", + n.cores = 1) + +## -- not only RNAseq data +data("iris") +l_tmb <- fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width , + data = iris, + group_by = "Species", + family = gaussian(), + log_file = "log.txt", + n.cores = 1) + + +## -- additional settings +l_tmb <- fitModelParallel(formula = kij ~ varA, + data = data2fit, + group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), + n.cores = 1, + log_file = "log.txt", + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e5, + eval.max=1e5))) + +## -- output +l_tmb$gene1 + +``` + +## Update fit + +This section is all about updating your models. Learn how to modify the model family, control settings, and even update your model formula. Model updates are a crucial part of the modeling process, allowing you to refine and adapt your analyses. + +```{r example-update, warning=FALSE, message=FALSE} + +## -- update your fit modifying the model family +l_tmb <- updateParallel(formula = kij ~ varA, + l_tmb = l_tmb , + family = gaussian(), + log_file = "log.txt", + n.cores = 1) + +## -- update fit using additional model control settings +l_tmb <- updateParallel(formula = kij ~ varA , + l_tmb = l_tmb , + family = gaussian(), + log_file = "log.txt", + n.cores = 1, + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3))) + + +## -- update your model formula +l_tmb <- updateParallel(formula = kij ~ varA + varB + varA:varB , + l_tmb = l_tmb , + family = glmmTMB::nbinom2(link = "log"), + log_file = "log.txt", + n.cores = 1) + +## -- output +l_tmb$gene1 +``` + +## Plot fit metrics + +Visualizing fit metrics is essential for evaluating your models. Here, we show you how to generate various plots to assess the quality of your models. You can explore all metrics or focus on specific aspects like dispersion and log-likelihood. + +```{r example-plotMetrics, warning=FALSE, message=FALSE} + +## -- plot all metrics +metrics_plot(l_tmb = l_tmb) + +## -- Focus on metrics +metrics_plot(l_tmb = l_tmb, focus = c("dispersion", "logLik")) +``` + +## Anova to select the best model + +Selecting the best model is a fundamental step in data analysis. We explore how to use analysis of variance (ANOVA) to compare different models and identify the most suitable one for your data. Model selection is critical for drawing meaningful conclusions from your analyses. + +```{r example-anova, warning=FALSE, message=FALSE} + +## -- update your fit modifying the model family +l_anova <- anovaParallel(l_tmb = l_tmb) + +## -- additional settings +l_anova <- anovaParallel(l_tmb = l_tmb, type = "III" ) + +## -- output +l_anova$gene1 +``` + + +# Simulation evaluation report + +In this section, we delve into the evaluation of your simulation results. You'll find details on generating receiver operating characteristic (ROC) curves, identity plots, and dispersion evaluations. These evaluations provide valuable insights into the performance of your simulated data and models. + +```{r example-simulationReport, warning=FALSE, message=FALSE} + +## -- get simulation/fit evaluation +resSimu <- simulationReport(mock_data, + list_tmb = l_tmb, + coeff_threshold = 0.4, + alt_hypothesis = "greaterAbs") + +## -- roc curve +resSimu$roc_plot + +## -- identity plot +resSimu$identity_plot + +## -- dispersion +resSimu$dispersionEvaluation$disp_plot + +``` + +## Compare HTRfit with DESeq2 + +Comparing different analysis approaches is a common practice in data analysis. Here, we compare the results obtained with HTRfit to those of DESeq2. This comparison helps you understand the advantages and unique features of HTRfit in your analyses. + +```{r example-ddsComparison, warning=FALSE, message=FALSE} +## -- DESeq2 +library(DESeq2) +dds <- DESeq2::DESeqDataSetFromMatrix( + countData = count_matrix, + colData = metaData, + design = ~ varA + varB + varA:varB ) +dds <- DESeq2::DESeq(dds, quiet = TRUE) + + + +## -- get simulation/fit evaluation +resSimu <- simulationReport(mock_data, + list_tmb = l_tmb, + dds_obj = dds, + coeff_threshold = 0.4, + alt_hypothesis = "greaterAbs") + +## -- roc curve +resSimu$roc_plot + +## -- identity plot +resSimu$identity_plot + +## -- dispersion +resSimu$dispersionEvaluation$disp_plot + +``` + +## Focus evaluation on a subset of genes + +This section demonstrates how to evaluate low-expressed genes by filtering and analyzing a subset of your data. This focused evaluation can reveal insights that may be obscured when considering the entire dataset. + +```{r example-subsetGenes, warning=FALSE, message=FALSE} + +## Focus on low expressed genes +#low_expressed <- mock_data$groundTruth$effects[ mock_data$groundTruth$effects$basalExpr < 0, ] +#l_genes <- unique(low_expressed$geneID) +#mock_lowExpressed <- subsetGenes(l_genes, mock_data) + + +## -- get simulation/fit evaluation +#resSimu <- simulationReport(mock_lowExpressed, +# list_tmb = l_tmb, +# coeff_threshold = 0.4, +# alt_hypothesis = "greaterAbs") + +## -- roc curve +#resSimu$roc_plot +```