# WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand #' Get input for simulation based on coefficients #' #' This function generates input data for simulation based on the coefficients provided in the \code{list_var} argument. #' #' @param list_var A list of variables (already initialized) #' @param n_genes Number of genes to simulate (default: 1) #' @param normal_distr Specifies the distribution type for generating effects. Choose between 'univariate' (default) or 'multivariate' . #' - 'univariate': Effects are drawn independently from univariate normal distributions. #' - 'multivariate': Effects are drawn jointly from a multivariate normal distribution. (not recommended) #' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL) #' @return A data frame with input coefficients for simulation #' @export #' @examples #' # Example usage #' list_var <- init_variable() #' getInput2simulation(list_var, n_genes = 10) getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "univariate", input2mvrnorm = NULL) { stopifnot( normal_distr %in% c("multivariate", "univariate") ) if (normal_distr == "multivariate"){ if (is.null(input2mvrnorm)) input2mvrnorm = getInput2mvrnorm(list_var) l_dataFrom_normdistr <- getDataFromMvrnorm(list_var, input2mvrnorm, n_genes) } if (normal_distr == "univariate"){ l_dataFrom_normdistr <- getDataFromRnorm(list_var, n_genes) } l_dataFromUser = getDataFromUser(list_var) df_input2simu <- getCoefficients(list_var, l_dataFrom_normdistr, l_dataFromUser, n_genes) return(df_input2simu) } #' Get the reference level for categorical variables in the data #' #' This function extracts the reference level for each categorical variable in the data. #' The reference level is the first level encountered for each categorical variable. #' #' @param data The data frame containing the categorical variables. #' @return A list containing the reference level for each categorical variable. #' @export getRefLevel <- function(data){ col_names <- colnames(data) categorical_vars <- col_names[grepl(col_names, pattern = "label_")] if (length(categorical_vars) == 1){ l_labels <- list() l_labels[[categorical_vars]] <- levels(data[, categorical_vars]) } else l_labels <- lapply(data[, categorical_vars], levels) l_labels_ref <- sapply(l_labels, function(vec) vec[1]) return(l_labels_ref) } #' Replace the effect by 0 in the data #' #' This function replaces the effect in interactions columns by 0, when needed. #' #' @param list_var The list of variables containing the effects to modify. #' @param l_labels_ref A list containing the reference level for each categorical variable. #' @param data The data frame containing the effects to modify. #' @return The modified data frame #' @export replaceUnexpectedInteractionValuesBy0 <- function(list_var, l_labels_ref , data){ varInteraction <- getListVar(list_var$interactions) df_interaction_with0 <- sapply(varInteraction, function(var){ categorical_var <- paste("label", unlist(strsplit(var, ":")), sep = "_") bool_matrix <- sapply(categorical_var, function(uniq_cat_var) data[uniq_cat_var] == l_labels_ref[uniq_cat_var]) idx_0 <- rowSums(bool_matrix) > 0 ## line without interactions effects return(replace(data[[var]], idx_0, 0)) }) col_names <- colnames(data) categorical_vars <- col_names[grepl(col_names, pattern = "label_")] data[, varInteraction] <- df_interaction_with0 return(data) } #' Prepare data using effects from a normal distribution #' #' Prepares the data by generating effects from a normal distribution for each gene. #' #' @param list_var A list of variables (already initialized) #' @param n_genes Number of genes to generate data for. #' @return A dataframe containing gene metadata and effects generated from a normal distribution. #' @export getDataFromRnorm <- function(list_var, n_genes){ ## -- check if all data have been provided by user if (is.null(getInput2mvrnorm(list_var)$covMatrix)) return(list()) metadata <- getGeneMetadata(list_var , n_genes) df_effects <- get_effects_from_rnorm(list_var, metadata) data <- cbind(metadata, df_effects) if(!is.null(getListVar(list_var$interactions))){ l_labels_ref <- getRefLevel(data) data <- replaceUnexpectedInteractionValuesBy0(list_var, l_labels_ref, data) } return(list(data)) } #' Generate effects from a normal distribution #' #' Generates effects from a normal distribution for each gene. #' #' @param list_var A list of variables (already initialized) #' @param metadata Gene metadata. #' @return A dataframe containing effects generated from a normal distribution. #' @export get_effects_from_rnorm <- function(list_var, metadata){ variable_standard_dev <- getGivenAttribute(list_var, attribute = "sd") %>% unlist() interaction_standard_dev <- getGivenAttribute(list_var$interactions, attribute = "sd") %>% unlist() list_stdev <- c(variable_standard_dev, interaction_standard_dev) # -- mu variable_mu <- getGivenAttribute(list_var, attribute = "mu") %>% unlist() interaction_mu <- getGivenAttribute(list_var$interactions, attribute = "mu") %>% unlist() list_mu <- c(variable_mu, interaction_mu) variable_2rnorm <- names(list_stdev) l_effects <- lapply(stats::setNames(variable_2rnorm, variable_2rnorm) , function(var){ col_labels <- paste("label", unlist(strsplit(var, ":")), sep = "_") cols2paste <- c("geneID", col_labels) list_combinations <- apply( metadata[ , cols2paste ] , 1 , paste , collapse = "-" ) list_effects <- unique(list_combinations) list_beta <- rnorm(length(list_effects), mean = list_mu[var], sd = list_stdev[var]) names(list_beta) <- list_effects unname(list_beta[list_combinations]) }) df_effects <- do.call("cbind", l_effects) return(df_effects) } #' getCoefficients #' #' Get the coefficients. #' #' @param list_var A list of variables (already initialized) #' @param l_dataFromMvrnorm Data from the `getGeneMetadata` function (optional). #' @param l_dataFromUser Data from the `getDataFromUser` function (optional). #' @param n_genes The number of genes. #' @export #' @return A dataframe containing the coefficients. #' @examples #' # Example usage #' list_var <- init_variable() #' input2mvrnorm = getInput2mvrnorm(list_var) #' l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes=3) #' l_dataFromUser = getDataFromUser(list_var) #' getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes = 3) getCoefficients <- function(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) { if (length(l_dataFromMvrnorm) == 0) { metaData <- getGeneMetadata(list_var, n_genes) l_dataFromMvrnorm <- list(metaData) } l_df2join <- c(l_dataFromMvrnorm, l_dataFromUser) df_coef <- Reduce(function(d1, d2){ column_names = colnames(d2) idx_key = grepl(pattern = "label", column_names ) keys = column_names[idx_key] join_dtf(d1, d2, k1 = keys , k2 = keys) } , l_df2join ) %>% as.data.frame() column_names <- colnames(df_coef) idx_column2factor <- grep(pattern = "label_", column_names) if (length(idx_column2factor) > 1) { df_coef[, idx_column2factor] <- lapply(df_coef[, idx_column2factor], as.factor) } else { df_coef[, idx_column2factor] <- as.factor(df_coef[, idx_column2factor]) } return(df_coef) } #' Get the log_qij values from the coefficient data frame. #' #' @param dtf_coef The coefficient data frame. #' @return The coefficient data frame with log_qij column added. #' @export #' @examples #' list_var <- init_variable() #' dtf_coef <- getInput2simulation(list_var, 10) #' dtf_coef <- getLog_qij(dtf_coef) getLog_qij <- function(dtf_coef) { dtf_beta_numeric <- dtf_coef[sapply(dtf_coef, is.numeric)] dtf_coef$log_qij <- rowSums(dtf_beta_numeric, na.rm = TRUE) return(dtf_coef) } #' Calculate mu_ij values based on coefficient data frame and scaling factor #' #' This function calculates mu_ij values by raising 2 to the power of the log_qij values #' from the coefficient data frame and multiplying it by the provided scaling factor. #' #' @param dtf_coef Coefficient data frame containing the log_qij values #' #' @return Coefficient data frame with an additional mu_ij column #' #' @examples #' list_var <- init_variable() #' dtf_coef <- getInput2simulation(list_var, 10) #' dtf_coef <- getLog_qij(dtf_coef) #' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0)) #' getMu_ij(dtf_coef) #' @export getMu_ij <- function(dtf_coef) { log_qij_scaled <- dtf_coef$log_qij + dtf_coef$basalExpr dtf_coef$log_qij_scaled <- log_qij_scaled mu_ij <- exp(log_qij_scaled) dtf_coef$mu_ij <- mu_ij return(dtf_coef) } #' getMu_ij_matrix #' #' Get the Mu_ij matrix. #' #' @param dtf_coef A dataframe containing the coefficients. #' @importFrom reshape2 dcast #' @importFrom stats as.formula #' @export #' @return A Mu_ij matrix. #' @examples #' list_var <- init_variable() #' dtf_coef <- getInput2simulation(list_var, 10) #' dtf_coef <- getLog_qij(dtf_coef) #' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0)) #' dtf_coef<- getMu_ij(dtf_coef) #' getMu_ij_matrix(dtf_coef) getMu_ij_matrix <- function(dtf_coef) { column_names <- colnames(dtf_coef) idx_var <- grepl(pattern = "label", column_names) l_var <- column_names[idx_var] str_formula_rigth <- paste(l_var, collapse = " + ") if (str_formula_rigth == "") stop("no variable label detected") str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ") formula <- stats::as.formula(str_formula) dtf_Muij <- dtf_coef %>% reshape2::dcast(formula = formula, value.var = "mu_ij", drop = F) dtf_Muij[is.na(dtf_Muij)] <- 0 mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix() mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F] return(mtx_Muij) } #' getSubCountsTable #' #' Get the subcounts table. #' #' @param matx_Muij The Mu_ij matrix. #' @param matx_dispersion The dispersion matrix. #' @param replicateID The replication identifier. #' @param l_bool_replication A boolean vector indicating the replicates. #' @importFrom stats rnbinom #' #' @return A subcounts table. getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) { getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) { k_ij <- stats::rnbinom(n_genes * n_samples, size = matx_dispersion, mu = matx_Muij) %>% matrix(nrow = n_genes, ncol = n_samples) k_ij[is.na(k_ij)] <- 0 return(k_ij) } if (!any(l_bool_replication)) return(NULL) matx_Muij <- matx_Muij[, l_bool_replication, drop = FALSE] matx_dispersion <- matx_dispersion[, l_bool_replication, drop = FALSE] l_sampleID <- colnames(matx_Muij) l_geneID <- rownames(matx_Muij) dimension_mtx <- dim(matx_Muij) n_genes <- dimension_mtx[1] n_samples <- dimension_mtx[2] matx_kij <- getKijMatrix(matx_Muij, matx_dispersion, n_genes, n_samples) colnames(matx_kij) <- paste(l_sampleID, replicateID, sep = "_") rownames(matx_kij) <- l_geneID return(matx_kij) } #' getReplicationMatrix #' #' @param minN Minimum number of replicates for each sample #' @param maxN Maximum number of replicates for each sample #' @param n_samples Number of samples #' @export #' @return A replication matrix indicating which samples are replicated getReplicationMatrix <- function(minN, maxN, n_samples) { # Create a list of logical vectors representing the minimum number of replicates l_replication_minimum = lapply(1:n_samples, FUN = function(i) rep(TRUE, times = minN) ) # Create a list of random logical vectors representing additional replicates l_replication_random = lapply(1:n_samples, FUN = function(i) sample(x = c(TRUE, FALSE), size = maxN-minN, replace = T) ) # Combine the replication vectors into matrices matx_replication_minimum <- do.call(cbind, l_replication_minimum) matx_replication_random <- do.call(cbind, l_replication_random) # Combine the minimum replicates and random replicates into a single matrix matx_replication <- rbind(matx_replication_minimum, matx_replication_random) # Sort the columns of the replication matrix in descending order matx_replication = apply(matx_replication, 2, sort, decreasing = TRUE ) %>% matrix(nrow = maxN) return(matx_replication) } #' getCountsTable #' #' @param matx_Muij Matrix of mean expression values for each gene and sample #' @param matx_dispersion Matrix of dispersion values for each gene and sample #' @param matx_bool_replication Replication matrix indicating which samples are replicated #' #' @return A counts table containing simulated read counts for each gene and sample getCountsTable <- function(matx_Muij , matx_dispersion, matx_bool_replication ){ max_replicates <- dim(matx_bool_replication)[1] # Apply the getSubCountsTable function to each row of the replication matrix l_countsTable = lapply(1:max_replicates, function(i) getSubCountsTable(matx_Muij , matx_dispersion, i, matx_bool_replication[i,] )) # Combine the counts tables into a single matrix countsTable = do.call(cbind, l_countsTable) return(countsTable %>% as.data.frame()) } #' getDispersionMatrix #' #' @param list_var A list of variables (already initialized) #' @param n_genes Number of genes #' @param dispersion Vector of dispersion values for each gene #' @export #' #' @return A matrix of dispersion values for each gene and sample getDispersionMatrix <- function(list_var, n_genes, dispersion = stats::runif(n_genes, min = 0, max = 1000)){ l_geneID = paste("gene", 1:n_genes, sep = "") l_sampleID = getSampleID(list_var) n_samples = length(l_sampleID) l_dispersion <- dispersion # Create a data frame for the dispersion values dtf_dispersion = list(dispersion = l_dispersion) %>% as.data.frame() dtf_dispersion <- dtf_dispersion[, rep("dispersion", n_samples)] rownames(dtf_dispersion) = l_geneID colnames(dtf_dispersion) = l_sampleID matx_dispersion = dtf_dispersion %>% as.matrix() return(matx_dispersion) } #' Replicate rows of a data frame by group #' #' Replicates the rows of a data frame based on a grouping variable and replication counts for each group. #' #' @param df Data frame to replicate #' @param group_var Name of the grouping variable in the data frame #' @param rep_list Vector of replication counts for each group #' @return Data frame with replicated rows #' @examples #' df <- data.frame(group = c("A", "B"), value = c(1, 2)) #' replicateByGroup(df, "group", c(2, 3)) #' #' @export replicateByGroup <- function(df, group_var, rep_list) { l_group_var <- df[[group_var]] group_levels <- unique(l_group_var) names(rep_list) <- group_levels group_indices <- rep_list[l_group_var] replicated_indices <- rep(seq_len(nrow(df)), times = group_indices) replicated_df <- df[replicated_indices, ] suffix_sampleID <- sequence(group_indices) replicated_df[["sampleID"]] <- paste(replicated_df[["sampleID"]], suffix_sampleID, sep = "_") rownames(replicated_df) <- NULL return(replicated_df) } #' Replicate rows of a data frame #' #' Replicates the rows of a data frame by a specified factor. #' #' @param df Data frame to replicate #' @param n Replication factor for each row #' @return Data frame with replicated rows #' @export #' @examples #' df <- data.frame(a = 1:3, b = letters[1:3]) #' replicateRows(df, 2) #' replicateRows <- function(df, n) { indices <- rep(seq_len(nrow(df)), each = n) replicated_df <- df[indices, , drop = FALSE] rownames(replicated_df) <- NULL return(replicated_df) } #' Get sample metadata #' #' Generates sample metadata based on the input variables, replication matrix, and number of genes. #' #' @param list_var A list of variables (already initialized) #' @param replicationMatrix Replication matrix #' @param n_genes Number of genes #' @return Data frame of sample metadata #' @importFrom data.table setorderv #' @export #' @examples #' list_var <- init_variable() #' replicationMatrix <- generateReplicationMatrix(list_var, 3, 3) #' getSampleMetadata(list_var, replicationMatrix) getSampleMetadata <- function(list_var, replicationMatrix) { l_sampleIDs = getSampleID(list_var) metaData <- generateGridCombination_fromListVar(list_var) metaData[] <- lapply(metaData, as.character) ## before reordering data.table::setorderv(metaData, cols = colnames(metaData)) metaData[] <- lapply(metaData, as.factor) metaData$sampleID <- l_sampleIDs rep_list <- colSums(replicationMatrix) metaData$sampleID <- as.character(metaData$sampleID) ## before replicating sampleMetadata <- replicateByGroup(metaData, "sampleID", rep_list) colnames(sampleMetadata) <- gsub("label_", "", colnames(sampleMetadata)) return(sampleMetadata) } #' getSampleID #' #' @param list_var A list of variables (already initialized) #' @export #' @return A sorted vector of sample IDs #' @examples #' getSampleID(init_variable()) getSampleID <- function(list_var){ gridCombination <- generateGridCombination_fromListVar(list_var) l_sampleID <- apply( gridCombination , 1 , paste , collapse = "_" ) %>% unname() return(sort(l_sampleID)) }