# WARNING - Generated by {fusen} from /dev/flat_full.Rmd: do not edit by hand #' Get input for simulation based on coefficients #' #' This function generates input data for simulation based on the coefficients provided in the \code{list_var} argument. #' #' @param list_var A list of variables (already initialized) #' @param n_genes Number of genes to simulate (default: 1) #' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL) #' @return A data frame with input coefficients for simulation #' @export #' @examples #' # Example usage #' list_var <- init_variable() #' getInput2simulation(list_var, n_genes = 10) getInput2simulation <- function(list_var, n_genes = 1, input2mvrnorm = NULL) { # Use default input to mvrnorm if not provided by the user if (is.null(input2mvrnorm)) input2mvrnorm = getInput2mvrnorm(list_var) l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes) l_dataFromUser = getDataFromUser(list_var) df_input2simu <- getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) return(df_input2simu) } #' getCoefficients #' #' Get the coefficients. #' #' @param list_var A list of variables (already initialized) #' @param l_dataFromMvrnorm Data from the `getGeneMetadata` function (optional). #' @param l_dataFromUser Data from the `getDataFromUser` function (optional). #' @param n_genes The number of genes. #' @export #' @return A dataframe containing the coefficients. #' @examples #' # Example usage #' list_var <- init_variable() #' input2mvrnorm = getInput2mvrnorm(list_var) #' l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes) #' l_dataFromUser = getDataFromUser(list_var) #' getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes = 3) getCoefficients <- function(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) { if (length(l_dataFromMvrnorm) == 0) { metaData <- getGeneMetadata(list_var, n_genes) l_dataFromMvrnorm <- list(metaData) } l_df2join <- c(l_dataFromMvrnorm, l_dataFromUser) df_coef <- Reduce(function(d1, d2){ column_names = colnames(d2) idx_key = grepl(pattern = "label", column_names ) keys = column_names[idx_key] join_dtf(d1, d2, k1 = keys , k2 = keys) } , l_df2join ) %>% as.data.frame() column_names <- colnames(df_coef) idx_column2factor <- grep(pattern = "label_", column_names) if (length(idx_column2factor) > 1) { df_coef[, idx_column2factor] <- lapply(df_coef[, idx_column2factor], as.factor) } else { df_coef[, idx_column2factor] <- as.factor(df_coef[, idx_column2factor]) } return(df_coef) } #' Get the log_qij values from the coefficient data frame. #' #' @param dtf_coef The coefficient data frame. #' @return The coefficient data frame with log_qij column added. #' @export #' @examples #' list_var <- init_variable() #' dtf_coef <- getInput2simulation(list_var, 10) #' dtf_coef <- getLog_qij(dtf_coef) getLog_qij <- function(dtf_coef) { dtf_beta_numeric <- dtf_coef[sapply(dtf_coef, is.numeric)] dtf_coef$log_qij <- rowSums(dtf_beta_numeric, na.rm = TRUE) return(dtf_coef) } #' Calculate mu_ij values based on coefficient data frame and scaling factor #' #' This function calculates mu_ij values by raising 2 to the power of the log_qij values #' from the coefficient data frame and multiplying it by the provided scaling factor. #' #' @param dtf_coef Coefficient data frame containing the log_qij values #' #' @return Coefficient data frame with an additional mu_ij column #' #' @examples #' list_var <- init_variable() #' dtf_coef <- getInput2simulation(list_var, 10) #' dtf_coef <- getLog_qij(dtf_coef) #' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0)) #' getMu_ij(dtf_coef) #' @export getMu_ij <- function(dtf_coef) { log_qij_scaled <- dtf_coef$log_qij + dtf_coef$basalExpr dtf_coef$log_qij_scaled <- log_qij_scaled mu_ij <- exp(log_qij_scaled) dtf_coef$mu_ij <- mu_ij return(dtf_coef) } #' getMu_ij_matrix #' #' Get the Mu_ij matrix. #' #' @param dtf_coef A dataframe containing the coefficients. #' @importFrom reshape2 dcast #' @importFrom stats as.formula #' @export #' @return A Mu_ij matrix. #' @examples #' list_var <- init_variable() #' dtf_coef <- getInput2simulation(list_var, 10) #' dtf_coef <- getLog_qij(dtf_coef) #' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0)) #' dtf_coef<- getMu_ij(dtf_coef) #' getMu_ij_matrix(dtf_coef) getMu_ij_matrix <- function(dtf_coef) { column_names <- colnames(dtf_coef) idx_var <- grepl(pattern = "label", column_names) l_var <- column_names[idx_var] str_formula_rigth <- paste(l_var, collapse = " + ") if (str_formula_rigth == "") stop("no variable label detected") str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ") formula <- stats::as.formula(str_formula) dtf_Muij <- dtf_coef %>% reshape2::dcast(formula = formula, value.var = "mu_ij", drop = F) dtf_Muij[is.na(dtf_Muij)] <- 0 mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix() mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F] return(mtx_Muij) } #' getSubCountsTable #' #' Get the subcounts table. #' #' @param matx_Muij The Mu_ij matrix. #' @param matx_dispersion The dispersion matrix. #' @param replicateID The replication identifier. #' @param l_bool_replication A boolean vector indicating the replicates. #' @importFrom stats rnbinom #' #' @return A subcounts table. getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) { getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) { k_ij <- stats::rnbinom(n_genes * n_samples, size = matx_dispersion, mu = matx_Muij) %>% matrix(nrow = n_genes, ncol = n_samples) k_ij[is.na(k_ij)] <- 0 return(k_ij) } if (!any(l_bool_replication)) return(NULL) matx_Muij <- matx_Muij[, l_bool_replication, drop = FALSE] matx_dispersion <- matx_dispersion[, l_bool_replication, drop = FALSE] l_sampleID <- colnames(matx_Muij) l_geneID <- rownames(matx_Muij) dimension_mtx <- dim(matx_Muij) n_genes <- dimension_mtx[1] n_samples <- dimension_mtx[2] matx_kij <- getKijMatrix(matx_Muij, matx_dispersion, n_genes, n_samples) colnames(matx_kij) <- paste(l_sampleID, replicateID, sep = "_") rownames(matx_kij) <- l_geneID return(matx_kij) } #' getReplicationMatrix #' #' @param minN Minimum number of replicates for each sample #' @param maxN Maximum number of replicates for each sample #' @param n_samples Number of samples #' @export #' @return A replication matrix indicating which samples are replicated getReplicationMatrix <- function(minN, maxN, n_samples) { # Create a list of logical vectors representing the minimum number of replicates l_replication_minimum = lapply(1:n_samples, FUN = function(i) rep(TRUE, times = minN) ) # Create a list of random logical vectors representing additional replicates l_replication_random = lapply(1:n_samples, FUN = function(i) sample(x = c(TRUE, FALSE), size = maxN-minN, replace = T) ) # Combine the replication vectors into matrices matx_replication_minimum <- do.call(cbind, l_replication_minimum) matx_replication_random <- do.call(cbind, l_replication_random) # Combine the minimum replicates and random replicates into a single matrix matx_replication <- rbind(matx_replication_minimum, matx_replication_random) # Sort the columns of the replication matrix in descending order matx_replication = apply(matx_replication, 2, sort, decreasing = TRUE ) %>% matrix(nrow = maxN) return(matx_replication) } #' getCountsTable #' #' @param matx_Muij Matrix of mean expression values for each gene and sample #' @param matx_dispersion Matrix of dispersion values for each gene and sample #' @param matx_bool_replication Replication matrix indicating which samples are replicated #' #' @return A counts table containing simulated read counts for each gene and sample getCountsTable <- function(matx_Muij , matx_dispersion, matx_bool_replication ){ max_replicates <- dim(matx_bool_replication)[1] # Apply the getSubCountsTable function to each row of the replication matrix l_countsTable = lapply(1:max_replicates, function(i) getSubCountsTable(matx_Muij , matx_dispersion, i, matx_bool_replication[i,] )) # Combine the counts tables into a single matrix countsTable = do.call(cbind, l_countsTable) return(countsTable %>% as.data.frame()) } #' getDispersionMatrix #' #' @param list_var A list of variables (already initialized) #' @param n_genes Number of genes #' @param dispersion Vector of dispersion values for each gene #' @export #' #' @return A matrix of dispersion values for each gene and sample getDispersionMatrix <- function(list_var, n_genes, dispersion = stats::runif(n_genes, min = 0, max = 1000)){ l_geneID = paste("gene", 1:n_genes, sep = "") l_sampleID = getSampleID(list_var) n_samples = length(l_sampleID) l_dispersion <- dispersion # Create a data frame for the dispersion values dtf_dispersion = list(dispersion = l_dispersion) %>% as.data.frame() dtf_dispersion <- dtf_dispersion[, rep("dispersion", n_samples)] rownames(dtf_dispersion) = l_geneID colnames(dtf_dispersion) = l_sampleID matx_dispersion = dtf_dispersion %>% as.matrix() return(matx_dispersion) } #' Replicate rows of a data frame by group #' #' Replicates the rows of a data frame based on a grouping variable and replication counts for each group. #' #' @param df Data frame to replicate #' @param group_var Name of the grouping variable in the data frame #' @param rep_list Vector of replication counts for each group #' @return Data frame with replicated rows #' @examples #' df <- data.frame(group = c("A", "B"), value = c(1, 2)) #' replicateByGroup(df, "group", c(2, 3)) #' #' @export replicateByGroup <- function(df, group_var, rep_list) { l_group_var <- df[[group_var]] group_levels <- unique(l_group_var) names(rep_list) <- group_levels group_indices <- rep_list[l_group_var] replicated_indices <- rep(seq_len(nrow(df)), times = group_indices) replicated_df <- df[replicated_indices, ] suffix_sampleID <- sequence(group_indices) replicated_df[["sampleID"]] <- paste(replicated_df[["sampleID"]], suffix_sampleID, sep = "_") rownames(replicated_df) <- NULL return(replicated_df) } #' Replicate rows of a data frame #' #' Replicates the rows of a data frame by a specified factor. #' #' @param df Data frame to replicate #' @param n Replication factor for each row #' @return Data frame with replicated rows #' @export #' @examples #' df <- data.frame(a = 1:3, b = letters[1:3]) #' replicateRows(df, 2) #' replicateRows <- function(df, n) { indices <- rep(seq_len(nrow(df)), each = n) replicated_df <- df[indices, , drop = FALSE] rownames(replicated_df) <- NULL return(replicated_df) } #' Get sample metadata #' #' Generates sample metadata based on the input variables, replication matrix, and number of genes. #' #' @param list_var A list of variables (already initialized) #' @param replicationMatrix Replication matrix #' @param n_genes Number of genes #' @return Data frame of sample metadata #' @importFrom data.table setorderv #' @export #' @examples #' list_var <- init_variable() #' n_genes <- 10 #' replicationMatrix <- generateReplicationMatrix(list_var ,2, 3) #' getSampleMetadata(list_var, n_genes, replicationMatrix) getSampleMetadata <- function(list_var, n_genes, replicationMatrix) { l_sampleIDs = getSampleID(list_var) metaData <- generateGridCombination_fromListVar(list_var) metaData[] <- lapply(metaData, as.character) ## before reordering data.table::setorderv(metaData, cols = colnames(metaData)) metaData[] <- lapply(metaData, as.factor) metaData$sampleID <- l_sampleIDs rep_list <- colSums(replicationMatrix) metaData$sampleID <- as.character(metaData$sampleID) ## before replicating sampleMetadata <- replicateByGroup(metaData, "sampleID", rep_list) colnames(sampleMetadata) <- gsub("label_", "", colnames(sampleMetadata)) return(sampleMetadata) } #' getSampleID #' #' @param list_var A list of variables (already initialized) #' @export #' @return A sorted vector of sample IDs #' @examples #' getSampleID(init_variable()) getSampleID <- function(list_var){ gridCombination <- generateGridCombination_fromListVar(list_var) l_sampleID <- apply( gridCombination , 1 , paste , collapse = "_" ) %>% unname() return(sort(l_sampleID)) }