Skip to content
Snippets Groups Projects
Select Git revision
  • 0542b1c42c74ba5d203edae6e8a7cc50d3238b5b
  • master default protected
  • v2.1.1
  • v2.1.0
4 results

simulation.R

Blame
  • simulation.R 12.53 KiB
    # WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand
    
    #' Get input for simulation based on coefficients
    #'
    #' This function generates input data for simulation based on the coefficients provided in the \code{list_var} argument.
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param n_genes Number of genes to simulate (default: 1)
    #' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL)
    #' @return A data frame with input coefficients for simulation
    #' @export
    #' @examples
    #' # Example usage
    #' list_var <- init_variable()
    #' getInput2simulation(list_var, n_genes = 10)
    getInput2simulation <- function(list_var, n_genes = 1, input2mvrnorm = NULL) {
      
      # Use default input to mvrnorm if not provided by the user
      if (is.null(input2mvrnorm)) input2mvrnorm = getInput2mvrnorm(list_var)  
    
      l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
      l_dataFromUser = getDataFromUser(list_var)
      df_input2simu <- getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes)
      return(df_input2simu)
    }
    
    #' getCoefficients
    #'
    #' Get the coefficients.
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param l_dataFromMvrnorm Data from the `getGeneMetadata` function (optional).
    #' @param l_dataFromUser Data from the `getDataFromUser` function (optional).
    #' @param n_genes The number of genes.
    #' @export
    #' @return A dataframe containing the coefficients.
    #' @examples
    #' # Example usage
    #' list_var <- init_variable()
    #' input2mvrnorm = getInput2mvrnorm(list_var)
    #' l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
    #' l_dataFromUser = getDataFromUser(list_var)
    #' getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes = 3)
    getCoefficients <- function(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) {
      if (length(l_dataFromMvrnorm) == 0) {
        metaData <- getGeneMetadata(list_var, n_genes)
        l_dataFromMvrnorm <- list(metaData)
      }
      l_df2join <- c(l_dataFromMvrnorm, l_dataFromUser)
      
      
      df_coef <- Reduce(function(d1, d2){ column_names = colnames(d2)
                                          idx_key = grepl(pattern = "label", column_names )
                                          keys = column_names[idx_key]
                                          join_dtf(d1, d2, k1 = keys , k2 = keys)
                                        } 
                        , l_df2join ) %>% as.data.frame()
      column_names <- colnames(df_coef)
      idx_column2factor <- grep(pattern = "label_", column_names)
      
      if (length(idx_column2factor) > 1) {
        df_coef[, idx_column2factor] <- lapply(df_coef[, idx_column2factor], as.factor)
      } else {
        df_coef[, idx_column2factor] <- as.factor(df_coef[, idx_column2factor])
      }
      
      return(df_coef)
    }
    
    
    #' Get the log_qij values from the coefficient data frame.
    #'
    #' @param dtf_coef The coefficient data frame.
    #' @return The coefficient data frame with log_qij column added.
    #' @export
    #' @examples
    #' list_var <- init_variable()
    #' dtf_coef <- getInput2simulation(list_var, 10)
    #' dtf_coef <- getLog_qij(dtf_coef)
    getLog_qij <- function(dtf_coef) {
      dtf_beta_numeric <- dtf_coef[sapply(dtf_coef, is.numeric)]
      dtf_coef$log_qij <- rowSums(dtf_beta_numeric, na.rm = TRUE)
      return(dtf_coef)
    }
    
    
    #' Calculate mu_ij values based on coefficient data frame and scaling factor
    #'
    #' This function calculates mu_ij values by raising 2 to the power of the log_qij values
    #' from the coefficient data frame and multiplying it by the provided scaling factor.
    #'
    #' @param dtf_coef Coefficient data frame containing the log_qij values
    #'
    #' @return Coefficient data frame with an additional mu_ij column
    #'
    #' @examples
    #' list_var <- init_variable()
    #' dtf_coef <- getInput2simulation(list_var, 10)
    #' dtf_coef <- getLog_qij(dtf_coef)
    #' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
    #' getMu_ij(dtf_coef)
    #' @export
    getMu_ij <- function(dtf_coef) {
      log_qij_scaled <- dtf_coef$log_qij + dtf_coef$basalExpr
      dtf_coef$log_qij_scaled <- log_qij_scaled
      mu_ij <- exp(log_qij_scaled)  
      dtf_coef$mu_ij <- mu_ij
      return(dtf_coef)
    }
    
    #' getMu_ij_matrix
    #'
    #' Get the Mu_ij matrix.
    #'
    #' @param dtf_coef A dataframe containing the coefficients.
    #' @importFrom reshape2 dcast
    #' @importFrom stats as.formula
    
    #' @export
    #' @return A Mu_ij matrix.
    #' @examples
    #' list_var <- init_variable()
    #' dtf_coef <- getInput2simulation(list_var, 10)
    #' dtf_coef <- getLog_qij(dtf_coef)
    #' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
    #' dtf_coef<- getMu_ij(dtf_coef)
    #' getMu_ij_matrix(dtf_coef)
    getMu_ij_matrix <- function(dtf_coef) {
      column_names <- colnames(dtf_coef)
      idx_var <- grepl(pattern = "label", column_names)
      l_var <- column_names[idx_var]
      str_formula_rigth <- paste(l_var, collapse = " + ")
      if (str_formula_rigth == "") stop("no variable label detected")
      str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ")
      formula <- stats::as.formula(str_formula)
      dtf_Muij <- dtf_coef %>% reshape2::dcast(formula = formula, value.var = "mu_ij", drop = F)
      dtf_Muij[is.na(dtf_Muij)] <- 0
      mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix()
      mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F]
      return(mtx_Muij)
    }
    
    #' getSubCountsTable
    #'
    #' Get the subcounts table.
    #'
    #' @param matx_Muij The Mu_ij matrix.
    #' @param matx_dispersion The dispersion matrix.
    #' @param replicateID The replication identifier.
    #' @param l_bool_replication A boolean vector indicating the replicates.
    #' @importFrom stats rnbinom
    #' 
    #' @return A subcounts table.
    getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) {
      getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) {
        k_ij <- stats::rnbinom(n_genes * n_samples,
                               size = matx_dispersion,
                               mu = matx_Muij) %>%
                  matrix(nrow = n_genes, ncol = n_samples)
        
        k_ij[is.na(k_ij)] <- 0
        return(k_ij)
      }
      
      if (!any(l_bool_replication))
        return(NULL) 
      
      matx_Muij <- matx_Muij[, l_bool_replication, drop = FALSE]
      matx_dispersion <- matx_dispersion[, l_bool_replication, drop = FALSE] 
      l_sampleID <- colnames(matx_Muij)
      l_geneID <- rownames(matx_Muij)
      dimension_mtx <- dim(matx_Muij)
      n_genes <- dimension_mtx[1]
      n_samples <- dimension_mtx[2]
      matx_kij <- getKijMatrix(matx_Muij, matx_dispersion, n_genes, n_samples)
      colnames(matx_kij) <- paste(l_sampleID, replicateID, sep = "_")
      rownames(matx_kij) <- l_geneID
      return(matx_kij)
    }
    
    #' getReplicationMatrix
    #'
    #' @param minN Minimum number of replicates for each sample
    #' @param maxN Maximum number of replicates for each sample
    #' @param n_samples Number of samples
    #' @export
    #' @return A replication matrix indicating which samples are replicated
    getReplicationMatrix <- function(minN, maxN, n_samples) {
      
      # Create a list of logical vectors representing the minimum number of replicates
      l_replication_minimum = lapply(1:n_samples, 
                                     FUN = function(i) rep(TRUE, times = minN) )
      
      # Create a list of random logical vectors representing additional replicates
      l_replication_random = lapply(1:n_samples, 
                                    FUN = function(i) sample(x = c(TRUE, FALSE), size = maxN-minN, replace = T) )
      
      # Combine the replication vectors into matrices
      matx_replication_minimum <- do.call(cbind, l_replication_minimum)
      matx_replication_random <- do.call(cbind, l_replication_random)
      
      # Combine the minimum replicates and random replicates into a single matrix
      matx_replication <- rbind(matx_replication_minimum, matx_replication_random)
      
      # Sort the columns of the replication matrix in descending order
      matx_replication = apply(matx_replication, 2, sort, decreasing = TRUE ) %>% matrix(nrow = maxN)
      
      return(matx_replication)
    }
    
    #' getCountsTable
    #'
    #' @param matx_Muij Matrix of mean expression values for each gene and sample
    #' @param matx_dispersion Matrix of dispersion values for each gene and sample
    #' @param matx_bool_replication Replication matrix indicating which samples are replicated
    #'
    #' @return A counts table containing simulated read counts for each gene and sample
    getCountsTable <- function(matx_Muij ,  matx_dispersion, matx_bool_replication ){
      max_replicates <-  dim(matx_bool_replication)[1]
      
      # Apply the getSubCountsTable function to each row of the replication matrix
      l_countsTable = lapply(1:max_replicates, function(i) getSubCountsTable(matx_Muij , matx_dispersion, i, matx_bool_replication[i,]  ))
      
      # Combine the counts tables into a single matrix
      countsTable = do.call(cbind, l_countsTable)
      
      return(countsTable %>% as.data.frame())
    }
    
    #' getDispersionMatrix
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param n_genes Number of genes
    #' @param dispersion Vector of dispersion values for each gene
    #' @export
    #'
    #' @return A matrix of dispersion values for each gene and sample
    getDispersionMatrix <- function(list_var, n_genes, dispersion = stats::runif(n_genes, min = 0, max = 1000)){
      l_geneID = paste("gene", 1:n_genes, sep = "")
      l_sampleID = getSampleID(list_var) 
      n_samples = length(l_sampleID) 
      l_dispersion <- dispersion
      
      # Create a data frame for the dispersion values
      dtf_dispersion = list(dispersion =  l_dispersion) %>% as.data.frame()
      dtf_dispersion <- dtf_dispersion[, rep("dispersion", n_samples)]
      rownames(dtf_dispersion) = l_geneID
      colnames(dtf_dispersion) = l_sampleID
      
      matx_dispersion = dtf_dispersion %>% as.matrix()
      
      return(matx_dispersion)
    }
    
    
    
    
    
    #' Replicate rows of a data frame by group
    #'
    #' Replicates the rows of a data frame based on a grouping variable and replication counts for each group.
    #'
    #' @param df Data frame to replicate
    #' @param group_var Name of the grouping variable in the data frame
    #' @param rep_list Vector of replication counts for each group
    #' @return Data frame with replicated rows
    #' @examples
    #' df <- data.frame(group = c("A", "B"), value = c(1, 2))
    #' replicateByGroup(df, "group", c(2, 3))
    #'
    #' @export
    replicateByGroup <- function(df, group_var, rep_list) {
      l_group_var <- df[[group_var]]
      group_levels <- unique(l_group_var)
      names(rep_list) <- group_levels
      group_indices <- rep_list[l_group_var]
      replicated_indices <- rep(seq_len(nrow(df)), times = group_indices)
      replicated_df <- df[replicated_indices, ]
      suffix_sampleID <- sequence(group_indices)
      replicated_df[["sampleID"]] <- paste(replicated_df[["sampleID"]], suffix_sampleID, sep = "_")
      rownames(replicated_df) <- NULL
      return(replicated_df)
    }
    
    
    
    #' Replicate rows of a data frame
    #'
    #' Replicates the rows of a data frame by a specified factor.
    #'
    #' @param df Data frame to replicate
    #' @param n Replication factor for each row
    #' @return Data frame with replicated rows
    #' @export
    #' @examples
    #' df <- data.frame(a = 1:3, b = letters[1:3])
    #' replicateRows(df, 2)
    #'
    replicateRows <- function(df, n) {
      indices <- rep(seq_len(nrow(df)), each = n)
      replicated_df <- df[indices, , drop = FALSE]
      rownames(replicated_df) <- NULL
      return(replicated_df)
    }
    
    #' Get sample metadata
    #'
    #' Generates sample metadata based on the input variables, replication matrix, and number of genes.
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param replicationMatrix Replication matrix
    #' @param n_genes Number of genes
    #' @return Data frame of sample metadata
    #' @importFrom data.table setorderv
    #' @export
    #' @examples
    #' list_var <- init_variable()
    #' n_genes <- 10
    #' replicationMatrix <- generateReplicationMatrix(list_var ,2, 3)
    #' getSampleMetadata(list_var, n_genes,  replicationMatrix)
    getSampleMetadata <- function(list_var, n_genes, replicationMatrix) {
      l_sampleIDs = getSampleID(list_var)
      metaData <- generateGridCombination_fromListVar(list_var)
      metaData[] <- lapply(metaData, as.character) ## before reordering
      data.table::setorderv(metaData, cols = colnames(metaData))
      metaData[] <- lapply(metaData, as.factor)
      metaData$sampleID <- l_sampleIDs
      rep_list <- colSums(replicationMatrix)
      metaData$sampleID <- as.character(metaData$sampleID) ## before replicating
      sampleMetadata <- replicateByGroup(metaData, "sampleID", rep_list)
      colnames(sampleMetadata) <- gsub("label_", "", colnames(sampleMetadata))
      return(sampleMetadata)
    }
    
    
    #' getSampleID
    #'
    #' @param list_var A list of variables (already initialized)
    #' @export
    #' @return A sorted vector of sample IDs
    #' @examples
    #' getSampleID(init_variable())
    getSampleID <- function(list_var){
      gridCombination <- generateGridCombination_fromListVar(list_var)
      l_sampleID <- apply( gridCombination , 1 , paste , collapse = "_" ) %>% unname()
      return(sort(l_sampleID))
    }