Skip to content
Snippets Groups Projects
Select Git revision
  • c2ac944e7cc9ba1e3a0dcd2fc9aed5eaa71361b8
  • master default protected
  • v2.1.1
  • v2.1.0
4 results

simulation.R

Blame
  • aduvermy's avatar
    Arnaud Duvermy authored
    Former-commit-id: 2a636c88881bc0466afb7c71438b2b4dd44b062f
    Former-commit-id: b685e4b01275a4c06316c731a96215e27eda2d69
    Former-commit-id: a5f9dbc6bf9e17d6b9d50030b50de463744dae95
    ca65a361
    History
    simulation.R 17.23 KiB
    # WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand
    
    
    #' Get input for simulation based on coefficients
    #'
    #' This function generates input data for simulation based on the coefficients provided in the \code{list_var} argument.
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param n_genes Number of genes to simulate (default: 1)
    #' @param normal_distr Specifies the distribution type for generating effects. Choose between 'univariate' (default) or 'multivariate' .
    #' - 'univariate': Effects are drawn independently from univariate normal distributions. 
    #' - 'multivariate': Effects are drawn jointly from a multivariate normal distribution. (not recommended)
    #' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL)
    #' @return A data frame with input coefficients for simulation
    #' @export
    #' @examples
    #' # Example usage
    #' list_var <- init_variable()
    #' getInput2simulation(list_var, n_genes = 10)
    getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "univariate",  input2mvrnorm = NULL) {
      
      stopifnot( normal_distr %in% c("multivariate", "univariate") )
    
      if (normal_distr == "multivariate"){
          if (is.null(input2mvrnorm)) input2mvrnorm = getInput2mvrnorm(list_var)    
          l_dataFrom_normdistr <- getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
        } 
      if (normal_distr == "univariate"){
          l_dataFrom_normdistr <- getDataFromRnorm(list_var, n_genes)
        }
      
      l_dataFromUser = getDataFromUser(list_var)
      
      df_input2simu <- getCoefficients(list_var, l_dataFrom_normdistr, l_dataFromUser, n_genes)
      
      return(df_input2simu)
    }
    
    
    
    #' Get the reference level for categorical variables in the data
    #'
    #' This function extracts the reference level for each categorical variable in the data.
    #' The reference level is the first level encountered for each categorical variable.
    #'
    #' @param data The data frame containing the categorical variables.
    #' @return A list containing the reference level for each categorical variable.
    #' @export
    getRefLevel <- function(data){
      col_names <- colnames(data)
      categorical_vars <- col_names[grepl(col_names, pattern = "label_")]
      if (length(categorical_vars) == 1){
        l_labels <- list()
        l_labels[[categorical_vars]] <- levels(data[, categorical_vars])
        
      } else l_labels <- lapply(data[, categorical_vars], levels)
      l_labels_ref <- sapply(l_labels, function(vec) vec[1])
      return(l_labels_ref)
    }
    
    #' Replace the effect by 0 in the data
    #'
    #' This function replaces the effect in interactions columns by 0, when needed.
    #'
    #' @param list_var The list of variables containing the effects to modify.
    #' @param l_labels_ref A list containing the reference level for each categorical variable.
    #' @param data The data frame containing the effects to modify.
    #' @return The modified data frame 
    #' @export
    replaceUnexpectedInteractionValuesBy0 <- function(list_var, l_labels_ref , data){
      varInteraction <- getListVar(list_var$interactions)
      df_interaction_with0 <- sapply(varInteraction, function(var){
        categorical_var <- paste("label", unlist(strsplit(var, ":")), sep = "_")
        bool_matrix <- sapply(categorical_var, function(uniq_cat_var) data[uniq_cat_var] ==  l_labels_ref[uniq_cat_var])
        idx_0 <- rowSums(bool_matrix) > 0 ## line without interactions effects
        return(replace(data[[var]], idx_0, 0)) 
      })
      col_names <- colnames(data)
      categorical_vars <- col_names[grepl(col_names, pattern = "label_")]
      data[, varInteraction] <- df_interaction_with0
      return(data)
    }
    
    
    
    
    #' Prepare data using effects from a normal distribution
    #'
    #' Prepares the data by generating effects from a normal distribution for each gene.
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param n_genes Number of genes to generate data for.
    #' @return A dataframe containing gene metadata and effects generated from a normal distribution.
    #' @export
    getDataFromRnorm <- function(list_var, n_genes){
        ## -- check if all data have been provided by user
        if (is.null(getInput2mvrnorm(list_var)$covMatrix))
            return(list())
        metadata <- getGeneMetadata(list_var , n_genes)
        df_effects <- get_effects_from_rnorm(list_var, metadata)
        data <- cbind(metadata, df_effects)  
        
        if(!is.null(getListVar(list_var$interactions))){
          l_labels_ref <- getRefLevel(data)
          data <- replaceUnexpectedInteractionValuesBy0(list_var, l_labels_ref, data)
        }
        
        return(list(data))
    }
    
    #' Generate effects from a normal distribution
    #'
    #' Generates effects from a normal distribution for each gene.
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param metadata Gene metadata.
    #' @return A dataframe containing effects generated from a normal distribution.
    #' @export
    get_effects_from_rnorm <- function(list_var, metadata){
      
      variable_standard_dev <- getGivenAttribute(list_var, attribute = "sd") %>% unlist()
      interaction_standard_dev <- getGivenAttribute(list_var$interactions, attribute = "sd") %>% unlist()
      list_stdev <- c(variable_standard_dev, interaction_standard_dev)
      
      # -- mu
      variable_mu <- getGivenAttribute(list_var, attribute = "mu") %>% unlist()
      interaction_mu <- getGivenAttribute(list_var$interactions, attribute = "mu") %>% unlist()
      list_mu <- c(variable_mu, interaction_mu)
      
      variable_2rnorm <- names(list_stdev)
      l_effects <- lapply(stats::setNames(variable_2rnorm, variable_2rnorm) , function(var){
        col_labels <- paste("label", unlist(strsplit(var, ":")), sep = "_")
        cols2paste <- c("geneID", col_labels)
        list_combinations <- apply( metadata[ , cols2paste ] , 1 , paste , collapse = "-" )
        list_effects <- unique(list_combinations)
        list_beta <- rnorm(length(list_effects), mean = list_mu[var], sd = list_stdev[var])
        names(list_beta) <- list_effects
        unname(list_beta[list_combinations])
      })
           
      
      df_effects <- do.call("cbind", l_effects)
      return(df_effects)
    }
    
    
    
    
    #' getCoefficients
    #'
    #' Get the coefficients.
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param l_dataFromMvrnorm Data from the `getGeneMetadata` function (optional).
    #' @param l_dataFromUser Data from the `getDataFromUser` function (optional).
    #' @param n_genes The number of genes.
    #' @export
    #' @return A dataframe containing the coefficients.
    #' @examples
    #' # Example usage
    #' list_var <- init_variable()
    #' input2mvrnorm = getInput2mvrnorm(list_var)
    #' l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes=3)
    #' l_dataFromUser = getDataFromUser(list_var)
    #' getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes = 3)
    getCoefficients <- function(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) {
      if (length(l_dataFromMvrnorm) == 0) {
        metaData <- getGeneMetadata(list_var, n_genes)
        l_dataFromMvrnorm <- list(metaData)
      }
      l_df2join <- c(l_dataFromMvrnorm, l_dataFromUser)
      
      
      df_coef <- Reduce(function(d1, d2){ column_names = colnames(d2)
                                          idx_key = grepl(pattern = "label", column_names )
                                          keys = column_names[idx_key]
                                          join_dtf(d1, d2, k1 = keys , k2 = keys)
                                        } 
                        , l_df2join ) %>% as.data.frame()
      column_names <- colnames(df_coef)
      idx_column2factor <- grep(pattern = "label_", column_names)
      
      if (length(idx_column2factor) > 1) {
        df_coef[, idx_column2factor] <- lapply(df_coef[, idx_column2factor], as.factor)
      } else {
        df_coef[, idx_column2factor] <- as.factor(df_coef[, idx_column2factor])
      }
      
      return(df_coef)
    }
    
    
    #' Get the log_qij values from the coefficient data frame.
    #'
    #' @param dtf_coef The coefficient data frame.
    #' @return The coefficient data frame with log_qij column added.
    #' @export
    #' @examples
    #' list_var <- init_variable()
    #' dtf_coef <- getInput2simulation(list_var, 10)
    #' dtf_coef <- getLog_qij(dtf_coef)
    getLog_qij <- function(dtf_coef) {
      dtf_beta_numeric <- dtf_coef[sapply(dtf_coef, is.numeric)]
      dtf_coef$log_qij <- rowSums(dtf_beta_numeric, na.rm = TRUE)
      return(dtf_coef)
    }
    
    
    #' Calculate mu_ij values based on coefficient data frame and scaling factor
    #'
    #' This function calculates mu_ij values by raising 2 to the power of the log_qij values
    #' from the coefficient data frame and multiplying it by the provided scaling factor.
    #'
    #' @param dtf_coef Coefficient data frame containing the log_qij values
    #'
    #' @return Coefficient data frame with an additional mu_ij column
    #'
    #' @examples
    #' list_var <- init_variable()
    #' dtf_coef <- getInput2simulation(list_var, 10)
    #' dtf_coef <- getLog_qij(dtf_coef)
    #' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
    #' getMu_ij(dtf_coef)
    #' @export
    getMu_ij <- function(dtf_coef) {
      log_qij_scaled <- dtf_coef$log_qij + dtf_coef$basalExpr
      dtf_coef$log_qij_scaled <- log_qij_scaled
      mu_ij <- exp(log_qij_scaled)  
      dtf_coef$mu_ij <- mu_ij
      return(dtf_coef)
    }
    
    #' getMu_ij_matrix
    #'
    #' Get the Mu_ij matrix.
    #'
    #' @param dtf_coef A dataframe containing the coefficients.
    #' @importFrom reshape2 dcast
    #' @importFrom stats as.formula
    
    #' @export
    #' @return A Mu_ij matrix.
    #' @examples
    #' list_var <- init_variable()
    #' dtf_coef <- getInput2simulation(list_var, 10)
    #' dtf_coef <- getLog_qij(dtf_coef)
    #' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
    #' dtf_coef<- getMu_ij(dtf_coef)
    #' getMu_ij_matrix(dtf_coef)
    getMu_ij_matrix <- function(dtf_coef) {
      column_names <- colnames(dtf_coef)
      idx_var <- grepl(pattern = "label", column_names)
      l_var <- column_names[idx_var]
      str_formula_rigth <- paste(l_var, collapse = " + ")
      if (str_formula_rigth == "") stop("no variable label detected")
      str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ")
      formula <- stats::as.formula(str_formula)
      dtf_Muij <- dtf_coef %>% reshape2::dcast(formula = formula, value.var = "mu_ij", drop = F)
      dtf_Muij[is.na(dtf_Muij)] <- 0
      mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix()
      mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F]
      return(mtx_Muij)
    }
    
    #' getSubCountsTable
    #'
    #' Get the subcounts table.
    #'
    #' @param matx_Muij The Mu_ij matrix.
    #' @param matx_dispersion The dispersion matrix.
    #' @param replicateID The replication identifier.
    #' @param l_bool_replication A boolean vector indicating the replicates.
    #' @importFrom stats rnbinom
    #' 
    #' @return A subcounts table.
    getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) {
      getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) {
        k_ij <- stats::rnbinom(n_genes * n_samples,
                               size = matx_dispersion,
                               mu = matx_Muij) %>%
                  matrix(nrow = n_genes, ncol = n_samples)
        
        k_ij[is.na(k_ij)] <- 0
        return(k_ij)
      }
      
      if (!any(l_bool_replication))
        return(NULL) 
      
      matx_Muij <- matx_Muij[, l_bool_replication, drop = FALSE]
      matx_dispersion <- matx_dispersion[, l_bool_replication, drop = FALSE] 
      l_sampleID <- colnames(matx_Muij)
      l_geneID <- rownames(matx_Muij)
      dimension_mtx <- dim(matx_Muij)
      n_genes <- dimension_mtx[1]
      n_samples <- dimension_mtx[2]
      matx_kij <- getKijMatrix(matx_Muij, matx_dispersion, n_genes, n_samples)
      colnames(matx_kij) <- paste(l_sampleID, replicateID, sep = "_")
      rownames(matx_kij) <- l_geneID
      return(matx_kij)
    }
    
    #' getReplicationMatrix
    #'
    #' @param minN Minimum number of replicates for each sample
    #' @param maxN Maximum number of replicates for each sample
    #' @param n_samples Number of samples
    #' @export
    #' @return A replication matrix indicating which samples are replicated
    getReplicationMatrix <- function(minN, maxN, n_samples) {
      
      # Create a list of logical vectors representing the minimum number of replicates
      l_replication_minimum = lapply(1:n_samples, 
                                     FUN = function(i) rep(TRUE, times = minN) )
      
      # Create a list of random logical vectors representing additional replicates
      l_replication_random = lapply(1:n_samples, 
                                    FUN = function(i) sample(x = c(TRUE, FALSE), size = maxN-minN, replace = T) )
      
      # Combine the replication vectors into matrices
      matx_replication_minimum <- do.call(cbind, l_replication_minimum)
      matx_replication_random <- do.call(cbind, l_replication_random)
      
      # Combine the minimum replicates and random replicates into a single matrix
      matx_replication <- rbind(matx_replication_minimum, matx_replication_random)
      
      # Sort the columns of the replication matrix in descending order
      matx_replication = apply(matx_replication, 2, sort, decreasing = TRUE ) %>% matrix(nrow = maxN)
      
      return(matx_replication)
    }
    
    #' getCountsTable
    #'
    #' @param matx_Muij Matrix of mean expression values for each gene and sample
    #' @param matx_dispersion Matrix of dispersion values for each gene and sample
    #' @param matx_bool_replication Replication matrix indicating which samples are replicated
    #'
    #' @return A counts table containing simulated read counts for each gene and sample
    getCountsTable <- function(matx_Muij ,  matx_dispersion, matx_bool_replication ){
      max_replicates <-  dim(matx_bool_replication)[1]
      
      # Apply the getSubCountsTable function to each row of the replication matrix
      l_countsTable = lapply(1:max_replicates, function(i) getSubCountsTable(matx_Muij , matx_dispersion, i, matx_bool_replication[i,]  ))
      
      # Combine the counts tables into a single matrix
      countsTable = do.call(cbind, l_countsTable)
      
      return(countsTable %>% as.data.frame())
    }
    
    #' getDispersionMatrix
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param n_genes Number of genes
    #' @param dispersion Vector of dispersion values for each gene
    #' @export
    #'
    #' @return A matrix of dispersion values for each gene and sample
    getDispersionMatrix <- function(list_var, n_genes, dispersion = stats::runif(n_genes, min = 0, max = 1000)){
      l_geneID = paste("gene", 1:n_genes, sep = "")
      l_sampleID = getSampleID(list_var) 
      n_samples = length(l_sampleID) 
      l_dispersion <- dispersion
      
      # Create a data frame for the dispersion values
      dtf_dispersion = list(dispersion =  l_dispersion) %>% as.data.frame()
      dtf_dispersion <- dtf_dispersion[, rep("dispersion", n_samples)]
      rownames(dtf_dispersion) = l_geneID
      colnames(dtf_dispersion) = l_sampleID
      
      matx_dispersion = dtf_dispersion %>% as.matrix()
      
      return(matx_dispersion)
    }
    
    
    
    
    
    #' Replicate rows of a data frame by group
    #'
    #' Replicates the rows of a data frame based on a grouping variable and replication counts for each group.
    #'
    #' @param df Data frame to replicate
    #' @param group_var Name of the grouping variable in the data frame
    #' @param rep_list Vector of replication counts for each group
    #' @return Data frame with replicated rows
    #' @examples
    #' df <- data.frame(group = c("A", "B"), value = c(1, 2))
    #' replicateByGroup(df, "group", c(2, 3))
    #'
    #' @export
    replicateByGroup <- function(df, group_var, rep_list) {
      l_group_var <- df[[group_var]]
      group_levels <- unique(l_group_var)
      names(rep_list) <- group_levels
      group_indices <- rep_list[l_group_var]
      replicated_indices <- rep(seq_len(nrow(df)), times = group_indices)
      replicated_df <- df[replicated_indices, ]
      suffix_sampleID <- sequence(group_indices)
      replicated_df[["sampleID"]] <- paste(replicated_df[["sampleID"]], suffix_sampleID, sep = "_")
      rownames(replicated_df) <- NULL
      return(replicated_df)
    }
    
    
    
    #' Replicate rows of a data frame
    #'
    #' Replicates the rows of a data frame by a specified factor.
    #'
    #' @param df Data frame to replicate
    #' @param n Replication factor for each row
    #' @return Data frame with replicated rows
    #' @export
    #' @examples
    #' df <- data.frame(a = 1:3, b = letters[1:3])
    #' replicateRows(df, 2)
    #'
    replicateRows <- function(df, n) {
      indices <- rep(seq_len(nrow(df)), each = n)
      replicated_df <- df[indices, , drop = FALSE]
      rownames(replicated_df) <- NULL
      return(replicated_df)
    }
    
    #' Get sample metadata
    #'
    #' Generates sample metadata based on the input variables, replication matrix, and number of genes.
    #'
    #' @param list_var A list of variables (already initialized)
    #' @param replicationMatrix Replication matrix
    #' @param n_genes Number of genes
    #' @return Data frame of sample metadata
    #' @importFrom data.table setorderv
    #' @export
    #' @examples
    #' list_var <- init_variable()
    #' replicationMatrix <- generateReplicationMatrix(list_var, 3)
    #' getSampleMetadata(list_var,  replicationMatrix)
    getSampleMetadata <- function(list_var, replicationMatrix) {
      l_sampleIDs = getSampleID(list_var)
      metaData <- generateGridCombination_fromListVar(list_var)
      metaData[] <- lapply(metaData, as.character) ## before reordering
      data.table::setorderv(metaData, cols = colnames(metaData))
      metaData[] <- lapply(metaData, as.factor)
      metaData$sampleID <- l_sampleIDs
      rep_list <- colSums(replicationMatrix)
      metaData$sampleID <- as.character(metaData$sampleID) ## before replicating
      sampleMetadata <- replicateByGroup(metaData, "sampleID", rep_list)
      colnames(sampleMetadata) <- gsub("label_", "", colnames(sampleMetadata))
      return(sampleMetadata)
    }
    
    
    #' getSampleID
    #'
    #' @param list_var A list of variables (already initialized)
    #' @export
    #' @return A sorted vector of sample IDs
    #' @examples
    #' getSampleID(init_variable())
    getSampleID <- function(list_var){
      gridCombination <- generateGridCombination_fromListVar(list_var)
      l_sampleID <- apply( gridCombination , 1 , paste , collapse = "_" ) %>% unname()
      return(sort(l_sampleID))
    }