simulation.R

# WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand


#' Get input for simulation based on coefficients
#'
#' This function generates input data for simulation based on the coefficients provided in the \code{list_var} argument.
#'
#' @param list_var A list of variables (already initialized)
#' @param n_genes Number of genes to simulate (default: 1)
#' @param normal_distr Specifies the distribution type for generating effects. Choose between 'univariate' (default) or 'multivariate' .
#' - 'univariate': Effects are drawn independently from univariate normal distributions. 
#' - 'multivariate': Effects are drawn jointly from a multivariate normal distribution. (not recommended)
#' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL)
#' @return A data frame with input coefficients for simulation
#' @export
#' @examples
#' # Example usage
#' list_var <- init_variable()
#' getInput2simulation(list_var, n_genes = 10)
getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "univariate",  input2mvrnorm = NULL) {
  
  stopifnot( normal_distr %in% c("multivariate", "univariate") )

  if (normal_distr == "multivariate"){
      if (is.null(input2mvrnorm)) input2mvrnorm = getInput2mvrnorm(list_var)    
      l_dataFrom_normdistr <- getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
    } 
  if (normal_distr == "univariate"){
      l_dataFrom_normdistr <- getDataFromRnorm(list_var, n_genes)
    }
  
  l_dataFromUser = getDataFromUser(list_var)
  
  df_input2simu <- getCoefficients(list_var, l_dataFrom_normdistr, l_dataFromUser, n_genes)
  
  return(df_input2simu)
}


#' Get the reference level for categorical variables in the data
#'
#' This function extracts the reference level for each categorical variable in the data.
#' The reference level is the first level encountered for each categorical variable.
#'
#' @param data The data frame containing the categorical variables.
#' @return A list containing the reference level for each categorical variable.
#' @export
getRefLevel <- function(data){
  col_names <- colnames(data)
  categorical_vars <- col_names[grepl(col_names, pattern = "label_")]
  if (length(categorical_vars) == 1){
    l_labels <- list()
    l_labels[[categorical_vars]] <- levels(data[, categorical_vars])
    
  } else l_labels <- lapply(data[, categorical_vars], levels)
  l_labels_ref <- sapply(l_labels, function(vec) vec[1])
  return(l_labels_ref)
}

#' Replace the effect by 0 in the data
#'
#' This function replaces the effect in interactions columns by 0, when needed.
#'
#' @param list_var The list of variables containing the effects to modify.
#' @param l_labels_ref A list containing the reference level for each categorical variable.
#' @param data The data frame containing the effects to modify.
#' @return The modified data frame 
#' @export
replaceUnexpectedInteractionValuesBy0 <- function(list_var, l_labels_ref , data){
  varInteraction <- getListVar(list_var$interactions)
  df_interaction_with0 <- sapply(varInteraction, function(var){
    categorical_var <- paste("label", unlist(strsplit(var, ":")), sep = "_")
    bool_matrix <- sapply(categorical_var, function(uniq_cat_var) data[uniq_cat_var] ==  l_labels_ref[uniq_cat_var])
    idx_0 <- rowSums(bool_matrix) > 0 ## line without interactions effects
    return(replace(data[[var]], idx_0, 0)) 
  })
  col_names <- colnames(data)
  categorical_vars <- col_names[grepl(col_names, pattern = "label_")]
  data[, varInteraction] <- df_interaction_with0
  return(data)
}


#' Prepare data using effects from a normal distribution
#'
#' Prepares the data by generating effects from a normal distribution for each gene.
#'
#' @param list_var A list of variables (already initialized)
#' @param n_genes Number of genes to generate data for.
#' @return A dataframe containing gene metadata and effects generated from a normal distribution.
#' @export
getDataFromRnorm <- function(list_var, n_genes){
    ## -- check if all data have been provided by user
    if (is.null(getInput2mvrnorm(list_var)$covMatrix))
        return(list())
    metadata <- getGeneMetadata(list_var , n_genes)
    df_effects <- get_effects_from_rnorm(list_var, metadata)
    data <- cbind(metadata, df_effects)  
    
    if(!is.null(getListVar(list_var$interactions))){
      l_labels_ref <- getRefLevel(data)
      data <- replaceUnexpectedInteractionValuesBy0(list_var, l_labels_ref, data)
    }
    
    return(list(data))
}

#' Generate effects from a normal distribution
#'
#' Generates effects from a normal distribution for each gene.
#'
#' @param list_var A list of variables (already initialized)
#' @param metadata Gene metadata.
#' @return A dataframe containing effects generated from a normal distribution.
#' @export
get_effects_from_rnorm <- function(list_var, metadata){
  
  variable_standard_dev <- getGivenAttribute(list_var, attribute = "sd") %>% unlist()
  interaction_standard_dev <- getGivenAttribute(list_var$interactions, attribute = "sd") %>% unlist()
  list_stdev <- c(variable_standard_dev, interaction_standard_dev)
  
  # -- mu
  variable_mu <- getGivenAttribute(list_var, attribute = "mu") %>% unlist()
  interaction_mu <- getGivenAttribute(list_var$interactions, attribute = "mu") %>% unlist()
  list_mu <- c(variable_mu, interaction_mu)
  
  variable_2rnorm <- names(list_stdev)
  l_effects <- lapply(stats::setNames(variable_2rnorm, variable_2rnorm) , function(var){
    col_labels <- paste("label", unlist(strsplit(var, ":")), sep = "_")
    cols2paste <- c("geneID", col_labels)
    list_combinations <- apply( metadata[ , cols2paste ] , 1 , paste , collapse = "-" )
    list_effects <- unique(list_combinations)
    list_beta <- rnorm(length(list_effects), mean = list_mu[var], sd = list_stdev[var])
    names(list_beta) <- list_effects
    unname(list_beta[list_combinations])
  })
       
  
  df_effects <- do.call("cbind", l_effects)
  return(df_effects)
}


#' getCoefficients
#'
#' Get the coefficients.
#'
#' @param list_var A list of variables (already initialized)
#' @param l_dataFromMvrnorm Data from the `getGeneMetadata` function (optional).
#' @param l_dataFromUser Data from the `getDataFromUser` function (optional).
#' @param n_genes The number of genes.
#' @export
#' @return A dataframe containing the coefficients.
#' @examples
#' # Example usage
#' list_var <- init_variable()
#' input2mvrnorm = getInput2mvrnorm(list_var)
#' l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes=3)
#' l_dataFromUser = getDataFromUser(list_var)
#' getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes = 3)
getCoefficients <- function(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) {
  if (length(l_dataFromMvrnorm) == 0) {
    metaData <- getGeneMetadata(list_var, n_genes)
    l_dataFromMvrnorm <- list(metaData)
  }
  l_df2join <- c(l_dataFromMvrnorm, l_dataFromUser)
  
  
  df_coef <- Reduce(function(d1, d2){ column_names = colnames(d2)
                                      idx_key = grepl(pattern = "label", column_names )
                                      keys = column_names[idx_key]
                                      join_dtf(d1, d2, k1 = keys , k2 = keys)
                                    } 
                    , l_df2join ) %>% as.data.frame()
  column_names <- colnames(df_coef)
  idx_column2factor <- grep(pattern = "label_", column_names)
  
  if (length(idx_column2factor) > 1) {
    df_coef[, idx_column2factor] <- lapply(df_coef[, idx_column2factor], as.factor)
  } else {
    df_coef[, idx_column2factor] <- as.factor(df_coef[, idx_column2factor])
  }
  
  return(df_coef)
}


#' Get the log_qij values from the coefficient data frame.
#'
#' @param dtf_coef The coefficient data frame.
#' @return The coefficient data frame with log_qij column added.
#' @export
#' @examples
#' list_var <- init_variable()
#' dtf_coef <- getInput2simulation(list_var, 10)
#' dtf_coef <- getLog_qij(dtf_coef)
getLog_qij <- function(dtf_coef) {
  dtf_beta_numeric <- dtf_coef[sapply(dtf_coef, is.numeric)]
  dtf_coef$log_qij <- rowSums(dtf_beta_numeric, na.rm = TRUE)
  return(dtf_coef)
}


#' Calculate mu_ij values based on coefficient data frame and scaling factor
#'
#' This function calculates mu_ij values by raising 2 to the power of the log_qij values
#' from the coefficient data frame and multiplying it by the provided scaling factor.
#'
#' @param dtf_coef Coefficient data frame containing the log_qij values
#'
#' @return Coefficient data frame with an additional mu_ij column
#'
#' @examples
#' list_var <- init_variable()
#' dtf_coef <- getInput2simulation(list_var, 10)
#' dtf_coef <- getLog_qij(dtf_coef)
#' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
#' getMu_ij(dtf_coef)
#' @export
getMu_ij <- function(dtf_coef) {
  log_qij_scaled <- dtf_coef$log_qij + dtf_coef$basalExpr
  dtf_coef$log_qij_scaled <- log_qij_scaled
  mu_ij <- exp(log_qij_scaled)  
  dtf_coef$mu_ij <- mu_ij
  return(dtf_coef)
}

#' getMu_ij_matrix
#'
#' Get the Mu_ij matrix.
#'
#' @param dtf_coef A dataframe containing the coefficients.
#' @importFrom reshape2 dcast
#' @importFrom stats as.formula

#' @export
#' @return A Mu_ij matrix.
#' @examples
#' list_var <- init_variable()
#' dtf_coef <- getInput2simulation(list_var, 10)
#' dtf_coef <- getLog_qij(dtf_coef)
#' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
#' dtf_coef<- getMu_ij(dtf_coef)
#' getMu_ij_matrix(dtf_coef)
getMu_ij_matrix <- function(dtf_coef) {
  column_names <- colnames(dtf_coef)
  idx_var <- grepl(pattern = "label", column_names)
  l_var <- column_names[idx_var]
  str_formula_rigth <- paste(l_var, collapse = " + ")
  if (str_formula_rigth == "") stop("no variable label detected")
  str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ")
  formula <- stats::as.formula(str_formula)
  dtf_Muij <- dtf_coef %>% reshape2::dcast(formula = formula, value.var = "mu_ij", drop = F)
  dtf_Muij[is.na(dtf_Muij)] <- 0
  mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix()
  mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F]
  return(mtx_Muij)
}

#' getSubCountsTable
#'
#' Get the subcounts table.
#'
#' @param matx_Muij The Mu_ij matrix.
#' @param matx_dispersion The dispersion matrix.
#' @param replicateID The replication identifier.
#' @param l_bool_replication A boolean vector indicating the replicates.
#' @importFrom stats rnbinom
#' 
#' @return A subcounts table.
getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) {
  getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) {
    k_ij <- stats::rnbinom(n_genes * n_samples,
                           size = matx_dispersion,
                           mu = matx_Muij) %>%
              matrix(nrow = n_genes, ncol = n_samples)
    
    k_ij[is.na(k_ij)] <- 0
    return(k_ij)
  }
  
  if (!any(l_bool_replication))
    return(NULL) 
  
  matx_Muij <- matx_Muij[, l_bool_replication, drop = FALSE]
  matx_dispersion <- matx_dispersion[, l_bool_replication, drop = FALSE] 
  l_sampleID <- colnames(matx_Muij)
  l_geneID <- rownames(matx_Muij)
  dimension_mtx <- dim(matx_Muij)
  n_genes <- dimension_mtx[1]
  n_samples <- dimension_mtx[2]
  matx_kij <- getKijMatrix(matx_Muij, matx_dispersion, n_genes, n_samples)
  colnames(matx_kij) <- paste(l_sampleID, replicateID, sep = "_")
  rownames(matx_kij) <- l_geneID
  return(matx_kij)
}

#' getReplicationMatrix
#'
#' @param minN Minimum number of replicates for each sample
#' @param maxN Maximum number of replicates for each sample
#' @param n_samples Number of samples
#' @export
#' @return A replication matrix indicating which samples are replicated
getReplicationMatrix <- function(minN, maxN, n_samples) {
  
  # Create a list of logical vectors representing the minimum number of replicates
  l_replication_minimum = lapply(1:n_samples, 
                                 FUN = function(i) rep(TRUE, times = minN) )
  
  # Create a list of random logical vectors representing additional replicates
  l_replication_random = lapply(1:n_samples, 
                                FUN = function(i) sample(x = c(TRUE, FALSE), size = maxN-minN, replace = T) )
  
  # Combine the replication vectors into matrices
  matx_replication_minimum <- do.call(cbind, l_replication_minimum)
  matx_replication_random <- do.call(cbind, l_replication_random)
  
  # Combine the minimum replicates and random replicates into a single matrix
  matx_replication <- rbind(matx_replication_minimum, matx_replication_random)
  
  # Sort the columns of the replication matrix in descending order
  matx_replication = apply(matx_replication, 2, sort, decreasing = TRUE ) %>% matrix(nrow = maxN)
  
  return(matx_replication)
}

#' getCountsTable
#'
#' @param matx_Muij Matrix of mean expression values for each gene and sample
#' @param matx_dispersion Matrix of dispersion values for each gene and sample
#' @param matx_bool_replication Replication matrix indicating which samples are replicated
#'
#' @return A counts table containing simulated read counts for each gene and sample
getCountsTable <- function(matx_Muij ,  matx_dispersion, matx_bool_replication ){
  max_replicates <-  dim(matx_bool_replication)[1]
  
  # Apply the getSubCountsTable function to each row of the replication matrix
  l_countsTable = lapply(1:max_replicates, function(i) getSubCountsTable(matx_Muij , matx_dispersion, i, matx_bool_replication[i,]  ))
  
  # Combine the counts tables into a single matrix
  countsTable = do.call(cbind, l_countsTable)
  
  return(countsTable %>% as.data.frame())
}

#' getDispersionMatrix
#'
#' @param list_var A list of variables (already initialized)
#' @param n_genes Number of genes
#' @param dispersion Vector of dispersion values for each gene
#' @export
#'
#' @return A matrix of dispersion values for each gene and sample
getDispersionMatrix <- function(list_var, n_genes, dispersion = stats::runif(n_genes, min = 0, max = 1000)){
  l_geneID = paste("gene", 1:n_genes, sep = "")
  l_sampleID = getSampleID(list_var) 
  n_samples = length(l_sampleID) 
  l_dispersion <- dispersion
  
  # Create a data frame for the dispersion values
  dtf_dispersion = list(dispersion =  l_dispersion) %>% as.data.frame()
  dtf_dispersion <- dtf_dispersion[, rep("dispersion", n_samples)]
  rownames(dtf_dispersion) = l_geneID
  colnames(dtf_dispersion) = l_sampleID
  
  matx_dispersion = dtf_dispersion %>% as.matrix()
  
  return(matx_dispersion)
}


#' Replicate rows of a data frame by group
#'
#' Replicates the rows of a data frame based on a grouping variable and replication counts for each group.
#'
#' @param df Data frame to replicate
#' @param group_var Name of the grouping variable in the data frame
#' @param rep_list Vector of replication counts for each group
#' @return Data frame with replicated rows
#' @examples
#' df <- data.frame(group = c("A", "B"), value = c(1, 2))
#' replicateByGroup(df, "group", c(2, 3))
#'
#' @export
replicateByGroup <- function(df, group_var, rep_list) {
  l_group_var <- df[[group_var]]
  group_levels <- unique(l_group_var)
  names(rep_list) <- group_levels
  group_indices <- rep_list[l_group_var]
  replicated_indices <- rep(seq_len(nrow(df)), times = group_indices)
  replicated_df <- df[replicated_indices, ]
  suffix_sampleID <- sequence(group_indices)
  replicated_df[["sampleID"]] <- paste(replicated_df[["sampleID"]], suffix_sampleID, sep = "_")
  rownames(replicated_df) <- NULL
  return(replicated_df)
}


#' Replicate rows of a data frame
#'
#' Replicates the rows of a data frame by a specified factor.
#'
#' @param df Data frame to replicate
#' @param n Replication factor for each row
#' @return Data frame with replicated rows
#' @export
#' @examples
#' df <- data.frame(a = 1:3, b = letters[1:3])
#' replicateRows(df, 2)
#'
replicateRows <- function(df, n) {
  indices <- rep(seq_len(nrow(df)), each = n)
  replicated_df <- df[indices, , drop = FALSE]
  rownames(replicated_df) <- NULL
  return(replicated_df)
}

#' Get sample metadata
#'
#' Generates sample metadata based on the input variables, replication matrix, and number of genes.
#'
#' @param list_var A list of variables (already initialized)
#' @param replicationMatrix Replication matrix
#' @param n_genes Number of genes
#' @return Data frame of sample metadata
#' @importFrom data.table setorderv
#' @export
#' @examples
#' list_var <- init_variable()
#' replicationMatrix <- generateReplicationMatrix(list_var, 3, 3)
#' getSampleMetadata(list_var,  replicationMatrix)
getSampleMetadata <- function(list_var, replicationMatrix) {
  l_sampleIDs = getSampleID(list_var)
  metaData <- generateGridCombination_fromListVar(list_var)
  metaData[] <- lapply(metaData, as.character) ## before reordering
  data.table::setorderv(metaData, cols = colnames(metaData))
  metaData[] <- lapply(metaData, as.factor)
  metaData$sampleID <- l_sampleIDs
  rep_list <- colSums(replicationMatrix)
  metaData$sampleID <- as.character(metaData$sampleID) ## before replicating
  sampleMetadata <- replicateByGroup(metaData, "sampleID", rep_list)
  colnames(sampleMetadata) <- gsub("label_", "", colnames(sampleMetadata))
  return(sampleMetadata)
}


#' getSampleID
#'
#' @param list_var A list of variables (already initialized)
#' @export
#' @return A sorted vector of sample IDs
#' @examples
#' getSampleID(init_variable())
getSampleID <- function(list_var){
  gridCombination <- generateGridCombination_fromListVar(list_var)
  l_sampleID <- apply( gridCombination , 1 , paste , collapse = "_" ) %>% unname()
  return(sort(l_sampleID))
}