Select Git revision
mock_rnaseq.R
Arnaud Duvermy authored
mock_rnaseq.R 9.26 KiB
# WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand
#' Check the validity of the dispersion matrix
#'
#' Checks if the dispersion matrix has the correct dimensions.
#'
#' @param matx_dispersion Replication matrix
#' @param matx_bool_replication Replication matrix
#' @return TRUE if the dimensions are valid, FALSE otherwise
#' @export
#' @examples
#' matx_dispersion <- matrix(1:12, nrow = 3, ncol = 4)
#' matx_bool_replication <- matrix(TRUE, nrow = 3, ncol = 4)
#' is_dispersionMatrixValid(matx_dispersion, matx_bool_replication)
is_dispersionMatrixValid <- function(matx_dispersion, matx_bool_replication) {
expected_nb_column <- dim(matx_bool_replication)[2]
if (expected_nb_column != dim(matx_dispersion)[2]) {
return(FALSE)
}
return(TRUE)
}
#' Generate count table
#'
#' Generates the count table based on the mu_ij matrix, dispersion matrix, and replication matrix.
#'
#' @param mu_ij_matx_rep Replicated mu_ij matrix
#' @param matx_dispersion_rep Replicated dispersion matrix
#' @return Count table
#' @export
#' @examples
#' mu_ij_matx_rep <- matrix(1:12, nrow = 3, ncol = 4)
#' matx_dispersion_rep <- matrix(1:12, nrow = 3, ncol = 4)
#' generateCountTable(mu_ij_matx_rep, matx_dispersion_rep)
generateCountTable <- function(mu_ij_matx_rep, matx_dispersion_rep) {
message("k_ij ~ Nbinom(mu_ij, dispersion)")
n_genes <- dim(mu_ij_matx_rep)[1]
n_samples <- dim(mu_ij_matx_rep)[2]
n_samplings <- prod(n_genes * n_samples)
mat_countsTable <- rnbinom(n_samplings,
size = matx_dispersion_rep,
mu = mu_ij_matx_rep) %>%
matrix(nrow = n_genes, ncol = n_samples)
colnames(mat_countsTable) <- colnames(mu_ij_matx_rep)
rownames(mat_countsTable) <- rownames(mu_ij_matx_rep)
mat_countsTable[is.na(mat_countsTable)] <- 0
return(mat_countsTable)
}
#' Perform RNA-seq simulation
#'
#' Simulates RNA-seq data based on the input variables.
#'
#' @param list_var List of input variables
#' @param n_genes Number of genes
#' @param min_replicates Minimum replication count
#' @param max_replicates Maximum replication count
#' @param sequencing_depth Sequencing depth
#' @param basal_expression base expression gene
#' @param dispersion User-provided dispersion vector (optional)
#' @return List containing the ground truth, counts, and metadata
#' @export
#' @examples
#' mock_rnaseq(list_var = init_variable(),
#' n_genes = 1000, min_replicates = 2,
#' max_replicates = 4)
mock_rnaseq <- function(list_var, n_genes, min_replicates, max_replicates, sequencing_depth = NULL,
basal_expression = 0 , dispersion = stats::runif(n_genes, min = 0, max = 1000) ) {
## -- get my effect
df_inputSimulation <- getInput2simulation(list_var, n_genes)
## -- add column logQij
df_inputSimulation <- getLog_qij(df_inputSimulation)
df_inputSimulation <- addBasalExpression(df_inputSimulation, n_genes, basal_expression)
df_inputSimulation <- getMu_ij(df_inputSimulation )
message("Building mu_ij matrix")
## -- matrix
matx_Muij <- getMu_ij_matrix(df_inputSimulation)
l_sampleID <- getSampleID(list_var)
matx_bool_replication <- generateReplicationMatrix(list_var, min_replicates, max_replicates)
mu_ij_matx_rep <- replicateMatrix(matx_Muij, matx_bool_replication)
dispersion <- getValidDispersion(dispersion)
genes_dispersion <- sample(dispersion , size = n_genes, replace = T)
matx_dispersion <- getDispersionMatrix(list_var, n_genes, genes_dispersion)
l_geneID = base::paste("gene", 1:n_genes, sep = "")
names(genes_dispersion) <- l_geneID
## same order as mu_ij_matx_rep
matx_dispersion <- matx_dispersion[ order(row.names(matx_dispersion)), ]
matx_dispersion_rep <- replicateMatrix(matx_dispersion, matx_bool_replication)
if (!is.null(sequencing_depth)) {
message("Scaling count table according to sequencing depth.")
message("Scaling counts by sequencing depth may exhibit some randomness due to certain parameter combinations, resulting in erratic behavior. This can be minimized by simulating more genes. We advise verifying the simulated sequencing depth to avoid drawing incorrect conclusions.")
mu_ij_dtf_rep <- scaleCountsTable(mu_ij_matx_rep, sequencing_depth)
mu_ij_matx_rep <- as.matrix(mu_ij_dtf_rep)
}
matx_countsTable <- generateCountTable(mu_ij_matx_rep, matx_dispersion_rep)
message("Counts simulation: Done")
dtf_countsTable <- matx_countsTable %>% as.data.frame()
#if (!is.null(sequencing_depth)) {
# message("Scaling count table according to sequencing depth.")
# dtf_countsTable <- scaleCountsTable(dtf_countsTable, sequencing_depth)
#}
checkFractionOfZero(dtf_countsTable)
metaData <- getSampleMetadata(list_var, n_genes, matx_bool_replication)
libSize <- sum(colSums(dtf_countsTable))
settings_df <- getSettingsTable(n_genes, min_replicates, max_replicates, libSize)
list2ret <- list( settings = settings_df, init = list_var,
groundTruth = list(effects = df_inputSimulation, gene_dispersion = genes_dispersion),
counts = dtf_countsTable,
metadata = metaData)
## -- clean garbage collector to save memory
invisible(gc(reset = TRUE, verbose = FALSE));
return(list2ret)
}
#' Check Fraction of Zero or One in Counts Table
#'
#' This function checks the percentage of counts in a given counts table that are either zero or one.
#' If more than 50% of the counts fall in this category, a warning is issued, suggesting a review of input parameters.
#'
#' @param counts_table A matrix or data frame representing counts.
#' @return NULL
#' @export
#' @examples
#' # Example usage:
#' counts_table <- matrix(c(0, 1, 2, 3, 4, 0, 0, 1, 1), ncol = 3)
#' checkFractionOfZero(counts_table)
checkFractionOfZero <- function(counts_table){
dim_matrix <- dim(counts_table)
n_counts <- dim_matrix[1]*dim_matrix[2]
n_zero_or_one <- sum(counts_table < 1)
fractionOfZero <- n_zero_or_one/n_counts*100
if( fractionOfZero > 50) {
message("50% of the counts simulated are bellow 1. Consider reviewing your input parameters.")
}
return(NULL)
}
#' Validate and Filter Dispersion Values
#'
#' This function takes an input vector and validates it to ensure that it meets certain criteria.
#'
#' @param input_vector A vector to be validated.
#' @return A validated and filtered numeric vector.
#' @details The function checks whether the input is a vector, suppresses warnings while converting to numeric,
#' and filters out non-numeric elements. It also checks for values greater than zero and removes negative values.
#' If the resulting vector has a length of zero, an error is thrown.
#' @examples
#' getValidDispersion(c(0.5, 1.2, -0.3, "invalid", 0.8))
#' @export
getValidDispersion <- function(input_vector) {
# Verify if it's a vector
if (!is.vector(input_vector)) {
stop("dispersion param is not a vector.")
}
input_vector <- suppressWarnings(as.numeric(input_vector))
# Filter numeric elements
numeric_elements <- !is.na(input_vector)
if (sum(!numeric_elements) > 0) {
message("Non-numeric elements were removed from the dispersion vector")
input_vector <- input_vector[numeric_elements]
}
# Check and filter values > 0
numeric_positive_elements <- input_vector > 0
if (sum(!numeric_positive_elements) > 0) {
message("Negative numeric values were removed from the dispersion vector")
input_vector <- input_vector[numeric_positive_elements]
}
if (length(input_vector) == 0) stop("Invalid dispersion values provided.")
return(input_vector)
}
#' Generate replication matrix
#'
#' Generates the replication matrix based on the minimum and maximum replication counts.
#'
#' @param list_var Number of samples
#' @param min_replicates Minimum replication count
#' @param max_replicates Maximum replication count
#' @return Replication matrix
#' @export
#' @examples
#' list_var = init_variable()
#' generateReplicationMatrix(list_var, min_replicates = 2, max_replicates = 4)
generateReplicationMatrix <- function(list_var, min_replicates, max_replicates) {
if (min_replicates > max_replicates) {
message("min_replicates > max_replicates have been supplied")
message("Automatic reversing")
tmp_min_replicates <- min_replicates
min_replicates <- max_replicates
max_replicates <- tmp_min_replicates
}
l_sampleIDs <- getSampleID(list_var)
n_samples <- length(l_sampleIDs)
return(getReplicationMatrix(min_replicates, max_replicates, n_samples = n_samples))
}
#' Replicate matrix
#'
#' Replicates a matrix based on a replication matrix.
#'
#' @param matrix Matrix to replicate
#' @param replication_matrix Replication matrix
#' @return Replicated matrix
#' @export
#' @examples
#' matrix <- matrix(1:9, nrow = 3, ncol = 3)
#' replication_matrix <- matrix(TRUE, nrow = 3, ncol = 3)
#' replicateMatrix(matrix, replication_matrix)
replicateMatrix <- function(matrix, replication_matrix) {
n_genes <- dim(matrix)[1]
rep_list <- colSums(replication_matrix)
replicated_indices <- rep(seq_len(ncol(matrix)), times = rep_list)
replicated_matrix <- matrix[, replicated_indices, drop = FALSE]
suffix_sampleID <- sequence(rep_list)
colnames(replicated_matrix) <- paste(colnames(replicated_matrix), suffix_sampleID, sep = "_")
return(replicated_matrix)
}