Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# WARNING - Generated by {fusen} from /dev/flat_full.Rmd: do not edit by hand
#' Get input for simulation based on coefficients
#'
#' This function generates input data for simulation based on the coefficients provided in the \code{list_var} argument.
#'
#' @param list_var A list of variables (already initialized)
#' @param n_genes Number of genes to simulate (default: 1)
#' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL)
#' @return A data frame with input coefficients for simulation
#' @export
#' @examples
#' # Example usage
#' list_var <- init_variable()
#' getInput2simulation(list_var, n_genes = 10)
getInput2simulation <- function(list_var, n_genes = 1, input2mvrnorm = NULL) {
# Use default input to mvrnorm if not provided by the user
if (is.null(input2mvrnorm)) input2mvrnorm = getInput2mvrnorm(list_var)
l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
l_dataFromUser = getDataFromUser(list_var)
df_input2simu <- getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes)
return(df_input2simu)
}
#' getCoefficients
#'
#' Get the coefficients.
#'
#' @param list_var A list of variables (already initialized)
#' @param l_dataFromMvrnorm Data from the `getGeneMetadata` function (optional).
#' @param l_dataFromUser Data from the `getDataFromUser` function (optional).
#' @param n_genes The number of genes.
#' @export
#' @return A dataframe containing the coefficients.
#' @examples
#' # Example usage
#' list_var <- init_variable()
#' input2mvrnorm = getInput2mvrnorm(list_var)
#' l_dataFromMvrnorm = getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
#' l_dataFromUser = getDataFromUser(list_var)
#' getCoefficients(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes = 3)
getCoefficients <- function(list_var, l_dataFromMvrnorm, l_dataFromUser, n_genes) {
if (length(l_dataFromMvrnorm) == 0) {
metaData <- getGeneMetadata(list_var, n_genes)
l_dataFromMvrnorm <- list(metaData)
}
l_df2join <- c(l_dataFromMvrnorm, l_dataFromUser)
df_coef <- Reduce(function(d1, d2){ column_names = colnames(d2)
idx_key = grepl(pattern = "label", column_names )
keys = column_names[idx_key]
join_dtf(d1, d2, k1 = keys , k2 = keys)
}
, l_df2join ) %>% as.data.frame()
column_names <- colnames(df_coef)
idx_column2factor <- grep(pattern = "label_", column_names)
if (length(idx_column2factor) > 1) {
df_coef[, idx_column2factor] <- lapply(df_coef[, idx_column2factor], as.factor)
} else {
df_coef[, idx_column2factor] <- as.factor(df_coef[, idx_column2factor])
}
return(df_coef)
}
#' Get the log_qij values from the coefficient data frame.
#'
#' @param dtf_coef The coefficient data frame.
#' @return The coefficient data frame with log_qij column added.
#' @export
#' @examples
#' list_var <- init_variable()
#' dtf_coef <- getInput2simulation(list_var, 10)
#' dtf_coef <- getLog_qij(dtf_coef)
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
getLog_qij <- function(dtf_coef) {
dtf_beta_numeric <- dtf_coef[sapply(dtf_coef, is.numeric)]
dtf_coef$log_qij <- rowSums(dtf_beta_numeric, na.rm = TRUE)
return(dtf_coef)
}
#' Calculate mu_ij values based on coefficient data frame and scaling factor
#'
#' This function calculates mu_ij values by raising 2 to the power of the log_qij values
#' from the coefficient data frame and multiplying it by the provided scaling factor.
#'
#' @param dtf_coef Coefficient data frame containing the log_qij values
#'
#' @return Coefficient data frame with an additional mu_ij column
#'
#' @examples
#' list_var <- init_variable()
#' dtf_coef <- getInput2simulation(list_var, 10)
#' dtf_coef <- getLog_qij(dtf_coef)
#' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
#' getMu_ij(dtf_coef)
#' @export
getMu_ij <- function(dtf_coef) {
log_qij_scaled <- dtf_coef$log_qij + dtf_coef$basalExpr
dtf_coef$log_qij_scaled <- log_qij_scaled
mu_ij <- exp(log_qij_scaled)
dtf_coef$mu_ij <- mu_ij
return(dtf_coef)
}
#' getMu_ij_matrix
#'
#' Get the Mu_ij matrix.
#'
#' @param dtf_coef A dataframe containing the coefficients.
#' @importFrom reshape2 dcast
#' @importFrom stats as.formula
#' @export
#' @return A Mu_ij matrix.
#' @examples
#' list_var <- init_variable()
#' dtf_coef <- getInput2simulation(list_var, 10)
#' dtf_coef <- getLog_qij(dtf_coef)
#' dtf_coef <- addBasalExpression(dtf_coef, 10, c(10, 20, 0))
#' dtf_coef<- getMu_ij(dtf_coef)
#' getMu_ij_matrix(dtf_coef)
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
getMu_ij_matrix <- function(dtf_coef) {
column_names <- colnames(dtf_coef)
idx_var <- grepl(pattern = "label", column_names)
l_var <- column_names[idx_var]
str_formula_rigth <- paste(l_var, collapse = " + ")
if (str_formula_rigth == "") stop("no variable label detected")
str_formula <- paste(c("geneID", str_formula_rigth), collapse = " ~ ")
formula <- stats::as.formula(str_formula)
dtf_Muij <- dtf_coef %>% reshape2::dcast(formula = formula, value.var = "mu_ij", drop = F)
dtf_Muij[is.na(dtf_Muij)] <- 0
mtx_Muij <- data.frame(dtf_Muij[, -1], row.names = dtf_Muij[, 1]) %>% as.matrix()
mtx_Muij <- mtx_Muij[, order(colnames(mtx_Muij)), drop = F]
return(mtx_Muij)
}
#' getSubCountsTable
#'
#' Get the subcounts table.
#'
#' @param matx_Muij The Mu_ij matrix.
#' @param matx_dispersion The dispersion matrix.
#' @param replicateID The replication identifier.
#' @param l_bool_replication A boolean vector indicating the replicates.
#' @importFrom stats rnbinom
#'
#' @return A subcounts table.
getSubCountsTable <- function(matx_Muij, matx_dispersion, replicateID, l_bool_replication) {
getKijMatrix <- function(matx_Muij, matx_dispersion, n_genes, n_samples) {
k_ij <- stats::rnbinom(n_genes * n_samples,
size = matx_dispersion,
mu = matx_Muij) %>%
matrix(nrow = n_genes, ncol = n_samples)
k_ij[is.na(k_ij)] <- 0
return(k_ij)
}
if (!any(l_bool_replication))
return(NULL)
matx_Muij <- matx_Muij[, l_bool_replication, drop = FALSE]
matx_dispersion <- matx_dispersion[, l_bool_replication, drop = FALSE]
l_sampleID <- colnames(matx_Muij)
l_geneID <- rownames(matx_Muij)
dimension_mtx <- dim(matx_Muij)
n_genes <- dimension_mtx[1]
n_samples <- dimension_mtx[2]
matx_kij <- getKijMatrix(matx_Muij, matx_dispersion, n_genes, n_samples)
colnames(matx_kij) <- paste(l_sampleID, replicateID, sep = "_")
rownames(matx_kij) <- l_geneID
return(matx_kij)
}
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#' getReplicationMatrix
#'
#' @param minN Minimum number of replicates for each sample
#' @param maxN Maximum number of replicates for each sample
#' @param n_samples Number of samples
#' @export
#' @return A replication matrix indicating which samples are replicated
getReplicationMatrix <- function(minN, maxN, n_samples) {
# Create a list of logical vectors representing the minimum number of replicates
l_replication_minimum = lapply(1:n_samples,
FUN = function(i) rep(TRUE, times = minN) )
# Create a list of random logical vectors representing additional replicates
l_replication_random = lapply(1:n_samples,
FUN = function(i) sample(x = c(TRUE, FALSE), size = maxN-minN, replace = T) )
# Combine the replication vectors into matrices
matx_replication_minimum <- do.call(cbind, l_replication_minimum)
matx_replication_random <- do.call(cbind, l_replication_random)
# Combine the minimum replicates and random replicates into a single matrix
matx_replication <- rbind(matx_replication_minimum, matx_replication_random)
# Sort the columns of the replication matrix in descending order
matx_replication = apply(matx_replication, 2, sort, decreasing = TRUE ) %>% matrix(nrow = maxN)
return(matx_replication)
}
#' getCountsTable
#'
#' @param matx_Muij Matrix of mean expression values for each gene and sample
#' @param matx_dispersion Matrix of dispersion values for each gene and sample
#' @param matx_bool_replication Replication matrix indicating which samples are replicated
#'
#' @return A counts table containing simulated read counts for each gene and sample
getCountsTable <- function(matx_Muij , matx_dispersion, matx_bool_replication ){
max_replicates <- dim(matx_bool_replication)[1]
# Apply the getSubCountsTable function to each row of the replication matrix
l_countsTable = lapply(1:max_replicates, function(i) getSubCountsTable(matx_Muij , matx_dispersion, i, matx_bool_replication[i,] ))
# Combine the counts tables into a single matrix
countsTable = do.call(cbind, l_countsTable)
return(countsTable %>% as.data.frame())
}
#' getDispersionMatrix
#'
#' @param list_var A list of variables (already initialized)
#' @param n_genes Number of genes
#' @param dispersion Vector of dispersion values for each gene
#' @export
#'
#' @return A matrix of dispersion values for each gene and sample
getDispersionMatrix <- function(list_var, n_genes, dispersion = stats::runif(n_genes, min = 0, max = 1000)){
l_geneID = paste("gene", 1:n_genes, sep = "")
l_sampleID = getSampleID(list_var)
n_samples = length(l_sampleID)
l_dispersion <- dispersion
# Create a data frame for the dispersion values
dtf_dispersion = list(dispersion = l_dispersion) %>% as.data.frame()
dtf_dispersion <- dtf_dispersion[, rep("dispersion", n_samples)]
rownames(dtf_dispersion) = l_geneID
colnames(dtf_dispersion) = l_sampleID
matx_dispersion = dtf_dispersion %>% as.matrix()
return(matx_dispersion)
}
#' Replicate rows of a data frame by group
#'
#' Replicates the rows of a data frame based on a grouping variable and replication counts for each group.
#'
#' @param df Data frame to replicate
#' @param group_var Name of the grouping variable in the data frame
#' @param rep_list Vector of replication counts for each group
#' @return Data frame with replicated rows
#' @examples
#' df <- data.frame(group = c("A", "B"), value = c(1, 2))
#' replicateByGroup(df, "group", c(2, 3))
#'
#' @export
replicateByGroup <- function(df, group_var, rep_list) {
l_group_var <- df[[group_var]]
group_levels <- unique(l_group_var)
names(rep_list) <- group_levels
group_indices <- rep_list[l_group_var]
replicated_indices <- rep(seq_len(nrow(df)), times = group_indices)
replicated_df <- df[replicated_indices, ]
suffix_sampleID <- sequence(group_indices)
replicated_df[["sampleID"]] <- paste(replicated_df[["sampleID"]], suffix_sampleID, sep = "_")
rownames(replicated_df) <- NULL
return(replicated_df)
}
#' Replicate rows of a data frame
#'
#' Replicates the rows of a data frame by a specified factor.
#'
#' @param df Data frame to replicate
#' @param n Replication factor for each row
#' @return Data frame with replicated rows
#' @export
#' @examples
#' df <- data.frame(a = 1:3, b = letters[1:3])
#' replicateRows(df, 2)
#'
replicateRows <- function(df, n) {
indices <- rep(seq_len(nrow(df)), each = n)
replicated_df <- df[indices, , drop = FALSE]
rownames(replicated_df) <- NULL
return(replicated_df)
}
#' Get sample metadata
#'
#' Generates sample metadata based on the input variables, replication matrix, and number of genes.
#'
#' @param list_var A list of variables (already initialized)
#' @param replicationMatrix Replication matrix
#' @param n_genes Number of genes
#' @return Data frame of sample metadata
#' @importFrom data.table setorderv
#' @export
#' @examples
#' list_var <- init_variable()
#' n_genes <- 10
#' replicationMatrix <- generateReplicationMatrix(list_var ,2, 3)
#' getSampleMetadata(list_var, n_genes, replicationMatrix)
getSampleMetadata <- function(list_var, n_genes, replicationMatrix) {
l_sampleIDs = getSampleID(list_var)
metaData <- generateGridCombination_fromListVar(list_var)
metaData[] <- lapply(metaData, as.character) ## before reordering
data.table::setorderv(metaData, cols = colnames(metaData))
metaData[] <- lapply(metaData, as.factor)
metaData$sampleID <- l_sampleIDs
rep_list <- colSums(replicationMatrix)
metaData$sampleID <- as.character(metaData$sampleID) ## before replicating
sampleMetadata <- replicateByGroup(metaData, "sampleID", rep_list)
colnames(sampleMetadata) <- gsub("label_", "", colnames(sampleMetadata))
return(sampleMetadata)
}
#' getSampleID
#'
#' @param list_var A list of variables (already initialized)
#' @export
#' @return A sorted vector of sample IDs
#' @examples
#' getSampleID(init_variable())
getSampleID <- function(list_var){
gridCombination <- generateGridCombination_fromListVar(list_var)
l_sampleID <- apply( gridCombination , 1 , paste , collapse = "_" ) %>% unname()
return(sort(l_sampleID))
}