From 89a843958043ac8bdbf82b5a2e2bb5fe81901846 Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Thu, 26 Oct 2023 13:41:59 +0200 Subject: [PATCH] improve log file managment --- R/basal_expression_scaling.R | 82 +++++++++++++++++++ R/fitmodel.R | 14 ++-- R/update_fittedmodel.R | 14 +++- dev/flat_full.Rmd | 48 +++++------ man/addBasalExpression.Rd | 17 +++- man/fitModelParallel.Rd | 4 +- man/generate_basal_expression.Rd | 11 ++- man/getBinExpression.Rd | 13 ++- man/parallel_fit.Rd | 14 +++- man/parallel_update.Rd | 10 ++- man/updateParallel.Rd | 10 ++- .../testthat/test-basal_expression_scaling.R | 75 +++++++++++++++++ tests/testthat/test-fitmodel.R | 8 +- vignettes/htrfit.Rmd | 13 +-- 14 files changed, 273 insertions(+), 60 deletions(-) create mode 100644 R/basal_expression_scaling.R create mode 100644 tests/testthat/test-basal_expression_scaling.R diff --git a/R/basal_expression_scaling.R b/R/basal_expression_scaling.R new file mode 100644 index 0000000..c2ddc5d --- /dev/null +++ b/R/basal_expression_scaling.R @@ -0,0 +1,82 @@ +# WARNING - Generated by {fusen} from /dev/flat_full.Rmd: do not edit by hand + + + + + +#' Get bin expression for a data frame. +#' +#' This function divides the values of a specified column in a data frame into \code{n_bins} bins of equal width. +#' The bin labels are then added as a new column in the data frame. +#' +#' @param dtf_coef A data frame containing the values to be binned. +#' @param n_bins The number of bins to create. +#' +#' @return A data frame with an additional column named \code{binExpression}, containing the bin labels. +#' @export +#' @examples +#' dtf <- data.frame(mu_ij = c(10, 20, 30, 15, 25, 35, 40, 5, 12, 22)) +#' dtf_with_bins <- getBinExpression(dtf, n_bins = 3) +#' +getBinExpression <- function(dtf_coef, n_bins){ + col2bin <- "mu_ij" + bin_labels <- cut(dtf_coef[[col2bin]], n_bins, labels = paste("BinExpression", 1:n_bins, sep = "_")) + dtf_coef$binExpression <- bin_labels + return(dtf_coef) +} + + + + +#' Generate BE data. +#' +#' This function generates basal expression data for a given number of genes, in a vector of basal expression values. +#' +#' @param n_genes The number of genes to generate BE data for. +#' @param basal_expression a numeric vector from which sample BE for eacg genes +#' +#' @return A data frame containing gene IDs, BE values +#' +#' @examples +#' generate_basal_expression(n_genes = 100, 10) +#' +#' @export +generate_basal_expression <- function(n_genes, basal_expression) { + ## --avoid bug if one value in basal_expr + pool2sample <- c(basal_expression, basal_expression) + BE <- sample(x = pool2sample, size = n_genes, replace = T) + l_geneID <- base::paste("gene", 1:n_genes, sep = "") + ret <- list(geneID = l_geneID, basalExpr = BE) %>% as.data.frame() + return(ret) +} + + + +#' Compute basal expresion for gene expression based on the coefficients data frame. +#' +#' This function takes the coefficients data frame \code{dtf_coef} and computes +#' basal expression for gene expression. The scaling factors are generated +#' using the function \code{generate_basal_expression}. +#' +#' @param dtf_coef A data frame containing the coefficients for gene expression. +#' @param n_genes number of genes in simulation +#' @param basal_expression gene basal expression vector +#' +#' @return A modified data frame \code{dtf_coef} with an additional column containing +#' the scaling factors for gene expression. +#' @export +#' @examples +#' list_var <- init_variable() +#' N_GENES <- 5 +#' dtf_coef <- getInput2simulation(list_var, N_GENES) +#' dtf_coef <- getLog_qij(dtf_coef) +#' addBasalExpression(dtf_coef, N_GENES, 1) +addBasalExpression <- function(dtf_coef, n_genes, basal_expression){ + BE_df <- generate_basal_expression(n_genes, basal_expression ) + dtf_coef <- join_dtf(dtf_coef, BE_df, "geneID", "geneID") + return(dtf_coef) +} + + + + diff --git a/R/fitmodel.R b/R/fitmodel.R index 6118363..de0d84b 100644 --- a/R/fitmodel.R +++ b/R/fitmodel.R @@ -197,7 +197,7 @@ launchFit <- function(group, group_by, formula, data, ...) { #' @param data Data frame containing the data #' @param n.cores The number of CPU cores to use for parallel processing. #' If set to NULL (default), the number of available CPU cores will be automatically detected. -#' @param log_file File to write log (default : log.txt) +#' @param log_file File to write log (default : Rtmpdir/htrfit.log) #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function #' @return List of fitted model objects or NULL for any errors #' @importFrom stats setNames @@ -205,8 +205,10 @@ launchFit <- function(group, group_by, formula, data, ...) { #' @examples #' parallel_fit(group_by = "Species", "setosa", #' formula = Sepal.Length ~ Sepal.Width + Petal.Length, -#' data = iris, n.cores = 1, log_file = "log.txt" ) -parallel_fit <- function(groups, group_by, formula, data, n.cores = NULL, log_file, ...) { +#' data = iris, n.cores = 1 ) +parallel_fit <- function(groups, group_by, formula, data, n.cores = NULL, + log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ...) { + if (is.null(n.cores)) n.cores <- parallel::detectCores() clust <- parallel::makeCluster(n.cores, outfile = log_file) @@ -227,19 +229,21 @@ parallel_fit <- function(groups, group_by, formula, data, n.cores = NULL, log_fi #' @param group_by Column name in data representing the grouping variable #' @param n.cores The number of CPU cores to use for parallel processing. #' If set to NULL (default), the number of available CPU cores will be automatically detected. -#' @param log_file File path to save the log messages (default : log.txt) +#' @param log_file File path to save the log messages (default : Rtmpdir/htrfit.log) #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function #' @return List of fitted model objects or NULL for any errors #' @export #' @examples #' fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length, #' data = iris, group_by = "Species", n.cores = 1) -fitModelParallel <- function(formula, data, group_by, n.cores = NULL, log_file = "log.txt", ...) { +fitModelParallel <- function(formula, data, group_by, n.cores = NULL, log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ...) { ## SOme verification isValidInput2fit(data, formula) is_fullrank(data, formula) + ## -- print log location + message( paste("Log file location", log_file, sep =': ') ) groups <- unique(data[[group_by]]) # Fit models in parallel and capture the results diff --git a/R/update_fittedmodel.R b/R/update_fittedmodel.R index e408b2a..5184432 100644 --- a/R/update_fittedmodel.R +++ b/R/update_fittedmodel.R @@ -9,7 +9,7 @@ #' @param formula Formula for the GLMNB model. #' @param list_tmb List of glmmTMB objects. #' @param n.cores Number of cores to use for parallel processing. If NULL, the function will use all available cores. -#' @param log_file File path for the log output. +#' @param log_file File path for the log output (default: Rtmpdir/htrfit.log). #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function. #' @export #' @return A list of updated GLMNB models. @@ -22,12 +22,16 @@ #' fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) #' new_formula <- Sepal.Length ~ Sepal.Width #' results <- updateParallel(new_formula, fitted_models, n.cores = 1) -updateParallel <- function(formula, list_tmb, n.cores = NULL, log_file = "log.txt", ...) { +updateParallel <- function(formula, list_tmb, n.cores = NULL, log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ...) { + isValidInput2fit(list_tmb[[1]]$frame, formula) is_fullrank(list_tmb[[1]]$frame, formula) + ## -- ## -- print log location + message( paste("Log file location", log_file, sep =': ') ) + # Fit models update in parallel and capture the results results <- parallel_update(formula, list_tmb, n.cores, log_file, ...) return(results) @@ -41,7 +45,7 @@ updateParallel <- function(formula, list_tmb, n.cores = NULL, log_file = "log.tx #' @param formula Formula for the GLMNB model. #' @param list_tmb List of glmmTMB objects. #' @param n.cores Number of cores to use for parallel processing. -#' @param log_file File path for the log output. +#' @param log_file File path for the log output (default : Rtmpdir/htrfit.log). #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function. #' @export #' @return A list of updated GLMNB models. @@ -53,7 +57,9 @@ updateParallel <- function(formula, list_tmb, n.cores = NULL, log_file = "log.tx #' fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) #' new_formula <- Sepal.Length ~ Sepal.Width #' results <- parallel_update(new_formula, fitted_models, n.cores = 1) -parallel_update <- function(formula, list_tmb, n.cores = NULL, log_file = "log.txt", ...) { +parallel_update <- function(formula, list_tmb, n.cores = NULL, + log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ...) { + if (is.null(n.cores)) n.cores <- parallel::detectCores() clust <- parallel::makeCluster(n.cores, outfile = log_file) #l_geneID <- attributes(l_tmb)$names diff --git a/dev/flat_full.Rmd b/dev/flat_full.Rmd index e942523..c6694dc 100644 --- a/dev/flat_full.Rmd +++ b/dev/flat_full.Rmd @@ -2609,7 +2609,7 @@ launchFit <- function(group, group_by, formula, data, ...) { #' @param data Data frame containing the data #' @param n.cores The number of CPU cores to use for parallel processing. #' If set to NULL (default), the number of available CPU cores will be automatically detected. -#' @param log_file File to write log (default : log.txt) +#' @param log_file File to write log (default : Rtmpdir/htrfit.log) #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function #' @return List of fitted model objects or NULL for any errors #' @importFrom stats setNames @@ -2617,8 +2617,10 @@ launchFit <- function(group, group_by, formula, data, ...) { #' @examples #' parallel_fit(group_by = "Species", "setosa", #' formula = Sepal.Length ~ Sepal.Width + Petal.Length, -#' data = iris, n.cores = 1, log_file = "log.txt" ) -parallel_fit <- function(groups, group_by, formula, data, n.cores = NULL, log_file, ...) { +#' data = iris, n.cores = 1 ) +parallel_fit <- function(groups, group_by, formula, data, n.cores = NULL, + log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ...) { + if (is.null(n.cores)) n.cores <- parallel::detectCores() clust <- parallel::makeCluster(n.cores, outfile = log_file) @@ -2639,19 +2641,21 @@ parallel_fit <- function(groups, group_by, formula, data, n.cores = NULL, log_fi #' @param group_by Column name in data representing the grouping variable #' @param n.cores The number of CPU cores to use for parallel processing. #' If set to NULL (default), the number of available CPU cores will be automatically detected. -#' @param log_file File path to save the log messages (default : log.txt) +#' @param log_file File path to save the log messages (default : Rtmpdir/htrfit.log) #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function #' @return List of fitted model objects or NULL for any errors #' @export #' @examples #' fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length, #' data = iris, group_by = "Species", n.cores = 1) -fitModelParallel <- function(formula, data, group_by, n.cores = NULL, log_file = "log.txt", ...) { +fitModelParallel <- function(formula, data, group_by, n.cores = NULL, log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ...) { ## SOme verification isValidInput2fit(data, formula) is_fullrank(data, formula) + ## -- print log location + message( paste("Log file location", log_file, sep =': ') ) groups <- unique(data[[group_by]]) # Fit models in parallel and capture the results @@ -2862,14 +2866,14 @@ test_that("parallel_fit returns a list of fitted model objects or NULL for any e groups <- unique(iris$Species) group_by <- "Species" formula <- Sepal.Length ~ Sepal.Width + Petal.Length - fitted_models <- parallel_fit(groups, group_by, formula, iris, log_file = "log.txt", n.cores = 1) + fitted_models <- parallel_fit(groups, group_by, formula, iris, n.cores = 1) expect_s3_class(fitted_models$setosa, "glmmTMB") expect_length(fitted_models, length(groups)) # Test with invalid formula invalid_formula <- blabla ~ cyl + disp result <- suppressWarnings(parallel_fit(groups, group_by, invalid_formula, - iris, log_file = "log.txt", n.cores = 1)) + iris, n.cores = 1)) expect_equal(result, expected = list(setosa = NULL, versicolor = NULL, virginica = NULL)) @@ -2880,7 +2884,6 @@ test_that("parallel_fit returns a list of fitted model objects or NULL for any e data = iris, group_by = group_by, groups = "setosa", - log_file = "log.txt", n.cores = 1, family = glmmTMB::nbinom1(link = "log") )) expect_s3_class(fitted_models$setosa$call$family, "family") @@ -2890,7 +2893,6 @@ test_that("parallel_fit returns a list of fitted model objects or NULL for any e data = iris, group_by = group_by, groups = "setosa", - log_file = "log.txt", family = glmmTMB::nbinom1(link = "log"), n.cores = 1, control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, @@ -2910,7 +2912,7 @@ test_that("fitModelParallel fits models in parallel for each group and returns a expect_length(fitted_models, length(groups)) invalid_formula <- blabla ~ cyl + disp - expect_error(fitModelParallel(invalid_formula, iris, group_by ,log_file = "log.txt", n.cores = 1)) + expect_error(fitModelParallel(invalid_formula, iris, group_by , n.cores = 1)) # Additional parameters: #change family + formula @@ -2947,7 +2949,7 @@ test_that("fitModelParallel fits models in parallel for each group and returns a #' @param formula Formula for the GLMNB model. #' @param list_tmb List of glmmTMB objects. #' @param n.cores Number of cores to use for parallel processing. If NULL, the function will use all available cores. -#' @param log_file File path for the log output. +#' @param log_file File path for the log output (default: Rtmpdir/htrfit.log). #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function. #' @export #' @return A list of updated GLMNB models. @@ -2960,12 +2962,16 @@ test_that("fitModelParallel fits models in parallel for each group and returns a #' fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) #' new_formula <- Sepal.Length ~ Sepal.Width #' results <- updateParallel(new_formula, fitted_models, n.cores = 1) -updateParallel <- function(formula, list_tmb, n.cores = NULL, log_file = "log.txt", ...) { +updateParallel <- function(formula, list_tmb, n.cores = NULL, log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ...) { + isValidInput2fit(list_tmb[[1]]$frame, formula) is_fullrank(list_tmb[[1]]$frame, formula) + ## -- ## -- print log location + message( paste("Log file location", log_file, sep =': ') ) + # Fit models update in parallel and capture the results results <- parallel_update(formula, list_tmb, n.cores, log_file, ...) return(results) @@ -2979,7 +2985,7 @@ updateParallel <- function(formula, list_tmb, n.cores = NULL, log_file = "log.tx #' @param formula Formula for the GLMNB model. #' @param list_tmb List of glmmTMB objects. #' @param n.cores Number of cores to use for parallel processing. -#' @param log_file File path for the log output. +#' @param log_file File path for the log output (default : Rtmpdir/htrfit.log). #' @param ... Additional arguments to be passed to the glmmTMB::glmmTMB function. #' @export #' @return A list of updated GLMNB models. @@ -2991,7 +2997,9 @@ updateParallel <- function(formula, list_tmb, n.cores = NULL, log_file = "log.tx #' fitted_models <- fitModelParallel(formula, iris, group_by, n.cores = 1) #' new_formula <- Sepal.Length ~ Sepal.Width #' results <- parallel_update(new_formula, fitted_models, n.cores = 1) -parallel_update <- function(formula, list_tmb, n.cores = NULL, log_file = "log.txt", ...) { +parallel_update <- function(formula, list_tmb, n.cores = NULL, + log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ...) { + if (is.null(n.cores)) n.cores <- parallel::detectCores() clust <- parallel::makeCluster(n.cores, outfile = log_file) #l_geneID <- attributes(l_tmb)$names @@ -7076,7 +7084,7 @@ mock_data <- mock_rnaseq(input_var_list, N_GENES, ## Set gene dispersion -The dispersion parameter (\alpha_i), characterizes the relationship between the variance of the observed count and its mean value. In simple terms, it quantifies how much we expect the observed count to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter. +The dispersion parameter ($\alpha_i$), characterizes the relationship between the variance of the observed count and its mean value. In simple terms, it quantifies how much we expect the observed count to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter. ```{r example-mock_rnaseq_disp, warning=FALSE, message=FALSE} @@ -7132,7 +7140,7 @@ names(mock_data) In this modeling framework, counts denoted as $K_{ij}$ for gene i and sample j are generated using a negative binomial distribution. The negative binomial distribution considers a fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. -The fitted mean $\mu_{ij}$ is determined by a parameter, qij, which is proportionally related to the sum of all effects specified using `init_variable()` or `add_interaction()`. If basal gene expressions are provided, the $\mu_{ij}$ values are scaled accordingly using the gene-specific basal expression value ($bexpr_i$). +The fitted mean $\mu_{ij}$ is determined by a parameter, $q_{ij}$, which is proportionally related to the sum of all effects specified using `init_variable()` or `add_interaction()`. If basal gene expressions are provided, the $\mu_{ij}$ values are scaled accordingly using the gene-specific basal expression value ($bexpr_i$). Furthermore, the coefficients $\beta_i$ represent the natural logarithm fold changes for gene i across each column of the model matrix X. The dispersion parameter $\alpha_i$ plays a crucial role in defining the relationship between the variance of observed counts and their mean value. In simpler terms, it quantifies how far we expect observed counts to deviate from the mean value. @@ -7176,7 +7184,6 @@ l_tmb <- fitModelParallel(formula = kij ~ varA, data = data2fit, group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), - log_file = "log.txt", n.cores = 1) ``` @@ -7191,7 +7198,6 @@ l_tmb <- fitModelParallel(formula = kij ~ varA + ( 1 | varB ), data = data2fit, group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), - log_file = "log.txt", n.cores = 1) ``` @@ -7205,7 +7211,6 @@ l_tmb <- fitModelParallel(formula = kij ~ varA, data = data2fit, group_by = "geneID", n.cores = 1, - log_file = "log.txt", family = glmmTMB::nbinom2(link = "log"), control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e5, eval.max=1e5))) @@ -7222,7 +7227,6 @@ l_tmb <- fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length + data = iris, group_by = "Species", family = gaussian(), - log_file = "log.txt", n.cores = 1) ``` @@ -7235,14 +7239,12 @@ The `updateParallel()` function updates and re-fits a model for each gene. It of l_tmb <- updateParallel(formula = kij ~ varA, list_tmb = l_tmb , family = gaussian(), - log_file = "log.txt", n.cores = 1) ## -- update fit using additional model control settings l_tmb <- updateParallel(formula = kij ~ varA , list_tmb = l_tmb , family = gaussian(), - log_file = "log.txt", n.cores = 1, control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, eval.max=1e3))) @@ -7252,7 +7254,6 @@ l_tmb <- updateParallel(formula = kij ~ varA , l_tmb <- updateParallel(formula = kij ~ varA + varB + varA:varB , list_tmb = l_tmb , family = glmmTMB::nbinom2(link = "log"), - log_file = "log.txt", n.cores = 1) ## -- output @@ -7411,7 +7412,6 @@ l_tmb <- fitModelParallel(formula = kij ~ varB + (varB | varA), data = data2fit, group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), - log_file = "log.txt", n.cores = 1) ## -- output l_tmb$gene1 diff --git a/man/addBasalExpression.Rd b/man/addBasalExpression.Rd index 2cc6230..fec7fd9 100644 --- a/man/addBasalExpression.Rd +++ b/man/addBasalExpression.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/basal_expression__scaling.R +% Please edit documentation in R/basal_expression__scaling.R, +% R/basal_expression_scaling.R \name{addBasalExpression} \alias{addBasalExpression} \title{Compute basal expresion for gene expression based on the coefficients data frame.} \usage{ +addBasalExpression(dtf_coef, n_genes, basal_expression) + addBasalExpression(dtf_coef, n_genes, basal_expression) } \arguments{ @@ -14,10 +17,17 @@ addBasalExpression(dtf_coef, n_genes, basal_expression) \item{basal_expression}{gene basal expression vector} } \value{ +A modified data frame \code{dtf_coef} with an additional column containing +the scaling factors for gene expression. + A modified data frame \code{dtf_coef} with an additional column containing the scaling factors for gene expression. } \description{ +This function takes the coefficients data frame \code{dtf_coef} and computes +basal expression for gene expression. The scaling factors are generated +using the function \code{generate_basal_expression}. + This function takes the coefficients data frame \code{dtf_coef} and computes basal expression for gene expression. The scaling factors are generated using the function \code{generate_basal_expression}. @@ -28,4 +38,9 @@ N_GENES <- 5 dtf_coef <- getInput2simulation(list_var, N_GENES) dtf_coef <- getLog_qij(dtf_coef) addBasalExpression(dtf_coef, N_GENES, 1) +list_var <- init_variable() +N_GENES <- 5 +dtf_coef <- getInput2simulation(list_var, N_GENES) +dtf_coef <- getLog_qij(dtf_coef) +addBasalExpression(dtf_coef, N_GENES, 1) } diff --git a/man/fitModelParallel.Rd b/man/fitModelParallel.Rd index a37d644..9212c78 100644 --- a/man/fitModelParallel.Rd +++ b/man/fitModelParallel.Rd @@ -10,7 +10,7 @@ fitModelParallel( data, group_by, n.cores = NULL, - log_file = "log.txt", + log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), ... ) } @@ -24,7 +24,7 @@ fitModelParallel( \item{n.cores}{The number of CPU cores to use for parallel processing. If set to NULL (default), the number of available CPU cores will be automatically detected.} -\item{log_file}{File path to save the log messages (default : log.txt)} +\item{log_file}{File path to save the log messages (default : Rtmpdir/htrfit.log)} \item{...}{Additional arguments to be passed to the glmmTMB::glmmTMB function} } diff --git a/man/generate_basal_expression.Rd b/man/generate_basal_expression.Rd index 3e68b06..3c96870 100644 --- a/man/generate_basal_expression.Rd +++ b/man/generate_basal_expression.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/basal_expression__scaling.R +% Please edit documentation in R/basal_expression__scaling.R, +% R/basal_expression_scaling.R \name{generate_basal_expression} \alias{generate_basal_expression} \title{Generate BE data.} \usage{ +generate_basal_expression(n_genes, basal_expression) + generate_basal_expression(n_genes, basal_expression) } \arguments{ @@ -12,12 +15,18 @@ generate_basal_expression(n_genes, basal_expression) \item{basal_expression}{a numeric vector from which sample BE for eacg genes} } \value{ +A data frame containing gene IDs, BE values + A data frame containing gene IDs, BE values } \description{ +This function generates basal expression data for a given number of genes, in a vector of basal expression values. + This function generates basal expression data for a given number of genes, in a vector of basal expression values. } \examples{ generate_basal_expression(n_genes = 100, 10) +generate_basal_expression(n_genes = 100, 10) + } diff --git a/man/getBinExpression.Rd b/man/getBinExpression.Rd index 7c4ecd1..f0c234e 100644 --- a/man/getBinExpression.Rd +++ b/man/getBinExpression.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/basal_expression__scaling.R +% Please edit documentation in R/basal_expression__scaling.R, +% R/basal_expression_scaling.R \name{getBinExpression} \alias{getBinExpression} \title{Get bin expression for a data frame.} \usage{ +getBinExpression(dtf_coef, n_bins) + getBinExpression(dtf_coef, n_bins) } \arguments{ @@ -12,9 +15,14 @@ getBinExpression(dtf_coef, n_bins) \item{n_bins}{The number of bins to create.} } \value{ +A data frame with an additional column named \code{binExpression}, containing the bin labels. + A data frame with an additional column named \code{binExpression}, containing the bin labels. } \description{ +This function divides the values of a specified column in a data frame into \code{n_bins} bins of equal width. +The bin labels are then added as a new column in the data frame. + This function divides the values of a specified column in a data frame into \code{n_bins} bins of equal width. The bin labels are then added as a new column in the data frame. } @@ -22,4 +30,7 @@ The bin labels are then added as a new column in the data frame. dtf <- data.frame(mu_ij = c(10, 20, 30, 15, 25, 35, 40, 5, 12, 22)) dtf_with_bins <- getBinExpression(dtf, n_bins = 3) +dtf <- data.frame(mu_ij = c(10, 20, 30, 15, 25, 35, 40, 5, 12, 22)) +dtf_with_bins <- getBinExpression(dtf, n_bins = 3) + } diff --git a/man/parallel_fit.Rd b/man/parallel_fit.Rd index 0474bfb..77ebb71 100644 --- a/man/parallel_fit.Rd +++ b/man/parallel_fit.Rd @@ -5,7 +5,15 @@ \title{Fit models in parallel for each group using mclapply and handle logging. Uses parallel_fit to fit the models.} \usage{ -parallel_fit(groups, group_by, formula, data, n.cores = NULL, log_file, ...) +parallel_fit( + groups, + group_by, + formula, + data, + n.cores = NULL, + log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), + ... +) } \arguments{ \item{groups}{Vector of unique group values} @@ -19,7 +27,7 @@ parallel_fit(groups, group_by, formula, data, n.cores = NULL, log_file, ...) \item{n.cores}{The number of CPU cores to use for parallel processing. If set to NULL (default), the number of available CPU cores will be automatically detected.} -\item{log_file}{File to write log (default : log.txt)} +\item{log_file}{File to write log (default : Rtmpdir/htrfit.log)} \item{...}{Additional arguments to be passed to the glmmTMB::glmmTMB function} } @@ -33,5 +41,5 @@ Uses parallel_fit to fit the models. \examples{ parallel_fit(group_by = "Species", "setosa", formula = Sepal.Length ~ Sepal.Width + Petal.Length, - data = iris, n.cores = 1, log_file = "log.txt" ) + data = iris, n.cores = 1 ) } diff --git a/man/parallel_update.Rd b/man/parallel_update.Rd index ca1c208..52d6d59 100644 --- a/man/parallel_update.Rd +++ b/man/parallel_update.Rd @@ -4,7 +4,13 @@ \alias{parallel_update} \title{Internal function to fit glmmTMB models in parallel.} \usage{ -parallel_update(formula, list_tmb, n.cores = NULL, log_file = "log.txt", ...) +parallel_update( + formula, + list_tmb, + n.cores = NULL, + log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), + ... +) } \arguments{ \item{formula}{Formula for the GLMNB model.} @@ -13,7 +19,7 @@ parallel_update(formula, list_tmb, n.cores = NULL, log_file = "log.txt", ...) \item{n.cores}{Number of cores to use for parallel processing.} -\item{log_file}{File path for the log output.} +\item{log_file}{File path for the log output (default : Rtmpdir/htrfit.log).} \item{...}{Additional arguments to be passed to the glmmTMB::glmmTMB function.} } diff --git a/man/updateParallel.Rd b/man/updateParallel.Rd index 4c0b838..7f779e1 100644 --- a/man/updateParallel.Rd +++ b/man/updateParallel.Rd @@ -4,7 +4,13 @@ \alias{updateParallel} \title{Update glmmTMB models in parallel.} \usage{ -updateParallel(formula, list_tmb, n.cores = NULL, log_file = "log.txt", ...) +updateParallel( + formula, + list_tmb, + n.cores = NULL, + log_file = paste(tempdir(check = FALSE), "htrfit.log", sep = "/"), + ... +) } \arguments{ \item{formula}{Formula for the GLMNB model.} @@ -13,7 +19,7 @@ updateParallel(formula, list_tmb, n.cores = NULL, log_file = "log.txt", ...) \item{n.cores}{Number of cores to use for parallel processing. If NULL, the function will use all available cores.} -\item{log_file}{File path for the log output.} +\item{log_file}{File path for the log output (default: Rtmpdir/htrfit.log).} \item{...}{Additional arguments to be passed to the glmmTMB::glmmTMB function.} } diff --git a/tests/testthat/test-basal_expression_scaling.R b/tests/testthat/test-basal_expression_scaling.R new file mode 100644 index 0000000..86876f8 --- /dev/null +++ b/tests/testthat/test-basal_expression_scaling.R @@ -0,0 +1,75 @@ +# WARNING - Generated by {fusen} from /dev/flat_full.Rmd: do not edit by hand + + +test_that("generate_basal_expression returns correct number of genes", { + be_data <- generate_basal_expression(n_genes = 100, 1) + expect_equal(nrow(be_data), 100) +}) + + +test_that("generate_basal_expression returns BE values within specified vector", { + BE_vec <- c(1, 2, 33, 0.4) + be_data <- generate_basal_expression(n_genes = 100, BE_vec) + expect_true(all(be_data$basalExpr %in% BE_vec)) +}) + + +test_that("Test for addbasalExpre function",{ + + list_var <- init_variable() + N_GENES <- 5 + dtf_coef <- getInput2simulation(list_var, N_GENES) + dtf_coef <- getLog_qij(dtf_coef) + + # Test the function + dtf_coef_with_BE <- addBasalExpression(dtf_coef, N_GENES, 5) + + # Check if the output is a data frame + expect_true(is.data.frame(dtf_coef_with_BE)) + + # Check if the number of rows is equal to number of row in dtf_coef + expect_equal(nrow(dtf_coef_with_BE), nrow(dtf_coef)) + + # Check if the number of rows is equal to number of row in dtf_coef +1 + expect_equal(ncol(dtf_coef_with_BE), ncol(dtf_coef)+1) + + # Check if the data frame has a new column "BE" + expect_true("basalExpr" %in% colnames(dtf_coef_with_BE)) + + # Check if the values in the "BE" column are numeric + expect_true(all(is.numeric(dtf_coef_with_BE$basalExpr))) + +}) + + +# Test 1: Check if the function returns the correct number of bins +test_that("getBinExpression returns the correct number of bins", { + dtf <- data.frame(mu_ij = c(10, 20, 30, 15, 25, 35, 40, 5, 12, 22)) + n_bins <- 3 + dtf_with_bins <- getBinExpression(dtf, n_bins) + expect_equal(nrow(dtf_with_bins), nrow(dtf), label = "Number of rows should remain the same") + expect_equal(ncol(dtf_with_bins), ncol(dtf) + 1, label = "Number of columns should increase by 1") +}) + +# Test 2: Check if the function adds the binExpression column correctly +test_that("getBinExpression adds the binExpression column correctly", { + dtf <- data.frame(mu_ij = c(10, 20, 30, 15, 25, 35, 40, 5, 12, 22)) + n_bins <- 3 + dtf_with_bins <- getBinExpression(dtf, n_bins) + expected_bins <- c("BinExpression_1", "BinExpression_2", "BinExpression_3", "BinExpression_1", "BinExpression_2", + "BinExpression_3", "BinExpression_3", "BinExpression_1", "BinExpression_1", "BinExpression_2") + expect_equal(dtf_with_bins$binExpression, factor(expected_bins)) +}) + +# Test 3: Check if the function handles negative values correctly +test_that("getBinExpression handles negative values correctly", { + dtf <- data.frame(mu_ij = c(10, -20, 30, -15, 25, 35, -40, 5, 12, -22)) + n_bins <- 4 + dtf_with_bins <- getBinExpression(dtf, n_bins) + expected_bins <- c("BinExpression_3", "BinExpression_2", "BinExpression_4", "BinExpression_2", "BinExpression_4", + "BinExpression_4", "BinExpression_1", "BinExpression_3", "BinExpression_3", "BinExpression_1") + expect_equal(dtf_with_bins$binExpression, factor(expected_bins)) +}) + + + diff --git a/tests/testthat/test-fitmodel.R b/tests/testthat/test-fitmodel.R index 5307cf2..f1a3d80 100644 --- a/tests/testthat/test-fitmodel.R +++ b/tests/testthat/test-fitmodel.R @@ -197,14 +197,14 @@ test_that("parallel_fit returns a list of fitted model objects or NULL for any e groups <- unique(iris$Species) group_by <- "Species" formula <- Sepal.Length ~ Sepal.Width + Petal.Length - fitted_models <- parallel_fit(groups, group_by, formula, iris, log_file = "log.txt", n.cores = 1) + fitted_models <- parallel_fit(groups, group_by, formula, iris, n.cores = 1) expect_s3_class(fitted_models$setosa, "glmmTMB") expect_length(fitted_models, length(groups)) # Test with invalid formula invalid_formula <- blabla ~ cyl + disp result <- suppressWarnings(parallel_fit(groups, group_by, invalid_formula, - iris, log_file = "log.txt", n.cores = 1)) + iris, n.cores = 1)) expect_equal(result, expected = list(setosa = NULL, versicolor = NULL, virginica = NULL)) @@ -215,7 +215,6 @@ test_that("parallel_fit returns a list of fitted model objects or NULL for any e data = iris, group_by = group_by, groups = "setosa", - log_file = "log.txt", n.cores = 1, family = glmmTMB::nbinom1(link = "log") )) expect_s3_class(fitted_models$setosa$call$family, "family") @@ -225,7 +224,6 @@ test_that("parallel_fit returns a list of fitted model objects or NULL for any e data = iris, group_by = group_by, groups = "setosa", - log_file = "log.txt", family = glmmTMB::nbinom1(link = "log"), n.cores = 1, control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, @@ -245,7 +243,7 @@ test_that("fitModelParallel fits models in parallel for each group and returns a expect_length(fitted_models, length(groups)) invalid_formula <- blabla ~ cyl + disp - expect_error(fitModelParallel(invalid_formula, iris, group_by ,log_file = "log.txt", n.cores = 1)) + expect_error(fitModelParallel(invalid_formula, iris, group_by , n.cores = 1)) # Additional parameters: #change family + formula diff --git a/vignettes/htrfit.Rmd b/vignettes/htrfit.Rmd index b2a9128..cbd246a 100644 --- a/vignettes/htrfit.Rmd +++ b/vignettes/htrfit.Rmd @@ -15,6 +15,7 @@ knitr::opts_chunk$set( ``` ```{r setup} +devtools::load_all() library(HTRfit) ``` @@ -154,7 +155,7 @@ mock_data <- mock_rnaseq(input_var_list, N_GENES, ## Set gene dispersion -The dispersion parameter (\alpha_i), characterizes the relationship between the variance of the observed count and its mean value. In simple terms, it quantifies how much we expect the observed count to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter. +The dispersion parameter ($\alpha_i$), characterizes the relationship between the variance of the observed count and its mean value. In simple terms, it quantifies how much we expect the observed count to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter. ```{r example-mock_rnaseq_disp, warning = FALSE, message = FALSE} @@ -208,7 +209,7 @@ names(mock_data) In this modeling framework, counts denoted as $K_{ij}$ for gene i and sample j are generated using a negative binomial distribution. The negative binomial distribution considers a fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. -The fitted mean $\mu_{ij}$ is determined by a parameter, qij, which is proportionally related to the sum of all effects specified using `init_variable()` or `add_interaction()`. If basal gene expressions are provided, the $\mu_{ij}$ values are scaled accordingly using the gene-specific basal expression value ($bexpr_i$). +The fitted mean $\mu_{ij}$ is determined by a parameter, $q_{ij}$, which is proportionally related to the sum of all effects specified using `init_variable()` or `add_interaction()`. If basal gene expressions are provided, the $\mu_{ij}$ values are scaled accordingly using the gene-specific basal expression value ($bexpr_i$). Furthermore, the coefficients $\beta_i$ represent the natural logarithm fold changes for gene i across each column of the model matrix X. The dispersion parameter $\alpha_i$ plays a crucial role in defining the relationship between the variance of observed counts and their mean value. In simpler terms, it quantifies how far we expect observed counts to deviate from the mean value. @@ -255,7 +256,6 @@ l_tmb <- fitModelParallel(formula = kij ~ varA, data = data2fit, group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), - log_file = "log.txt", n.cores = 1) ``` @@ -270,7 +270,6 @@ l_tmb <- fitModelParallel(formula = kij ~ varA + ( 1 | varB ), data = data2fit, group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), - log_file = "log.txt", n.cores = 1) ``` @@ -285,7 +284,6 @@ l_tmb <- fitModelParallel(formula = kij ~ varA, data = data2fit, group_by = "geneID", n.cores = 1, - log_file = "log.txt", family = glmmTMB::nbinom2(link = "log"), control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e5, eval.max=1e5))) @@ -303,7 +301,6 @@ l_tmb <- fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length + data = iris, group_by = "Species", family = gaussian(), - log_file = "log.txt", n.cores = 1) ``` @@ -317,14 +314,12 @@ The `updateParallel()` function updates and re-fits a model for each gene. It of l_tmb <- updateParallel(formula = kij ~ varA, list_tmb = l_tmb , family = gaussian(), - log_file = "log.txt", n.cores = 1) ## -- update fit using additional model control settings l_tmb <- updateParallel(formula = kij ~ varA , list_tmb = l_tmb , family = gaussian(), - log_file = "log.txt", n.cores = 1, control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, eval.max=1e3))) @@ -334,7 +329,6 @@ l_tmb <- updateParallel(formula = kij ~ varA , l_tmb <- updateParallel(formula = kij ~ varA + varB + varA:varB , list_tmb = l_tmb , family = glmmTMB::nbinom2(link = "log"), - log_file = "log.txt", n.cores = 1) ## -- output @@ -500,7 +494,6 @@ l_tmb <- fitModelParallel(formula = kij ~ varB + (varB | varA), data = data2fit, group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), - log_file = "log.txt", n.cores = 1) ## -- output l_tmb$gene1 -- GitLab