diff --git a/NAMESPACE b/NAMESPACE index e77736fa579b8a18407b5391ac607036007488da..4ec440dd0859d60a0f8a023969c04046f5634fc6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -100,6 +100,8 @@ export(get_eval_data_from_dds) export(get_eval_data_from_ltmb) export(get_inference_dds) export(get_label_y_position) +export(get_mad_left_threshold) +export(get_mad_user_message) export(get_ml_metrics_obj) export(get_performances_metrics_obj) export(get_pr_curve) @@ -110,6 +112,7 @@ export(get_rsquare_2plot) export(glance_tmb) export(group_logQij_per_genes_and_labels) export(handleAnovaError) +export(identifyTopFit) export(inferenceToExpected_withFixedEff) export(inferenceToExpected_withMixedEff) export(init_variable) @@ -203,6 +206,7 @@ importFrom(stats,approxfun) importFrom(stats,as.formula) importFrom(stats,cor) importFrom(stats,drop.terms) +importFrom(stats,mad) importFrom(stats,median) importFrom(stats,model.matrix) importFrom(stats,p.adjust) diff --git a/R/filtering_fit.R b/R/filtering_fit.R new file mode 100644 index 0000000000000000000000000000000000000000..302c9121cbce6ad7d8100a9fce6fe1d0bce8e380 --- /dev/null +++ b/R/filtering_fit.R @@ -0,0 +1,117 @@ +# WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand + + +#' Identify top or low fitting observations based on specified diagnostic metric and filtering method. +#' +#' This function identifies top or low fitting observations based on a specified metric and filtering method. +#' +#' @param list_tmb List of glmmTMB objects. +#' @param metric The metric used for diagnostic (e.g., "AIC", "BIC", "logLik", "deviance", "dispersion"). +#' @param filter_method The filtering method to be used (e.g., "mad"). +#' Feel free to implement your own filetering method +#' @param keep Whether to keep "top" or "low" fitting observations. +#' @param sort Logical indicating whether to sort the results. +#' @param decreasing Logical indicating whether to sort in decreasing order. +#' @param mad_tolerance Tolerance for MAD-based filtering. +#' @return A character vector of row names corresponding to the top or low fitting observations. +#' @export +#' +#' @examples +#' input_var_list <- init_variable() +#' ## -- simulate RNAseq data +#' mock_data <- mock_rnaseq(input_var_list, +#' n_genes = 5, +#' min_replicates = 3, +#' max_replicates = 3, +#' basal_expression = 2, +#' sequencing_depth = 1e5) +#' ## -- prepare data & fit a model with mixed effect +#' data2fit = prepareData2fit(countMatrix = mock_data$counts, +#' metadata = mock_data$metadata) +#'l_tmb <- fitModelParallel(formula = kij ~ myVariable, data = data2fit, +#' group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), +#' n.cores = 1) +#' # Identify top fitting observations based on AIC with MAD filtering +#' identifyTopFit(l_tmb, metric = "AIC", filter_method = "mad", keep = "top", sort = TRUE, decreasing = TRUE, mad_tolerance = 3) +#' +#' # Identify low fitting observations based on BIC without sorting +#' identifyTopFit(l_tmb, metric = "BIC", filter_method = "mad", keep = "low", sort = FALSE) +#' +#' # Identify top fitting observations based on log-likelihood with MAD filtering and custom tolerance +#' identifyTopFit(l_tmb, metric = "logLik", filter_method = "mad", keep = "top", mad_tolerance = 2) +identifyTopFit <- function(list_tmb, metric = "AIC", filter_method = "mad", keep = "top", + sort = F, decreasing = T, mad_tolerance = 3){ + + ## -- verif + stopifnot(is.list(list_tmb)) + stopifnot(metric %in% c("AIC", "BIC", "logLik", "deviance", "dispersion")) + + glance_df <- glance_tmb(list_tmb) + ## -- MAD method + if (filter_method == "mad"){ + lft_threshold <- get_mad_left_threshold(glance_df[[metric]], mad_tolerance) + get_mad_user_message(metric, glance_df[[metric]], lft_threshold, keep) + index2keep <- if (keep == "top") glance_df[[metric]] > lft_threshold else glance_df[[metric]] < lft_threshold + id2keep <- rownames(glance_df)[index2keep] + glance_df <- glance_df[ id2keep , ] + } ## -- feel free to implement other filtering methods .. + + ## -- sort + if (isTRUE(sort)){ + id2keep <- rownames(glance_df)[order(glance_df[[metric]], decreasing = decreasing)] + } + return(id2keep) +} + +#' Calculate the left threshold for MAD-based filtering. +#' +#' This function calculates the left threshold for MAD-based filtering. +#' +#' @param x Vector of values used for filtering. +#' @param tolerance Tolerance value for MAD-based filtering. +#' @return The left threshold value. +#' @importFrom stats median mad +#' @export +#' +#' @examples +#' # Calculate the left threshold for MAD-based filtering with tolerance 3 +#' get_mad_left_threshold(c(100, 200, 300), 3) +#' +#' # Calculate the left threshold for MAD-based filtering with tolerance 2 +#' get_mad_left_threshold(c(50, 75, 100), 2) +get_mad_left_threshold <- function(x, tolerance){ + med <- stats::median(x) + mad <- stats::mad(x, center = median(x), + constant = 1, na.rm = TRUE, + low = FALSE, high = FALSE) + med - (tolerance * mad) +} + +#' Generate user message for MAD filtering. +#' +#' This function generates a user message explaining the MAD filtering process. +#' +#' @param var The name of the metric used for filtering. +#' @param x Vector of values used for filtering. +#' @param left_threshold The left threshold for filtering. +#' @param keep Whether to keep "top" or "low" observations. +#' @return A message explaining the MAD filtering process. +#' @export +#' +#' @examples +#' # Generate user message for MAD filtering with top fitting observations +#' get_mad_user_message("AIC", c(100, 200, 300), 150, "top") +#' +#' # Generate user message for MAD filtering with low fitting observations +#' get_mad_user_message("BIC", c(50, 75, 100), 75, "low") +get_mad_user_message <- function(var, x , left_threshold, keep) { + message("Based on the specified metric (", var, ") and the MAD filtering method, the following selection criteria were applied:") + message("1. The MAD-based threshold for considering outliers was calculated.") + keep_str <- ifelse(keep == "top", "above", "bellow") + message(paste("2. Values", keep_str , "the threshold were identified, threshold:", left_threshold)) + message("3. Summary of selection:") + nb_keep <- ifelse(keep == "top", length(x[x > left_threshold]), length(x[x < left_threshold]) ) + sub_msg2 <- paste("values", keep_str, "the threshold") + message(paste("-", nb_keep, "out of", length(x), "observations had", sub_msg2, "for the", var, "metric.")) +} + diff --git a/dev/flat_full.Rmd b/dev/flat_full.Rmd index 32fbba8ecdc446281c21728a0a810d91076fddc0..be40c69583b9eaee0554cc5468c57c490ea3fcf5 100644 --- a/dev/flat_full.Rmd +++ b/dev/flat_full.Rmd @@ -3142,6 +3142,177 @@ test_that("fitModelParallel fits models in parallel for each group and returns a ``` +```{r function-filtering_fit, filename = "filtering_fit"} + +#' Identify top or low fitting observations based on specified diagnostic metric and filtering method. +#' +#' This function identifies top or low fitting observations based on a specified metric and filtering method. +#' +#' @param list_tmb List of glmmTMB objects. +#' @param metric The metric used for diagnostic (e.g., "AIC", "BIC", "logLik", "deviance", "dispersion"). +#' @param filter_method The filtering method to be used (e.g., "mad"). +#' Feel free to implement your own filetering method +#' @param keep Whether to keep "top" or "low" fitting observations. +#' @param sort Logical indicating whether to sort the results. +#' @param decreasing Logical indicating whether to sort in decreasing order. +#' @param mad_tolerance Tolerance for MAD-based filtering. +#' @return A character vector of row names corresponding to the top or low fitting observations. +#' @export +#' +#' @examples +#' input_var_list <- init_variable() +#' ## -- simulate RNAseq data +#' mock_data <- mock_rnaseq(input_var_list, +#' n_genes = 5, +#' min_replicates = 3, +#' max_replicates = 3, +#' basal_expression = 2, +#' sequencing_depth = 1e5) +#' ## -- prepare data & fit a model with mixed effect +#' data2fit = prepareData2fit(countMatrix = mock_data$counts, +#' metadata = mock_data$metadata) +#'l_tmb <- fitModelParallel(formula = kij ~ myVariable, data = data2fit, +#' group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), +#' n.cores = 1) +#' # Identify top fitting observations based on AIC with MAD filtering +#' identifyTopFit(l_tmb, metric = "AIC", filter_method = "mad", keep = "top", +#' sort = TRUE, decreasing = TRUE, mad_tolerance = 3) +#' +#' # Identify low fitting observations based on BIC without sorting +#' identifyTopFit(l_tmb, metric = "BIC", filter_method = "mad", keep = "low", sort = FALSE) +#' +#' # Identify top fitting observations based on log-likelihood with MAD filtering and custom tolerance +#' identifyTopFit(l_tmb, metric = "logLik", filter_method = "mad", keep = "top", mad_tolerance = 2) +identifyTopFit <- function(list_tmb, metric = "AIC", filter_method = "mad", keep = "top", + sort = F, decreasing = T, mad_tolerance = 3){ + + ## -- verif + stopifnot(is.list(list_tmb)) + stopifnot(metric %in% c("AIC", "BIC", "logLik", "deviance", "dispersion")) + + glance_df <- glance_tmb(list_tmb) + ## -- MAD method + if (filter_method == "mad"){ + lft_threshold <- get_mad_left_threshold(glance_df[[metric]], mad_tolerance) + get_mad_user_message(metric, glance_df[[metric]], lft_threshold, keep) + index2keep <- if (keep == "top") glance_df[[metric]] > lft_threshold else glance_df[[metric]] < lft_threshold + id2keep <- rownames(glance_df)[index2keep] + glance_df <- glance_df[ id2keep , ] + } ## -- feel free to implement other filtering methods .. + + ## -- sort + if (isTRUE(sort)){ + id2keep <- rownames(glance_df)[order(glance_df[[metric]], decreasing = decreasing)] + } + return(id2keep) +} + +#' Calculate the left threshold for MAD-based filtering. +#' +#' This function calculates the left threshold for MAD-based filtering. +#' +#' @param x Vector of values used for filtering. +#' @param tolerance Tolerance value for MAD-based filtering. +#' @return The left threshold value. +#' @importFrom stats median mad +#' @export +#' +#' @examples +#' # Calculate the left threshold for MAD-based filtering with tolerance 3 +#' get_mad_left_threshold(c(100, 200, 300), 3) +#' +#' # Calculate the left threshold for MAD-based filtering with tolerance 2 +#' get_mad_left_threshold(c(50, 75, 100), 2) +get_mad_left_threshold <- function(x, tolerance){ + med <- stats::median(x) + mad <- stats::mad(x, center = median(x), + constant = 1, na.rm = TRUE, + low = FALSE, high = FALSE) + med - (tolerance * mad) +} + +#' Generate user message for MAD filtering. +#' +#' This function generates a user message explaining the MAD filtering process. +#' +#' @param var The name of the metric used for filtering. +#' @param x Vector of values used for filtering. +#' @param left_threshold The left threshold for filtering. +#' @param keep Whether to keep "top" or "low" observations. +#' @return A message explaining the MAD filtering process. +#' @export +#' +#' @examples +#' # Generate user message for MAD filtering with top fitting observations +#' get_mad_user_message("AIC", c(100, 200, 300), 150, "top") +#' +#' # Generate user message for MAD filtering with low fitting observations +#' get_mad_user_message("BIC", c(50, 75, 100), 75, "low") +get_mad_user_message <- function(var, x , left_threshold, keep) { + message("Based on the specified metric (", var, ") and the MAD filtering method, the following selection criteria were applied:") + message("1. The MAD-based threshold for considering outliers was calculated.") + keep_str <- ifelse(keep == "top", "above", "bellow") + message(paste("2. Values", keep_str , "the threshold were identified, threshold:", left_threshold)) + message("3. Summary of selection:") + nb_keep <- ifelse(keep == "top", length(x[x > left_threshold]), length(x[x < left_threshold]) ) + sub_msg2 <- paste("values", keep_str, "the threshold") + message(paste("-", nb_keep, "out of", length(x), "observations had", sub_msg2, "for the", var, "metric.")) +} + +``` + + +```{r test-update_fittedmodel} + +# Tests for identifyTopFit function +test_that("identifyTopFit correctly identifies top-fitting objects", { + input_var_list <- init_variable() + ## -- simulate RNAseq data + set.seed(101) + mock_data <- mock_rnaseq(input_var_list, + n_genes = 5, + min_replicates = 3, + max_replicates = 3, + basal_expression = 2, + sequencing_depth = 1e5) + ## -- prepare data & fit a model with mixed effect + data2fit = prepareData2fit(countMatrix = mock_data$counts, + metadata = mock_data$metadata, + normalization = T) + l_tmb <- fitModelParallel(formula = kij ~ myVariable, data = data2fit, + group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), + n.cores = 1) + glance_tmb(l_tmb) + # Identify top fitting observations based on AIC with MAD filtering + top_genes <- identifyTopFit(l_tmb, metric = "AIC", filter_method = "mad", keep = "top", + sort = TRUE, decreasing = TRUE, mad_tolerance = 3) + expect_equal(top_genes, c("gene5", "gene4", "gene1", "gene2", "gene3")) + # Identify low fitting observations based on BIC without sorting + top_genes <-identifyTopFit(l_tmb, metric = "BIC", filter_method = "mad", keep = "low", sort = FALSE) + expect_equal(top_genes, character()) + + # Identify top fitting observations based on log-likelihood with MAD filtering and custom tolerance + top_genes <- identifyTopFit(l_tmb, metric = "logLik", filter_method = "mad", keep = "top", mad_tolerance = 2) + expect_equal(top_genes, c("gene1", "gene2", "gene3", "gene4")) +}) + +# Tests for get_mad_left_threshold function +test_that("get_mad_left_threshold correctly calculates MAD-based left threshold", { + left_threshold <- get_mad_left_threshold(c(100, 200, 300), 3) + expect_equal(left_threshold, 200 - (3 * 100)) +}) + + +# Tests for get_mad_message +test_that('get_mad_user_message return correct output', { + expect_message(get_mad_user_message("BIC", c(50, 75, 100), 75, "low")) + expect_message(get_mad_user_message("BIC", c(50, 75, 100), 75, "top")) +}) + + +``` + + ```{r function-update_fittedmodel, filename = "update_fittedmodel"} diff --git a/man/get_mad_left_threshold.Rd b/man/get_mad_left_threshold.Rd new file mode 100644 index 0000000000000000000000000000000000000000..ddcd334b9df4f2fc4d4b6d8f514d1fbfa1a50641 --- /dev/null +++ b/man/get_mad_left_threshold.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filtering_fit.R +\name{get_mad_left_threshold} +\alias{get_mad_left_threshold} +\title{Calculate the left threshold for MAD-based filtering.} +\usage{ +get_mad_left_threshold(x, tolerance) +} +\arguments{ +\item{x}{Vector of values used for filtering.} + +\item{tolerance}{Tolerance value for MAD-based filtering.} +} +\value{ +The left threshold value. +} +\description{ +This function calculates the left threshold for MAD-based filtering. +} +\examples{ +# Calculate the left threshold for MAD-based filtering with tolerance 3 +get_mad_left_threshold(c(100, 200, 300), 3) + +# Calculate the left threshold for MAD-based filtering with tolerance 2 +get_mad_left_threshold(c(50, 75, 100), 2) +} diff --git a/man/get_mad_user_message.Rd b/man/get_mad_user_message.Rd new file mode 100644 index 0000000000000000000000000000000000000000..bf14154d4ab63f3468f66a78a0a6d700b35e43c2 --- /dev/null +++ b/man/get_mad_user_message.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filtering_fit.R +\name{get_mad_user_message} +\alias{get_mad_user_message} +\title{Generate user message for MAD filtering.} +\usage{ +get_mad_user_message(var, x, left_threshold, keep) +} +\arguments{ +\item{var}{The name of the metric used for filtering.} + +\item{x}{Vector of values used for filtering.} + +\item{left_threshold}{The left threshold for filtering.} + +\item{keep}{Whether to keep "top" or "low" observations.} +} +\value{ +A message explaining the MAD filtering process. +} +\description{ +This function generates a user message explaining the MAD filtering process. +} +\examples{ +# Generate user message for MAD filtering with top fitting observations +get_mad_user_message("AIC", c(100, 200, 300), 150, "top") + +# Generate user message for MAD filtering with low fitting observations +get_mad_user_message("BIC", c(50, 75, 100), 75, "low") +} diff --git a/man/identifyTopFit.Rd b/man/identifyTopFit.Rd new file mode 100644 index 0000000000000000000000000000000000000000..300e0b587bd5587f2cf0f448b0187d5ce868f67e --- /dev/null +++ b/man/identifyTopFit.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filtering_fit.R +\name{identifyTopFit} +\alias{identifyTopFit} +\title{Identify top or low fitting observations based on specified diagnostic metric and filtering method.} +\usage{ +identifyTopFit( + list_tmb, + metric = "AIC", + filter_method = "mad", + keep = "top", + sort = F, + decreasing = T, + mad_tolerance = 3 +) +} +\arguments{ +\item{list_tmb}{List of glmmTMB objects.} + +\item{metric}{The metric used for diagnostic (e.g., "AIC", "BIC", "logLik", "deviance", "dispersion").} + +\item{filter_method}{The filtering method to be used (e.g., "mad"). +Feel free to implement your own filetering method} + +\item{keep}{Whether to keep "top" or "low" fitting observations.} + +\item{sort}{Logical indicating whether to sort the results.} + +\item{decreasing}{Logical indicating whether to sort in decreasing order.} + +\item{mad_tolerance}{Tolerance for MAD-based filtering.} +} +\value{ +A character vector of row names corresponding to the top or low fitting observations. +} +\description{ +This function identifies top or low fitting observations based on a specified metric and filtering method. +} +\examples{ +input_var_list <- init_variable() +## -- simulate RNAseq data +mock_data <- mock_rnaseq(input_var_list, + n_genes = 5, + min_replicates = 3, + max_replicates = 3, + basal_expression = 2, + sequencing_depth = 1e5) +## -- prepare data & fit a model with mixed effect +data2fit = prepareData2fit(countMatrix = mock_data$counts, + metadata = mock_data$metadata) +l_tmb <- fitModelParallel(formula = kij ~ myVariable, data = data2fit, + group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), + n.cores = 1) +# Identify top fitting observations based on AIC with MAD filtering +identifyTopFit(l_tmb, metric = "AIC", filter_method = "mad", keep = "top", sort = TRUE, decreasing = TRUE, mad_tolerance = 3) + +# Identify low fitting observations based on BIC without sorting +identifyTopFit(l_tmb, metric = "BIC", filter_method = "mad", keep = "low", sort = FALSE) + +# Identify top fitting observations based on log-likelihood with MAD filtering and custom tolerance +identifyTopFit(l_tmb, metric = "logLik", filter_method = "mad", keep = "top", mad_tolerance = 2) +} diff --git a/tests/testthat/test-filtering_fit.R b/tests/testthat/test-filtering_fit.R new file mode 100644 index 0000000000000000000000000000000000000000..021f0d85d500137406fa4b2073bc246523e1e5d5 --- /dev/null +++ b/tests/testthat/test-filtering_fit.R @@ -0,0 +1,49 @@ +# WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand + + +# Tests for identifyTopFit function +test_that("identifyTopFit correctly identifies top-fitting objects", { + input_var_list <- init_variable() + ## -- simulate RNAseq data + set.seed(101) + mock_data <- mock_rnaseq(input_var_list, + n_genes = 5, + min_replicates = 3, + max_replicates = 3, + basal_expression = 2, + sequencing_depth = 1e5) + ## -- prepare data & fit a model with mixed effect + data2fit = prepareData2fit(countMatrix = mock_data$counts, + metadata = mock_data$metadata, + normalization = T) + l_tmb <- fitModelParallel(formula = kij ~ myVariable, data = data2fit, + group_by = "geneID", family = glmmTMB::nbinom2(link = "log"), + n.cores = 1) + glance_tmb(l_tmb) + # Identify top fitting observations based on AIC with MAD filtering + top_genes <- identifyTopFit(l_tmb, metric = "AIC", filter_method = "mad", keep = "top", + sort = TRUE, decreasing = TRUE, mad_tolerance = 3) + expect_equal(top_genes, c("gene5", "gene4", "gene1", "gene2", "gene3")) + # Identify low fitting observations based on BIC without sorting + top_genes <-identifyTopFit(l_tmb, metric = "BIC", filter_method = "mad", keep = "low", sort = FALSE) + expect_equal(top_genes, character()) + + # Identify top fitting observations based on log-likelihood with MAD filtering and custom tolerance + top_genes <- identifyTopFit(l_tmb, metric = "logLik", filter_method = "mad", keep = "top", mad_tolerance = 2) + expect_equal(top_genes, c("gene1", "gene2", "gene3", "gene4")) +}) + +# Tests for get_mad_left_threshold function +test_that("get_mad_left_threshold correctly calculates MAD-based left threshold", { + left_threshold <- get_mad_left_threshold(c(100, 200, 300), 3) + expect_equal(left_threshold, 200 - (3 * 100)) +}) + + +# Tests for get_mad_message +test_that('get_mad_user_message return correct output', { + expect_message(get_mad_user_message("BIC", c(50, 75, 100), 75, "low")) + expect_message(get_mad_user_message("BIC", c(50, 75, 100), 75, "top")) +}) + + diff --git a/vignettes/03-rnaseq_analysis.Rmd b/vignettes/03-rnaseq_analysis.Rmd index 6c89661fba3ae3a72efdf85f6a4fb9e83973bf27..f9c542e2eb513dabb2a7ff1bb1cf01477cda98d2 100644 --- a/vignettes/03-rnaseq_analysis.Rmd +++ b/vignettes/03-rnaseq_analysis.Rmd @@ -207,6 +207,18 @@ diagnostic_plot(list_tmb = l_tmb) diagnostic_plot(list_tmb = l_tmb, focus = c("dispersion", "logLik")) ``` +## Select best fit based on diagnostic metrics + +The identifyTopFit function allows for selecting genes that best fit models, based on various metrics such as AIC, BIC, LogLik, deviance, dispersion . This function provides the flexibility to employ multiple filtering methods to identify genes most relevant for further analysis. We implemented a filtering method based on the Median Absolute Deviation (MAD). For example, using the MAD method with a tolerance threshold of 3, the function identifies genes whose metric values are greater ("top") or lower ("low") than `median(metric) - 3 * MAD(metric)`. The keep option allows choosing whether to retain genes with metric values above or below the MAD threshold. + +```{r example-identifytopfit, warning = FALSE, message = FALSE} +identifyTopFit(list_tmb = l_tmb, metric = "AIC", + filter_method = "mad", keep = "top", + sort = TRUE, decreasing = TRUE, + mad_tolerance = 3) +``` + + ## Anova to select the best model Utilizing the `anovaParallel()` function enables you to perform model selection by assessing the significance of the fixed effects. You can also include additional parameters like type. For more details, refer to [car::Anova](https://rdrr.io/cran/car/man/Anova.html).