-
Arnaud Duvermy authored
Former-commit-id: 3315d7b7a669f610970b9c041ca5a2c96dddad44 Former-commit-id: 12faaaf4fc44cca459c6d05fd8177e018394a65b Former-commit-id: 91f0afc716a5a87eb01a079ba77a18b85b712e54
Arnaud Duvermy authoredFormer-commit-id: 3315d7b7a669f610970b9c041ca5a2c96dddad44 Former-commit-id: 12faaaf4fc44cca459c6d05fd8177e018394a65b Former-commit-id: 91f0afc716a5a87eb01a079ba77a18b85b712e54
simulation_report.R 21.33 KiB
# WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand
#' Check Validity of Truth Labels
#'
#' This function checks the validity of truth labels for HTRfit evaluation, specifically designed for binary classification.
#'
#' @param eval_data Data frame containing evaluation parameters.
#' @param col_param Column name specifying the parameter for grouping.
#' @param col_truth Column name for binary ground truth values (default is "isDE").
#' @return Logical value indicating the validity of truth labels.
#' @export
is_truthLabels_valid <- function(eval_data, col_param = "description", col_truth = "isDE" ) {
## --init
isValid <- TRUE
## -- subset fixed effect
eval_data_fixed <- subset(eval_data, effect == "fixed")
table_summary <- table(eval_data_fixed[[col_param]], eval_data_fixed[[col_truth]])
## -- 2 lines needed (FALSE/TRUE)
n_labels <- dim(table_summary)[2]
if(n_labels != 2) {
labels_str <- paste(colnames(table_summary), collapse = ", ")
msg <- paste("Both FALSE/TRUE truth labels (isDE) are required for classification evaluation.\nFound : ", labels_str )
message(msg)
isValid <- FALSE
}
## -- one label found 0 time !
label_not_found <- which(table_summary == 0, arr.ind=TRUE)
if(dim(label_not_found)[1] > 0){
description <- rownames(label_not_found)
label_not_found <- colnames(table_summary)[label_not_found[,"col"]]
msg <- "Both TRUE and FALSE labels are required for HTRfit evaluation.\n"
msg2 <- paste("Label isDE ==", label_not_found, "not found for description ==", description, collapse = '\n')
msg3 <- "Please review your threshold or alternative hypothesis, and consider checking the identity plot for further insights."
msg <- paste(msg, msg2, ".\n", msg3 , sep = "")
message(msg)
isValid <- FALSE
}
return(isValid)
}
#' Validate input parameters for evaluation
#'
#' This function validates the input parameters used for evaluation, ensuring that they meet the required criteria.
#'
#' @param mock_obj Mock object containing data for evaluation.
#' @param list_gene Character vector of gene names to evaluate. Default is NULL.
#' @param list_tmb List of glmmTMB objects to evaluate. Default is NULL.
#' @param dds DESeqDataSet object to evaluate. Default is NULL.
#' @param coeff_threshold Numeric value representing the threshold for coefficients.
#' @param alt_hypothesis Numeric value representing the alternative hypothesis. Should be one of 'greaterAbs', 'greater', or 'less'.
#' @param alpha_risk Numeric value representing the alpha risk for hypothesis testing.
#' @return TRUE or error message
#' @export
isValidEvalInput <- function(mock_obj, list_gene, list_tmb, dds, coeff_threshold, alt_hypothesis, alpha_risk) {
# Validation de l'objet mock
invisible(isValidMock_obj(mock_obj))
# Vérification de la présence d'au moins un objet à évaluer
if (is.null(list_tmb) && is.null(dds))
stop("Both 'list_tmb' and 'dds' are NULL. There is nothing to evaluate.")
# Vérification du type d'objet DESeqDataSet
if (!is.null(dds)) {
stopifnot("DESeqDataSet" %in% class(dds))
}
# Validation de la liste d'objets glmmTMB
if (!is.null(list_tmb)) {
isValidList_tmb(list_tmb)
}
# Vérification de la liste de gènes
if (!is.null(list_gene)) {
stopifnot(is.character(list_gene))
}
stopifnot(length(coeff_threshold) == 1)
stopifnot(is.numeric(coeff_threshold))
stopifnot(length(alt_hypothesis) == 1)
stopifnot(is.character(alt_hypothesis))
stopifnot(alt_hypothesis %in% c("greaterAbs", "greater", "less"))
stopifnot(length(alpha_risk) == 1)
stopifnot(is.numeric(alpha_risk))
return(TRUE)
}
#' Compute evaluation report for TMB/DESeq2 analysis
#'
#' This function computes an evaluation report for TMB/DESeq2 analysis using several graphical
#' summaries like precision-recall (PR) curve, Receiver operating characteristic (ROC) curve
#' and others. It takes as input several parameters like TMB results (\code{l_tmb}), DESeq2
#' result (\code{dds}), mock object (\code{mock_obj}), coefficient threshold (\code{coeff_threshold}) and
#' alternative hypothesis (\code{alt_hypothesis}).
#'
#' @param mock_obj Mock object that represents the distribution of measurements corresponding
#' to mock samples.
#' @param list_gene A character vector specifying the genes id to be retained for evaluation. If NULL (default) all genes are used for evaluation
#' @param list_tmb TMB results from analysis.
#' @param dds DESeq2 results from differential gene expression analysis.
#' @param coeff_threshold A non-negative value which specifies a ln(fold change) threshold. The Threshold is used for the Wald test to determine whether the coefficient (β) is significant or not, depending on \code{alt_hypothesis} parameter. Default is 0.69, ln(FC = 2).
#' @param alt_hypothesis Alternative hypothesis for the Wald test (default is "greaterAbs").
#' Possible choice:
#' "greater"
#' - β > coeff_threshold,
#' "less"
#' - β < −coeff_threshold,
#' or two-tailed alternative:
#' "greaterAbs"
#' - |β| > coeff_threshold
#' @param alpha_risk parameter that sets the threshold for alpha risk level while testing coefficient (β). Default: 0.05.
#' @param palette_color Optional parameter that sets the color palette for plots.Default : c(DESeq2 = "#500472", HTRfit ="#79cbb8").
#' @param palette_shape Optional parameter that sets the point shape for plots.Default : c(DESeq2 = 17, HTRfit = 19).
#' @param skip_eval_intercept indicate whether to calculate precision-recall and ROC metrics for the intercept (default skip_eval_intercept = TRUE).
#' @param ... Additional parameters to be passed to aesthetics \code{get_pr_curve} and \code{get_roc_curve}.
#'
#' @return A list containing the following components:
#' \item{identity}{A list containing model parameters and dispersion data.}
#' \item{precision_recall}{A PR curve object generated from TMB and DESeq2 results.}
#' \item{roc}{A ROC curve object generated from TMB and DESeq2 results.}
#' \item{counts}{A counts plot generated from mock object.}
#' \item{performances}{A summary of the performances obtained.}
#' @export
#' @examples
#' \dontrun{
#' report <- evaluation_report(list_tmb = l_res, dds = NULL,
#' mock_obj = mock_data,
#' coeff_threshold = 0.45,
#' alt_hypothesis = "greaterAbs")
#' }
#'
evaluation_report <- function(mock_obj, list_gene = NULL,
list_tmb = NULL, dds = NULL,
coeff_threshold = 0.69, alt_hypothesis = "greaterAbs", alpha_risk = 0.05,
palette_color = c(DESeq2 = "#500472", HTRfit ="#79cbb8"),
palette_shape = c(DESeq2 = 17, HTRfit = 19),
skip_eval_intercept = TRUE, ...) {
## -- verif
invisible(isValidEvalInput(mock_obj, list_gene, list_tmb, dds,
coeff_threshold, alt_hypothesis, alpha_risk))
## -- subset genes
if (!is.null(list_gene)) {
mock_obj <- subsetGenes(list_gene, mock_obj)
list_tmb <- list_tmb[list_gene]
}
## -- eval data
eval_data <- get_eval_data(list_tmb, dds, mock_obj, coeff_threshold, alt_hypothesis)
## -- identity plot
params_identity_eval <- eval_identityTerm( eval_data$modelparams, palette_color, palette_shape )
dispersion_identity_eval <- eval_identityTerm(eval_data$modeldispersion, palette_color, palette_shape)
if (isTRUE(skip_eval_intercept)) {
eval_data2metrics <- subset(eval_data$modelparams, term != "(Intercept)")
} else {
eval_data2metrics <- eval_data$modelparams
}
## aggregate RMSE + R2
rmse_modelparams <- compute_rmse(eval_data2metrics, grouping_by = "from")
rsquare_modelparams <- compute_rsquare(eval_data2metrics, grouping_by = "from")
rsquare_modelparams$from <- NULL
aggregate_metrics <- cbind(rmse_modelparams, rsquare_modelparams)
## -- counts plot
counts_violinplot <- counts_plot(mock_obj)
## -- check if eval data ok
if(isFALSE(is_truthLabels_valid(eval_data2metrics))){
message("The required truth labels for HTRfit classification metrics evaluation are not met. Only the identity plot and counts plot will be returned.")
## -- clear memory
invisible(gc(reset = T, verbose = F, full = T)) ;
return(
list(
data = eval_data,
identity = list(params = params_identity_eval$p,
dispersion = dispersion_identity_eval$p ),
counts = counts_violinplot,
performances = list(byparams = rbind(dispersion_identity_eval$R2, params_identity_eval$R2),
aggregate = aggregate_metrics )
))
}
## -- pr curve
pr_curve_obj <- get_pr_object(eval_data2metrics)
pr_curve_obj <- get_pr_curve(pr_curve_obj, palette_color = palette_color, ...)
## -- auc curve
roc_curve_obj <- get_roc_object(eval_data2metrics)
roc_curve_obj <- get_roc_curve(roc_curve_obj, palette_color = palette_color, ...)
## -- acc, recall, sensib, speci, ...
metrics_obj <- get_ml_metrics_obj(eval_data2metrics, alpha_risk )
## -- merge all metrics in one obj
model_perf_obj <- get_performances_metrics_obj( r2_params = params_identity_eval$R2,
r2_agg = aggregate_metrics,
dispersion_identity_eval$R2,
pr_curve_obj,
roc_curve_obj,
metrics_obj )
## -- counts plot
counts_violinplot <- counts_plot(mock_obj)
## -- clear memory
invisible(gc(reset = T, verbose = F, full = T)) ;
return(
list(
data = eval_data,
identity = list( params = params_identity_eval$p,
dispersion = dispersion_identity_eval$p ) ,
precision_recall = list( params = pr_curve_obj$byparams$pr_curve,
aggregate = pr_curve_obj$aggregate$pr_curve ),
roc = list( params = roc_curve_obj$byparams$roc_curve,
aggregate = roc_curve_obj$aggregate$roc_curve ),
counts = counts_violinplot,
performances = model_perf_obj)
)
}
#' Compute classification and regression performance metrics object
#'
#' This function computes metrics object for both classification and regression performance
#' from evaluation objects generated by \code{evaluation_report} function. Metrics object
#' contains the by-parameter and aggregate metrics for PR AUC, ROC AUC, R-squared and other
#' classification metrics for precision, recall, sensitivity, and specificity. The function
#' takes as input various evaluation objects including R-squared values (\code{r2_params}),
#' dispersion values (\code{r2_dispersion}), PR object (\code{pr_obj}), ROC object
#' (\code{roc_obj}), and machine learning performance metrics object (\code{ml_metrics_obj}).
#' The function generates separate data frames for metric values by parameter value and for the
#' aggregated metric values.
#'
#' @param r2_params R-squared and RMSE values from model parameters evaluation object.
#' @param r2_agg R-squared and RMSE values aggregated.
#' @param r2_dispersion R-squared and RMSE values from dispersion evaluation object.
#' @param pr_obj PR object generated from evaluation report.
#' @param roc_obj ROC object generated from evaluation report.
#' @param ml_metrics_obj Machine learning performance metrics object.
#'
#' @return A list containing separate data frames for by-parameter and aggregated metric values.
#' @export
get_performances_metrics_obj <- function(r2_params , r2_agg ,r2_dispersion,
pr_obj, roc_obj, ml_metrics_obj ){
## -- by params
auc_mtrics_params <- join_dtf(pr_obj$byparams$pr_auc , roc_obj$byparams$roc_auc,
k1 = c("from", "description"), k2 = c("from", "description"))
metrics_params <- join_dtf(auc_mtrics_params, ml_metrics_obj$byparams,
k1 = c("from", "description"), k2 = c("from", "description"))
rsquare_mtrics <- rbind(r2_params, r2_dispersion)
metrics_params <- join_dtf(metrics_params, rsquare_mtrics,
k1 = c("from", "description"), k2 = c("from", "description"))
## -- aggregate
auc_mtrics_agg <- join_dtf(pr_obj$aggregate$pr_auc , roc_obj$aggregate$roc_auc,
k1 = c("from"), k2 = c("from"))
metrics_agg <- join_dtf(auc_mtrics_agg , ml_metrics_obj$aggregate,
k1 = c("from"), k2 = c("from"))
metrics_agg <- join_dtf(metrics_agg, r2_agg,
k1 = c("from"), k2 = c("from"))
return(list(byparams = metrics_params, aggregate = metrics_agg ))
}
#' Compute summary metrics on classification results
#'
#' This function computes several classification metrics like accuracy, precision, recall,
#' sensitivity and specificity on classification results. The input to the function is a data frame
#' (\code{dt}) containing the predicted classification result as \code{y_pred} and the actual
#' classification as \code{isDE}. The function returns a data frame with the computed metrics.
#'
#' @param dt Data frame containing the predicted and actual classification results.
#'
#' @return A data frame with the computed classification metrics of accuracy, precision, recall,
#' sensitivity and specificity.
#' @export
compute_metrics_summary <- function(dt) {
data.frame(
accuracy = accuracy( dt$y_pred, dt$isDE ),
precision = precision(dt$y_pred, y_true = dt$isDE, positive = "TRUE"),
recall = recall(dt$y_pred, y_true = dt$isDE, positive = "TRUE"),
sensitivity = sensitivity(dt$y_pred, y_true = dt$isDE, positive = "TRUE"),
specificity = specificity(dt$y_pred , y_true = dt$isDE, positive = "TRUE")
)
}
#' Get classification metrics for evaluation object
#'
#' This function extracts the classification metrics. It takes as input (\code{evaldata_params})
#' and an optional risk level for the alpha risk (\code{alpha_risk}).
#' It retrieves the p-values from the identity term and computes the binary classification
#' result by thresholding with the alpha risk level. It then computes the classification metrics
#' using \code{compute_metrics_summary} function separately for each parameter value as well as
#' for the aggregated results.
#'
#' @param evaldata_params Identity term of the evaluation object.
#' @param alpha_risk parameter that sets the threshold for alpha risk level (default 0.05).
#' @param col_param parameter that sets the column name for the parameter (default "description").
#'
#' @return A list containing separate data frames for classification metrics by parameter value
#' and for aggregated classification metrics.
#'
#' @importFrom data.table setDT .SD
#' @export
get_ml_metrics_obj <- function(evaldata_params, alpha_risk = 0.05, col_param = "description"){
## -- subset fixed eff
evaldata_params <- subset(evaldata_params, effect == "fixed")
evaldata_params$y_pred <- evaldata_params$p.adj < alpha_risk
## by params
dt_evaldata_params <- data.table::setDT(evaldata_params)
byparam_metrics <- dt_evaldata_params[, compute_metrics_summary(.SD), by=c("from", col_param), .SDcols=c("y_pred", "isDE")]
## aggreg
agg_metrics <- dt_evaldata_params[, compute_metrics_summary(.SD), by=c("from"), .SDcols=c("y_pred", "isDE")]
return(list( byparams = as.data.frame(byparam_metrics), aggregate = as.data.frame(agg_metrics)))
}
#' Extracts evaluation data from a list of TMB models.
#'
#' This function takes a list of TMB models, performs tidy evaluation, extracts model parameters,
#' and compares them to the ground truth effects. Additionally, it evaluates and compares dispersion
#' inferred from TMB with the ground truth gene dispersion. The results are organized in two data frames,
#' one for model parameters and one for dispersion, both labeled as "HTRfit".
#'
#' @param list_tmb A list of TMB models.
#' @param mock_obj A mock object containing ground truth information.
#' @param coeff_threshold The coefficient threshold for wald test
#' @param alt_hypothesis The alternative hypothesis for wald test
#' @return A list containing data frames for model parameters and dispersion.
#' @export
get_eval_data_from_ltmb <- function(list_tmb, mock_obj, coeff_threshold, alt_hypothesis ){
## -- reshape 2 dataframe
tidyRes <- tidy_results(list_tmb, coeff_threshold, alt_hypothesis)
## -- model params
formula_used <- list_tmb[[1]]$modelInfo$allForm$formula
params_df <- compareInferenceToExpected(tidyRes, mock_obj$groundTruth$effects, formula_used)
params_df <- getLabelExpected(params_df, coeff_threshold, alt_hypothesis)
params_df$from <- "HTRfit"
## -- dispersion
dispersion_inferred <- extract_tmbDispersion(list_tmb)
dispersion_df <- getDispersionComparison(dispersion_inferred, mock_obj$groundTruth$gene_dispersion)
dispersion_df$from <- "HTRfit"
return(list(modelparams = params_df, modeldispersion = dispersion_df ))
}
#' Extracts evaluation data from a DESeqDataSet (dds) object.
#'
#' This function takes a DESeqDataSet object, performs tidy evaluation, extracts model parameters
#' (beta in the case of DESeqDataSet), and compares them to the ground truth effects. Additionally,
#' it evaluates and compares dispersion inferred from DESeqDataSet with the ground truth gene dispersion.
#' The results are organized in two data frames, one for model parameters and one for dispersion, both #' labeled as "HTRfit".
#'
#' @param dds A DESeqDataSet object.
#' @param mock_obj A mock object containing ground truth information.
#' @param coeff_threshold The coefficient threshold wald test
#' @param alt_hypothesis The alternative hypothesis wald test
#' @return A list containing data frames for model parameters and dispersion.
#' @export
get_eval_data_from_dds <- function(dds, mock_obj, coeff_threshold, alt_hypothesis){
## -- reshape 2 dataframe
tidy_dds <- wrap_dds(dds, coeff_threshold, alt_hypothesis)
## -- model params (beta in case of dds)
params_df <- inferenceToExpected_withFixedEff(tidy_dds$fixEff, mock_obj$groundTruth$effects)
params_df <- getLabelExpected(params_df, coeff_threshold, alt_hypothesis)
params_df$component <- NA
params_df$group <- NA
params_df$from <- "DESeq2"
## -- dispersion
dispersion_inferred <- extract_ddsDispersion(tidy_dds)
dispersion_df <- getDispersionComparison(dispersion_inferred , mock_obj$groundTruth$gene_dispersion)
dispersion_df$from <- "DESeq2"
return(list(modelparams = params_df, modeldispersion = dispersion_df ))
}
#' Combines evaluation data from TMB and DESeqDataSet (dds) objects.
#'
#' This function combines model parameters and dispersion data frames from TMB and DESeqDataSet (dds) evaluations.
#'
#' @param evaldata_tmb Evaluation data from TMB models.
#' @param evaldata_dds Evaluation data from DESeqDataSet (dds) object.
#' @return A list containing combined data frames for model parameters and dispersion.
#' @export
rbind_evaldata_tmb_dds <- function(evaldata_tmb, evaldata_dds){
## -- rbind
evaldata_dispersion <- rbind(evaldata_tmb$modeldispersion, evaldata_dds$modeldispersion)
evaldata_params <- rbind(evaldata_tmb$modelparams, evaldata_dds$modelparams)
## -- res
return(list(modelparams = evaldata_params, modeldispersion = evaldata_dispersion ))
}
#' Combines model parameters and dispersion data frames.
#'
#' This function combines model parameters and dispersion data frames, ensuring proper alignment.
#'
#' @param eval_data Evaluation data containing model parameters and dispersion.
#' @return A combined data frame with model parameters and dispersion.
#' @export
rbind_model_params_and_dispersion <- function(eval_data){
## -- split
disp_df <- eval_data$modeldispersion
params_df <- eval_data$modelparams
## -- merging model and dispersion param
disp_df[setdiff(names(params_df), names(disp_df))] <- NA
disp_df <- disp_df[names(params_df)]
## -- rbind
res_df <- rbind(params_df, disp_df)
return(res_df)
}
#' Gets evaluation data from both TMB and DESeqDataSet (dds) objects.
#'
#' This function retrieves evaluation data from TMB and DESeqDataSet (dds) objects, combining
#' the results into a list containing data frames for model parameters and dispersion.
#'
#' @param l_tmb A list of TMB models (default is NULL).
#' @param dds A DESeqDataSet object (default is NULL).
#' @param mock_obj A mock object containing ground truth information.
#' @param coefficient Threshold value for coefficient testing (default is 0). This threshold corresponds to the natural logarithm of the fold change (ln(FC)).
#' @param alt_hypothesis The alternative hypothesis for wald test
#' @return A list containing data frames for model parameters and dispersion.
#' @export
get_eval_data <- function(l_tmb = NULL, dds = NULL , mock_obj, coefficient, alt_hypothesis){
## -- init
eval_data_tmb <- NULL
eval_data_dds <- NULL
## -- evaluation data
eval_data_tmb <- if (!is.null(l_tmb)) get_eval_data_from_ltmb(l_tmb, mock_obj, coefficient, alt_hypothesis )
eval_data_dds <- if (!is.null(dds)) get_eval_data_from_dds(dds, mock_obj, coefficient, alt_hypothesis )
## -- merge/rbind
eval_data <- rbind_evaldata_tmb_dds(eval_data_tmb, eval_data_dds)
return(eval_data)
}