Skip to content
Snippets Groups Projects
  • Arnaud Duvermy's avatar
    a0429440
    enhance report · a0429440
    Arnaud Duvermy authored
    Former-commit-id: 3315d7b7a669f610970b9c041ca5a2c96dddad44
    Former-commit-id: 12faaaf4fc44cca459c6d05fd8177e018394a65b
    Former-commit-id: 91f0afc716a5a87eb01a079ba77a18b85b712e54
    a0429440
    History
    enhance report
    Arnaud Duvermy authored
    Former-commit-id: 3315d7b7a669f610970b9c041ca5a2c96dddad44
    Former-commit-id: 12faaaf4fc44cca459c6d05fd8177e018394a65b
    Former-commit-id: 91f0afc716a5a87eb01a079ba77a18b85b712e54
simulation_report.R 21.33 KiB
# WARNING - Generated by {fusen} from dev/flat_full.Rmd: do not edit by hand



#' Check Validity of Truth Labels
#'
#' This function checks the validity of truth labels for HTRfit evaluation, specifically designed for binary classification.
#'
#' @param eval_data Data frame containing evaluation parameters.
#' @param col_param Column name specifying the parameter for grouping.
#' @param col_truth Column name for binary ground truth values (default is "isDE").
#' @return Logical value indicating the validity of truth labels.
#' @export
is_truthLabels_valid <- function(eval_data, col_param = "description", col_truth = "isDE" ) {
  
    ## --init
    isValid <- TRUE
  
    ## -- subset fixed effect
    eval_data_fixed <- subset(eval_data, effect == "fixed")
    
    table_summary <- table(eval_data_fixed[[col_param]], eval_data_fixed[[col_truth]])
    ## -- 2 lines needed (FALSE/TRUE)
    n_labels <- dim(table_summary)[2]
    if(n_labels != 2) {
      labels_str <- paste(colnames(table_summary), collapse = ", ")
      msg <- paste("Both FALSE/TRUE truth labels (isDE) are required for classification evaluation.\nFound : ", labels_str )
      message(msg)
      isValid <- FALSE
    }
    ## -- one label found 0 time !
    label_not_found <- which(table_summary == 0, arr.ind=TRUE)
    if(dim(label_not_found)[1] > 0){
      description <- rownames(label_not_found)
      label_not_found <- colnames(table_summary)[label_not_found[,"col"]]
      msg <- "Both TRUE and FALSE labels are required for HTRfit evaluation.\n"
      msg2 <- paste("Label isDE ==", label_not_found, "not found for description ==", description, collapse = '\n')
      msg3 <- "Please review your threshold or alternative hypothesis, and consider checking the identity plot for further insights."
      msg <- paste(msg, msg2, ".\n", msg3 , sep = "")
      message(msg)
      isValid <- FALSE
    }
    return(isValid)
}


#' Validate input parameters for evaluation
#'
#' This function validates the input parameters used for evaluation, ensuring that they meet the required criteria.
#'
#' @param mock_obj Mock object containing data for evaluation.
#' @param list_gene Character vector of gene names to evaluate. Default is NULL.
#' @param list_tmb List of glmmTMB objects to evaluate. Default is NULL.
#' @param dds DESeqDataSet object to evaluate. Default is NULL.
#' @param coeff_threshold Numeric value representing the threshold for coefficients.
#' @param alt_hypothesis Numeric value representing the alternative hypothesis. Should be one of 'greaterAbs', 'greater', or 'less'.
#' @param alpha_risk Numeric value representing the alpha risk for hypothesis testing.
#' @return TRUE or error message
#' @export
isValidEvalInput <- function(mock_obj, list_gene, list_tmb, dds, coeff_threshold, alt_hypothesis, alpha_risk) {
    # Validation de l'objet mock
    invisible(isValidMock_obj(mock_obj))

    # Vérification de la présence d'au moins un objet à évaluer
    if (is.null(list_tmb) && is.null(dds)) 
        stop("Both 'list_tmb' and 'dds' are NULL. There is nothing to evaluate.")

    # Vérification du type d'objet DESeqDataSet
    if (!is.null(dds)) {
        stopifnot("DESeqDataSet" %in% class(dds))
    }

    # Validation de la liste d'objets glmmTMB
    if (!is.null(list_tmb)) {
        isValidList_tmb(list_tmb)
    }

    # Vérification de la liste de gènes
    if (!is.null(list_gene)) {
        stopifnot(is.character(list_gene))
    }

      
    stopifnot(length(coeff_threshold) == 1)
    stopifnot(is.numeric(coeff_threshold))
    
    stopifnot(length(alt_hypothesis) == 1)
    stopifnot(is.character(alt_hypothesis))
    stopifnot(alt_hypothesis %in% c("greaterAbs", "greater", "less"))
    
    stopifnot(length(alpha_risk) == 1)
    stopifnot(is.numeric(alpha_risk))
    
    return(TRUE)
}



#' Compute evaluation report for TMB/DESeq2 analysis
#'
#' This function computes an evaluation report for TMB/DESeq2 analysis using several graphical
#' summaries like precision-recall (PR) curve, Receiver operating characteristic (ROC) curve
#' and others. It takes as input several parameters like TMB results (\code{l_tmb}), DESeq2
#' result (\code{dds}), mock object (\code{mock_obj}), coefficient threshold (\code{coeff_threshold}) and
#' alternative hypothesis (\code{alt_hypothesis}). 
#'
#' @param mock_obj Mock object that represents the distribution of measurements corresponding
#'   to mock samples.
#' @param list_gene A character vector specifying the genes id to be retained for evaluation. If NULL (default) all genes are used for evaluation
#' @param list_tmb TMB results from analysis.
#' @param dds DESeq2 results from differential gene expression analysis.
#' @param coeff_threshold  A non-negative value which specifies a ln(fold change) threshold. The Threshold  is used for the Wald test to determine whether the  coefficient (β) is significant or not, depending on \code{alt_hypothesis} parameter. Default is 0.69, ln(FC = 2).
#' @param alt_hypothesis Alternative hypothesis for the Wald test (default is "greaterAbs").
#' Possible choice: 
#' "greater" 
#' - β > coeff_threshold, 
#' "less" 
#' - β < −coeff_threshold,
#' or two-tailed alternative: 
#' "greaterAbs" 
#' - |β| > coeff_threshold
#' @param alpha_risk parameter that sets the threshold for alpha risk level while testing coefficient (β). Default: 0.05.
#' @param palette_color Optional parameter that sets the color palette for plots.Default : c(DESeq2 = "#500472", HTRfit ="#79cbb8").
#' @param palette_shape Optional parameter that sets the point shape for plots.Default : c(DESeq2 = 17, HTRfit = 19).
#' @param skip_eval_intercept indicate whether to calculate precision-recall and ROC metrics for the intercept (default skip_eval_intercept = TRUE).
#' @param ... Additional parameters to be passed to aesthetics \code{get_pr_curve} and \code{get_roc_curve}.
#'
#' @return A list containing the following components:
#' \item{identity}{A list containing model parameters and dispersion data.}
#' \item{precision_recall}{A PR curve object generated from TMB and DESeq2 results.}
#' \item{roc}{A ROC curve object generated from TMB and DESeq2 results.}
#' \item{counts}{A counts plot generated from mock object.}
#'  \item{performances}{A summary of the performances obtained.}
#' @export
#' @examples
#' \dontrun{
#' report <- evaluation_report(list_tmb = l_res, dds = NULL, 
#'                            mock_obj = mock_data, 
#'                            coeff_threshold = 0.45, 
#'                            alt_hypothesis = "greaterAbs")
#' }
#'
evaluation_report <- function(mock_obj, list_gene = NULL,
                              list_tmb = NULL, dds = NULL,
                              coeff_threshold = 0.69, alt_hypothesis = "greaterAbs", alpha_risk = 0.05, 
                              palette_color = c(DESeq2 = "#500472", HTRfit ="#79cbb8"), 
                              palette_shape = c(DESeq2 = 17, HTRfit = 19), 
                              skip_eval_intercept = TRUE, ...) {
  
  ## -- verif
 invisible(isValidEvalInput(mock_obj, list_gene, list_tmb, dds, 
                            coeff_threshold, alt_hypothesis, alpha_risk))
  
  ## -- subset genes
  if (!is.null(list_gene)) {
        mock_obj <- subsetGenes(list_gene, mock_obj)
        list_tmb <- list_tmb[list_gene]
  }
  
  ## -- eval data
  eval_data <- get_eval_data(list_tmb, dds, mock_obj, coeff_threshold, alt_hypothesis)
  ## -- identity plot
  params_identity_eval <- eval_identityTerm( eval_data$modelparams, palette_color, palette_shape )
  dispersion_identity_eval <- eval_identityTerm(eval_data$modeldispersion, palette_color, palette_shape)
  
  if (isTRUE(skip_eval_intercept)) {
    eval_data2metrics <- subset(eval_data$modelparams, term != "(Intercept)")
  } else {
    eval_data2metrics <- eval_data$modelparams
  }

  ## aggregate RMSE + R2
  rmse_modelparams <- compute_rmse(eval_data2metrics, grouping_by = "from")
  rsquare_modelparams <- compute_rsquare(eval_data2metrics, grouping_by = "from")
  rsquare_modelparams$from <- NULL
  aggregate_metrics <- cbind(rmse_modelparams, rsquare_modelparams)
  
  
  ## -- counts plot
  counts_violinplot <- counts_plot(mock_obj)
  
  ## -- check if eval data ok 
  if(isFALSE(is_truthLabels_valid(eval_data2metrics))){
    
    message("The required truth labels for HTRfit classification metrics evaluation are not met. Only the identity plot and counts plot will be returned.")
    ## -- clear memory
    invisible(gc(reset = T, verbose = F, full = T)) ; 
    
    return(
          list(
                data = eval_data, 
                identity = list(params = params_identity_eval$p,
                                dispersion = dispersion_identity_eval$p ), 
                counts = counts_violinplot,
                performances = list(byparams = rbind(dispersion_identity_eval$R2, params_identity_eval$R2),
                                    aggregate = aggregate_metrics )
                
          ))
      
      
  }

  
  
  ## -- pr curve
  pr_curve_obj <- get_pr_object(eval_data2metrics)
  pr_curve_obj <- get_pr_curve(pr_curve_obj, palette_color = palette_color,  ...)
  
  ## -- auc curve
  roc_curve_obj <- get_roc_object(eval_data2metrics)
  roc_curve_obj <- get_roc_curve(roc_curve_obj, palette_color = palette_color, ...)
  
  ## -- acc, recall, sensib, speci, ...
  metrics_obj <- get_ml_metrics_obj(eval_data2metrics, alpha_risk )
  ## -- merge all metrics in one obj
  model_perf_obj <- get_performances_metrics_obj( r2_params = params_identity_eval$R2,   
                                                  r2_agg = aggregate_metrics, 
                                                  dispersion_identity_eval$R2, 
                                                  pr_curve_obj,
                                                  roc_curve_obj,
                                                  metrics_obj )
  
  ## -- counts plot
  counts_violinplot <- counts_plot(mock_obj)
  
  ## -- clear memory
  invisible(gc(reset = T, verbose = F, full = T)) ;  
  
  return(
        list(
              data = eval_data, 
              identity = list( params = params_identity_eval$p,
                              dispersion = dispersion_identity_eval$p ) ,
              precision_recall = list( params = pr_curve_obj$byparams$pr_curve,
                                        aggregate = pr_curve_obj$aggregate$pr_curve ),
              roc = list( params = roc_curve_obj$byparams$roc_curve,
                                        aggregate = roc_curve_obj$aggregate$roc_curve ),
              counts = counts_violinplot,
              performances = model_perf_obj)
        )
}



#' Compute classification and regression performance metrics object
#' 
#' This function computes metrics object for both classification and regression performance
#' from evaluation objects generated by \code{evaluation_report} function. Metrics object
#' contains the by-parameter and aggregate metrics for PR AUC, ROC AUC, R-squared and other
#' classification metrics for precision, recall, sensitivity, and specificity. The function
#' takes as input various evaluation objects including R-squared values (\code{r2_params}),
#' dispersion values (\code{r2_dispersion}), PR object (\code{pr_obj}), ROC object
#' (\code{roc_obj}), and machine learning performance metrics object (\code{ml_metrics_obj}).
#' The function generates separate data frames for metric values by parameter value and for the
#' aggregated metric values.
#' 
#' @param r2_params R-squared and RMSE values from model parameters evaluation object.
#' @param r2_agg R-squared and RMSE values aggregated.
#' @param r2_dispersion R-squared and RMSE values from dispersion evaluation object.
#' @param pr_obj PR object generated from evaluation report.
#' @param roc_obj ROC object generated from evaluation report.
#' @param ml_metrics_obj Machine learning performance metrics object.
#' 
#' @return A list containing separate data frames for by-parameter and aggregated metric values.
#' @export 
get_performances_metrics_obj <- function(r2_params , r2_agg ,r2_dispersion,
                                         pr_obj, roc_obj, ml_metrics_obj ){
  ## -- by params  
  auc_mtrics_params <- join_dtf(pr_obj$byparams$pr_auc , roc_obj$byparams$roc_auc,
                         k1 = c("from", "description"), k2 = c("from", "description"))
  metrics_params <- join_dtf(auc_mtrics_params,  ml_metrics_obj$byparams, 
                        k1 = c("from", "description"), k2 = c("from", "description")) 
  rsquare_mtrics <- rbind(r2_params, r2_dispersion)
  metrics_params <- join_dtf(metrics_params, rsquare_mtrics, 
                        k1 = c("from", "description"), k2 = c("from", "description")) 
  ## -- aggregate
  auc_mtrics_agg <- join_dtf(pr_obj$aggregate$pr_auc , roc_obj$aggregate$roc_auc,
                      k1 = c("from"), k2 = c("from"))
  metrics_agg <- join_dtf(auc_mtrics_agg , ml_metrics_obj$aggregate,
                             k1 = c("from"), k2 = c("from"))
  metrics_agg <- join_dtf(metrics_agg, r2_agg,
                          k1 = c("from"), k2 = c("from"))
  return(list(byparams = metrics_params, aggregate = metrics_agg ))  
}


#' Compute summary metrics on classification results
#'
#' This function computes several classification metrics like accuracy, precision, recall,
#' sensitivity and specificity on classification results. The input to the function is a data frame
#' (\code{dt}) containing the predicted classification result as \code{y_pred} and the actual
#' classification as \code{isDE}. The function returns a data frame with the computed metrics.
#'
#' @param dt Data frame containing the predicted and actual classification results.
#'
#' @return A data frame with the computed classification metrics of accuracy, precision, recall,
#' sensitivity and specificity.
#' @export
compute_metrics_summary <- function(dt) {
  
  data.frame(
    accuracy = accuracy( dt$y_pred, dt$isDE ),
    precision = precision(dt$y_pred, y_true = dt$isDE, positive = "TRUE"),
    recall = recall(dt$y_pred, y_true = dt$isDE, positive = "TRUE"),
    sensitivity = sensitivity(dt$y_pred, y_true = dt$isDE, positive = "TRUE"),
    specificity = specificity(dt$y_pred , y_true = dt$isDE, positive = "TRUE")
  )
}

#' Get classification metrics for evaluation object
#'
#' This function extracts the classification metrics. It takes as input (\code{evaldata_params}) 
#' and an optional risk level for the alpha risk (\code{alpha_risk}).
#' It retrieves the p-values from the identity term and computes the binary classification
#' result by thresholding with the alpha risk level. It then computes the classification metrics
#' using \code{compute_metrics_summary} function separately for each parameter value as well as
#' for the aggregated results.
#'
#' @param evaldata_params Identity term of the evaluation object.
#' @param alpha_risk parameter that sets the threshold for alpha risk level (default 0.05).
#' @param col_param  parameter that sets the column name for the parameter (default "description").
#'
#' @return A list containing separate data frames for classification metrics by parameter value
#' and for aggregated classification metrics.
#'
#' @importFrom data.table setDT .SD
#' @export
get_ml_metrics_obj <- function(evaldata_params, alpha_risk = 0.05, col_param = "description"){
  
   ## -- subset fixed eff
  evaldata_params <- subset(evaldata_params, effect == "fixed")
  
  
  evaldata_params$y_pred <-  evaldata_params$p.adj <  alpha_risk
  
  ## by params
  dt_evaldata_params <- data.table::setDT(evaldata_params)
  byparam_metrics <- dt_evaldata_params[, compute_metrics_summary(.SD), by=c("from", col_param), .SDcols=c("y_pred", "isDE")]
  
  ## aggreg
  agg_metrics <- dt_evaldata_params[, compute_metrics_summary(.SD), by=c("from"), .SDcols=c("y_pred", "isDE")]
  
  return(list( byparams =  as.data.frame(byparam_metrics), aggregate = as.data.frame(agg_metrics)))
}





#' Extracts evaluation data from a list of TMB models.
#'
#' This function takes a list of TMB models, performs tidy evaluation, extracts model parameters,
#' and compares them to the ground truth effects. Additionally, it evaluates and compares dispersion
#' inferred from TMB with the ground truth gene dispersion. The results are organized in two data frames,
#' one for model parameters and one for dispersion, both labeled as "HTRfit".
#'
#' @param list_tmb A list of TMB models.
#' @param mock_obj A mock object containing ground truth information.
#' @param coeff_threshold The coefficient threshold for wald test
#' @param alt_hypothesis The alternative hypothesis for wald test
#' @return A list containing data frames for model parameters and dispersion.
#' @export
get_eval_data_from_ltmb <- function(list_tmb, mock_obj, coeff_threshold, alt_hypothesis ){

  ## -- reshape 2 dataframe
  tidyRes  <- tidy_results(list_tmb, coeff_threshold, alt_hypothesis)

  ## -- model params
  formula_used <- list_tmb[[1]]$modelInfo$allForm$formula
  params_df <- compareInferenceToExpected(tidyRes, mock_obj$groundTruth$effects, formula_used)
  params_df <- getLabelExpected(params_df, coeff_threshold, alt_hypothesis)
  params_df$from <- "HTRfit"

  ## -- dispersion
  dispersion_inferred <- extract_tmbDispersion(list_tmb)
  dispersion_df <- getDispersionComparison(dispersion_inferred, mock_obj$groundTruth$gene_dispersion)
  dispersion_df$from <- "HTRfit"

  return(list(modelparams = params_df, modeldispersion = dispersion_df ))
}


#' Extracts evaluation data from a DESeqDataSet (dds) object.
#'
#' This function takes a DESeqDataSet object, performs tidy evaluation, extracts model parameters
#' (beta in the case of DESeqDataSet), and compares them to the ground truth effects. Additionally,
#' it evaluates and compares dispersion inferred from DESeqDataSet with the ground truth gene dispersion.
#' The results are organized in two data frames, one for model parameters and one for dispersion, both #' labeled as "HTRfit".
#'
#' @param dds A DESeqDataSet object.
#' @param mock_obj A mock object containing ground truth information.
#' @param coeff_threshold The coefficient threshold wald test
#' @param alt_hypothesis The alternative hypothesis wald test
#' @return A list containing data frames for model parameters and dispersion.
#' @export
get_eval_data_from_dds <- function(dds, mock_obj, coeff_threshold, alt_hypothesis){

  ## -- reshape 2 dataframe
  tidy_dds <- wrap_dds(dds, coeff_threshold, alt_hypothesis)

  ## -- model params (beta in case of dds)
  params_df <- inferenceToExpected_withFixedEff(tidy_dds$fixEff, mock_obj$groundTruth$effects)
  params_df <- getLabelExpected(params_df, coeff_threshold, alt_hypothesis)
  params_df$component <- NA
  params_df$group <- NA
  params_df$from <- "DESeq2"

  ## -- dispersion
  dispersion_inferred <- extract_ddsDispersion(tidy_dds)
  dispersion_df <- getDispersionComparison(dispersion_inferred , mock_obj$groundTruth$gene_dispersion)
  dispersion_df$from <- "DESeq2"

  return(list(modelparams = params_df, modeldispersion = dispersion_df ))

}




#' Combines evaluation data from TMB and DESeqDataSet (dds) objects.
#'
#' This function combines model parameters and dispersion data frames from TMB and DESeqDataSet (dds) evaluations.
#'
#' @param evaldata_tmb Evaluation data from TMB models.
#' @param evaldata_dds Evaluation data from DESeqDataSet (dds) object.
#' @return A list containing combined data frames for model parameters and dispersion.
#' @export
rbind_evaldata_tmb_dds <- function(evaldata_tmb, evaldata_dds){
  ## -- rbind
  evaldata_dispersion <- rbind(evaldata_tmb$modeldispersion, evaldata_dds$modeldispersion)
  evaldata_params <- rbind(evaldata_tmb$modelparams, evaldata_dds$modelparams)

  ## -- res
  return(list(modelparams = evaldata_params, modeldispersion = evaldata_dispersion ))
}


#' Combines model parameters and dispersion data frames.
#'
#' This function combines model parameters and dispersion data frames, ensuring proper alignment.
#'
#' @param eval_data Evaluation data containing model parameters and dispersion.
#' @return A combined data frame with model parameters and dispersion.
#' @export
rbind_model_params_and_dispersion <- function(eval_data){
  ## -- split
  disp_df <- eval_data$modeldispersion
  params_df <- eval_data$modelparams
  ## -- merging model and dispersion param
  disp_df[setdiff(names(params_df), names(disp_df))] <- NA
  disp_df <- disp_df[names(params_df)]
  ## -- rbind
  res_df <- rbind(params_df, disp_df)
  return(res_df)
}



#' Gets evaluation data from both TMB and DESeqDataSet (dds) objects.
#'
#' This function retrieves evaluation data from TMB and DESeqDataSet (dds) objects, combining
#' the results into a list containing data frames for model parameters and dispersion.
#'
#' @param l_tmb A list of TMB models (default is NULL).
#' @param dds A DESeqDataSet object (default is NULL).
#' @param mock_obj A mock object containing ground truth information.
#' @param coefficient Threshold value for coefficient testing (default is 0). This threshold corresponds to the natural logarithm of the fold change (ln(FC)).
#' @param alt_hypothesis The alternative hypothesis for wald test
#' @return A list containing data frames for model parameters and dispersion.
#' @export
get_eval_data <- function(l_tmb = NULL, dds = NULL , mock_obj, coefficient, alt_hypothesis){
  ## -- init 
  eval_data_tmb <- NULL
  eval_data_dds <- NULL
  
  ## -- evaluation data
  eval_data_tmb <- if (!is.null(l_tmb)) get_eval_data_from_ltmb(l_tmb, mock_obj, coefficient, alt_hypothesis )
  eval_data_dds <- if (!is.null(dds)) get_eval_data_from_dds(dds, mock_obj, coefficient, alt_hypothesis )
  ## -- merge/rbind
  eval_data <- rbind_evaldata_tmb_dds(eval_data_tmb, eval_data_dds)
  
  return(eval_data)
}