diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c84bd3f0060631fddd073be699a7bfbe4699406d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +inst/doc +.Rproj.user +.Rbuildignore +HTRfit.Rproj +.Rhistory +vignettes/log.txt diff --git a/NAMESPACE b/NAMESPACE index 8463ec664ed394337f3100fa56d0eebc5b3901c5..2d61f27fa5aeb1fb8aff4ca690584377bf9495a1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,9 @@ # Generated by roxygen2: do not edit by hand export("%>%") -export(AUC) export(Area_Under_Curve) export(ConfusionDF) export(ConfusionMatrix) -export(PRAUC) export(accuracy) export(addBasalExpression) export(add_interaction) @@ -18,6 +16,7 @@ export(build_missingColumn_with_na) export(calculate_actualMixed) export(calculate_actual_interactionX2_values) export(calculate_actual_interactionX3_values) +export(checkFractionOfZero) export(check_input2interaction) export(clean_variable_name) export(compareInferenceToExpected) @@ -121,6 +120,7 @@ export(is_formula_mixedTypeI) export(is_fullrank) export(is_mixedEffect_inFormula) export(is_positive_definite) +export(is_truthLabels_valid) export(is_validGroupBy) export(join_dtf) export(launchFit) @@ -204,10 +204,8 @@ importFrom(stats,approxfun) importFrom(stats,as.formula) importFrom(stats,cor) importFrom(stats,drop.terms) -importFrom(stats,integrate) importFrom(stats,median) importFrom(stats,model.matrix) -importFrom(stats,na.omit) importFrom(stats,p.adjust) importFrom(stats,pnorm) importFrom(stats,rnbinom) diff --git a/R/evaluation_withmixedeffect.R b/R/evaluation_withmixedeffect.R index 4b204476fd04f6c2c663b4fcfb808de8dfcddd3d..f19755b2646e9e720b1881006f0761741a929bc5 100644 --- a/R/evaluation_withmixedeffect.R +++ b/R/evaluation_withmixedeffect.R @@ -130,7 +130,8 @@ getActualMixed_typeI <- function(list_logqij, genes_iter_list, categoricalVar_in #' Compare the mixed-effects inference to expected values. #' -#' This function compares the mixed-effects inference obtained from a mixed-effects model to expected values derived from a ground truth dataset. The function assumes a specific type I mixed-effect structure in the input model. +#' This function compares the mixed-effects inference obtained from a mixed-effects model to expected values derived from a ground truth dataset. +#' The function assumes a specific type I mixed-effect structure in the input model. # #' @param tidy_tmb tidy model results obtained from fitting a mixed-effects model. #' @param ground_truth_eff A data frame containing ground truth effects. @@ -172,8 +173,8 @@ inferenceToExpected_withMixedEff <- function(tidy_tmb, ground_truth_eff){ #' Calculate actual mixed effects. #' #' This function calculates actual mixed effects based on the given data for a specific type I mixed-effect structure. -# It calculates the expected values, standard deviations, and correlations between the fixed and random effects. -# The function is designed to work with specific input data for type I mixed-effect calculations. +# It calculates the expected values, standard deviations, and correlations between the fixed and random effects. +# The function is designed to work with specific input data for type I mixed-effect calculations. # #' @param data_gene Data for a specific gene. #' @param labelRef_InCategoricalVar The reference label for the categorical variable. diff --git a/R/fake-section-title.R b/R/fake-section-title.R index 3b93160efebdb8c53a336b4ff105f8aba4c1b075..72d65fe5ac370bd3d2ef754583daf56a0f49251e 100644 --- a/R/fake-section-title.R +++ b/R/fake-section-title.R @@ -289,6 +289,14 @@ prediction <- function(predictions, labels, label.ordering=NULL) { stop(paste("Number of predictions in each run must be equal", "to the number of labels for each run.")) + ## replace infinite numbers by max values + ## avoid prob with infinite values + #for (i in 1:length(predictions)) { + # epsilon <- max(predictions[[i]][is.finite(predictions[[i]] )]) + # idx_inf_values <- !is.finite( predictions[[i]] ) + # predictions[[i]][idx_inf_values] <- epsilon + #} + ## only keep prediction/label pairs that are finite numbers for (i in 1:length(predictions)) { finite.bool <- is.finite( predictions[[i]] ) @@ -296,6 +304,9 @@ prediction <- function(predictions, labels, label.ordering=NULL) { labels[[i]] <- labels[[i]][ finite.bool ] } + + + ## abort if 'labels' format is inconsistent across ## different cross-validation runs label.format="" ## one of 'normal','factor','ordered' @@ -349,7 +360,8 @@ prediction <- function(predictions, labels, label.ordering=NULL) { } } - + #print(levels) + #print(labels) if (length(levels) != 2) { message <- paste("Number of classes is not equal to 2.\n", "HTRfit currently supports only evaluation of ", diff --git a/R/fitmodel.R b/R/fitmodel.R index 2f26e1ca4df7cb4b79097b5ef3ddefa733864bf1..4ae9cb79a03730683ed1869a0e36c896f59fb6b8 100644 --- a/R/fitmodel.R +++ b/R/fitmodel.R @@ -242,7 +242,7 @@ parallel_fit <- function(groups, group_by, formula, data, n.cores = NULL, l_data2parallel <- prepare_dataParallel(groups, group_by, data) clust <- parallel::makeCluster(n.cores, outfile = log_file , type= cl_type ) - parallel::clusterExport(clust, c("fitModel"), envir=environment()) + parallel::clusterExport(clust, c("fitModel")) results_fit <- parallel::parLapply(clust, X = l_data2parallel, fun = launchFit, group_by = group_by, formula = formula, ...) diff --git a/R/mlmetrics.R b/R/mlmetrics.R index 29807816152aa3d26e2a2a853ac18ab77ed3f386..dcf8a0581d43df41c0379a17314b2a23d4114e1e 100644 --- a/R/mlmetrics.R +++ b/R/mlmetrics.R @@ -192,58 +192,29 @@ specificity <- function(y_true, y_pred, positive = NULL) { #' @param x the x-points of the curve #' @param y the y-points of the curve #' @param method can be "trapezoid" (default), "step" or "spline" -#' @param na.rm a logical value indicating whether NA values should be stripped before the computation proceeds #' @return Area Under the Curve (AUC) #' @examples #' x <- seq(0, pi, length.out = 200) #' plot(x = x, y = sin(x), type = "l") -#' Area_Under_Curve(x = x, y = sin(x), method = "trapezoid", na.rm = TRUE) -#' @importFrom stats integrate -#' @importFrom stats na.omit +#' Area_Under_Curve(x = x, y = sin(x), method = "trapezoid") #' @importFrom stats splinefun #' @export -Area_Under_Curve <- function(x, y, method = c("trapezoid", "step", "spline"), na.rm = FALSE) { - if (na.rm == TRUE) { - xy_cbind <- na.omit(cbind(x, y)) - x <- xy_cbind[, 1] - y <- xy_cbind[, 2] - } - if (length(x) != length(y)) { - stop("length x must equal length y") - } +Area_Under_Curve <- function(x, y, method = "trapezoid"){ idx <- order(x) x <- x[idx] y <- y[idx] - switch(match.arg(arg = method, choices = c("trapezoid", "step","spline")), - trapezoid = { - AUC <- sum((apply(cbind(y[-length(y)], y[-1]), 1, mean)) *(x[-1] - x[-length(x)]))}, - step = { - AUC <- sum(y[-length(y)] * (x[-1] - x[-length(x)]))}, - spline = { - AUC <- integrate(splinefun(x, y, method = "natural"), lower = min(x),upper = max(x))$value}) - return(AUC) + if (method == 'trapezoid'){ + auc <- sum((rowMeans(cbind(y[-length(y)], y[-1]))) * (x[-1] - x[-length(x)])) + }else if (method == 'step'){ + auc <- sum(y[-length(y)] * (x[-1] - x[-length(x)])) + }else if (method == 'spline'){ + auc <- integrate(splinefun(x, y, method = "natural"), lower = min(x), upper = max(x)) + auc <- auc$value + } + return(auc) } -#' @title Area Under the precision-recall Curve (PR AUC) -#' -#' @description -#' Compute the Area Under the precision-recall Curve (PR AUC) from prediction scores. -#' -#' @param y_pred Predicted probabilities vector, as returned by a classifier -#' @param y_true Ground truth (correct) 0-1 labels vector -#' @return Area Under the PR Curve (PR AUC) -#' @examples -#' data(cars) -#' logreg <- glm(formula = vs ~ hp + wt, -#' family = binomial(link = "logit"), data = mtcars) -#' PRAUC(y_pred = logreg$fitted.values, y_true = mtcars$vs) -#' @export -PRAUC <- function(y_pred, y_true) { - pred_obj <- prediction(y_pred, y_true) - perf_obj <- performance(pred_obj, measure = "prec", x.measure = "rec") - PRAUC <- Area_Under_Curve(perf_obj@x.values[[1]], perf_obj@y.values[[1]], method = "trapezoid", na.rm = TRUE) - return(PRAUC) -} + .performance.positive.predictive.value <- function(predictions, labels, cutoffs, fp, tp, fn, tn, @@ -285,25 +256,4 @@ PRAUC <- function(y_pred, y_true) { } -#' @title Area Under the Receiver Operating Characteristic Curve (ROC AUC) -#' -#' @description -#' Compute the Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. -#' -#' @param y_pred Predicted probabilities vector, as returned by a classifier -#' @param y_true Ground truth (correct) 0-1 labels vector -#' @return Area Under the ROC Curve (ROC AUC) -#' @examples -#' data(cars) -#' logreg <- glm(formula = vs ~ hp + wt, -#' family = binomial(link = "logit"), data = mtcars) -#' AUC(y_pred = logreg$fitted.values, y_true = mtcars$vs) -#' @export -AUC <- function(y_pred, y_true) { - rank <- rank(y_pred) - n_pos <- as.double(sum(y_true == 1)) - n_neg <- as.double(sum(y_true == 0)) - AUC <- (sum(rank[y_true == 1]) - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg) - return(AUC) -} diff --git a/R/mock_rnaseq.R b/R/mock_rnaseq.R index e7ad322d4e713bd86985cd0fcc8f2c309b6a1e2e..7fc440aedddce49d047fdee2e3a127f914e1aaad 100644 --- a/R/mock_rnaseq.R +++ b/R/mock_rnaseq.R @@ -104,6 +104,8 @@ mock_rnaseq <- function(list_var, n_genes, min_replicates, max_replicates, seque dtf_countsTable <- scaleCountsTable(dtf_countsTable, sequencing_depth) } + checkFractionOfZero(dtf_countsTable) + metaData <- getSampleMetadata(list_var, n_genes, matx_bool_replication) libSize <- sum(colSums(dtf_countsTable)) settings_df <- getSettingsTable(n_genes, min_replicates, max_replicates, libSize) @@ -120,7 +122,35 @@ mock_rnaseq <- function(list_var, n_genes, min_replicates, max_replicates, seque } - +#' Check Fraction of Zero or One in Counts Table +#' +#' This function checks the percentage of counts in a given counts table that are either zero or one. +#' If more than 50% of the counts fall in this category, a warning is issued, suggesting a review of input parameters. +#' +#' @param counts_table A matrix or data frame representing counts. +#' @return NULL +#' @export +#' @examples +#' # Example usage: +#' counts_table <- matrix(c(0, 1, 2, 3, 4, 0, 0, 1, 1), ncol = 3) +#' checkFractionOfZero(counts_table) +checkFractionOfZero <- function(counts_table){ + + dim_matrix <- dim(counts_table) + + n_counts <- dim_matrix[1]*dim_matrix[2] + + n_zero_or_one <- sum(counts_table < 1) + + fractionOfZero <- n_zero_or_one/n_counts*100 + + if( fractionOfZero > 50) { + message("50% of the counts simulated are bellow 1. Consider reviewing your input parameters.") + } + + return(NULL) + +} diff --git a/R/na-na.R b/R/na-na.R new file mode 100644 index 0000000000000000000000000000000000000000..72d65fe5ac370bd3d2ef754583daf56a0f49251e --- /dev/null +++ b/R/na-na.R @@ -0,0 +1,931 @@ +# WARNING - Generated by {fusen} from /dev/flat_full.Rmd: do not edit by hand + + +#' @name prediction-class +#' @aliases prediction-class +#' +#' @title Class \code{prediction} +#' +#' @description +#' Object to encapsulate numerical predictions together with the +#' corresponding true class labels, optionally collecting predictions and +#' labels for several cross-validation or bootstrapping runs. +#' +#' @section Objects from the Class: +#' Objects can be created by using the \code{prediction} function. +#' +#' @note +#' Every \code{prediction} object contains information about the 2x2 +#' contingency table consisting of tp,tn,fp, and fn, along with the +#' marginal sums n.pos,n.neg,n.pos.pred,n.neg.pred, because these form +#' the basis for many derived performance measures. +#' +#' @slot predictions A list, in which each element is a vector of predictions +#' (the list has length > 1 for x-validation data. +#' @slot labels Analogously, a list in which each element is a vector of true +#' class labels. +#' @slot cutoffs A list in which each element is a vector of all necessary +#' cutoffs. Each cutoff vector consists of the predicted scores (duplicates +#' removed), in descending order. +#' @slot fp A list in which each element is a vector of the number (not the +#' rate!) of false positives induced by the cutoffs given in the corresponding +#' 'cutoffs' list entry. +#' @slot tp As fp, but for true positives. +#' @slot tn As fp, but for true negatives. +#' @slot fn As fp, but for false negatives. +#' @slot n.pos A list in which each element contains the number of positive +#' samples in the given x-validation run. +#' @slot n.neg As n.pos, but for negative samples. +#' @slot n.pos.pred A list in which each element is a vector of the number of +#' samples predicted as positive at the cutoffs given in the corresponding +#' 'cutoffs' entry. +#' @slot n.neg.pred As n.pos.pred, but for negatively predicted samples. +#' +#' +#' @author +#' Tobias Sing \email{tobias.sing@gmail.com}, Oliver Sander +#' \email{osander@gmail.com} +#' +#' @seealso +#' \code{\link{prediction}}, +#' \code{\link{performance}}, +#' \code{\link{performance-class}}, +#' \code{\link{plot.performance}} +#' +#' @export +setClass("prediction", + representation(predictions = "list", + labels = "list", + cutoffs = "list", + fp = "list", + tp = "list", + tn = "list", + fn = "list", + n.pos = "list", + n.neg = "list", + n.pos.pred = "list", + n.neg.pred = "list")) + +setMethod("show","prediction", + function(object){ + cat("A ", class(object), " instance\n", sep = "") + if(length(object@predictions) > 1L){ + cat(" with ", length(object@predictions)," cross ", + "validation runs ", sep = "") + if(length(unique(vapply(object@predictions,length,integer(1))))){ + cat("(equal lengths)", sep = "") + } else { + cat("(different lengths)", sep = "") + } + } else { + cat(" with ", length(object@predictions[[1L]]), + " data points", sep = "") + } + }) + +#' @name performance-class +#' @aliases performance-class +#' +#' @title Class \code{performance} +#' +#' @description +#' Object to capture the result of a performance evaluation, optionally +#' collecting evaluations from several cross-validation or bootstrapping runs. +#' +#' @section Objects from the Class: +#' Objects can be created by using the \code{performance} function. +#' +#' @details +#' A \code{performance} object can capture information from four +#' different evaluation scenarios: +#' \itemize{ +#' \item The behaviour of a cutoff-dependent performance measure across +#' the range of all cutoffs (e.g. \code{performance( predObj, 'acc' )} ). Here, +#' \code{x.values} contains the cutoffs, \code{y.values} the +#' corresponding values of the performance measure, and +#' \code{alpha.values} is empty.\cr +#' \item The trade-off between two performance measures across the +#' range of all cutoffs (e.g. \code{performance( predObj, +#' 'tpr', 'fpr' )} ). In this case, the cutoffs are stored in +#' \code{alpha.values}, while \code{x.values} and \code{y.values} +#' contain the corresponding values of the two performance measures.\cr +#' \item A performance measure that comes along with an obligatory +#' second axis (e.g. \code{performance( predObj, 'ecost' )} ). Here, the measure values are +#' stored in \code{y.values}, while the corresponding values of the +#' obligatory axis are stored in \code{x.values}, and \code{alpha.values} +#' is empty.\cr +#' \item A performance measure whose value is just a scalar +#' (e.g. \code{performance( predObj, 'auc' )} ). The value is then stored in +#' \code{y.values}, while \code{x.values} and \code{alpha.values} are +#' empty. +#' } +#' +#' @slot x.name Performance measure used for the x axis. +#' @slot y.name Performance measure used for the y axis. +#' @slot alpha.name Name of the unit that is used to create the parametrized +#' curve. Currently, curves can only be parametrized by cutoff, so +#' \code{alpha.name} is either \code{none} or \code{cutoff}. +#' @slot x.values A list in which each entry contains the x values of the curve +#' of this particular cross-validation run. \code{x.values[[i]]}, +#' \code{y.values[[i]]}, and \code{alpha.values[[i]]} correspond to each +#' other. +#' @slot y.values A list in which each entry contains the y values of the curve +#' of this particular cross-validation run. +#' @slot alpha.values A list in which each entry contains the cutoff values of +#' the curve of this particular cross-validation run. +#' +#' @references +#' A detailed list of references can be found on the ROCR homepage at +#' \url{http://rocr.bioinf.mpi-sb.mpg.de}. +#' +#' @author +#' Tobias Sing \email{tobias.sing@gmail.com}, Oliver Sander +#' \email{osander@gmail.com} +#' +#' @seealso +#' \code{\link{prediction}} +#' \code{\link{performance}}, +#' \code{\link{prediction-class}}, +#' \code{\link{plot.performance}} +#' +#' @export +setClass("performance", + representation(x.name = "character", + y.name = "character", + alpha.name = "character", + x.values = "list", + y.values = "list", + alpha.values = "list" )) + +setMethod("show","performance", + function(object){ + cat("A ", class(object), " instance\n", sep = "") + if(length(object@y.values[[1L]]) > 1L){ + cat(" '", object@x.name, "' vs. '", object@y.name, + "' (alpha: '",object@alpha.name,"')\n", sep = "") + } else { + cat(" '", object@y.name, "'\n", sep = "") + } + if(length(object@y.values) > 1L){ + cat(" for ", length(object@y.values)," cross ", + "validation runs ", sep = "") + } else { + if(length(object@y.values[[1L]]) > 1L){ + cat(" with ", length(object@y.values[[1L]])," data points", + sep = "") + } + } + }) + + + +#' @name prediction +#' +#' @title Function to create prediction objects +#' +#' @description +#' Every classifier evaluation using ROCR starts with creating a +#' \code{prediction} object. This function is used to transform the input data +#' (which can be in vector, matrix, data frame, or list form) into a +#' standardized format. +#' +#' @details +#' \code{predictions} and \code{labels} can simply be vectors of the same +#' length. However, in the case of cross-validation data, different +#' cross-validation runs can be provided as the *columns* of a matrix or +#' data frame, or as the entries of a list. In the case of a matrix or +#' data frame, all cross-validation runs must have the same length, whereas +#' in the case of a list, the lengths can vary across the cross-validation +#' runs. Internally, as described in section 'Value', all of these input +#' formats are converted to list representation. +#' +#' Since scoring classifiers give relative tendencies towards a negative +#' (low scores) or positive (high scores) class, it has to be declared +#' which class label denotes the negative, and which the positive class. +#' Ideally, labels should be supplied as ordered factor(s), the lower +#' level corresponding to the negative class, the upper level to the +#' positive class. If the labels are factors (unordered), numeric, +#' logical or characters, ordering of the labels is inferred from +#' R's built-in \code{<} relation (e.g. 0 < 1, -1 < 1, 'a' < 'b', +#' FALSE < TRUE). Use \code{label.ordering} to override this default +#' ordering. Please note that the ordering can be locale-dependent +#' e.g. for character labels '-1' and '1'. +#' +#' Currently, ROCR supports only binary classification (extensions toward +#' multiclass classification are scheduled for the next release, +#' however). If there are more than two distinct label symbols, execution +#' stops with an error message. If all predictions use the same two +#' symbols that are used for the labels, categorical predictions are +#' assumed. If there are more than two predicted values, but all numeric, +#' continuous predictions are assumed (i.e. a scoring +#' classifier). Otherwise, if more than two symbols occur in the +#' predictions, and not all of them are numeric, execution stops with an +#' error message. +#' +#' @param predictions A vector, matrix, list, or data frame containing the +#' predictions. +#' @param labels A vector, matrix, list, or data frame containing the true class +#' labels. Must have the same dimensions as \code{predictions}. +#' @param label.ordering The default ordering (cf.details) of the classes can +#' be changed by supplying a vector containing the negative and the positive +#' class label. +#' +#' @return An S4 object of class \code{prediction}. +#' +#' @author +#' Tobias Sing \email{tobias.sing@gmail.com}, Oliver Sander +#' \email{osander@gmail.com} +#' @export +prediction <- function(predictions, labels, label.ordering=NULL) { + + ## bring 'predictions' and 'labels' into list format, + ## each list entry representing one x-validation run + + ## convert predictions into canonical list format + if (is.data.frame(predictions)) { + names(predictions) <- c() + predictions <- as.list(predictions) + } else if (is.matrix(predictions)) { + predictions <- as.list(data.frame(predictions)) + names(predictions) <- c() + } else if (is.vector(predictions) && !is.list(predictions)) { + predictions <- list(predictions) + } else if (!is.list(predictions)) { + stop("Format of predictions is invalid. It couldn't be coerced to a list.", + call. = FALSE) + } + + ## convert labels into canonical list format + if (is.data.frame(labels)) { + names(labels) <- c() + labels <- as.list( labels) + } else if (is.matrix(labels)) { + labels <- as.list( data.frame( labels)) + names(labels) <- c() + } else if ((is.vector(labels) || + is.ordered(labels) || + is.factor(labels)) && + !is.list(labels)) { + labels <- list( labels) + } else if (!is.list(labels)) { + stop("Format of labels is invalid. It couldn't be coerced to a list.", + call. = FALSE) + } + + + if(any(vapply(predictions,anyNA, logical(1)))){ + warnings("'predictions' contains NA. These missing predictions will be removed from evaluation") + nonNA_pred <- !is.na(predictions) + predictions <- predictions[nonNA_pred] + labels <- labels[nonNA_pred] + } + + + ## Length consistency checks + if (length(predictions) != length(labels)) + stop(paste("Number of cross-validation runs must be equal", + "for predictions and labels.")) + if (! all(sapply(predictions, length) == sapply(labels, length))) + stop(paste("Number of predictions in each run must be equal", + "to the number of labels for each run.")) + + ## replace infinite numbers by max values + ## avoid prob with infinite values + #for (i in 1:length(predictions)) { + # epsilon <- max(predictions[[i]][is.finite(predictions[[i]] )]) + # idx_inf_values <- !is.finite( predictions[[i]] ) + # predictions[[i]][idx_inf_values] <- epsilon + #} + + ## only keep prediction/label pairs that are finite numbers + for (i in 1:length(predictions)) { + finite.bool <- is.finite( predictions[[i]] ) + predictions[[i]] <- predictions[[i]][ finite.bool ] + labels[[i]] <- labels[[i]][ finite.bool ] + } + + + + + ## abort if 'labels' format is inconsistent across + ## different cross-validation runs + label.format="" ## one of 'normal','factor','ordered' + if (all(sapply( labels, is.factor)) && + !any(sapply(labels, is.ordered))) { + label.format <- "factor" + } else if (all(sapply( labels, is.ordered))) { + label.format <- "ordered" + } else if (all(sapply( labels, is.character)) || + all(sapply( labels, is.numeric)) || + all(sapply( labels, is.logical))) { + label.format <- "normal" + } else { + stop(paste("Inconsistent label data type across different", + "cross-validation runs.")) + } + + ## abort if levels are not consistent across different + ## cross-validation runs + if (! all(sapply(labels, levels)==levels(labels[[1]])) ) { + stop(paste("Inconsistent factor levels across different", + "cross-validation runs.")) + } + + ## convert 'labels' into ordered factors, aborting if the number + ## of classes is not equal to 2. + levels <- c() + if ( label.format == "ordered" ) { + if (!is.null(label.ordering)) { + stop(paste("'labels' is already ordered. No additional", + "'label.ordering' must be supplied.")) + } else { + levels <- levels(labels[[1]]) + } + } else { + if ( is.null( label.ordering )) { + if ( label.format == "factor" ) levels <- sort(levels(labels[[1]])) + else levels <- sort( unique( unlist( labels))) + } else { + ## if (!setequal( levels, label.ordering)) { + if (!setequal( unique(unlist(labels)), label.ordering )) { + stop("Label ordering does not match class labels.") + } + levels <- label.ordering + } + for (i in 1:length(labels)) { + if (is.factor(labels)) + labels[[i]] <- ordered(as.character(labels[[i]]), + levels=levels) + else labels[[i]] <- ordered( labels[[i]], levels=levels) + } + + } + #print(levels) + #print(labels) + if (length(levels) != 2) { + message <- paste("Number of classes is not equal to 2.\n", + "HTRfit currently supports only evaluation of ", + "binary classification tasks.",sep="") + stop(message) + } + + ## determine whether predictions are continuous or categorical + ## (in the latter case stop + if (!is.numeric( unlist( predictions ))) { + stop("Currently, only continuous predictions are supported by HTRfit") + } + + ## compute cutoff/fp/tp data + + cutoffs <- list() + fp <- list() + tp <- list() + fn <- list() + tn <- list() + n.pos <- list() + n.neg <- list() + n.pos.pred <- list() + n.neg.pred <- list() + for (i in 1:length(predictions)) { + n.pos <- c( n.pos, sum( labels[[i]] == levels[2] )) + n.neg <- c( n.neg, sum( labels[[i]] == levels[1] )) + ans <- .compute.unnormalized.roc.curve( predictions[[i]], labels[[i]] ) + cutoffs <- c( cutoffs, list( ans$cutoffs )) + fp <- c( fp, list( ans$fp )) + tp <- c( tp, list( ans$tp )) + fn <- c( fn, list( n.pos[[i]] - tp[[i]] )) + tn <- c( tn, list( n.neg[[i]] - fp[[i]] )) + n.pos.pred <- c(n.pos.pred, list(tp[[i]] + fp[[i]]) ) + n.neg.pred <- c(n.neg.pred, list(tn[[i]] + fn[[i]]) ) + } + + + return( new("prediction", predictions=predictions, + labels=labels, + cutoffs=cutoffs, + fp=fp, + tp=tp, + fn=fn, + tn=tn, + n.pos=n.pos, + n.neg=n.neg, + n.pos.pred=n.pos.pred, + n.neg.pred=n.neg.pred)) +} + +## fast fp/tp computation based on cumulative summing +.compute.unnormalized.roc.curve <- function( predictions, labels ) { + ## determine the labels that are used for the pos. resp. neg. class : + pos.label <- levels(labels)[2] + neg.label <- levels(labels)[1] + + pred.order <- order(predictions, decreasing=TRUE) + predictions.sorted <- predictions[pred.order] + tp <- cumsum(labels[pred.order]==pos.label) + fp <- cumsum(labels[pred.order]==neg.label) + + ## remove fp & tp for duplicated predictions + ## as duplicated keeps the first occurrence, but we want the last, two + ## rev are used. + ## Highest cutoff (Infinity) corresponds to tp=0, fp=0 + dups <- rev(duplicated(rev(predictions.sorted))) + tp <- c(0, tp[!dups]) + fp <- c(0, fp[!dups]) + cutoffs <- c(Inf, predictions.sorted[!dups]) + + return(list( cutoffs=cutoffs, fp=fp, tp=tp )) +} + +#' @name performance +#' +#' @title Function to create performance objects +#' +#' @description +#' All kinds of predictor evaluations are performed using this function. +#' +#' @details +#' Here is the list of available performance measures. Let Y and +#' \eqn{\hat{Y}}{Yhat} be random variables representing the class and the prediction for +#' a randomly drawn sample, respectively. We denote by +#' \eqn{\oplus}{+} and \eqn{\ominus}{-} the positive and +#' negative class, respectively. Further, we use the following +#' abbreviations for empirical quantities: P (\# positive +#' samples), N (\# negative samples), TP (\# true positives), TN (\# true +#' negatives), FP (\# false positives), FN (\# false negatives). +#' \describe{ +#' \item{\code{acc}:}{accuracy. \eqn{P(\hat{Y}=Y)}{P(Yhat = Y)}. Estimated +#' as: \eqn{\frac{TP+TN}{P+N}}{(TP+TN)/(P+N)}.} +#' \item{\code{err}:}{Error rate. \eqn{P(\hat{Y}\ne Y)}{P(Yhat != +#' Y)}. Estimated as: \eqn{\frac{FP+FN}{P+N}}{(FP+FN)/(P+N)}.} +#' \item{\code{fpr}:}{False positive rate. \eqn{P(\hat{Y}=\oplus | Y = +#' \ominus)}{P(Yhat = + | Y = -)}. Estimated as: +#' \eqn{\frac{FP}{N}}{FP/N}.} +#' \item{\code{fall}:}{Fallout. Same as \code{fpr}.} +#' \item{\code{tpr}:}{True positive +#' rate. \eqn{P(\hat{Y}=\oplus|Y=\oplus)}{P(Yhat = + | Y = +)}. Estimated +#' as: \eqn{\frac{TP}{P}}{TP/P}.} +#' \item{\code{rec}:}{recall. Same as \code{tpr}.} +#' \item{\code{sens}:}{sensitivity. Same as \code{tpr}.} +#' \item{\code{fnr}:}{False negative +#' rate. \eqn{P(\hat{Y}=\ominus|Y=\oplus)}{P(Yhat = - | Y = +#' +)}. Estimated as: \eqn{\frac{FN}{P}}{FN/P}.} +#' \item{\code{miss}:}{Miss. Same as \code{fnr}.} +#' \item{\code{tnr}:}{True negative rate. \eqn{P(\hat{Y} = +#' \ominus|Y=\ominus)}{P(Yhat = - | Y = -)}.} +#' \item{\code{spec}:}{specificity. Same as \code{tnr}.} +#' \item{\code{ppv}:}{Positive predictive +#' value. \eqn{P(Y=\oplus|\hat{Y}=\oplus)}{P(Y = + | Yhat = +#' +)}. Estimated as: \eqn{\frac{TP}{TP+FP}}{TP/(TP+FP)}.} +#' \item{\code{prec}:}{precision. Same as \code{ppv}.} +#' \item{\code{npv}:}{Negative predictive +#' value. \eqn{P(Y=\ominus|\hat{Y}=\ominus)}{P(Y = - | Yhat = +#' -)}. Estimated as: \eqn{\frac{TN}{TN+FN}}{TN/(TN+FN)}.} +#' \item{\code{pcfall}:}{Prediction-conditioned +#' fallout. \eqn{P(Y=\ominus|\hat{Y}=\oplus)}{P(Y = - | Yhat = +#' +)}. Estimated as: \eqn{\frac{FP}{TP+FP}}{FP/(TP+FP)}.} +#' \item{\code{pcmiss}:}{Prediction-conditioned +#' miss. \eqn{P(Y=\oplus|\hat{Y}=\ominus)}{P(Y = + | Yhat = +#' -)}. Estimated as: \eqn{\frac{FN}{TN+FN}}{FN/(TN+FN)}.} +#' \item{\code{rpp}:}{Rate of positive predictions. \eqn{P( \hat{Y} = +#' \oplus)}{P(Yhat = +)}. Estimated as: (TP+FP)/(TP+FP+TN+FN).} +#' \item{\code{rnp}:}{Rate of negative predictions. \eqn{P( \hat{Y} = +#' \ominus)}{P(Yhat = -)}. Estimated as: (TN+FN)/(TP+FP+TN+FN).} +#' \item{\code{phi}:}{Phi correlation coefficient. \eqn{\frac{TP \cdot +#' TN - FP \cdot FN}{\sqrt{ (TP+FN) \cdot (TN+FP) \cdot (TP+FP) +#' \cdot (TN+FN)}}}{(TP*TN - +#' FP*FN)/(sqrt((TP+FN)*(TN+FP)*(TP+FP)*(TN+FN)))}. Yields a +#' number between -1 and 1, with 1 indicating a perfect +#' prediction, 0 indicating a random prediction. Values below 0 +#' indicate a worse than random prediction.} +#' \item{\code{mat}:}{Matthews correlation coefficient. Same as \code{phi}.} +#' \item{\code{mi}:}{Mutual information. \eqn{I(\hat{Y},Y) := H(Y) - +#' H(Y|\hat{Y})}{I(Yhat, Y) := H(Y) - H(Y | Yhat)}, where H is the +#' (conditional) entropy. Entropies are estimated naively (no bias +#' correction).} +#' \item{\code{chisq}:}{Chi square test statistic. \code{?chisq.test} +#' for details. Note that R might raise a warning if the sample size +#' is too small.} +#' \item{\code{odds}:}{Odds ratio. \eqn{\frac{TP \cdot TN}{FN \cdot +#' FP}}{(TP*TN)/(FN*FP)}. Note that odds ratio produces +#' Inf or NA values for all cutoffs corresponding to FN=0 or +#' FP=0. This can substantially decrease the plotted cutoff region.} +#' \item{\code{lift}:}{Lift +#' value. \eqn{\frac{P(\hat{Y}=\oplus|Y=\oplus)}{P(\hat{Y}=\oplus)}}{P(Yhat = + | +#' Y = +)/P(Yhat = +)}.} +#' \item{\code{f}:}{precision-recall F measure (van Rijsbergen, 1979). Weighted +#' harmonic mean of precision (P) and recall (R). \eqn{F = +#' \frac{1}{\alpha \frac{1}{P} + (1-\alpha)\frac{1}{R}}}{F = 1/ +#' (alpha*1/P + (1-alpha)*1/R)}. If +#' \eqn{\alpha=\frac{1}{2}}{alpha=1/2}, the mean is balanced. A +#' frequent equivalent formulation is +#' \eqn{F = \frac{(\beta^2+1) \cdot P \cdot R}{R + \beta^2 \cdot +#' P}}{F = (beta^2+1) * P * R / (R + beta^2 * P)}. In this formulation, the +#' mean is balanced if \eqn{\beta=1}{beta=1}. Currently, ROCR only accepts +#' the alpha version as input (e.g. \eqn{\alpha=0.5}{alpha=0.5}). If no +#' value for alpha is given, the mean will be balanced by default.} +#' \item{\code{rch}:}{ROC convex hull. A ROC (=\code{tpr} vs \code{fpr}) curve +#' with concavities (which represent suboptimal choices of cutoff) removed +#' (Fawcett 2001). Since the result is already a parametric performance +#' curve, it cannot be used in combination with other measures.} +#' \item{\code{auc}:}{Area under the ROC curve. This is equal to the value of the +#' Wilcoxon-Mann-Whitney test statistic and also the probability that the +#' classifier will score are randomly drawn positive sample higher than a +#' randomly drawn negative sample. Since the output of +#' \code{auc} is cutoff-independent, this +#' measure cannot be combined with other measures into a parametric +#' curve. The partial area under the ROC curve up to a given false +#' positive rate can be calculated by passing the optional parameter +#' \code{fpr.stop=0.5} (or any other value between 0 and 1) to +#' \code{performance}.} +#' \item{\code{aucpr}:}{Area under the precision/recall curve. Since the output +#' of \code{aucpr} is cutoff-independent, this measure cannot be combined +#' with other measures into a parametric curve.} +#' \item{\code{prbe}:}{precision-recall break-even point. The cutoff(s) where +#' precision and recall are equal. At this point, positive and negative +#' predictions are made at the same rate as their prevalence in the +#' data. Since the output of +#' \code{prbe} is just a cutoff-independent scalar, this +#' measure cannot be combined with other measures into a parametric curve.} +#' \item{\code{cal}:}{Calibration error. The calibration error is the +#' absolute difference between predicted confidence and actual reliability. This +#' error is estimated at all cutoffs by sliding a window across the +#' range of possible cutoffs. The default window size of 100 can be +#' adjusted by passing the optional parameter \code{window.size=200} +#' to \code{performance}. E.g., if for several +#' positive samples the output of the classifier is around 0.75, you might +#' expect from a well-calibrated classifier that the fraction of them +#' which is correctly predicted as positive is also around 0.75. In a +#' well-calibrated classifier, the probabilistic confidence estimates +#' are realistic. Only for use with +#' probabilistic output (i.e. scores between 0 and 1).} +#' \item{\code{mxe}:}{Mean cross-entropy. Only for use with +#' probabilistic output. \eqn{MXE :=-\frac{1}{P+N}( \sum_{y_i=\oplus} +#' ln(\hat{y}_i) + \sum_{y_i=\ominus} ln(1-\hat{y}_i))}{MXE := - 1/(P+N) \sum_{y_i=+} +#' ln(yhat_i) + \sum_{y_i=-} ln(1-yhat_i)}. Since the output of +#' \code{mxe} is just a cutoff-independent scalar, this +#' measure cannot be combined with other measures into a parametric curve.} +#' \item{\code{rmse}:}{Root-mean-squared error. Only for use with +#' numerical class labels. \eqn{RMSE:=\sqrt{\frac{1}{P+N}\sum_i (y_i +#' - \hat{y}_i)^2}}{RMSE := sqrt(1/(P+N) \sum_i (y_i - +#' yhat_i)^2)}. Since the output of +#' \code{rmse} is just a cutoff-independent scalar, this +#' measure cannot be combined with other measures into a parametric curve.} +#' \item{\code{sar}:}{Score combinining performance measures of different +#' characteristics, in the attempt of creating a more "robust" +#' measure (cf. Caruana R., ROCAI2004): +#' SAR = 1/3 * ( accuracy + Area under the ROC curve + Root +#' mean-squared error ).} +#' \item{\code{ecost}:}{Expected cost. For details on cost curves, +#' cf. Drummond&Holte 2000,2004. \code{ecost} has an obligatory x +#' axis, the so-called 'probability-cost function'; thus it cannot be +#' combined with other measures. While using \code{ecost} one is +#' interested in the lower envelope of a set of lines, it might be +#' instructive to plot the whole set of lines in addition to the lower +#' envelope. An example is given in \code{demo(ROCR)}.} +#' \item{\code{cost}:}{Cost of a classifier when +#' class-conditional misclassification costs are explicitly given. +#' Accepts the optional parameters \code{cost.fp} and +#' \code{cost.fn}, by which the costs for false positives and +#' negatives can be adjusted, respectively. By default, both are set +#' to 1.} +#' } +#' +#' @note +#' Here is how to call \code{performance()} to create some standard +#' evaluation plots: +#' \describe{ +#' \item{ROC curves:}{measure="tpr", x.measure="fpr".} +#' \item{precision/recall graphs:}{measure="prec", x.measure="rec".} +#' \item{sensitivity/specificity plots:}{measure="sens", x.measure="spec".} +#' \item{Lift charts:}{measure="lift", x.measure="rpp".} +#' } +#' +#' @param prediction.obj An object of class \code{prediction}. +#' @param measure Performance measure to use for the evaluation. A complete list +#' of the performance measures that are available for \code{measure} and +#' \code{x.measure} is given in the 'Details' section. +#' @param x.measure A second performance measure. If different from the default, +#' a two-dimensional curve, with \code{x.measure} taken to be the unit in +#' direction of the x axis, and \code{measure} to be the unit in direction of +#' the y axis, is created. This curve is parametrized with the cutoff. +#' @param ... Optional arguments (specific to individual performance measures). +#' +#' @return An S4 object of class \code{performance}. +#' +#' @author +#' Tobias Sing \email{tobias.sing@gmail.com}, Oliver Sander +#' \email{osander@gmail.com} +#' +#' @export +performance <- function(prediction.obj, + measure, + x.measure="cutoff", + ...) { + + ## define the needed environments + envir.list <- .define.environments() + long.unit.names <- envir.list$long.unit.names + function.names <- envir.list$function.names + obligatory.x.axis <- envir.list$obligatory.x.axis + optional.arguments <- envir.list$optional.arguments + default.values <- envir.list$default.values + + ## abort in case of misuse + if (class(prediction.obj) != 'prediction' || + !exists(measure, where=long.unit.names, inherits=FALSE) || + !exists(x.measure, where=long.unit.names, inherits=FALSE)) { + stop(paste("Wrong argument types: First argument must be of type", + "'prediction'; second and optional third argument must", + "be available performance measures!")) + } + + ## abort, if attempt is made to use a measure that has an obligatory + ## x.axis as the x.measure (cannot be combined) + if (exists( x.measure, where=obligatory.x.axis, inherits=FALSE )) { + message <- paste("The performance measure", + x.measure, + "can only be used as 'measure', because it has", + "the following obligatory 'x.measure':\n", + get( x.measure, envir=obligatory.x.axis)) + stop(message) + } + + ## if measure is a performance measure with obligatory x.axis, then + ## enforce this axis: + if (exists( measure, where=obligatory.x.axis, inherits=FALSE )) { + x.measure <- get( measure, envir=obligatory.x.axis ) + } + + if (x.measure == "cutoff" || + exists( measure, where=obligatory.x.axis, inherits=FALSE )) { + + ## fetch from '...' any optional arguments for the performance + ## measure at hand that are given, otherwise fill up the default values + optional.args <- list(...) + argnames <- c() + if ( exists( measure, where=optional.arguments, inherits=FALSE )) { + argnames <- get( measure, envir=optional.arguments ) + default.arglist <- list() + for (i in 1:length(argnames)) { + default.arglist <- c(default.arglist, + get(paste(measure,":",argnames[i],sep=""), + envir=default.values, inherits=FALSE)) + } + names(default.arglist) <- argnames + + for (i in 1:length(argnames)) { + templist <- list(optional.args, + default.arglist[[i]]) + names(templist) <- c('arglist', argnames[i]) + + optional.args <- do.call('.farg', templist) + } + } + optional.args <- .select.args( optional.args, argnames ) + + ## determine function name + function.name <- get( measure, envir=function.names ) + + ## for each x-validation run, compute the requested performance measure + x.values <- list() + y.values <- list() + for (i in 1:length( prediction.obj@predictions )) { + argumentlist <- .sarg(optional.args, + predictions= prediction.obj@predictions[[i]], + labels= prediction.obj@labels[[i]], + cutoffs= prediction.obj@cutoffs[[i]], + fp= prediction.obj@fp[[i]], + tp= prediction.obj@tp[[i]], + fn= prediction.obj@fn[[i]], + tn= prediction.obj@tn[[i]], + n.pos= prediction.obj@n.pos[[i]], + n.neg= prediction.obj@n.neg[[i]], + n.pos.pred= prediction.obj@n.pos.pred[[i]], + n.neg.pred= prediction.obj@n.neg.pred[[i]]) + + ans <- do.call( function.name, argumentlist ) + + if (!is.null(ans[[1]])) x.values <- c( x.values, list( ans[[1]] )) + y.values <- c( y.values, list( ans[[2]] )) + } + + if (! (length(x.values)==0 || length(x.values)==length(y.values)) ) { + stop("Consistency error.") + } + + ## create a new performance object + return( new("performance", + x.name = get( x.measure, envir=long.unit.names ), + y.name = get( measure, envir=long.unit.names ), + alpha.name = "none", + x.values = x.values, + y.values = y.values, + alpha.values = list() )) + } else { + perf.obj.1 <- performance( prediction.obj, measure=x.measure, ... ) + perf.obj.2 <- performance( prediction.obj, measure=measure, ... ) + return( .combine.performance.objects( perf.obj.1, perf.obj.2 ) ) + } +} + +#' @importFrom stats approxfun +.combine.performance.objects <- function( p.obj.1, p.obj.2 ) { + ## some checks for misusage (in any way, this function is + ## only for internal use) + if ( p.obj.1@x.name != p.obj.2@x.name ) { + stop("Error: Objects need to have identical x axis.") + } + if ( p.obj.1@alpha.name != "none" || p.obj.2@alpha.name != "none") { + stop("Error: At least one of the two objects has already been merged.") + } + if (length(p.obj.1@x.values) != length(p.obj.2@x.values)) { + stop(paste("Only performance objects with identical number of", + "cross-validation runs can be combined.")) + } + + x.values <- list() + x.name <- p.obj.1@y.name + y.values <- list() + y.name <- p.obj.2@y.name + alpha.values <- list() + alpha.name <- p.obj.1@x.name + + for (i in 1:length( p.obj.1@x.values )) { + x.values.1 <- p.obj.1@x.values[[i]] + y.values.1 <- p.obj.1@y.values[[i]] + x.values.2 <- p.obj.2@x.values[[i]] + y.values.2 <- p.obj.2@y.values[[i]] + + ## cutoffs of combined object = merged cutoffs of simple objects + cutoffs <- sort( unique( c(x.values.1, x.values.2)), decreasing=TRUE ) + + ## calculate y.values at cutoffs using step function + y.values.int.1 <- stats::approxfun(x.values.1, y.values.1, + method="constant",f=1,rule=2)(cutoffs) + y.values.int.2 <- stats::approxfun(x.values.2, y.values.2, + method="constant",f=1,rule=2)(cutoffs) + + ## 'approxfun' ignores NA and NaN + objs <- list( y.values.int.1, y.values.int.2) + objs.x <- list( x.values.1, x.values.2 ) + na.cutoffs.1.bool <- is.na( y.values.1) & !is.nan( y.values.1 ) + nan.cutoffs.1.bool <- is.nan( y.values.1) + na.cutoffs.2.bool <- is.na( y.values.2) & !is.nan( y.values.2 ) + nan.cutoffs.2.bool <- is.nan( y.values.2) + bools <- list(na.cutoffs.1.bool, nan.cutoffs.1.bool, + na.cutoffs.2.bool, nan.cutoffs.2.bool) + values <- c(NA,NaN,NA,NaN) + + for (j in 1:4) { + for (k in which(bools[[j]])) { + interval.max <- objs.x[[ ceiling(j/2) ]][k] + interval.min <- -Inf + if (k < length(objs.x[[ ceiling(j/2) ]])) { + interval.min <- objs.x[[ ceiling(j/2) ]][k+1] + } + objs[[ ceiling(j/2) ]][cutoffs <= interval.max & + cutoffs > interval.min ] <- values[j] + } + } + + alpha.values <- c(alpha.values, list(cutoffs)) + x.values <- c(x.values, list(objs[[1]])) + y.values <- c(y.values, list(objs[[2]])) + } + + return( new("performance", + x.name=x.name, y.name=y.name, + alpha.name=alpha.name, x.values=x.values, + y.values=y.values, alpha.values=alpha.values)) +} + +.define.environments <- function() { + ## There are five environments: long.unit.names, function.names, + ## obligatory.x.axis, optional.arguments, default.values + + ## Define long names corresponding to the measure abbreviations. + long.unit.names <- new.env() + assign("none","None", envir=long.unit.names) + assign("cutoff", "Cutoff", envir=long.unit.names) + assign("acc", "accuracy", envir=long.unit.names) + assign("err", "Error Rate", envir=long.unit.names) + assign("fpr", "False positive rate", envir=long.unit.names) + assign("tpr", "True positive rate", envir=long.unit.names) + assign("rec", "recall", envir=long.unit.names) + assign("sens", "sensitivity", envir=long.unit.names) + assign("fnr", "False negative rate", envir=long.unit.names) + assign("tnr", "True negative rate", envir=long.unit.names) + assign("spec", "specificity", envir=long.unit.names) + assign("ppv", "Positive predictive value", envir=long.unit.names) + assign("prec", "precision", envir=long.unit.names) + assign("npv", "Negative predictive value", envir=long.unit.names) + assign("fall", "Fallout", envir=long.unit.names) + assign("miss", "Miss", envir=long.unit.names) + assign("pcfall", "Prediction-conditioned fallout", envir=long.unit.names) + assign("pcmiss", "Prediction-conditioned miss", envir=long.unit.names) + assign("rpp", "Rate of positive predictions", envir=long.unit.names) + assign("rnp", "Rate of negative predictions", envir=long.unit.names) + assign("auc","Area under the ROC curve", envir=long.unit.names) + assign("aucpr","Area under the precision/recall curve", envir=long.unit.names) + assign("cal", "Calibration error", envir=long.unit.names) + assign("mwp", "Median window position", envir=long.unit.names) + assign("prbe","precision/recall break-even point", envir=long.unit.names) + assign("rch", "ROC convex hull", envir=long.unit.names) + assign("mxe", "Mean cross-entropy", envir=long.unit.names) + assign("rmse","Root-mean-square error", envir=long.unit.names) + assign("phi", "Phi correlation coefficient", envir=long.unit.names) + assign("mat","Matthews correlation coefficient", envir=long.unit.names) + assign("mi", "Mutual information", envir=long.unit.names) + assign("chisq", "Chi-square test statistic", envir=long.unit.names) + assign("odds","Odds ratio", envir=long.unit.names) + assign("lift", "Lift value", envir=long.unit.names) + assign("f","precision-recall F measure", envir=long.unit.names) + assign("sar", "SAR", envir=long.unit.names) + assign("ecost", "Expected cost", envir=long.unit.names) + assign("cost", "Explicit cost", envir=long.unit.names) + + ## Define function names corresponding to the measure abbreviations. + function.names <- new.env() + assign("acc", ".performance.accuracy", envir=function.names) + assign("err", ".performance.error.rate", envir=function.names) + assign("fpr", ".performance.false.positive.rate", envir=function.names) + assign("tpr", ".performance.true.positive.rate", envir=function.names) + assign("rec", ".performance.true.positive.rate", envir=function.names) + assign("sens", ".performance.true.positive.rate", envir=function.names) + assign("fnr", ".performance.false.negative.rate", envir=function.names) + assign("tnr", ".performance.true.negative.rate", envir=function.names) + assign("spec", ".performance.true.negative.rate", envir=function.names) + assign("ppv", ".performance.positive.predictive.value", + envir=function.names) + assign("prec", ".performance.positive.predictive.value", + envir=function.names) + assign("npv", ".performance.negative.predictive.value", + envir=function.names) + assign("fall", ".performance.false.positive.rate", envir=function.names) + assign("miss", ".performance.false.negative.rate", envir=function.names) + assign("pcfall", ".performance.prediction.conditioned.fallout", + envir=function.names) + assign("pcmiss", ".performance.prediction.conditioned.miss", + envir=function.names) + assign("rpp", ".performance.rate.of.positive.predictions", + envir=function.names) + assign("rnp", ".performance.rate.of.negative.predictions", + envir=function.names) + assign("auc", ".performance.auc", envir=function.names) + assign("aucpr", ".performance.aucpr", envir=function.names) + assign("cal", ".performance.calibration.error", envir=function.names) + assign("prbe", ".performance.precision.recall.break.even.point", + envir=function.names) + assign("rch", ".performance.rocconvexhull", envir=function.names) + assign("mxe", ".performance.mean.cross.entropy", envir=function.names) + assign("rmse", ".performance.root.mean.squared.error", + envir=function.names) + assign("phi", ".performance.phi", envir=function.names) + assign("mat", ".performance.phi", envir=function.names) + assign("mi", ".performance.mutual.information", envir=function.names) + assign("chisq", ".performance.chisq", envir=function.names) + assign("odds", ".performance.odds.ratio", envir=function.names) + assign("lift", ".performance.lift", envir=function.names) + assign("f", ".performance.f", envir=function.names) + assign("sar", ".performance.sar", envir=function.names) + assign("ecost", ".performance.expected.cost", envir=function.names) + assign("cost", ".performance.cost", envir=function.names) + + ## If a measure comes along with an obligatory x axis (including "none"), + ## list it here. + obligatory.x.axis <- new.env() + assign("mxe", "none", envir=obligatory.x.axis) + assign("rmse", "none", envir=obligatory.x.axis) + assign("prbe", "none", envir=obligatory.x.axis) + assign("auc", "none", envir=obligatory.x.axis) + assign("aucpr", "none", envir=obligatory.x.axis) + assign("rch","none", envir=obligatory.x.axis) + ## ecost requires probability cost function as x axis, which is handled + ## implicitly, not as an explicit performance measure. + assign("ecost","none", envir=obligatory.x.axis) + + ## If a measure has optional arguments, list the names of the + ## arguments here. + optional.arguments <- new.env() + assign("cal", "window.size", envir=optional.arguments) + assign("f", "alpha", envir=optional.arguments) + assign("cost", c("cost.fp", "cost.fn"), envir=optional.arguments) + assign("auc", "fpr.stop", envir=optional.arguments) + + ## If a measure has additional arguments, list the default values + ## for them here. Naming convention: e.g. "cal" has an optional + ## argument "window.size" the key to use here is "cal:window.size" + ## (colon as separator) + default.values <- new.env() + assign("cal:window.size", 100, envir=default.values) + assign("f:alpha", 0.5, envir=default.values) + assign("cost:cost.fp", 1, envir=default.values) + assign("cost:cost.fn", 1, envir=default.values) + assign("auc:fpr.stop", 1, envir=default.values) + + list(long.unit.names=long.unit.names, function.names=function.names, + obligatory.x.axis=obligatory.x.axis, + optional.arguments=optional.arguments, + default.values=default.values) +} + diff --git a/R/precision_recall.R b/R/precision_recall.R index 8326e841702f272875a8b2d184a02ee153db5f9d..c9ececc8d5e98ed705aafad1d3182b0be9638884 100644 --- a/R/precision_recall.R +++ b/R/precision_recall.R @@ -9,8 +9,10 @@ #' @return A dataframe with precision recall. #' @export compute_pr_curve <- function(dt){ + ## -- replace 0 by minimum machine + dt$p.adj[ dt$p.adj == 0 ] <- 1e-217 ## --see .SDcols for order - pred_obj <- prediction(dt$p.adj , dt$isDE) + pred_obj <- prediction( -log10(dt$p.adj) , dt$isDE) perf_obj = performance(pred_obj, measure = "prec", x.measure = "rec") data2curve <- data.frame(x.name = perf_obj@x.values[[1]], y.name = perf_obj@y.values[[1]]) names(data2curve) <- c(unname(perf_obj@x.name), unname(perf_obj@y.name)) @@ -29,11 +31,7 @@ compute_pr_curve <- function(dt){ #' @param dt A data table with columns for recall and precision. #' @return A numeric value representing the AUC. #' @export -compute_pr_auc <- function(dt) PRAUC(y_pred = -log10(dt$p.adj) , y_true = dt$isDE) - - - - +compute_pr_auc <- function(dt) Area_Under_Curve( dt$recall, dt$precision ) #' Gets precision-recall objects for a given parameter. @@ -50,19 +48,40 @@ compute_pr_auc <- function(dt) PRAUC(y_pred = -log10(dt$p.adj) , y_true = dt$isD #' @export get_pr_object <- function(evaldata_params, col_param = "description", col_truth = "isDE", col_score = "p.adj" ) { + ## -- subset fixed eff + evaldata_params <- subset(evaldata_params, effect == "fixed") + + ## -- by params -- random class AUC + prop_table <- table(evaldata_params[[col_param]], evaldata_params[[col_truth]]) + random_classifier_auc_params <- prop_table[,"TRUE"]/rowSums(prop_table) + random_classifier_auc_params <- as.data.frame(random_classifier_auc_params) + random_classifier_auc_params[col_param] <- rownames(random_classifier_auc_params) + + ## -- aggregate -- random class AUC + prop_table <- table(evaldata_params[[col_truth]]) + random_classifier_auc_agg <- unname(prop_table["TRUE"]/sum(prop_table)) + ## -- data.table conversion dt_evaldata_params <- data.table::setDT(evaldata_params) - + ## -- by params pr_curve_params <- dt_evaldata_params[, compute_pr_curve(.SD), by=c("from", col_param), .SDcols=c(col_truth, col_score)] - pr_auc_params <- dt_evaldata_params[, compute_pr_auc(.SD), by=c("from", col_param), .SDcols=c(col_score, col_truth)] + pr_auc_params <- pr_curve_params[, compute_pr_auc(.SD), by=c("from", col_param), .SDcols=c("recall", "precision")] names(pr_auc_params)[ names(pr_auc_params) == "V1" ] <- "pr_AUC" + pr_auc_params <- join_dtf(pr_auc_params, random_classifier_auc_params , + k1 = col_param, k2 = col_param) + names(pr_auc_params)[ names(pr_auc_params) == "random_classifier_auc_params" ] <- "pr_randm_AUC" + + + ## -- aggregate pr_curve_agg <- dt_evaldata_params[, compute_pr_curve(.SD), by = "from", .SDcols=c(col_truth, col_score)] - pr_auc_agg <- dt_evaldata_params[, compute_pr_auc(.SD), by = "from", .SDcols=c(col_score, col_truth)] + pr_auc_agg <- pr_curve_agg[, compute_pr_auc(.SD), by = "from", .SDcols=c("recall", "precision")] names(pr_auc_agg)[ names(pr_auc_agg) == "V1" ] <- "pr_AUC" - + pr_auc_agg$pr_randm_AUC <- random_classifier_auc_agg + + return(list(byparams = list(pr_curve = as.data.frame(pr_curve_params), pr_auc = as.data.frame(pr_auc_params)), aggregate = list(pr_curve = as.data.frame(pr_curve_agg), diff --git a/R/receiver_operating_characteristic.R b/R/receiver_operating_characteristic.R index 55fac1c7c44486f09d25fb890c3d18ec6b365016..27854c7b7297a7a0fdae61c850954b8b705493c1 100644 --- a/R/receiver_operating_characteristic.R +++ b/R/receiver_operating_characteristic.R @@ -35,6 +35,9 @@ getLabelExpected <- function(comparison_df, coeff_threshold, alt_hypothesis) { idx_DE <- abs(comparison_df$actual) > coeff_threshold comparison_df$isDE <- idx_DE } + ## isDE for random params == NA + idx_ran_pars <- comparison_df$effect == "ran_pars" + comparison_df$isDE[idx_ran_pars] <- NA return(comparison_df) } @@ -49,7 +52,9 @@ getLabelExpected <- function(comparison_df, coeff_threshold, alt_hypothesis) { #' @return A data frame with specificity, sensitivity, and threshold values. #' @export compute_roc_curve <- function(dt){ - pred_obj <- prediction( dt$p.adj, !dt$isDE) + ## -- replace 0 by minimum machine + dt$p.adj[ dt$p.adj == 0 ] <- 1e-217 + pred_obj <- prediction( -log10(dt$p.adj), dt$isDE) perf_obj <- performance(pred_obj,"tpr","fpr") data2curve <- data.frame(x.name = perf_obj@x.values[[1]], y.name = perf_obj@y.values[[1]]) names(data2curve) <- c(unname(perf_obj@x.name), unname(perf_obj@y.name)) @@ -61,10 +66,10 @@ compute_roc_curve <- function(dt){ #' #' This function calculates the area under the ROC curve (AUC) using specificity and sensitivity values. #' -#' @param dt A data table with columns for y_pred (-log10(padj)) and y_true (idDE). -#' @return A numeric value representing the AUC. +#' @param dt A data table with columns for True positive rate and False positive rate +#' @return A numeric value representing the AUC. #' @export -compute_roc_auc <- function(dt) AUC(y_pred = -log10(dt$p.adj) , y_true = dt$isDE) +compute_roc_auc <- function(dt) Area_Under_Curve(x = dt$`False positive rate`, y = dt$`True positive rate`) @@ -82,18 +87,23 @@ compute_roc_auc <- function(dt) AUC(y_pred = -log10(dt$p.adj) , y_true = dt$isDE #' @export get_roc_object <- function(evaldata_params, col_param = "description", col_truth = "isDE", col_score = "p.adj" ) { + ## -- subset fixed eff + evaldata_params <- subset(evaldata_params, effect == "fixed") + ## -- data.table conversion dt_evaldata_params <- data.table::setDT(evaldata_params) ## -- by params roc_curve_params <- dt_evaldata_params[, compute_roc_curve(.SD), by=c("from", col_param), .SDcols=c(col_truth, col_score)] - roc_auc_params <- dt_evaldata_params[, compute_roc_auc(.SD), by=c("from", col_param), .SDcols=c(col_score, col_truth)] + roc_auc_params <- roc_curve_params[, compute_roc_auc(.SD), by=c("from", col_param), .SDcols=c("False positive rate", "True positive rate")] names(roc_auc_params)[ names(roc_auc_params) == "V1" ] <- "roc_AUC" - + roc_auc_params$roc_randm_AUC <- 0.5 + ## -- aggregate roc_curve_agg <- dt_evaldata_params[, compute_roc_curve(.SD), by= "from", .SDcols=c(col_truth, col_score)] - roc_auc_agg <- dt_evaldata_params[, compute_roc_auc(.SD), by="from", .SDcols=c(col_score, col_truth)] + roc_auc_agg <- roc_curve_agg[, compute_roc_auc(.SD), by="from", .SDcols=c("False positive rate", "True positive rate")] names(roc_auc_agg)[ names(roc_auc_agg) == "V1" ] <- "roc_AUC" + roc_auc_agg$roc_randm_AUC <- 0.5 return(list(byparams = list(roc_curve = as.data.frame(roc_curve_params), roc_auc = as.data.frame(roc_auc_params)), diff --git a/R/rocr_functions.R b/R/rocr_functions.R index 8326e841702f272875a8b2d184a02ee153db5f9d..c9ececc8d5e98ed705aafad1d3182b0be9638884 100644 --- a/R/rocr_functions.R +++ b/R/rocr_functions.R @@ -9,8 +9,10 @@ #' @return A dataframe with precision recall. #' @export compute_pr_curve <- function(dt){ + ## -- replace 0 by minimum machine + dt$p.adj[ dt$p.adj == 0 ] <- 1e-217 ## --see .SDcols for order - pred_obj <- prediction(dt$p.adj , dt$isDE) + pred_obj <- prediction( -log10(dt$p.adj) , dt$isDE) perf_obj = performance(pred_obj, measure = "prec", x.measure = "rec") data2curve <- data.frame(x.name = perf_obj@x.values[[1]], y.name = perf_obj@y.values[[1]]) names(data2curve) <- c(unname(perf_obj@x.name), unname(perf_obj@y.name)) @@ -29,11 +31,7 @@ compute_pr_curve <- function(dt){ #' @param dt A data table with columns for recall and precision. #' @return A numeric value representing the AUC. #' @export -compute_pr_auc <- function(dt) PRAUC(y_pred = -log10(dt$p.adj) , y_true = dt$isDE) - - - - +compute_pr_auc <- function(dt) Area_Under_Curve( dt$recall, dt$precision ) #' Gets precision-recall objects for a given parameter. @@ -50,19 +48,40 @@ compute_pr_auc <- function(dt) PRAUC(y_pred = -log10(dt$p.adj) , y_true = dt$isD #' @export get_pr_object <- function(evaldata_params, col_param = "description", col_truth = "isDE", col_score = "p.adj" ) { + ## -- subset fixed eff + evaldata_params <- subset(evaldata_params, effect == "fixed") + + ## -- by params -- random class AUC + prop_table <- table(evaldata_params[[col_param]], evaldata_params[[col_truth]]) + random_classifier_auc_params <- prop_table[,"TRUE"]/rowSums(prop_table) + random_classifier_auc_params <- as.data.frame(random_classifier_auc_params) + random_classifier_auc_params[col_param] <- rownames(random_classifier_auc_params) + + ## -- aggregate -- random class AUC + prop_table <- table(evaldata_params[[col_truth]]) + random_classifier_auc_agg <- unname(prop_table["TRUE"]/sum(prop_table)) + ## -- data.table conversion dt_evaldata_params <- data.table::setDT(evaldata_params) - + ## -- by params pr_curve_params <- dt_evaldata_params[, compute_pr_curve(.SD), by=c("from", col_param), .SDcols=c(col_truth, col_score)] - pr_auc_params <- dt_evaldata_params[, compute_pr_auc(.SD), by=c("from", col_param), .SDcols=c(col_score, col_truth)] + pr_auc_params <- pr_curve_params[, compute_pr_auc(.SD), by=c("from", col_param), .SDcols=c("recall", "precision")] names(pr_auc_params)[ names(pr_auc_params) == "V1" ] <- "pr_AUC" + pr_auc_params <- join_dtf(pr_auc_params, random_classifier_auc_params , + k1 = col_param, k2 = col_param) + names(pr_auc_params)[ names(pr_auc_params) == "random_classifier_auc_params" ] <- "pr_randm_AUC" + + + ## -- aggregate pr_curve_agg <- dt_evaldata_params[, compute_pr_curve(.SD), by = "from", .SDcols=c(col_truth, col_score)] - pr_auc_agg <- dt_evaldata_params[, compute_pr_auc(.SD), by = "from", .SDcols=c(col_score, col_truth)] + pr_auc_agg <- pr_curve_agg[, compute_pr_auc(.SD), by = "from", .SDcols=c("recall", "precision")] names(pr_auc_agg)[ names(pr_auc_agg) == "V1" ] <- "pr_AUC" - + pr_auc_agg$pr_randm_AUC <- random_classifier_auc_agg + + return(list(byparams = list(pr_curve = as.data.frame(pr_curve_params), pr_auc = as.data.frame(pr_auc_params)), aggregate = list(pr_curve = as.data.frame(pr_curve_agg), diff --git a/R/simulation_report.R b/R/simulation_report.R index c985c70671bcd7f2914676be91cb9cd69966eef3..560653a60b3f2256cf4d8d022261c7449bf8526c 100644 --- a/R/simulation_report.R +++ b/R/simulation_report.R @@ -63,6 +63,49 @@ getGrobTable <- function(df){ +#' Check Validity of Truth Labels +#' +#' This function checks the validity of truth labels for HTRfit evaluation, specifically designed for binary classification. +#' +#' @param eval_data Data frame containing evaluation parameters. +#' @param col_param Column name specifying the parameter for grouping. +#' @param col_truth Column name for binary ground truth values (default is "isDE"). +#' @return Logical value indicating the validity of truth labels. +#' @export +is_truthLabels_valid <- function(eval_data, col_param = "description", col_truth = "isDE" ) { + + ## --init + isValid <- TRUE + + ## -- subset fixed effect + eval_data_fixed <- subset(eval_data, effect == "fixed") + + table_summary <- table(eval_data_fixed[[col_param]], eval_data_fixed[[col_truth]]) + ## -- 2 lines needed (FALSE/TRUE) + n_labels <- dim(table_summary)[2] + if(n_labels != 2) { + labels_str <- paste(colnames(table_summary), collapse = ", ") + msg <- paste("Both FALSE/TRUE truth labels (isDE) are required for classification evaluation.\nFound : ", labels_str ) + message(msg) + isValid <- FALSE + } + ## -- one label found 0 time ! + label_not_found <- which(table_summary == 0, arr.ind=TRUE) + if(dim(label_not_found)[1] > 0){ + description <- rownames(label_not_found) + label_not_found <- colnames(table_summary)[label_not_found[,"col"]] + msg <- "Both TRUE and FALSE labels are required for HTRfit evaluation.\n" + msg2 <- paste("Label isDE ==", label_not_found, "not found for description ==", description, collapse = '\n') + msg3 <- "Please review your threshold or alternative hypothesis, and consider checking the identity plot for further insights." + msg <- paste(msg, msg2, ".\n", msg3 , sep = "") + message(msg) + isValid <- FALSE + } + return(isValid) +} + + + #' Compute evaluation report for TMB/DESeq2 analysis #' #' This function computes an evaluation report for TMB/DESeq2 analysis using several graphical @@ -73,14 +116,15 @@ getGrobTable <- function(df){ #' functions like \code{get_eval_data}, \code{get_pr_object} and \code{get_roc_object} to #' perform computations and generate plots. #' -#' @param l_tmb TMB results from analysis. +#' @param list_tmb TMB results from analysis. #' @param dds DESeq2 results from differential gene expression analysis. #' @param mock_obj Mock object that represents the distribution of measurements corresponding #' to mock samples. -#' @param coefficient Fold-change threshold used to define differential expression. +#' @param coeff_threshold natural logarithm fold-change threshold used to define differential expression. #' @param alt_hypothesis Alternative hypothesis used for hypothesis testing. #' @param alpha_risk parameter that sets the threshold for alpha risk level (default 0.05). #' @param palette_color Optional parameter that sets the color palette for plots. +#' @param skip_eval_intercept indicate whether to calculate precision-recall and ROC metrics for the intercept (default TRUE). #' @param ... Additional parameters to be passed to \code{get_pr_curve} and \code{get_roc_curve}. #' #' @return A list containing the following components: @@ -89,29 +133,60 @@ getGrobTable <- function(df){ #' \item{roc}{A ROC curve object generated from TMB and DESeq2 results.} #' \item{counts}{A counts plot generated from mock object.} #' @export -evaluation_report <- function(l_tmb, dds, mock_obj, coefficient, alt_hypothesis, alpha_risk = 0.05, palette_color = NULL, ...) { +evaluation_report <- function(list_tmb, dds, mock_obj, coeff_threshold, alt_hypothesis, alpha_risk = 0.05, palette_color = NULL, skip_eval_intercept = TRUE, ...) { ## -- eval data - eval_data <- get_eval_data(l_tmb, dds, mock_obj, coefficient, alt_hypothesis) + eval_data <- get_eval_data(list_tmb, dds, mock_obj, coeff_threshold, alt_hypothesis) ## -- identity plot #identity_data <- rbind_model_params_and_dispersion(eval_data) params_identity_eval <- eval_identityTerm(eval_data$modelparams) dispersion_identity_eval <- eval_identityTerm(eval_data$modeldispersion) + if (isTRUE(skip_eval_intercept)){ + eval_data2metrics <- subset(eval_data$modelparams, + term != "(Intercept)") + } + else{ + eval_data2metrics <- eval_data$modelparams + } + + ## -- counts plot + counts_violinplot <- counts_plot(mock_obj) + + ## -- check if eval data ok + if(isFALSE(is_truthLabels_valid(eval_data2metrics))){ + + message("The required truth labels for HTRfit classification metrics evaluation are not met. Only the identity plot and counts plot will be returned.") + ## -- clear memory + invisible(gc(reset = T, verbose = F, full = T)) ; + + return( + list( + data = eval_data, + identity = list( params = params_identity_eval$p, + dispersion = dispersion_identity_eval$p ), + counts = counts_violinplot + + )) + + + } + + + ## -- pr curve - pr_curve_obj <- get_pr_object(eval_data$modelparams) + pr_curve_obj <- get_pr_object(eval_data2metrics) pr_curve_obj <- get_pr_curve(pr_curve_obj, ...) ## -- auc curve - roc_curve_obj <- get_roc_object(eval_data$modelparams) + roc_curve_obj <- get_roc_object(eval_data2metrics) roc_curve_obj <- get_roc_curve(roc_curve_obj, ...) ## -- acc, recall, sensib, speci, ... - metrics_obj <- get_ml_metrics_obj(eval_data$modelparams, alpha_risk ) + metrics_obj <- get_ml_metrics_obj(eval_data2metrics, alpha_risk ) ## -- merge all metrics in one obj - model_perf_obj <- get_performances_metrics_obj( params_identity_eval$R2, - dispersion_identity_eval$R2, + model_perf_obj <- get_performances_metrics_obj( r2_params = params_identity_eval$R2, dispersion_identity_eval$R2, pr_curve_obj, roc_curve_obj, metrics_obj ) @@ -222,6 +297,10 @@ compute_metrics_summary <- function(dt) { #' @export get_ml_metrics_obj <- function(evaldata_params, alpha_risk = 0.05, col_param = "description"){ + ## -- subset fixed eff + evaldata_params <- subset(evaldata_params, effect == "fixed") + + evaldata_params$y_pred <- evaldata_params$p.adj < alpha_risk ## by params diff --git a/R/update_fittedmodel.R b/R/update_fittedmodel.R index dac1bca13219c1a889cee7378fe05cc148a1686d..24ba2e8a7e73dcd752d7a68b6b351f9e79d68386 100644 --- a/R/update_fittedmodel.R +++ b/R/update_fittedmodel.R @@ -71,7 +71,7 @@ parallel_update <- function(formula, list_tmb, n.cores = NULL, clust <- parallel::makeCluster(n.cores, type= cl_type, outfile = log_file) - parallel::clusterExport(clust, c("launchUpdate", "fitUpdate"), envir=environment()) + parallel::clusterExport(clust, c("launchUpdate", "fitUpdate")) updated_res <- parallel::parLapply(clust, X = list_tmb, fun = launchUpdate , formula = formula, ...) parallel::stopCluster(clust) ; invisible(gc(reset = T, verbose = F, full = T)); diff --git a/dev/flat_full.Rmd b/dev/flat_full.Rmd index 230e7f2d50e54bb92e228b1872cf79a69166bfe7..7ad9b3f575d9d654d741df69bc57c93f4ba20a13 100644 --- a/dev/flat_full.Rmd +++ b/dev/flat_full.Rmd @@ -5627,6 +5627,8 @@ getLabelExpected <- function(comparison_df, coeff_threshold, alt_hypothesis) { #' @return A data frame with specificity, sensitivity, and threshold values. #' @export compute_roc_curve <- function(dt){ + ## -- replace 0 by minimum machine + dt$p.adj[ dt$p.adj == 0 ] <- 1e-217 pred_obj <- prediction( -log10(dt$p.adj), dt$isDE) perf_obj <- performance(pred_obj,"tpr","fpr") data2curve <- data.frame(x.name = perf_obj@x.values[[1]], y.name = perf_obj@y.values[[1]]) @@ -5639,10 +5641,10 @@ compute_roc_curve <- function(dt){ #' #' This function calculates the area under the ROC curve (AUC) using specificity and sensitivity values. #' -#' @param dt A data table with columns for y_pred (-log10(padj)) and y_true (idDE). -#' @return A numeric value representing the AUC. +#' @param dt A data table with columns for True positive rate and False positive rate +#' @return A numeric value representing the AUC. #' @export -compute_roc_auc <- function(dt) AUC(y_pred = -log10(dt$p.adj) , y_true = dt$isDE) +compute_roc_auc <- function(dt) Area_Under_Curve(x = dt$`False positive rate`, y = dt$`True positive rate`) @@ -5668,13 +5670,13 @@ get_roc_object <- function(evaldata_params, col_param = "description", col_truth ## -- by params roc_curve_params <- dt_evaldata_params[, compute_roc_curve(.SD), by=c("from", col_param), .SDcols=c(col_truth, col_score)] - roc_auc_params <- dt_evaldata_params[, compute_roc_auc(.SD), by=c("from", col_param), .SDcols=c(col_score, col_truth)] + roc_auc_params <- roc_curve_params[, compute_roc_auc(.SD), by=c("from", col_param), .SDcols=c("False positive rate", "True positive rate")] names(roc_auc_params)[ names(roc_auc_params) == "V1" ] <- "roc_AUC" roc_auc_params$roc_randm_AUC <- 0.5 ## -- aggregate roc_curve_agg <- dt_evaldata_params[, compute_roc_curve(.SD), by= "from", .SDcols=c(col_truth, col_score)] - roc_auc_agg <- dt_evaldata_params[, compute_roc_auc(.SD), by="from", .SDcols=c(col_score, col_truth)] + roc_auc_agg <- roc_curve_agg[, compute_roc_auc(.SD), by="from", .SDcols=c("False positive rate", "True positive rate")] names(roc_auc_agg)[ names(roc_auc_agg) == "V1" ] <- "roc_AUC" roc_auc_agg$roc_randm_AUC <- 0.5 @@ -5818,16 +5820,18 @@ test_that("compute_roc_auc computes AUC from data frame", { p.adj = c(1, 0.75, 0.75, 0.5, 0.25, 0, 0.1) ) - # Expected output - expected_result <- 0.42 - + roc_curve_obj <- compute_roc_curve(dt) # Test - result <- compute_roc_auc(dt) + result <- compute_roc_auc(roc_curve_obj) + + # Expected output + expected_result <- 0.42 expect_equal(round(result,2 ), expected_result) }) test_that("get_roc_object returns ROC curves and AUCs", { # Test data + set.seed(101) evaldata_params <- data.frame( from = rep(c("HTRfit", "DESeq2"), each = 5), description = rep(c("param1", "param2"), each = 5), @@ -6297,62 +6301,29 @@ specificity <- function(y_true, y_pred, positive = NULL) { #' @param x the x-points of the curve #' @param y the y-points of the curve #' @param method can be "trapezoid" (default), "step" or "spline" -#' @param na.rm a logical value indicating whether NA values should be stripped before the computation proceeds #' @return Area Under the Curve (AUC) #' @examples #' x <- seq(0, pi, length.out = 200) #' plot(x = x, y = sin(x), type = "l") -#' Area_Under_Curve(x = x, y = sin(x), method = "trapezoid", na.rm = TRUE) -#' @importFrom stats integrate -#' @importFrom stats na.omit +#' Area_Under_Curve(x = x, y = sin(x), method = "trapezoid") #' @importFrom stats splinefun #' @export -Area_Under_Curve <- function(x, y, method = c("trapezoid", "step", "spline"), na.rm = FALSE) { - if (na.rm == TRUE) { - xy_cbind <- na.omit(cbind(x, y)) - x <- xy_cbind[, 1] - y <- xy_cbind[, 2] - } - if (length(x) != length(y)) { - stop("length x must equal length y") - } +Area_Under_Curve <- function(x, y, method = "trapezoid"){ idx <- order(x) x <- x[idx] y <- y[idx] - switch(match.arg(arg = method, choices = c("trapezoid", "step","spline")), - trapezoid = { - AUC <- sum((apply(cbind(y[-length(y)], y[-1]), 1, mean)) *(x[-1] - x[-length(x)]))}, - step = { - AUC <- sum(y[-length(y)] * (x[-1] - x[-length(x)]))}, - spline = { - AUC <- integrate(splinefun(x, y, method = "natural"), lower = min(x),upper = max(x))$value}) - return(AUC) + if (method == 'trapezoid'){ + auc <- sum((rowMeans(cbind(y[-length(y)], y[-1]))) * (x[-1] - x[-length(x)])) + }else if (method == 'step'){ + auc <- sum(y[-length(y)] * (x[-1] - x[-length(x)])) + }else if (method == 'spline'){ + auc <- integrate(splinefun(x, y, method = "natural"), lower = min(x), upper = max(x)) + auc <- auc$value + } + return(auc) } -#' @title Area Under the precision-recall Curve (PR AUC) -#' -#' @description -#' Compute the Area Under the precision-recall Curve (PR AUC) from prediction scores. -#' -#' @param y_pred Predicted probabilities vector, as returned by a classifier -#' @param y_true Ground truth (correct) 0-1 labels vector -#' @return Area Under the PR Curve (PR AUC) -#' @examples -#' data(cars) -#' logreg <- glm(formula = vs ~ hp + wt, -#' family = binomial(link = "logit"), data = mtcars) -#' PRAUC(y_pred = logreg$fitted.values, y_true = mtcars$vs) -#' @export -PRAUC <- function(y_pred, y_true) { - #pred_obj <- prediction(y_pred, y_true) - #perf_obj <- performance(pred_obj, measure = "prec", x.measure = "rec") - #PRAUC <- Area_Under_Curve(perf_obj@x.values[[1]], perf_obj@y.values[[1]], method = "trapezoid", na.rm = TRUE) - prauc <- pr.curve( - scores.class0 = -log10(dt$p.adj), - weights.class0 = dt$isDE - )[[2]] - return(PRAUC) -} + .performance.positive.predictive.value <- function(predictions, labels, cutoffs, fp, tp, fn, tn, @@ -6394,27 +6365,6 @@ PRAUC <- function(y_pred, y_true) { } -#' @title Area Under the Receiver Operating Characteristic Curve (ROC AUC) -#' -#' @description -#' Compute the Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. -#' -#' @param y_pred Predicted probabilities vector, as returned by a classifier -#' @param y_true Ground truth (correct) 0-1 labels vector -#' @return Area Under the ROC Curve (ROC AUC) -#' @examples -#' data(cars) -#' logreg <- glm(formula = vs ~ hp + wt, -#' family = binomial(link = "logit"), data = mtcars) -#' AUC(y_pred = logreg$fitted.values, y_true = mtcars$vs) -#' @export -AUC <- function(y_pred, y_true) { - rank <- rank(y_pred) - n_pos <- as.double(sum(y_true == 1)) - n_neg <- as.double(sum(y_true == 0)) - AUC <- (sum(rank[y_true == 1]) - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg) - return(AUC) -} ``` @@ -6429,9 +6379,7 @@ AUC <- function(y_pred, y_true) { #' @return A dataframe with precision recall. #' @export compute_pr_curve <- function(dt){ - ## -- replace 0 by minimum machine - ## -- => avoid bug with baseline dt$p.adj[ dt$p.adj == 0 ] <- 1e-217 ## --see .SDcols for order pred_obj <- prediction( -log10(dt$p.adj) , dt$isDE) @@ -6453,17 +6401,7 @@ compute_pr_curve <- function(dt){ #' @param dt A data table with columns for recall and precision. #' @return A numeric value representing the AUC. #' @export -compute_pr_auc <- function(dt) { - ## -- replace 0 by minimum machine - ## -- => avoid bug with baseline - dt$p.adj[ dt$p.adj == 0 ] <- 1e-217 - prauc <- PRAUC(y_pred = -log10(dt$p.adj) , y_true = dt$isDE) - return(prauc) -} - - - - +compute_pr_auc <- function(dt) Area_Under_Curve( dt$recall, dt$precision ) #' Gets precision-recall objects for a given parameter. @@ -6482,7 +6420,7 @@ get_pr_object <- function(evaldata_params, col_param = "description", col_truth ## -- subset fixed eff evaldata_params <- subset(evaldata_params, effect == "fixed") - + ## -- by params -- random class AUC prop_table <- table(evaldata_params[[col_param]], evaldata_params[[col_truth]]) random_classifier_auc_params <- prop_table[,"TRUE"]/rowSums(prop_table) @@ -6498,7 +6436,7 @@ get_pr_object <- function(evaldata_params, col_param = "description", col_truth ## -- by params pr_curve_params <- dt_evaldata_params[, compute_pr_curve(.SD), by=c("from", col_param), .SDcols=c(col_truth, col_score)] - pr_auc_params <- dt_evaldata_params[, compute_pr_auc(.SD), by=c("from", col_param), .SDcols=c(col_score, col_truth)] + pr_auc_params <- pr_curve_params[, compute_pr_auc(.SD), by=c("from", col_param), .SDcols=c("recall", "precision")] names(pr_auc_params)[ names(pr_auc_params) == "V1" ] <- "pr_AUC" pr_auc_params <- join_dtf(pr_auc_params, random_classifier_auc_params , k1 = col_param, k2 = col_param) @@ -6509,7 +6447,7 @@ get_pr_object <- function(evaldata_params, col_param = "description", col_truth ## -- aggregate pr_curve_agg <- dt_evaldata_params[, compute_pr_curve(.SD), by = "from", .SDcols=c(col_truth, col_score)] - pr_auc_agg <- dt_evaldata_params[, compute_pr_auc(.SD), by = "from", .SDcols=c(col_score, col_truth)] + pr_auc_agg <- pr_curve_agg[, compute_pr_auc(.SD), by = "from", .SDcols=c("recall", "precision")] names(pr_auc_agg)[ names(pr_auc_agg) == "V1" ] <- "pr_AUC" pr_auc_agg$pr_randm_AUC <- random_classifier_auc_agg @@ -6620,8 +6558,9 @@ test_that("compute_pr_auc computes area under the precision-recall curve", { ) # Test the compute_pr_auc function - result <- compute_pr_auc(dt) - expect_equal(expected = 0.59, round(result, 2)) + pr_curve <- compute_pr_curve(dt) + result <- compute_pr_auc(pr_curve) + expect_equal(expected = 0.60, round(result, 2)) }) test_that("get_pr_object gets precision-recall objects", { @@ -8956,7 +8895,8 @@ getActualMixed_typeI <- function(list_logqij, genes_iter_list, categoricalVar_in #' Compare the mixed-effects inference to expected values. #' -#' This function compares the mixed-effects inference obtained from a mixed-effects model to expected values derived from a ground truth dataset. The function assumes a specific type I mixed-effect structure in the input model. +#' This function compares the mixed-effects inference obtained from a mixed-effects model to expected values derived from a ground truth dataset. +#' The function assumes a specific type I mixed-effect structure in the input model. # #' @param tidy_tmb tidy model results obtained from fitting a mixed-effects model. #' @param ground_truth_eff A data frame containing ground truth effects. @@ -8998,8 +8938,8 @@ inferenceToExpected_withMixedEff <- function(tidy_tmb, ground_truth_eff){ #' Calculate actual mixed effects. #' #' This function calculates actual mixed effects based on the given data for a specific type I mixed-effect structure. -# It calculates the expected values, standard deviations, and correlations between the fixed and random effects. -# The function is designed to work with specific input data for type I mixed-effect calculations. +# It calculates the expected values, standard deviations, and correlations between the fixed and random effects. +# The function is designed to work with specific input data for type I mixed-effect calculations. # #' @param data_gene Data for a specific gene. #' @param labelRef_InCategoricalVar The reference label for the categorical variable. @@ -9318,31 +9258,33 @@ test_that("calculate_actualMixed calculates actual mixed effects as expected", { # High-Throughput RNA-seq model fit -In the realm of RNAseq analysis, various key experimental parameters play a crucial role in influencing the statistical power to detect expression changes. Parameters such as sequencing depth, the number of replicates, and more have a significant impact. To navigate the selection of optimal values for these experimental parameters, we introduce a comprehensive statistical framework known as **HTRfit**, underpinned by computational simulation. **HTRfit** serves as a versatile tool, not only for simulation but also for conducting differential expression analysis. It facilitates this analysis by fitting Generalized Linear Models (GLMs) with multiple variables, which could encompass genotypes, environmental factors, and more. These GLMs are highly adaptable, allowing the incorporation of fixed effects, mixed effects, and interactions between variables. +In the realm of RNAseq analysis, various key experimental parameters play a crucial role in influencing the statistical power to detect expression changes. Parameters such as sequencing depth, the number of replicates, and others are expected to impact this statistical power. To help with the selection of optimal values for these experimental parameters, we introduce a comprehensive statistical framework known as **HTRfit**, underpinned by computational simulation. **HTRfit** serves as a versatile tool, not only for simulation but also for conducting differential expression analysis. It facilitates this analysis by fitting Generalized Linear Models (GLMs) with multiple variables, which could encompass genotypes, environmental factors, and more. These GLMs are highly adaptable, allowing the incorporation of fixed effects, mixed effects, and interactions between variables. + -# Initialize variable to simulate +# Initializing variables for simulation -The `init_variable()` function, which is a key tool for defining the variables in your experimental design. You can specify the variables' names and the size of the effects involved. By manually setting the effect of a variable, you make it a fixed effect, while random effect definitions can make it either fixed or mixed. +The `init_variable()` function is used for defining the variables in your experimental design. Sizes of effects for each variable and interaction term can be defined in two different ways: 1) The user can manually sets values of all levels of a variable, in which case the effects are necessarily considered fixed in the model; 2) The effects can be randomly picked in a normal distribution with mean and standard deviation defined by the user, in which case the user can decide whether the effects are considered fixed or random in the model. -## Manually init my first variable + +## Manually init a single variable The `init_variable()` function allows for precise control over the variables in your experimental design. -In this example, we manually initialize **varA** with specifics size effects (mu) and levels. +In this example, we manually initialize **varA** with specific size effects (mu) for three levels. ```{r example-init_variable_man, warning=FALSE, message=FALSE} -input_var_list <- init_variable( name = "varA", mu = c(0.2, 4, -3), level = 3) +list_var <- init_variable( name = "varA", mu = c(0.2, 4, -3), level = 3) ``` -## Randomly init my first variable +## Randomly init a single variable Alternatively, you can randomly initialize **varA** by specifying a mean (mu) and standard deviation (sd). -This introduces variability into **varA**, making it either a fixed or mixed effect in your design. +This introduces variability into **varA**, making it either a fixed or random effect in your design. ```{r example-init_variable_rand, warning=FALSE, message=FALSE} -input_var_list <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) +list_var <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) ``` @@ -9351,31 +9293,35 @@ input_var_list <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) You can also initialize multiple variables, such as **varA** and **varB**, with random values. This flexibility allows you to create diverse experimental designs. + ```{r example-init_variable_mult, warning=FALSE, message=FALSE} -input_var_list <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) %>% +list_var <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) %>% init_variable( name = "varB", mu = -3, sd = 0.34, level = 2) ``` + ## Add interaction between variable -Similarly to `init_variable()`, `add_interaction()` allow to init an interaction between variable. +Similarly to `init_variable()`, `add_interaction()` allows to define an interaction between variable. In this example, we initialize **varA** and **varB**, and create an interaction between **varA**, and **varB** using `add_interaction()`. + ```{r example-add_interaction, warning=FALSE, message=FALSE} -input_var_list <- init_variable( name = "varA", mu = 3, sd = 0.2, level = 2) %>% +list_var <- init_variable( name = "varA", mu = 3, sd = 0.2, level = 2) %>% init_variable( name = "varB", mu = 2, sd = 0.43, level = 2) %>% add_interaction( between_var = c("varA", "varB"), mu = 0.44, sd = 0.2) ``` -## Initialized a complex design +## Complex design Interactions can involve a maximum of three variables, such as **varA**, **varB**, and **varC**. + ```{r example-add_interaction_complex, eval= FALSE, message=FALSE, warning=FALSE, include=TRUE} ## /!\ not evaluated in Vignette -input_var_list <- init_variable( name = "varA", mu = 5, sd = 0.2, level = 2) %>% +list_var <- init_variable( name = "varA", mu = 5, sd = 0.2, level = 2) %>% init_variable( name = "varB", mu = 1, sd = 0.78, level = 2) %>% init_variable( name = "varC", mu = c(2, 3), sd = NA, level = 2) %>% add_interaction( between_var = c("varA", "varC"), mu = 0.44, sd = 0.2) %>% @@ -9385,16 +9331,17 @@ input_var_list <- init_variable( name = "varA", mu = 5, sd = 0.2, level = 2) %>% ``` -## Initialization variable object +## Structure of list_var object ```{r example-str_obj_init, warning=FALSE, message=FALSE} -str(input_var_list) +str(list_var) ``` -# Simulate RNAseq data +# Simulation of RNAseq data + +In this section, you will explore how to generate RNAseq data based on the previously defined input variables. The `mock_rnaseq()` function enables you to manage parameters in your RNAseq design, including number of genes, minimum and maximum number of replicates within your experimental setup, sequencing depth, basal expression of each gene, and dispersion of gene expression used for simulating counts. -In this section, you will explore how to generate RNAseq data based on the previously defined input variables. The `mock_rnaseq()` function enables you to manage parameters in your RNAseq design, including the number of genes, the minimum and maximum number of replicates within your experimental setup. You can also adjust the sequencing depth, the basal gene expression, and the gene dispersion used for simulating counts. ## Minimal example @@ -9407,19 +9354,19 @@ MAX_REPLICATES = 10 ## -- simulate RNAseq data based on input_var_list, minimum input required ## -- number of replicate randomly defined between MIN_REP and MAX_REP -mock_data <- mock_rnaseq(input_var_list, N_GENES, +mock_data <- mock_rnaseq(list_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES) ## -- simulate RNAseq data based on input_var_list, minimum input required ## -- Same number of replicates between conditions -mock_data <- mock_rnaseq(input_var_list, N_GENES, +mock_data <- mock_rnaseq(list_var, N_GENES, min_replicates = MAX_REPLICATES, max_replicates = MAX_REPLICATES) ``` -## Scaling genes counts with sequencing depth +## Scaling genes counts based on sequencing depth Sequencing depth is a critical parameter affecting the statistical power of an RNAseq analysis. With the `sequencing_depth` option in the `mock_rnaseq()` function, you have the ability to control this parameter. @@ -9432,17 +9379,19 @@ MAX_REPLICATES = 10 SEQ_DEPTH = c(100000, 5000000, 10000000)## -- Possible number of reads/sample SEQ_DEPTH = 10000000 ## -- all samples have same number of reads -mock_data <- mock_rnaseq(input_var_list, N_GENES, +mock_data <- mock_rnaseq(list_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES, sequencing_depth = SEQ_DEPTH) ``` -## Set gene dispersion -The dispersion parameter ($\alpha_i$), characterizes the relationship between the variance of the observed count and its mean value. In simple terms, it quantifies how much we expect the observed count to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter. +## Set dispersion of gene expression + +The dispersion parameter ($\alpha_i$), characterizes the relationship between the variance of the observed read counts and its mean value. In simple terms, it quantifies how much we expect observed counts to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter. + -```{r example-mock_rnaseq_disp, warning=FALSE, message=FALSE} +```{r example-mock_rnaseq_disp, warning = FALSE, message = FALSE} ## -- Required parameters N_GENES = 6000 @@ -9453,20 +9402,18 @@ MAX_REPLICATES = 4 DISP = 0.1 ## -- Same dispersion for each genes DISP = 1000 ## -- Same dispersion for each genes DISP = runif(N_GENES, 0, 1000) ## -- Dispersion can vary between genes -mock_data <- mock_rnaseq(input_var_list, N_GENES, +mock_data <- mock_rnaseq(list_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES, dispersion = DISP ) ``` - - ## Set basal gene expression -The basal gene expression parameter, accessible through the basal_expression option, allows you to control the fundamental baseline gene expression level. It lets you adjust the expected count when no other factors are influencing gene expression, making it a key factor for simulating RNAseq data that aligns with your experimental design. +The basal gene expression parameter, defined using the basal_expression option, allows you to control the fundamental baseline of expression level of genes. When a single value is specified, the basal expression of all genes is the same and the effects of variables defined in list_var are applied from this basal expression level. When several values are specified, the basal expression of each gene is picked randomly among these values (with replacement). The effects of variables are then applied from the basal expression of each gene. This represents the fact that expression levels can vary among genes even when not considering the effects of the defined variables (for example, some genes are constitutively more highly expressed than others because they have stronger basal promoters). -```{r example-mock_rnaseq_bexpr, warning=FALSE, message=FALSE} +```{r example-mock_rnaseq_bexpr, warning = FALSE, message = FALSE} ## -- Required parameters N_GENES = 6000 MIN_REPLICATES = 10 @@ -9476,7 +9423,7 @@ MAX_REPLICATES = 10 BASAL_EXPR = -3 ## -- Value can be negative to simulate low expressed gene BASAL_EXPR = 2 ## -- Same basal gene expression for the N_GENES BASAL_EXPR = c( -3, -1, 2, 8, 9, 10 ) ## -- Basal expression can vary between genes -mock_data <- mock_rnaseq(input_var_list, N_GENES, +mock_data <- mock_rnaseq(list_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES, basal_expression = BASAL_EXPR) @@ -9484,6 +9431,8 @@ mock_data <- mock_rnaseq(input_var_list, N_GENES, ``` + + ## Mock RNAseq object ```{r example-str_obj_mock, warning=FALSE, message=FALSE} @@ -9514,14 +9463,14 @@ The `prepareData2fit()` function serves the purpose of converting the counts mat ```{r example-prepareData, warning=FALSE, message=FALSE} ## -- simulate small example to prevent excessively lengthy vignette construction -input_var_list <- init_variable( name = "varA", mu = 3, sd = 0.2, level = 2) %>% +list_var <- init_variable( name = "varA", mu = 3, sd = 0.2, level = 2) %>% init_variable( name = "varB", mu = 2, sd = 0.43, level = 2) %>% add_interaction( between_var = c("varA", "varB"), mu = 0.44, sd = 0.2) N_GENES = 30 MIN_REPLICATES = 4 MAX_REPLICATES = 4 BASAL_EXPR = rnorm(N_GENES , 0 , 1 ) -mock_data <- mock_rnaseq(input_var_list, N_GENES, +mock_data <- mock_rnaseq(list_var, N_GENES, min_replicates = MIN_REPLICATES, max_replicates = MAX_REPLICATES, basal_expression = BASAL_EXPR) @@ -9628,26 +9577,25 @@ l_tmb <- updateParallel(formula = kij ~ varA + varB + varA:varB , n.cores = 1) ``` -#### list tmb object +#### Struture of list tmb object ```{r example-str_obj_l_tmb, warning=FALSE, message=FALSE} -str(l_tmb$gene1) +str(l_tmb$gene1, max.level = 1) ``` - ## Plot fit metrics Visualizing fit metrics is essential for evaluating your models. Here, we show you how to generate various plots to assess the quality of your models. You can explore all metrics or focus on specific aspects like dispersion and log-likelihood. -```{r example-plotMetrics , warning=FALSE, message=FALSE, fig.align='center', fig.height=4, fig.width=6} +```{r example-plotMetrics , warning=FALSE, message=FALSE, fig.align='center', fig.height=4, fig.width=8} ## -- plot all metrics diagnostic_plot(list_tmb = l_tmb) ``` -```{r example-plotMetricsFocus, warning=FALSE, message=FALSE, fig.align='center', fig.height=3, fig.width=4} +```{r example-plotMetricsFocus, warning=FALSE, message=FALSE, fig.align='center', fig.height=3, fig.width=8} ## -- Focus on metrics diagnostic_plot(list_tmb = l_tmb, focus = c("dispersion", "logLik")) ``` @@ -9708,7 +9656,7 @@ The dispersion plot, generated by the `evaluation_report()` function, offers a v The Receiver Operating Characteristic (ROC) curve is a valuable tool for assessing the performance of classification models, particularly in the context of identifying differentially expressed genes. It provides a graphical representation of the model's ability to distinguish between genes that are differentially expressed and those that are not, by varying the `coeff_threshold` and the `alt_hypothesis` parameters. -```{r example-outputROC, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=5} +```{r example-outputROC, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9} ## -- ROC curve resSimu$roc$params ``` @@ -9720,7 +9668,7 @@ resSimu$roc$params The precision-recall curve (PR curve) illustrates the relationship between precision and recall at various classification thresholds. It is particularly valuable in the context of imbalanced data, where one class is significantly more prevalent than the other. Unlike the ROC curve, which can be influenced by class distribution, the PR curve focuses on the model's ability to correctly identify examples of the minority class while maintaining high precision. -```{r example-outputPR, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=5} +```{r example-outputPR, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9} ## -- precision-recall curve resSimu$precision_recall$params ``` @@ -9733,7 +9681,7 @@ The area under the ROC curve (AUC) provides a single metric that summarizes the In addition to evaluating model performance through the AUC for both ROC and PR curves, we provide access to key classification metrics, including Accuracy, Precision, Recall (or Sensitivity), and Specificity. These metrics offer a comprehensive view of the model's classification capabilities. These metrics are computed for each parameter (excluding the intercept when skip_eval_intercept = TRUE), providing detailed insights into individual parameter contributions. Furthermore, an aggregated assessment is performed, considering all parameters (except the intercept by default), offering a perspective on the model's overall classification effectiveness. -```{r example-outputPerf, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=5} +```{r example-outputPerf, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9} ## -- precision-recall curve resSimu$performances ``` @@ -9763,12 +9711,15 @@ resSimu <- evaluation_report(list_tmb = l_tmb, ``` -```{r example-outputResSimu, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=5} +```{r example-outputResSimu_id, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=5} ## -- identity plot ###### 1) Model params resSimu$identity$params ###### Dispersion params resSimu$identity$dispersion +``` + +```{r example-outputResSimu_metric, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9} ## -- precision-recall curve resSimu$precision_recall$params ## -- ROC curve @@ -9798,12 +9749,15 @@ resSimu <- evaluation_report(list_tmb = l_tmb, As we compare this evaluation to the previous one, we observe a reduction in performances for both **HTRfit** and **DESeq2** inferences. -```{r example-subsetGenes_rocPlot, warning=FALSE, message=FALSE, fig.align='center', fig.height=4, fig.width=5} +```{r example-subsetGenes_id, warning=FALSE, message=FALSE, fig.align='center', fig.height=4, fig.width=5} ## -- identity plot ###### 1) Model params resSimu$identity$params ###### Dispersion params resSimu$identity$dispersion +``` + +```{r example-subsetGenes_metrics, warning=FALSE, message=FALSE, fig.align='center', fig.height=4, fig.width=9} ## -- precision-recall curve resSimu$precision_recall$params ## -- ROC curve @@ -9812,6 +9766,35 @@ resSimu$roc$params resSimu$performances ``` + +## Modifying your alpha risk + +```{r example-alphaRisk_1, warning = FALSE, message = FALSE} +## -- get simulation/fit evaluation +resSimu <- evaluation_report(list_tmb = l_tmb, + dds = dds, + mock_obj = mock_data, + coeff_threshold = 0.4, + alpha_risk = 0.05, ## -- default + alt_hypothesis = "greaterAbs") +## -- Performances metrics +resSimu$performances +``` + + +```{r example-alphaRiskk_2, warning = FALSE, message = FALSE} +## -- get simulation/fit evaluation +resSimu <- evaluation_report(list_tmb = l_tmb, + dds = dds, + mock_obj = mock_data, + coeff_threshold = 0.4, + alpha_risk = 0.01, + alt_hypothesis = "greaterAbs") +## -- Performances metrics +resSimu$performances +``` + + ## Evaluate model inference involving mixed effects For certain experimental scenarios, such as those involving a high number of levels or longitudinal data, the utilization of mixed effects within your design formula can be beneficial. The **HTRfit** simulation framework also offers the capability to assess this type of design formula. @@ -9844,12 +9827,15 @@ resSimu <- evaluation_report(list_tmb = l_tmb, alt_hypothesis = "greater") ``` -```{r example-outputResSimuMixed, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=5} +```{r example-outputResSimuMixed_id, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=5} ## -- identity plot ###### 1) Model params resSimu$identity$params ###### Dispersion params resSimu$identity$dispersion +``` + +```{r example-outputResSimuMixed_metric, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9} ## -- precision-recall curve resSimu$precision_recall$params ## -- ROC curve @@ -9858,6 +9844,12 @@ resSimu$roc$params resSimu$performances ``` +## Strure of evaluation report object + + +```{r example-str_eval_report, warning=FALSE, message=FALSE} +str(resSimu, max.level = 1) +``` ## About mixed model evaluation @@ -9872,7 +9864,7 @@ fusen::fill_description(fields = list(Title = "HTRfit"), overwrite = T) usethis::use_mit_license("Arnaud DUVERMY") usethis::use_pipe(export = TRUE) devtools::document() -.# Keep eval=FALSE to avoid infinite loop in case you hit the knit button +# Keep eval=FALSE to avoid infinite loop in case you hit the knit button # Execute in the console directly fusen::inflate(flat_file = "dev/flat_full.Rmd", vignette_name = "HTRfit", open_vignette = T, overwrite = T) ``` diff --git a/man/AUC.Rd b/man/AUC.Rd deleted file mode 100644 index cf85027e2d7474b1f5f0de29a716bf3c4d282d4e..0000000000000000000000000000000000000000 --- a/man/AUC.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mlmetrics.R -\name{AUC} -\alias{AUC} -\title{Area Under the Receiver Operating Characteristic Curve (ROC AUC)} -\usage{ -AUC(y_pred, y_true) -} -\arguments{ -\item{y_pred}{Predicted probabilities vector, as returned by a classifier} - -\item{y_true}{Ground truth (correct) 0-1 labels vector} -} -\value{ -Area Under the ROC Curve (ROC AUC) -} -\description{ -Compute the Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. -} -\examples{ -data(cars) -logreg <- glm(formula = vs ~ hp + wt, - family = binomial(link = "logit"), data = mtcars) -AUC(y_pred = logreg$fitted.values, y_true = mtcars$vs) -} diff --git a/man/Area_Under_Curve.Rd b/man/Area_Under_Curve.Rd index 84894e1905eb0317b11163afb2b345c70b51113d..dfb6d8652540651e7e1d3b5b0222cb082a54743c 100644 --- a/man/Area_Under_Curve.Rd +++ b/man/Area_Under_Curve.Rd @@ -4,12 +4,7 @@ \alias{Area_Under_Curve} \title{Calculate the Area Under the Curve} \usage{ -Area_Under_Curve( - x, - y, - method = c("trapezoid", "step", "spline"), - na.rm = FALSE -) +Area_Under_Curve(x, y, method = "trapezoid") } \arguments{ \item{x}{the x-points of the curve} @@ -17,8 +12,6 @@ Area_Under_Curve( \item{y}{the y-points of the curve} \item{method}{can be "trapezoid" (default), "step" or "spline"} - -\item{na.rm}{a logical value indicating whether NA values should be stripped before the computation proceeds} } \value{ Area Under the Curve (AUC) @@ -29,5 +22,5 @@ Calculate the area under the curve. \examples{ x <- seq(0, pi, length.out = 200) plot(x = x, y = sin(x), type = "l") -Area_Under_Curve(x = x, y = sin(x), method = "trapezoid", na.rm = TRUE) +Area_Under_Curve(x = x, y = sin(x), method = "trapezoid") } diff --git a/man/PRAUC.Rd b/man/PRAUC.Rd deleted file mode 100644 index ab39fb27ad60a84f50b7b2dc5ef7094976c1e59c..0000000000000000000000000000000000000000 --- a/man/PRAUC.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mlmetrics.R -\name{PRAUC} -\alias{PRAUC} -\title{Area Under the precision-recall Curve (PR AUC)} -\usage{ -PRAUC(y_pred, y_true) -} -\arguments{ -\item{y_pred}{Predicted probabilities vector, as returned by a classifier} - -\item{y_true}{Ground truth (correct) 0-1 labels vector} -} -\value{ -Area Under the PR Curve (PR AUC) -} -\description{ -Compute the Area Under the precision-recall Curve (PR AUC) from prediction scores. -} -\examples{ -data(cars) -logreg <- glm(formula = vs ~ hp + wt, - family = binomial(link = "logit"), data = mtcars) -PRAUC(y_pred = logreg$fitted.values, y_true = mtcars$vs) -} diff --git a/man/checkFractionOfZero.Rd b/man/checkFractionOfZero.Rd new file mode 100644 index 0000000000000000000000000000000000000000..198bfe4a9d7fac5f39c76c18cf3740678e4e5aed --- /dev/null +++ b/man/checkFractionOfZero.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mock_rnaseq.R +\name{checkFractionOfZero} +\alias{checkFractionOfZero} +\title{Check Fraction of Zero or One in Counts Table} +\usage{ +checkFractionOfZero(counts_table) +} +\arguments{ +\item{counts_table}{A matrix or data frame representing counts.} +} +\description{ +This function checks the percentage of counts in a given counts table that are either zero or one. +If more than 50\% of the counts fall in this category, a warning is issued, suggesting a review of input parameters. +} +\examples{ +# Example usage: +counts_table <- matrix(c(0, 1, 2, 3, 4, 0, 0, 1, 1), ncol = 3) +checkFractionOfZero(counts_table) +} diff --git a/man/compute_roc_auc.Rd b/man/compute_roc_auc.Rd index 429ba6e97f26a9fdb6f6cab1a2293b8fa88d7a31..08511585af5c2f637ea8af0c5ed4502d98c3a6a6 100644 --- a/man/compute_roc_auc.Rd +++ b/man/compute_roc_auc.Rd @@ -7,7 +7,7 @@ compute_roc_auc(dt) } \arguments{ -\item{dt}{A data table with columns for y_pred (-log10(padj)) and y_true (idDE).} +\item{dt}{A data table with columns for True positive rate and False positive rate} } \value{ A numeric value representing the AUC. diff --git a/man/evaluation_report.Rd b/man/evaluation_report.Rd index 664746b2d4a8a48813299187c36338eb99a860bf..b037b33c3fdfd03e291937fa5de9c7b5ed3f351e 100644 --- a/man/evaluation_report.Rd +++ b/man/evaluation_report.Rd @@ -5,25 +5,26 @@ \title{Compute evaluation report for TMB/DESeq2 analysis} \usage{ evaluation_report( - l_tmb, + list_tmb, dds, mock_obj, - coefficient, + coeff_threshold, alt_hypothesis, alpha_risk = 0.05, palette_color = NULL, + skip_eval_intercept = TRUE, ... ) } \arguments{ -\item{l_tmb}{TMB results from analysis.} +\item{list_tmb}{TMB results from analysis.} \item{dds}{DESeq2 results from differential gene expression analysis.} \item{mock_obj}{Mock object that represents the distribution of measurements corresponding to mock samples.} -\item{coefficient}{Fold-change threshold used to define differential expression.} +\item{coeff_threshold}{natural logarithm fold-change threshold used to define differential expression.} \item{alt_hypothesis}{Alternative hypothesis used for hypothesis testing.} @@ -31,6 +32,8 @@ to mock samples.} \item{palette_color}{Optional parameter that sets the color palette for plots.} +\item{skip_eval_intercept}{indicate whether to calculate precision-recall and ROC metrics for the intercept (default TRUE).} + \item{...}{Additional parameters to be passed to \code{get_pr_curve} and \code{get_roc_curve}.} } \value{ diff --git a/man/inferenceToExpected_withMixedEff.Rd b/man/inferenceToExpected_withMixedEff.Rd index bcb6dff787af2e14d8408b95cad8bf417992be49..113810bda47a17a905ef87cfc1f0f507d35da78f 100644 --- a/man/inferenceToExpected_withMixedEff.Rd +++ b/man/inferenceToExpected_withMixedEff.Rd @@ -15,7 +15,8 @@ inferenceToExpected_withMixedEff(tidy_tmb, ground_truth_eff) A data frame with the comparison of estimated mixed effects to expected values. } \description{ -This function compares the mixed-effects inference obtained from a mixed-effects model to expected values derived from a ground truth dataset. The function assumes a specific type I mixed-effect structure in the input model. +This function compares the mixed-effects inference obtained from a mixed-effects model to expected values derived from a ground truth dataset. +The function assumes a specific type I mixed-effect structure in the input model. } \examples{ \dontrun{ diff --git a/man/is_truthLabels_valid.Rd b/man/is_truthLabels_valid.Rd new file mode 100644 index 0000000000000000000000000000000000000000..a812e18d1c9dac7a1bfe2e8298ceedffc70f6837 --- /dev/null +++ b/man/is_truthLabels_valid.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulation_report.R +\name{is_truthLabels_valid} +\alias{is_truthLabels_valid} +\title{Check Validity of Truth Labels} +\usage{ +is_truthLabels_valid(eval_data, col_param = "description", col_truth = "isDE") +} +\arguments{ +\item{eval_data}{Data frame containing evaluation parameters.} + +\item{col_param}{Column name specifying the parameter for grouping.} + +\item{col_truth}{Column name for binary ground truth values (default is "isDE").} +} +\value{ +Logical value indicating the validity of truth labels. +} +\description{ +This function checks the validity of truth labels for HTRfit evaluation, specifically designed for binary classification. +} diff --git a/man/performance-class.Rd b/man/performance-class.Rd index 3d3bc11f542186cdbd9bd6ffef695ab637853778..9e8e4d53da0679f529019eefb02826695c77afe2 100644 --- a/man/performance-class.Rd +++ b/man/performance-class.Rd @@ -1,14 +1,41 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fake-section-title.R +% Please edit documentation in R/fake-section-title.R, R/na-na.R \docType{class} \name{performance-class} \alias{performance-class} \title{Class \code{performance}} \description{ +Object to capture the result of a performance evaluation, optionally +collecting evaluations from several cross-validation or bootstrapping runs. + Object to capture the result of a performance evaluation, optionally collecting evaluations from several cross-validation or bootstrapping runs. } \details{ +A \code{performance} object can capture information from four +different evaluation scenarios: +\itemize{ +\item The behaviour of a cutoff-dependent performance measure across +the range of all cutoffs (e.g. \code{performance( predObj, 'acc' )} ). Here, +\code{x.values} contains the cutoffs, \code{y.values} the +corresponding values of the performance measure, and +\code{alpha.values} is empty.\cr +\item The trade-off between two performance measures across the +range of all cutoffs (e.g. \code{performance( predObj, + 'tpr', 'fpr' )} ). In this case, the cutoffs are stored in +\code{alpha.values}, while \code{x.values} and \code{y.values} +contain the corresponding values of the two performance measures.\cr +\item A performance measure that comes along with an obligatory +second axis (e.g. \code{performance( predObj, 'ecost' )} ). Here, the measure values are +stored in \code{y.values}, while the corresponding values of the +obligatory axis are stored in \code{x.values}, and \code{alpha.values} +is empty.\cr +\item A performance measure whose value is just a scalar +(e.g. \code{performance( predObj, 'auc' )} ). The value is then stored in +\code{y.values}, while \code{x.values} and \code{alpha.values} are +empty. +} + A \code{performance} object can capture information from four different evaluation scenarios: \itemize{ @@ -52,20 +79,50 @@ other.} \item{\code{y.values}}{A list in which each entry contains the y values of the curve of this particular cross-validation run.} +\item{\code{alpha.values}}{A list in which each entry contains the cutoff values of +the curve of this particular cross-validation run.} + +\item{\code{x.name}}{Performance measure used for the x axis.} + +\item{\code{y.name}}{Performance measure used for the y axis.} + +\item{\code{alpha.name}}{Name of the unit that is used to create the parametrized +curve. Currently, curves can only be parametrized by cutoff, so +\code{alpha.name} is either \code{none} or \code{cutoff}.} + +\item{\code{x.values}}{A list in which each entry contains the x values of the curve +of this particular cross-validation run. \code{x.values[[i]]}, +\code{y.values[[i]]}, and \code{alpha.values[[i]]} correspond to each +other.} + +\item{\code{y.values}}{A list in which each entry contains the y values of the curve +of this particular cross-validation run.} + \item{\code{alpha.values}}{A list in which each entry contains the cutoff values of the curve of this particular cross-validation run.} }} \section{Objects from the Class}{ +Objects can be created by using the \code{performance} function. + + Objects can be created by using the \code{performance} function. } \references{ +A detailed list of references can be found on the ROCR homepage at +\url{http://rocr.bioinf.mpi-sb.mpg.de}. + A detailed list of references can be found on the ROCR homepage at \url{http://rocr.bioinf.mpi-sb.mpg.de}. } \seealso{ +\code{\link{prediction}} +\code{\link{performance}}, +\code{\link{prediction-class}}, +\code{\link{plot.performance}} + \code{\link{prediction}} \code{\link{performance}}, \code{\link{prediction-class}}, diff --git a/man/performance.Rd b/man/performance.Rd index 886aee2b7bccdf04404eb1f60278cd14d4fabf34..aebc6c484d4db008f3185db80769c84430f9631f 100644 --- a/man/performance.Rd +++ b/man/performance.Rd @@ -1,9 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fake-section-title.R +% Please edit documentation in R/fake-section-title.R, R/na-na.R \name{performance} \alias{performance} \title{Function to create performance objects} \usage{ +performance(prediction.obj, measure, x.measure = "cutoff", ...) + performance(prediction.obj, measure, x.measure = "cutoff", ...) } \arguments{ @@ -21,9 +23,13 @@ the y axis, is created. This curve is parametrized with the cutoff.} \item{...}{Optional arguments (specific to individual performance measures).} } \value{ +An S4 object of class \code{performance}. + An S4 object of class \code{performance}. } \description{ +All kinds of predictor evaluations are performed using this function. + All kinds of predictor evaluations are performed using this function. } \details{ @@ -172,8 +178,163 @@ Accepts the optional parameters \code{cost.fp} and negatives can be adjusted, respectively. By default, both are set to 1.} } + +Here is the list of available performance measures. Let Y and +\eqn{\hat{Y}}{Yhat} be random variables representing the class and the prediction for +a randomly drawn sample, respectively. We denote by +\eqn{\oplus}{+} and \eqn{\ominus}{-} the positive and +negative class, respectively. Further, we use the following +abbreviations for empirical quantities: P (\# positive +samples), N (\# negative samples), TP (\# true positives), TN (\# true +negatives), FP (\# false positives), FN (\# false negatives). +\describe{ +\item{\code{acc}:}{accuracy. \eqn{P(\hat{Y}=Y)}{P(Yhat = Y)}. Estimated +as: \eqn{\frac{TP+TN}{P+N}}{(TP+TN)/(P+N)}.} +\item{\code{err}:}{Error rate. \eqn{P(\hat{Y}\ne Y)}{P(Yhat != + Y)}. Estimated as: \eqn{\frac{FP+FN}{P+N}}{(FP+FN)/(P+N)}.} +\item{\code{fpr}:}{False positive rate. \eqn{P(\hat{Y}=\oplus | Y = + \ominus)}{P(Yhat = + | Y = -)}. Estimated as: +\eqn{\frac{FP}{N}}{FP/N}.} +\item{\code{fall}:}{Fallout. Same as \code{fpr}.} +\item{\code{tpr}:}{True positive +rate. \eqn{P(\hat{Y}=\oplus|Y=\oplus)}{P(Yhat = + | Y = +)}. Estimated +as: \eqn{\frac{TP}{P}}{TP/P}.} +\item{\code{rec}:}{recall. Same as \code{tpr}.} +\item{\code{sens}:}{sensitivity. Same as \code{tpr}.} +\item{\code{fnr}:}{False negative +rate. \eqn{P(\hat{Y}=\ominus|Y=\oplus)}{P(Yhat = - | Y = + +)}. Estimated as: \eqn{\frac{FN}{P}}{FN/P}.} +\item{\code{miss}:}{Miss. Same as \code{fnr}.} +\item{\code{tnr}:}{True negative rate. \eqn{P(\hat{Y} = + \ominus|Y=\ominus)}{P(Yhat = - | Y = -)}.} +\item{\code{spec}:}{specificity. Same as \code{tnr}.} +\item{\code{ppv}:}{Positive predictive +value. \eqn{P(Y=\oplus|\hat{Y}=\oplus)}{P(Y = + | Yhat = + +)}. Estimated as: \eqn{\frac{TP}{TP+FP}}{TP/(TP+FP)}.} +\item{\code{prec}:}{precision. Same as \code{ppv}.} +\item{\code{npv}:}{Negative predictive +value. \eqn{P(Y=\ominus|\hat{Y}=\ominus)}{P(Y = - | Yhat = + -)}. Estimated as: \eqn{\frac{TN}{TN+FN}}{TN/(TN+FN)}.} +\item{\code{pcfall}:}{Prediction-conditioned +fallout. \eqn{P(Y=\ominus|\hat{Y}=\oplus)}{P(Y = - | Yhat = + +)}. Estimated as: \eqn{\frac{FP}{TP+FP}}{FP/(TP+FP)}.} +\item{\code{pcmiss}:}{Prediction-conditioned +miss. \eqn{P(Y=\oplus|\hat{Y}=\ominus)}{P(Y = + | Yhat = + -)}. Estimated as: \eqn{\frac{FN}{TN+FN}}{FN/(TN+FN)}.} +\item{\code{rpp}:}{Rate of positive predictions. \eqn{P( \hat{Y} = + \oplus)}{P(Yhat = +)}. Estimated as: (TP+FP)/(TP+FP+TN+FN).} +\item{\code{rnp}:}{Rate of negative predictions. \eqn{P( \hat{Y} = + \ominus)}{P(Yhat = -)}. Estimated as: (TN+FN)/(TP+FP+TN+FN).} +\item{\code{phi}:}{Phi correlation coefficient. \eqn{\frac{TP \cdot + TN - FP \cdot FN}{\sqrt{ (TP+FN) \cdot (TN+FP) \cdot (TP+FP) + \cdot (TN+FN)}}}{(TP*TN - + FP*FN)/(sqrt((TP+FN)*(TN+FP)*(TP+FP)*(TN+FN)))}. Yields a +number between -1 and 1, with 1 indicating a perfect +prediction, 0 indicating a random prediction. Values below 0 +indicate a worse than random prediction.} +\item{\code{mat}:}{Matthews correlation coefficient. Same as \code{phi}.} +\item{\code{mi}:}{Mutual information. \eqn{I(\hat{Y},Y) := H(Y) - + H(Y|\hat{Y})}{I(Yhat, Y) := H(Y) - H(Y | Yhat)}, where H is the +(conditional) entropy. Entropies are estimated naively (no bias +correction).} +\item{\code{chisq}:}{Chi square test statistic. \code{?chisq.test} +for details. Note that R might raise a warning if the sample size +is too small.} +\item{\code{odds}:}{Odds ratio. \eqn{\frac{TP \cdot TN}{FN \cdot + FP}}{(TP*TN)/(FN*FP)}. Note that odds ratio produces +Inf or NA values for all cutoffs corresponding to FN=0 or +FP=0. This can substantially decrease the plotted cutoff region.} +\item{\code{lift}:}{Lift +value. \eqn{\frac{P(\hat{Y}=\oplus|Y=\oplus)}{P(\hat{Y}=\oplus)}}{P(Yhat = + | + Y = +)/P(Yhat = +)}.} +\item{\code{f}:}{precision-recall F measure (van Rijsbergen, 1979). Weighted +harmonic mean of precision (P) and recall (R). \eqn{F = + \frac{1}{\alpha \frac{1}{P} + (1-\alpha)\frac{1}{R}}}{F = 1/ + (alpha*1/P + (1-alpha)*1/R)}. If +\eqn{\alpha=\frac{1}{2}}{alpha=1/2}, the mean is balanced. A +frequent equivalent formulation is +\eqn{F = \frac{(\beta^2+1) \cdot P \cdot R}{R + \beta^2 \cdot + P}}{F = (beta^2+1) * P * R / (R + beta^2 * P)}. In this formulation, the +mean is balanced if \eqn{\beta=1}{beta=1}. Currently, ROCR only accepts +the alpha version as input (e.g. \eqn{\alpha=0.5}{alpha=0.5}). If no +value for alpha is given, the mean will be balanced by default.} +\item{\code{rch}:}{ROC convex hull. A ROC (=\code{tpr} vs \code{fpr}) curve +with concavities (which represent suboptimal choices of cutoff) removed +(Fawcett 2001). Since the result is already a parametric performance +curve, it cannot be used in combination with other measures.} +\item{\code{auc}:}{Area under the ROC curve. This is equal to the value of the +Wilcoxon-Mann-Whitney test statistic and also the probability that the +classifier will score are randomly drawn positive sample higher than a +randomly drawn negative sample. Since the output of +\code{auc} is cutoff-independent, this +measure cannot be combined with other measures into a parametric +curve. The partial area under the ROC curve up to a given false +positive rate can be calculated by passing the optional parameter +\code{fpr.stop=0.5} (or any other value between 0 and 1) to +\code{performance}.} +\item{\code{aucpr}:}{Area under the precision/recall curve. Since the output +of \code{aucpr} is cutoff-independent, this measure cannot be combined +with other measures into a parametric curve.} +\item{\code{prbe}:}{precision-recall break-even point. The cutoff(s) where +precision and recall are equal. At this point, positive and negative +predictions are made at the same rate as their prevalence in the +data. Since the output of +\code{prbe} is just a cutoff-independent scalar, this +measure cannot be combined with other measures into a parametric curve.} +\item{\code{cal}:}{Calibration error. The calibration error is the +absolute difference between predicted confidence and actual reliability. This +error is estimated at all cutoffs by sliding a window across the +range of possible cutoffs. The default window size of 100 can be +adjusted by passing the optional parameter \code{window.size=200} +to \code{performance}. E.g., if for several +positive samples the output of the classifier is around 0.75, you might +expect from a well-calibrated classifier that the fraction of them +which is correctly predicted as positive is also around 0.75. In a +well-calibrated classifier, the probabilistic confidence estimates +are realistic. Only for use with +probabilistic output (i.e. scores between 0 and 1).} +\item{\code{mxe}:}{Mean cross-entropy. Only for use with +probabilistic output. \eqn{MXE :=-\frac{1}{P+N}( \sum_{y_i=\oplus} + ln(\hat{y}_i) + \sum_{y_i=\ominus} ln(1-\hat{y}_i))}{MXE := - 1/(P+N) \sum_{y_i=+} + ln(yhat_i) + \sum_{y_i=-} ln(1-yhat_i)}. Since the output of +\code{mxe} is just a cutoff-independent scalar, this +measure cannot be combined with other measures into a parametric curve.} +\item{\code{rmse}:}{Root-mean-squared error. Only for use with +numerical class labels. \eqn{RMSE:=\sqrt{\frac{1}{P+N}\sum_i (y_i + - \hat{y}_i)^2}}{RMSE := sqrt(1/(P+N) \sum_i (y_i - + yhat_i)^2)}. Since the output of +\code{rmse} is just a cutoff-independent scalar, this +measure cannot be combined with other measures into a parametric curve.} +\item{\code{sar}:}{Score combinining performance measures of different +characteristics, in the attempt of creating a more "robust" +measure (cf. Caruana R., ROCAI2004): +SAR = 1/3 * ( accuracy + Area under the ROC curve + Root +mean-squared error ).} +\item{\code{ecost}:}{Expected cost. For details on cost curves, +cf. Drummond&Holte 2000,2004. \code{ecost} has an obligatory x +axis, the so-called 'probability-cost function'; thus it cannot be +combined with other measures. While using \code{ecost} one is +interested in the lower envelope of a set of lines, it might be +instructive to plot the whole set of lines in addition to the lower +envelope. An example is given in \code{demo(ROCR)}.} +\item{\code{cost}:}{Cost of a classifier when +class-conditional misclassification costs are explicitly given. +Accepts the optional parameters \code{cost.fp} and +\code{cost.fn}, by which the costs for false positives and +negatives can be adjusted, respectively. By default, both are set +to 1.} +} } \note{ +Here is how to call \code{performance()} to create some standard +evaluation plots: +\describe{ +\item{ROC curves:}{measure="tpr", x.measure="fpr".} +\item{precision/recall graphs:}{measure="prec", x.measure="rec".} +\item{sensitivity/specificity plots:}{measure="sens", x.measure="spec".} +\item{Lift charts:}{measure="lift", x.measure="rpp".} +} + Here is how to call \code{performance()} to create some standard evaluation plots: \describe{ diff --git a/man/prediction-class.Rd b/man/prediction-class.Rd index 472b9af76bed7e16d3fa0ae6e855ed89da7c76e5..87a05dabbb5e3ec149315fdcda6b6f2d22bb43fb 100644 --- a/man/prediction-class.Rd +++ b/man/prediction-class.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fake-section-title.R +% Please edit documentation in R/fake-section-title.R, R/na-na.R \docType{class} \name{prediction-class} \alias{prediction-class} \title{Class \code{prediction}} \description{ +Object to encapsulate numerical predictions together with the +corresponding true class labels, optionally collecting predictions and +labels for several cross-validation or bootstrapping runs. + Object to encapsulate numerical predictions together with the corresponding true class labels, optionally collecting predictions and labels for several cross-validation or bootstrapping runs. @@ -41,10 +45,46 @@ samples in the given x-validation run.} samples predicted as positive at the cutoffs given in the corresponding 'cutoffs' entry.} +\item{\code{n.neg.pred}}{As n.pos.pred, but for negatively predicted samples.} + +\item{\code{predictions}}{A list, in which each element is a vector of predictions +(the list has length > 1 for x-validation data.} + +\item{\code{labels}}{Analogously, a list in which each element is a vector of true +class labels.} + +\item{\code{cutoffs}}{A list in which each element is a vector of all necessary +cutoffs. Each cutoff vector consists of the predicted scores (duplicates +removed), in descending order.} + +\item{\code{fp}}{A list in which each element is a vector of the number (not the +rate!) of false positives induced by the cutoffs given in the corresponding +'cutoffs' list entry.} + +\item{\code{tp}}{As fp, but for true positives.} + +\item{\code{tn}}{As fp, but for true negatives.} + +\item{\code{fn}}{As fp, but for false negatives.} + +\item{\code{n.pos}}{A list in which each element contains the number of positive +samples in the given x-validation run.} + +\item{\code{n.neg}}{As n.pos, but for negative samples.} + +\item{\code{n.pos.pred}}{A list in which each element is a vector of the number of +samples predicted as positive at the cutoffs given in the corresponding +'cutoffs' entry.} + \item{\code{n.neg.pred}}{As n.pos.pred, but for negatively predicted samples.} }} \note{ +Every \code{prediction} object contains information about the 2x2 +contingency table consisting of tp,tn,fp, and fn, along with the +marginal sums n.pos,n.neg,n.pos.pred,n.neg.pred, because these form +the basis for many derived performance measures. + Every \code{prediction} object contains information about the 2x2 contingency table consisting of tp,tn,fp, and fn, along with the marginal sums n.pos,n.neg,n.pos.pred,n.neg.pred, because these form @@ -52,10 +92,18 @@ the basis for many derived performance measures. } \section{Objects from the Class}{ +Objects can be created by using the \code{prediction} function. + + Objects can be created by using the \code{prediction} function. } \seealso{ +\code{\link{prediction}}, +\code{\link{performance}}, +\code{\link{performance-class}}, +\code{\link{plot.performance}} + \code{\link{prediction}}, \code{\link{performance}}, \code{\link{performance-class}}, diff --git a/man/prediction.Rd b/man/prediction.Rd index 1d20b7865e3971ae541a0d0062d7d3fdd3186dcb..9d9848629d46cbad846609a5b79319dd363eae78 100644 --- a/man/prediction.Rd +++ b/man/prediction.Rd @@ -1,9 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fake-section-title.R +% Please edit documentation in R/fake-section-title.R, R/na-na.R \name{prediction} \alias{prediction} \title{Function to create prediction objects} \usage{ +prediction(predictions, labels, label.ordering = NULL) + prediction(predictions, labels, label.ordering = NULL) } \arguments{ @@ -18,9 +20,16 @@ be changed by supplying a vector containing the negative and the positive class label.} } \value{ +An S4 object of class \code{prediction}. + An S4 object of class \code{prediction}. } \description{ +Every classifier evaluation using ROCR starts with creating a +\code{prediction} object. This function is used to transform the input data +(which can be in vector, matrix, data frame, or list form) into a +standardized format. + Every classifier evaluation using ROCR starts with creating a \code{prediction} object. This function is used to transform the input data (which can be in vector, matrix, data frame, or list form) into a @@ -48,6 +57,38 @@ FALSE < TRUE). Use \code{label.ordering} to override this default ordering. Please note that the ordering can be locale-dependent e.g. for character labels '-1' and '1'. +Currently, ROCR supports only binary classification (extensions toward +multiclass classification are scheduled for the next release, +however). If there are more than two distinct label symbols, execution +stops with an error message. If all predictions use the same two +symbols that are used for the labels, categorical predictions are +assumed. If there are more than two predicted values, but all numeric, +continuous predictions are assumed (i.e. a scoring +classifier). Otherwise, if more than two symbols occur in the +predictions, and not all of them are numeric, execution stops with an +error message. + +\code{predictions} and \code{labels} can simply be vectors of the same +length. However, in the case of cross-validation data, different +cross-validation runs can be provided as the \emph{columns} of a matrix or +data frame, or as the entries of a list. In the case of a matrix or +data frame, all cross-validation runs must have the same length, whereas +in the case of a list, the lengths can vary across the cross-validation +runs. Internally, as described in section 'Value', all of these input +formats are converted to list representation. + +Since scoring classifiers give relative tendencies towards a negative +(low scores) or positive (high scores) class, it has to be declared +which class label denotes the negative, and which the positive class. +Ideally, labels should be supplied as ordered factor(s), the lower +level corresponding to the negative class, the upper level to the +positive class. If the labels are factors (unordered), numeric, +logical or characters, ordering of the labels is inferred from +R's built-in \code{<} relation (e.g. 0 < 1, -1 < 1, 'a' < 'b', +FALSE < TRUE). Use \code{label.ordering} to override this default +ordering. Please note that the ordering can be locale-dependent +e.g. for character labels '-1' and '1'. + Currently, ROCR supports only binary classification (extensions toward multiclass classification are scheduled for the next release, however). If there are more than two distinct label symbols, execution diff --git a/tests/testthat/test-mock_rnaseq.R b/tests/testthat/test-mock_rnaseq.R index f08ef4cdaf38ca0ddaa442f6220eb32f2d451b24..ac3bcf3ec4c76daa86769cccc92ae50fff875432 100644 --- a/tests/testthat/test-mock_rnaseq.R +++ b/tests/testthat/test-mock_rnaseq.R @@ -24,6 +24,18 @@ test_that("generateCountTable generates count table with correct dimensions", { +test_that("checkFractionofZero issues a warning for high fraction of zeros/ones", { + # Test case 1: Less than 50% zeros and ones + counts_table_1 <- matrix(c(0, 1, 2, 0, 0, 0, 0, 1, 1), ncol = 3) + expect_message(checkFractionOfZero(counts_table_1), "50% of the counts simulated are bellow 1. Consider reviewing your input parameters.") + + # Test case 2: More than 50% zeros and ones + counts_table_2 <- matrix(c(0, 1, 0, 0, 1, 10, 100, 1, 1), ncol = 3) + expect_null(checkFractionOfZero(counts_table_2)) + +}) + + # Test for replicateMatrix test_that("replicateMatrix replicates matrix correctly", { matrix <- matrix(1:9, nrow = 3, ncol = 3) diff --git a/tests/testthat/test-precision_recall.R b/tests/testthat/test-precision_recall.R index c75285b29c9ee994afa753b9b0537fceff9e5b47..da447006972340880ded1822e9c5a7ba06a0e31e 100644 --- a/tests/testthat/test-precision_recall.R +++ b/tests/testthat/test-precision_recall.R @@ -28,8 +28,9 @@ test_that("compute_pr_auc computes area under the precision-recall curve", { ) # Test the compute_pr_auc function - result <- compute_pr_auc(dt) - expect_equal(expected = 0.59, round(result, 2)) + pr_curve <- compute_pr_curve(dt) + result <- compute_pr_auc(pr_curve) + expect_equal(expected = 0.60, round(result, 2)) }) test_that("get_pr_object gets precision-recall objects", { @@ -37,9 +38,31 @@ test_that("get_pr_object gets precision-recall objects", { set.seed(42) dt_evaldata_params <- data.table::data.table( description = rep(c("param1", "param2"), each = 50), - isDE = sample(0:1, 100, replace = TRUE), + isDE = sample(c("FALSE","TRUE"), 100, replace = TRUE), from = c("A", "B"), - p.adj = runif(100) + p.adj = runif(100), + effect = "fixed" + ) + + # Test the get_pr_object function + result <- get_pr_object(dt_evaldata_params) + + expect_true("byparams" %in% names(result)) + expect_true("aggregate" %in% names(result)) + expect_true("data.frame" %in% class(result$byparams$pr_curve)) + expect_true("data.frame" %in% class(result$byparams$pr_auc)) + expect_true("data.frame" %in% class(result$aggregate$pr_curve)) + expect_true("data.frame" %in% class(result$aggregate$pr_auc)) + + ## -- not only fixed effect + # Mock data for testing + set.seed(42) + dt_evaldata_params <- data.table::data.table( + description = rep(c("param1", "param2"), each = 50), + isDE = sample(c(TRUE,FALSE), 100, replace = TRUE), + from = c("A", "B"), + p.adj = runif(100), + effect = sample(c("fixed","ran_pars"), 100, replace = TRUE) ) # Test the get_pr_object function @@ -51,6 +74,8 @@ test_that("get_pr_object gets precision-recall objects", { expect_true("data.frame" %in% class(result$byparams$pr_auc)) expect_true("data.frame" %in% class(result$aggregate$pr_curve)) expect_true("data.frame" %in% class(result$aggregate$pr_auc)) + + }) test_that("build_gg_pr_curve builds ggplot precision-recall curve", { diff --git a/tests/testthat/test-receiver_operating_characteristic.R b/tests/testthat/test-receiver_operating_characteristic.R index 2d5a08447cd9609ef463a80819ee5c86c80b617b..4e4aeacfa33e687f78e384adbd728050ace21747 100644 --- a/tests/testthat/test-receiver_operating_characteristic.R +++ b/tests/testthat/test-receiver_operating_characteristic.R @@ -49,21 +49,24 @@ test_that("compute_roc_auc computes AUC from data frame", { p.adj = c(1, 0.75, 0.75, 0.5, 0.25, 0, 0.1) ) - # Expected output - expected_result <- 0.42 - + roc_curve_obj <- compute_roc_curve(dt) # Test - result <- compute_roc_auc(dt) + result <- compute_roc_auc(roc_curve_obj) + + # Expected output + expected_result <- 0.42 expect_equal(round(result,2 ), expected_result) }) test_that("get_roc_object returns ROC curves and AUCs", { # Test data + set.seed(101) evaldata_params <- data.frame( from = rep(c("HTRfit", "DESeq2"), each = 5), description = rep(c("param1", "param2"), each = 5), isDE = sample(0:1, 10, replace = TRUE), - p.adj = runif(10) + p.adj = runif(10), + effect = rep("fixed", 10) ) # Test @@ -71,7 +74,24 @@ test_that("get_roc_object returns ROC curves and AUCs", { expect_equal(names(result), c("byparams", "aggregate")) expect_equal(names(result$byparams), c("roc_curve", "roc_auc")) expect_equal(names(result$aggregate), c("roc_curve", "roc_auc")) + + ## -- not only fixed effect + set.seed(101) + evaldata_params <- data.frame( + from = rep(c("HTRfit", "DESeq2"), each = 5), + description = rep(c("param1", "param2"), each = 5), + isDE = sample(0:1, 10, replace = TRUE), + p.adj = runif(10), + effect = c(rep("fixed", 8), 'ran_pars', 'ran_pars') + ) + + result <- get_roc_object(evaldata_params) + expect_equal(names(result), c("byparams", "aggregate")) + expect_equal(names(result$byparams), c("roc_curve", "roc_auc")) + expect_equal(names(result$aggregate), c("roc_curve", "roc_auc")) + + }) test_that("build_gg_roc_curve builds ggplot ROC curve", { diff --git a/vignettes/htrfit.Rmd b/vignettes/htrfit.Rmd index 2c3b60bec2ff5435d06cb088e98026bda82e2631..d7d1d681b8e85cbf5d2fa51bd6647ae2da68a088 100644 --- a/vignettes/htrfit.Rmd +++ b/vignettes/htrfit.Rmd @@ -15,6 +15,7 @@ knitr::opts_chunk$set( ``` ```{r setup} +devtools::load_all() library(HTRfit) ``` @@ -31,3 +32,607 @@ If it is the first time you use {fusen}, after 'description', you can directly r --> +# High-Throughput RNA-seq model fit + +In the realm of RNAseq analysis, various key experimental parameters play a crucial role in influencing the statistical power to detect expression changes. Parameters such as sequencing depth, the number of replicates, and others are expected to impact this statistical power. To help with the selection of optimal values for these experimental parameters, we introduce a comprehensive statistical framework known as **HTRfit**, underpinned by computational simulation. **HTRfit** serves as a versatile tool, not only for simulation but also for conducting differential expression analysis. It facilitates this analysis by fitting Generalized Linear Models (GLMs) with multiple variables, which could encompass genotypes, environmental factors, and more. These GLMs are highly adaptable, allowing the incorporation of fixed effects, mixed effects, and interactions between variables. + + + + +# Initializing variables for simulation + +The `init_variable()` function is used for defining the variables in your experimental design. Sizes of effects for each variable and interaction term can be defined in two different ways: 1) The user can manually sets values of all levels of a variable, in which case the effects are necessarily considered fixed in the model; 2) The effects can be randomly picked in a normal distribution with mean and standard deviation defined by the user, in which case the user can decide whether the effects are considered fixed or random in the model. + + + +## Manually init a single variable + +The `init_variable()` function allows for precise control over the variables in your experimental design. +In this example, we manually initialize **varA** with specific size effects (mu) for three levels. + + + +```{r example-init_variable_man, warning = FALSE, message = FALSE} +list_var <- init_variable( name = "varA", mu = c(0.2, 4, -3), level = 3) +``` + +## Randomly init a single variable + +Alternatively, you can randomly initialize **varA** by specifying a mean (mu) and standard deviation (sd). +This introduces variability into **varA**, making it either a fixed or random effect in your design. + + +```{r example-init_variable_rand, warning = FALSE, message = FALSE} +list_var <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) +``` + +## Randomly init several variables + +You can also initialize multiple variables, such as **varA** and **varB**, with random values. +This flexibility allows you to create diverse experimental designs. + + + +```{r example-init_variable_mult, warning = FALSE, message = FALSE} +list_var <- init_variable( name = "varA", mu = 10, sd = 0.2, level = 5) %>% + init_variable( name = "varB", mu = -3, sd = 0.34, level = 2) +``` + +## Add interaction between variable + +Similarly to `init_variable()`, `add_interaction()` allows to define an interaction between variable. + +In this example, we initialize **varA** and **varB**, and create an interaction between **varA**, and **varB** using `add_interaction()`. + + + +```{r example-add_interaction, warning = FALSE, message = FALSE} +list_var <- init_variable( name = "varA", mu = 3, sd = 0.2, level = 2) %>% + init_variable( name = "varB", mu = 2, sd = 0.43, level = 2) %>% + add_interaction( between_var = c("varA", "varB"), mu = 0.44, sd = 0.2) +``` + +## Complex design + +Interactions can involve a maximum of three variables, such as **varA**, **varB**, and **varC**. + + + +```{r example-add_interaction_complex, eval = FALSE, message = FALSE, warning = FALSE, include = TRUE} +## /!\ not evaluated in Vignette +list_var <- init_variable( name = "varA", mu = 5, sd = 0.2, level = 2) %>% + init_variable( name = "varB", mu = 1, sd = 0.78, level = 2) %>% + init_variable( name = "varC", mu = c(2, 3), sd = NA, level = 2) %>% + add_interaction( between_var = c("varA", "varC"), mu = 0.44, sd = 0.2) %>% + add_interaction( between_var = c("varA", "varB"), mu = 0.43, sd = 0.37) %>% + add_interaction( between_var = c("varB", "varC"), mu = -0.33, sd = 0.12) %>% + add_interaction( between_var = c("varA", "varB" ,"varC"), mu = 0.87, sd = 0.18) +``` + +## Structure of list_var object + +```{r example-str_obj_init, warning = FALSE, message = FALSE} +str(list_var) +``` + +# Simulation of RNAseq data + +In this section, you will explore how to generate RNAseq data based on the previously defined input variables. The `mock_rnaseq()` function enables you to manage parameters in your RNAseq design, including number of genes, minimum and maximum number of replicates within your experimental setup, sequencing depth, basal expression of each gene, and dispersion of gene expression used for simulating counts. + + + +## Minimal example + +```{r example-mock_rnaseq_min, warning = FALSE, message = FALSE} +## -- Required parameters +N_GENES = 6000 +MIN_REPLICATES = 2 +MAX_REPLICATES = 10 +######################## + +## -- simulate RNAseq data based on input_var_list, minimum input required +## -- number of replicate randomly defined between MIN_REP and MAX_REP +mock_data <- mock_rnaseq(list_var, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES) + +## -- simulate RNAseq data based on input_var_list, minimum input required +## -- Same number of replicates between conditions +mock_data <- mock_rnaseq(list_var, N_GENES, + min_replicates = MAX_REPLICATES, + max_replicates = MAX_REPLICATES) +``` + +## Scaling genes counts based on sequencing depth + +Sequencing depth is a critical parameter affecting the statistical power of an RNAseq analysis. With the `sequencing_depth` option in the `mock_rnaseq()` function, you have the ability to control this parameter. + + +```{r example-mock_rnaseq_seqDepth, warning = FALSE, message = FALSE} +## -- Required parameters +N_GENES = 6000 +MIN_REPLICATES = 2 +MAX_REPLICATES = 10 +######################## + +SEQ_DEPTH = c(100000, 5000000, 10000000)## -- Possible number of reads/sample +SEQ_DEPTH = 10000000 ## -- all samples have same number of reads +mock_data <- mock_rnaseq(list_var, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + sequencing_depth = SEQ_DEPTH) +``` + +## Set dispersion of gene expression + +The dispersion parameter ($\alpha_i$), characterizes the relationship between the variance of the observed read counts and its mean value. In simple terms, it quantifies how much we expect observed counts to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter. + + + +```{r example-mock_rnaseq_disp, warning = FALSE, message = FALSE} + +## -- Required parameters +N_GENES = 6000 +MIN_REPLICATES = 2 +MAX_REPLICATES = 4 +######################## + +DISP = 0.1 ## -- Same dispersion for each genes +DISP = 1000 ## -- Same dispersion for each genes +DISP = runif(N_GENES, 0, 1000) ## -- Dispersion can vary between genes +mock_data <- mock_rnaseq(list_var, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + dispersion = DISP ) + +``` + +## Set basal gene expression + +The basal gene expression parameter, defined using the basal_expression option, allows you to control the fundamental baseline of expression level of genes. When a single value is specified, the basal expression of all genes is the same and the effects of variables defined in list_var are applied from this basal expression level. When several values are specified, the basal expression of each gene is picked randomly among these values (with replacement). The effects of variables are then applied from the basal expression of each gene. This represents the fact that expression levels can vary among genes even when not considering the effects of the defined variables (for example, some genes are constitutively more highly expressed than others because they have stronger basal promoters). + + +```{r example-mock_rnaseq_bexpr, warning = FALSE, message = FALSE} +## -- Required parameters +N_GENES = 6000 +MIN_REPLICATES = 10 +MAX_REPLICATES = 10 +######################## + +BASAL_EXPR = -3 ## -- Value can be negative to simulate low expressed gene +BASAL_EXPR = 2 ## -- Same basal gene expression for the N_GENES +BASAL_EXPR = c( -3, -1, 2, 8, 9, 10 ) ## -- Basal expression can vary between genes +mock_data <- mock_rnaseq(list_var, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + basal_expression = BASAL_EXPR) + +``` + +## Mock RNAseq object + +```{r example-str_obj_mock, warning = FALSE, message = FALSE} +str(mock_data) +``` + +# Theory behind HTRfit + +<div id="bg" align="center"> + <img src="./figs/htrfit_workflow.png" width="500" height="300"> +</div> + + +In this modeling framework, counts denoted as $K_{ij}$ for gene i and sample j are generated using a negative binomial distribution. The negative binomial distribution considers a fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +The fitted mean $\mu_{ij}$ is determined by a parameter, $q_{ij}$, which is proportionally related to the sum of all effects specified using `init_variable()` or `add_interaction()`. If basal gene expressions are provided, the $\mu_{ij}$ values are scaled accordingly using the gene-specific basal expression value ($bexpr_i$). + +Furthermore, the coefficients $\beta_i$ represent the natural logarithm fold changes for gene i across each column of the model matrix X. The dispersion parameter $\alpha_i$ plays a crucial role in defining the relationship between the variance of observed counts and their mean value. In simpler terms, it quantifies how far we expect observed counts to deviate from the mean value. + + + +# Fitting models + +## Prepare data for fitting + +The `prepareData2fit()` function serves the purpose of converting the counts matrix and sample metadata into a dataframe that is compatible with downstream **HTRfit** functions designed for model fitting. This function also includes an option to perform median ratio normalization on the data counts. + + + +```{r example-prepareData, warning = FALSE, message = FALSE} +## -- simulate small example to prevent excessively lengthy vignette construction +list_var <- init_variable( name = "varA", mu = 3, sd = 0.2, level = 2) %>% + init_variable( name = "varB", mu = 2, sd = 0.43, level = 2) %>% + add_interaction( between_var = c("varA", "varB"), mu = 0.44, sd = 0.2) +N_GENES = 30 +MIN_REPLICATES = 4 +MAX_REPLICATES = 4 +BASAL_EXPR = rnorm(N_GENES , 0 , 1 ) +mock_data <- mock_rnaseq(list_var, N_GENES, + min_replicates = MIN_REPLICATES, + max_replicates = MAX_REPLICATES, + basal_expression = BASAL_EXPR) +######################## + + +## -- data from simulation or real data +count_matrix <- mock_data$counts +metaData <- mock_data$metadata +############################## + +## -- convert counts matrix and samples metadatas in a data frame for fitting +data2fit = prepareData2fit(countMatrix = count_matrix, + metadata = metaData, + normalization = F) + + +## -- median ratio normalization +data2fit = prepareData2fit(countMatrix = count_matrix, + metadata = metaData, + normalization = T, + response_name = "kij") +``` + +## Fit model from your data + +The `fitModelParallel()` function enables independent model fitting for each gene. The number of threads used for this process can be controlled by the `n.cores` parameter. + + +```{r example-fitModelParallel, warning = FALSE, message = FALSE} +l_tmb <- fitModelParallel(formula = kij ~ varA, + data = data2fit, + group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), + n.cores = 1) +``` + +## Use mixed effect in your model + +**HTRfit** uses the **glmmTMB** functions for model fitting algorithms. This choice allows for the utilization of random effects within your formula design. For further details on how to specify your model, please refer to the [mixed model documentation](https://rdrr.io/cran/glmmTMB/man/glmmTMBControl.html). + + + +```{r example-fitModelParallel_mixed, warning = FALSE, message = FALSE} +l_tmb <- fitModelParallel(formula = kij ~ varA + ( 1 | varB ), + data = data2fit, + group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), + n.cores = 1) +``` + +## Additional settings + +The function provides precise control over model settings for fitting optimization, including options for specifying the [model family](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/family) and [model control setting](https://rdrr.io/cran/glmmTMB/man/glmmTMBControl.html). By default, a Gaussian family model is fitted, but for RNA-seq data, it is highly recommended to specify `family = glmmTMB::nbinom2(link = "log")`. + + + +```{r example-fitModelParallel_addSet, warning = FALSE, message = FALSE} +l_tmb <- fitModelParallel(formula = kij ~ varA, + data = data2fit, + group_by = "geneID", + n.cores = 1, + family = glmmTMB::nbinom2(link = "log"), + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e5, + eval.max=1e5))) +``` + +## Not only RNAseq data + +As the model family can be customized, HTRfit is not exclusively tailored for RNA-seq data. + + +```{r example-fitModelParallel_nonRNA, warning = FALSE, message = FALSE, eval = FALSE} +data("iris") +l_tmb <- fitModelParallel(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width , + data = iris, + group_by = "Species", + family = gaussian(), + n.cores = 1) +``` + +## Update fit + +The `updateParallel()` function updates and re-fits a model for each gene. It offers options similar to those in `fitModelParallel()`. + + +```{r example-update, warning = FALSE, message = FALSE} +## -- update your fit modifying the model family +l_tmb <- updateParallel(formula = kij ~ varA, + list_tmb = l_tmb , + family = gaussian(), + n.cores = 1) + +## -- update fit using additional model control settings +l_tmb <- updateParallel(formula = kij ~ varA , + list_tmb = l_tmb , + family = gaussian(), + n.cores = 1, + control = glmmTMB::glmmTMBControl(optCtrl=list(iter.max=1e3, + eval.max=1e3))) + + +## -- update your model formula and your family model +l_tmb <- updateParallel(formula = kij ~ varA + varB + varA:varB , + list_tmb = l_tmb , + family = glmmTMB::nbinom2(link = "log"), + n.cores = 1) +``` + +#### Struture of list tmb object + +```{r example-str_obj_l_tmb, warning = FALSE, message = FALSE} +str(l_tmb$gene1, max.level = 1) +``` + +## Plot fit metrics + +Visualizing fit metrics is essential for evaluating your models. Here, we show you how to generate various plots to assess the quality of your models. You can explore all metrics or focus on specific aspects like dispersion and log-likelihood. + + +```{r example-plotMetrics, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 8} +## -- plot all metrics +diagnostic_plot(list_tmb = l_tmb) +``` + +```{r example-plotMetricsFocus, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 3, fig.width = 8} +## -- Focus on metrics +diagnostic_plot(list_tmb = l_tmb, focus = c("dispersion", "logLik")) +``` + +## Anova to select the best model + +Utilizing the `anovaParallel()` function enables you to perform model selection by assessing the significance of the fixed effects. You can also include additional parameters like type. For more details, refer to [car::Anova](https://rdrr.io/cran/car/man/Anova.html). + + +```{r example-anova, warning = FALSE, message = FALSE} +## -- update your fit modifying the model family +l_anova <- anovaParallel(list_tmb = l_tmb) + +## -- additional settings +l_anova <- anovaParallel(list_tmb = l_tmb, type = "III" ) + +``` + +# Simulation evaluation report + +In this section, we delve into the evaluation of your simulation results. The `evaluation_report()` function provide valuable insights into the performance of your simulated data and models. + + +```{r example-evaluation_report, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'none'} +## -- get simulation/fit evaluation +resSimu <- evaluation_report(list_tmb = l_tmb, + dds = NULL, + mock_obj = mock_data, + coeff_threshold = 0.4, + alpha_risk = 0.05, + alt_hypothesis = "greaterAbs") +``` + +## Identity plot + +The `evaluation_report()` function produces an identity plot, offering a visual tool to juxtapose the simulated effects (actual effects) with the model-inferred effects. This graphical representation simplifies assessing how well the model aligns with the values of the simulated effects, enabling a visual analysis of the model's goodness of fit to the simulated data. For a more quantitative evaluation of inference quality, the R-squared (R2) metric can be employed. + + +#### Model parameters + +```{r example-outputIdentity_params, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 5} +## -- Model params +resSimu$identity$params +``` + +#### Dispersion parameter + +```{r example-outputIdentity_disp, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 5} +## -- Dispersion params +resSimu$identity$dispersion +``` + +The dispersion plot, generated by the `evaluation_report()` function, offers a visual comparison of the dispersion parameters used in the simulation \(\alpha_i\) with those estimated by the model. This graphical representation provides an intuitive way to assess the alignment between the simulated dispersion values and the model-inferred values, enabling a visual evaluation of how well the model captures the underlying data characteristics. + + + +## ROC curve + +The Receiver Operating Characteristic (ROC) curve is a valuable tool for assessing the performance of classification models, particularly in the context of identifying differentially expressed genes. It provides a graphical representation of the model's ability to distinguish between genes that are differentially expressed and those that are not, by varying the `coeff_threshold` and the `alt_hypothesis` parameters. + + + +```{r example-outputROC, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 9} +## -- ROC curve +resSimu$roc$params +``` + +## Precision-recall curve + +The precision-recall curve (PR curve) illustrates the relationship between precision and recall at various classification thresholds. It is particularly valuable in the context of imbalanced data, where one class is significantly more prevalent than the other. Unlike the ROC curve, which can be influenced by class distribution, the PR curve focuses on the model's ability to correctly identify examples of the minority class while maintaining high precision. + + + +```{r example-outputPR, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 9} +## -- precision-recall curve +resSimu$precision_recall$params +``` + +## AUC and classification performance metrics + +The area under the ROC curve (AUC) provides a single metric that summarizes the model's overall performance in distinguishing between differentially expressed and non-differentially expressed genes. A higher AUC indicates better model performance. In the case of the ROC curve, this AUC should be compared to the value of 0.5, representing a random classifier. On the other hand, for the PR curve, we compute the pr_AUC_random, serving as a baseline for comparison. + + +In addition to evaluating model performance through the AUC for both ROC and PR curves, we provide access to key classification metrics, including Accuracy, Precision, Recall (or Sensitivity), and Specificity. These metrics offer a comprehensive view of the model's classification capabilities. These metrics are computed for each parameter (excluding the intercept when skip_eval_intercept = TRUE), providing detailed insights into individual parameter contributions. Furthermore, an aggregated assessment is performed, considering all parameters (except the intercept by default), offering a perspective on the model's overall classification effectiveness. + + + +```{r example-outputPerf, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 9} +## -- precision-recall curve +resSimu$performances +``` + +## Compare HTRfit with DESeq2 + +**HTRfit** offers a wrapper for **DESeq2** outputs. This functionality allows users to seamlessly integrate the results obtained from **DESeq2** into the **HTRfit** analysis pipeline. By doing so, you can readily compare the performance of **HTRfit** with **DESeq2** on your RNAseq data. This comparative analysis aids in determining which tool performs better for your specific research goals and dataset + + + +```{r example-ddsComparison, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'none'} +## -- DESeq2 +library(DESeq2) +dds <- DESeq2::DESeqDataSetFromMatrix( + countData = count_matrix, + colData = metaData, + design = ~ varA + varB + varA:varB ) +dds <- DESeq2::DESeq(dds, quiet = TRUE) + + +## -- get simulation/fit evaluation +resSimu <- evaluation_report(list_tmb = l_tmb, + dds = dds, + mock_obj = mock_data, + coeff_threshold = 0.4, + alt_hypothesis = "greaterAbs") +``` + +```{r example-outputResSimu_id, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 5} +## -- identity plot +###### 1) Model params +resSimu$identity$params +###### Dispersion params +resSimu$identity$dispersion +``` + +```{r example-outputResSimu_metric, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 9} +## -- precision-recall curve +resSimu$precision_recall$params +## -- ROC curve +resSimu$roc$params +## -- Performances metrics +resSimu$performances +``` + +## Focus evaluation on a subset of genes + +In this section, we showcase the assessment of model performance on a subset of genes. Specifically, we focus on evaluating genes with low expression levels, identified by their basal expression ($bexpr_i$) initialized below 0 during the simulation. + + +```{r example-subsetGenes, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'none'} +## -- Focus on low expressed genes +low_expressed_df <- mock_data$groundTruth$effects[ mock_data$groundTruth$effects$basalExpr < 0, ] +l_genes <- unique(low_expressed_df$geneID) +mock_lowExpressed <- subsetGenes(l_genes, mock_data) + + +## -- get simulation/fit evaluation +resSimu <- evaluation_report(list_tmb = l_tmb, + dds = dds, + mock_obj = mock_lowExpressed, + coeff_threshold = 0.4, + alt_hypothesis = "greaterAbs") +``` + +As we compare this evaluation to the previous one, we observe a reduction in performances for both **HTRfit** and **DESeq2** inferences. + + +```{r example-subsetGenes_id, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 5} +## -- identity plot +###### 1) Model params +resSimu$identity$params +###### Dispersion params +resSimu$identity$dispersion +``` + +```{r example-subsetGenes_metrics, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 9} +## -- precision-recall curve +resSimu$precision_recall$params +## -- ROC curve +resSimu$roc$params +## -- Performances metrics +resSimu$performances +``` + +## Modifying your alpha risk + +```{r example-alphaRisk_1, warning = FALSE, message = FALSE} +## -- get simulation/fit evaluation +resSimu <- evaluation_report(list_tmb = l_tmb, + dds = dds, + mock_obj = mock_data, + coeff_threshold = 0.4, + alpha_risk = 0.05, ## -- default + alt_hypothesis = "greaterAbs") +## -- Performances metrics +resSimu$performances +``` + +```{r example-alphaRiskk_2, warning = FALSE, message = FALSE} +## -- get simulation/fit evaluation +resSimu <- evaluation_report(list_tmb = l_tmb, + dds = dds, + mock_obj = mock_data, + coeff_threshold = 0.4, + alpha_risk = 0.01, + alt_hypothesis = "greaterAbs") +## -- Performances metrics +resSimu$performances +``` + +## Evaluate model inference involving mixed effects + +For certain experimental scenarios, such as those involving a high number of levels or longitudinal data, the utilization of mixed effects within your design formula can be beneficial. The **HTRfit** simulation framework also offers the capability to assess this type of design formula. + + +```{r example-evalMixed, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'none'} +## -- init a design with a high number of levels +input_var_list <- init_variable( name = "varA", mu = 0, sd = 0.29, level = 60) %>% + init_variable( name = "varB", mu = 0.27, sd = 0.6, level = 2) %>% + add_interaction( between_var = c("varA", "varB"), mu = 0.44, sd = 0.89) +## -- simulate RNAseq data +mock_data <- mock_rnaseq(input_var_list, + n_genes = 30, ## small number to prevent excessively lengthy vignette construction + min_replicates = 10, + max_replicates = 10, + basal_expression = 5 ) +## -- prepare data & fit a model with mixed effect +data2fit = prepareData2fit(countMatrix = mock_data$counts, + metadata = mock_data$metadata, + normalization = F) +l_tmb <- fitModelParallel(formula = kij ~ varB + (varB | varA), + data = data2fit, + group_by = "geneID", + family = glmmTMB::nbinom2(link = "log"), + n.cores = 1) +## -- evaluation +resSimu <- evaluation_report(list_tmb = l_tmb, + dds = NULL, + mock_obj = mock_data, + coeff_threshold = 0.27, + alt_hypothesis = "greater") +``` + +```{r example-outputResSimuMixed_id, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 5} +## -- identity plot +###### 1) Model params +resSimu$identity$params +###### Dispersion params +resSimu$identity$dispersion +``` + +```{r example-outputResSimuMixed_metric, warning = FALSE, message = FALSE, fig.align = 'center', fig.height = 4, fig.width = 9} +## -- precision-recall curve +resSimu$precision_recall$params +## -- ROC curve +resSimu$roc$params +## -- Performances metrics +resSimu$performances +``` + +## Strure of evaluation report object + +```{r example-str_eval_report, warning = FALSE, message = FALSE} +str(resSimu, max.level = 1) +``` + +## About mixed model evaluation + +**HTRfit** offers a versatile simulation framework capable of fitting various types of models involving mixed effects, thanks to its implementation of **glmmTMB**. By combining the functionalities of `init_variable()` and `add_interaction()`, **HTRfit** enables the simulation of even the most complex experimental designs. However, it's important to note that as of now, HTRfit supports the evaluation of only *Type I* mixed models. In this context, *Type I* models are defined as those that follow the structure: `~ varA + (1 | varB)` or `~ varA + (varA | varB)`. Models not conforming to this specific form cannot be evaluated using **HTRfit's** current implementation. Nonetheless, you are welcome to extend its capabilities by implementing support for additional model types. + + + +