From ac49fc50eacfda01f366a8b62ea334882ba6571f Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Tue, 17 Jan 2023 18:45:53 +0100 Subject: [PATCH] manage GLM mixte + warnings + errors --- .../00_pkg_src/HTRfit/R/preprocessing.R | 33 +++ src/model.fit/HTRfit/DESCRIPTION | 5 +- src/model.fit/HTRfit/R/model_fitting.R | 198 +++++++++++++++--- src/model.fit/HTRfit/devtools_history.R | 1 + src/v3/HTRsim/R/countsGenerator.R | 24 ++- src/v3/HTRsim/R/evaluation.R | 95 +++++---- src/v3/HTRsim/R/workflow.R | 61 +++++- src/v3/HTRsim/devtools_history.R | 2 +- 8 files changed, 346 insertions(+), 73 deletions(-) create mode 100644 src/model.fit/HTRfit.Rcheck/00_pkg_src/HTRfit/R/preprocessing.R diff --git a/src/model.fit/HTRfit.Rcheck/00_pkg_src/HTRfit/R/preprocessing.R b/src/model.fit/HTRfit.Rcheck/00_pkg_src/HTRfit/R/preprocessing.R new file mode 100644 index 0000000..045ff07 --- /dev/null +++ b/src/model.fit/HTRfit.Rcheck/00_pkg_src/HTRfit/R/preprocessing.R @@ -0,0 +1,33 @@ +#' get df2fit for a given gene +#' @param k_ij +#' @param design_simulation +#' @param gene_name +#' @import dplyr +#' @return dataframe +#' @export +#' +#' @examples +getdf2fit <- function(k_ij , design_simulation, gene_name){ + df_gene_i = cbind(design_simulation, k_ij) %>% + dplyr::mutate(gene_id = gene_name) %>% + dplyr::mutate(gene_id = gene_name) + rownames(df_gene_i) <- NULL + return(df_gene_i) +} + +#' get df2fit for a given gene +#' @param countTable +#' @param experimental_design +#' @import purrr +#' @return dataframe +#' @export +#' +#' @examples +reshapeCounTable<- function(countTable, experimental_design ){ + + gene_id_list = rownames( countTable ) + list_df2fit = purrr::map( .x = gene_id_list, ~getdf2fit( countTable[ .x, ], + experimental_design, .x) ) + data2fit = do.call( rbind, list_df2fit ) + return(data2fit) +} \ No newline at end of file diff --git a/src/model.fit/HTRfit/DESCRIPTION b/src/model.fit/HTRfit/DESCRIPTION index 4682d71..a5a95ef 100644 --- a/src/model.fit/HTRfit/DESCRIPTION +++ b/src/model.fit/HTRfit/DESCRIPTION @@ -14,9 +14,12 @@ Depends: tidyverse Imports: broom, + broom.mixed, dplyr, furrr, + futile.logger, future, MASS, purrr, - stats + stats, + tibble diff --git a/src/model.fit/HTRfit/R/model_fitting.R b/src/model.fit/HTRfit/R/model_fitting.R index 1a6572a..a522b9c 100644 --- a/src/model.fit/HTRfit/R/model_fitting.R +++ b/src/model.fit/HTRfit/R/model_fitting.R @@ -6,6 +6,7 @@ #' @param threads #' @import future #' @import furrr +#' @import futile.logger #' @return fit list #' @export #' @@ -14,6 +15,10 @@ launch.glm <- function(data2fit, model2fit = k_ij ~ genotype + environment + genotype:environment, fit_by = "gene_id", threads = 4) { + stopifnot(fit_by %in% colnames(data2fit)) + # -- log + futile.logger::flog.info("Fit started per %s", fit_by) + data2fit <- data2fit %>% as.data.frame() iteration_list <- data2fit[, fit_by] %>% unique() %>% @@ -25,9 +30,14 @@ launch.glm <- function(data2fit, ~ fit.glm( data2fit[which(data2fit[, fit_by] == .x), ], .x, - model2fit + model2fit, + fit_by ) ) + + # -- log + futile.logger::flog.info("Fit ended\n") + return(fit_list) } @@ -37,29 +47,50 @@ launch.glm <- function(data2fit, #' @param id #' @import MASS #' @import dplyr +#' @import futile.logger #' @return fit #' @export #' #' @examples -fit.glm <- function(data, id, model2fit) { - tryCatch( - { - fit <- MASS::glm.nb(model2fit, data = data, link = log) - fit.dtf <- tidySummary(fit) - fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(gene_id = id) - fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(gene_id = id) - fit.dtf$dispersion <- list( gene_id = id, dispersion.estimate = fit$theta ) %>% as.data.frame() +fit.glm <- function(data, id, model2fit, fit_by) { + # -- function executed if all perform well + f <- function(data, id, model2fit, fit_by) { + fit <- MASS::glm.nb(model2fit, data = data, link = log) + fit.dtf <- tidySummary(fit, "glm") + fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(!!fit_by := id) + fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(!!fit_by := id) + fit.dtf$dispersion <- list(dispersion.estimate = fit$theta) %>% + as.data.frame() %>% + dplyr::mutate(!!fit_by := id) + futile.logger::flog.info("%s: ok", id) + return(fit.dtf) + } - return(fit.dtf) + tryCatch( + expr = { + withCallingHandlers( + f(data, id, model2fit, fit_by), + warning = function(w) { + futile.logger::flog.info("%s: %s ", id, w) + invokeRestart("muffleWarning") + } + ) }, - error = function(cnd) { - inference <- list(gene_id = id, estimate = NA, std.error = NA, term= NA) %>% as.data.frame() - fitQuality <- list(null.deviance= NA,df.null = NA, logLik = NA,AIC = NA ,BIC=NA,deviance = NA,df.residual=NA,nobs = NA,gene_id = id) %>% as.data.frame() - dispersion <- list( gene_id = id, dispersion.estimate = NA ) %>% as.data.frame() - fit.dtf <- list(inference = inference, fitQuality = fitQuality, dispersion) - - return(fit.dtf) + error = function(e) { + futile.logger::flog.error("%s: %s ", id, e) + + inference <- list(estimate = NA, std.error = NA, term = NA) %>% + as.data.frame() %>% + dplyr::mutate(!!fit_by := id) + fitQuality <- list(null.deviance = NA, df.null = NA, logLik = NA, AIC = NA, BIC = NA, deviance = NA, df.residual = NA, nobs = NA) %>% + as.data.frame() %>% + dplyr::mutate(!!fit_by := id) + dispersion <- list(dispersion.estimate = NA) %>% + as.data.frame() %>% + dplyr::mutate(!!fit_by := id) + fit.dtf <- list(inference = inference, fitQuality = fitQuality, dispersion = dispersion) + return(fit.dtf) } ) } @@ -75,13 +106,13 @@ fit.glm <- function(data, id, model2fit) { #' @examples tidySummary <- function(fit, modelType = "glm") { dtf <- broom.mixed::tidy(fit) - if (modelType == "glm"){ - dtf <- dtf %>% - dplyr::select(estimate, std.error, term) + if (modelType == "glm") { + dtf <- dtf %>% + dplyr::select(estimate, std.error, term) } - if (modelType == "glmm"){ - dtf <- dtf %>% - dplyr::select(effect, group, estimate, std.error, term) + if (modelType == "glm_mixte") { + dtf <- dtf %>% + dplyr::select(effect, group, estimate, std.error, term) } gl <- broom::glance(fit) return(list(inference = dtf, fitQuality = gl)) @@ -98,5 +129,124 @@ listFit2dtf <- function(list_fit) { inference.dtf <- do.call(rbind, tmp[1, ]) fitQuality.dtf <- do.call(rbind, tmp[2, ]) dispersion.dtf <- do.call(rbind, tmp[3, ]) - return(list(inference = inference.dtf, fitQuality = fitQuality.dtf, dispersion = dispersion.dtf )) + return(list(inference = inference.dtf, fitQuality = fitQuality.dtf, dispersion = dispersion.dtf)) +} + + +#' launch glm mixte on data +#' @param data +#' @param model2fit +#' @param fit_by +#' @param threads +#' @param package +#' @import future +#' @import furrr +#' @import futile.logger +#' @return fit list +#' @export +#' +#' @examples +launch.glm_mixte <- function(data2fit, + model2fit = k_ij ~ environment + (1 + environment | genotype), + fit_by = "gene_id", package = "glmmTMB", + threads = 4) { + # -- log + futile.logger::flog.info("Fit started") + futile.logger::flog.info("GLM mixte: %s", package) + + data2fit <- data2fit %>% as.data.frame() + iteration_list <- data2fit[, fit_by] %>% + unique() %>% + unlist() %>% + unname() ## get list gene + future::plan(multisession, workers = threads) + fit_list <- iteration_list %>% furrr::future_map( + .x = ., + ~ fit.glm_mixte( + data2fit[which(data2fit[, fit_by] == .x), ], + .x, + model2fit, + fit_by, package + ) + ) + # -- log + futile.logger::flog.info("Fit ended\n") + return(fit_list) +} + + + + +#' fit model on data +#' @param data +#' @param id +#' @param model2fit +#' @param fit_by +#' @param package +#' @import lme4 +#' @import dplyr +#' @import futile.logger +#' @import glmmTMB +#' @return fit +#' @export +#' +#' @examples +fit.glm_mixte <- function(data, id, model2fit, fit_by, package = "glmmTMB") { + # -- function executed if all perform well + f <- function(data, id, model2fit, fit_by, package) { + stopifnot(package %in% c("glmmTMB", "lme4")) + if (package == "glmmTMB") { + fit <- glmmTMB::glmmTMB(model2fit, data = data, family = nbinom1, verbose = F) + theta <- glmmTMB::sigma(fit) # dispersion + } + if (package == "lme4") { + fit <- lme4::glmer.nb(model2fit, data = data, verbose = FALSE) + theta <- lme4::getME(fit, "glmer.nb.theta") # dispersion + } + + fit.dtf <- tidySummary(fit, "glm_mixte") + fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(!!fit_by := id) + fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(!!fit_by := id) + # if warning while fitting => no AIC and BIC => set to NA + if (!("AIC" %in% colnames(fit.dtf$fitQuality))) fit.dtf$fitQuality$AIC = NA + if (!("BIC" %in% colnames(fit.dtf$fitQuality))) fit.dtf$fitQuality$BIC = NA + if (!("logLik" %in% colnames(fit.dtf$fitQuality))) fit.dtf$fitQuality$logLik = NA + + fit.dtf$dispersion <- list(dispersion.estimate = theta) %>% + as.data.frame() %>% + dplyr::mutate(!!fit_by := id) + + + # -- log all rigth + futile.logger::flog.info("%s: ok", id) + return(fit.dtf) + } + + + tryCatch( + expr = { + withCallingHandlers( + f(data, id, model2fit, fit_by, package), + warning = function(w) { + futile.logger::flog.info("%s: %s ", id, w) + invokeRestart("muffleWarning") + } + ) + }, + error = function(e) { + # -- log error + futile.logger::flog.error("%s: %s ", id, e) + inference <- list(effect = NA, group = NA,estimate = NA, std.error = NA, term = NA) %>% + as.data.frame() %>% + dplyr::mutate(!!fit_by := id) + fitQuality <- list( sigma = NA, logLik = NA, AIC = NA, BIC = NA, deviance = NA, df.residual = NA, nobs = NA) %>% + as.data.frame() %>% + dplyr::mutate(!!fit_by := id) + dispersion <- list(dispersion.estimate = NA) %>% + as.data.frame() %>% + dplyr::mutate(!!fit_by := id) + fit.dtf <- list(inference = inference, fitQuality = fitQuality, dispersion = dispersion) + return(fit.dtf) + } + ) } diff --git a/src/model.fit/HTRfit/devtools_history.R b/src/model.fit/HTRfit/devtools_history.R index 16f62e9..630791d 100644 --- a/src/model.fit/HTRfit/devtools_history.R +++ b/src/model.fit/HTRfit/devtools_history.R @@ -8,6 +8,7 @@ usethis::use_package("future") usethis::use_package("stats") usethis::use_package("broom.mixed") usethis::use_package("tibble") +usethis::use_package("futile.logger") diff --git a/src/v3/HTRsim/R/countsGenerator.R b/src/v3/HTRsim/R/countsGenerator.R index 89dd073..c8b8af3 100644 --- a/src/v3/HTRsim/R/countsGenerator.R +++ b/src/v3/HTRsim/R/countsGenerator.R @@ -44,6 +44,10 @@ getBetaforSimulation <- function(n_genes = 100, n_genotypes = 20, mvrnormFit_lis dplyr::group_by(gene_id) %>% dplyr::mutate(betaE = mean(betaE)) } + + # -- log + futile.logger::flog.info("Beta: ok") + return(res) } @@ -80,6 +84,10 @@ getLog_qij <- function(beta.dtf, model.matx) { log_qij.dtf <- log_qij.matx %>% data.frame() annotations <- beta.dtf[, c("gene_id", "genotype")] log_qij.dtf <- cbind(annotations, log_qij.dtf) + + # -- log + futile.logger::flog.info("log(q_ij): ok") + return(log_qij.dtf) } @@ -108,6 +116,9 @@ getMu_ij <- function(log_qij.dtf, size_factor) { column_to_rownames("gene_id") %>% as.matrix() + # -- log + futile.logger::flog.info("mu_ij: ok") + return(mu_ij.matx) } @@ -152,7 +163,11 @@ getGenesDispersions <- function(n_genes, sample_id_list, rownames(gene_dispersion.dtf) <- base::paste("gene", 1:(n_genes), sep = "") } gene_dispersion.mtx <- gene_dispersion.dtf %>% as.matrix() - gene_dispersion.mtx <- gene_dispersion.mtx[ order(row.names(gene_dispersion.mtx)), ] + gene_dispersion.mtx <- gene_dispersion.mtx[order(row.names(gene_dispersion.mtx)), ] + + # -- log + futile.logger::flog.info("dispersion: ok") + return(gene_dispersion.mtx) } @@ -204,9 +219,9 @@ getCountTable <- function(mu_ij.matx, dispersion.matx, n_genes, n_genotypes, n_environments = 2, sample_id_list, replication.matx) { + # -- log + futile.logger::flog.info("k_ij ~ NegativBinomial( mu_ij, dispersion_ij )") ### Iterate on each replicates - message("Get k_ij :") - message("k_ij ~ NegativBinomial( mu_ij, dispersion_ij )\n") max_n_replicates <- dim(replication.matx)[1] kij.simu.list <- purrr::map( .x = 1:max_n_replicates, @@ -219,6 +234,9 @@ getCountTable <- function(mu_ij.matx, dispersion.matx, ) tableCounts.simulated <- do.call(cbind, kij.simu.list) + # -- log + futile.logger::flog.info("k_ij : ok\n") + return(tableCounts.simulated) } diff --git a/src/v3/HTRsim/R/evaluation.R b/src/v3/HTRsim/R/evaluation.R index 60a48ad..1b5ea22 100644 --- a/src/v3/HTRsim/R/evaluation.R +++ b/src/v3/HTRsim/R/evaluation.R @@ -1,6 +1,6 @@ #' Get prediction results #' -#' @param inference.dtf +#' @param inference.dtf #' @param threshold #' @param alphaRisk #' @param postInferenceSelection @@ -67,44 +67,17 @@ getPrediction <- function(inference.dtf, threshold = 0, alphaRisk = 0.05, altH = #' @param actual.dtf #' @param threshold #' @param altH +#' @param toEval #' @import dplyr #' @import reshape2 #' @return a dataframe to compare actual and infered betas with annotations #' @export #' #' @examples -getExpectation <- function(actual.dtf, threshold = 0, altH = "greaterAbs") { - reshapeActualDtf <- function(actual.dtf) { - actual.dtf <- actual.dtf %>% - reshape2::melt( - id = c("gene_id", "genotype", "idx_mvrnom"), - value.name = "value", variable.name = "beta" - ) %>% - dplyr::group_by(gene_id, beta) %>% - dplyr::mutate( - actual.value = - dplyr::if_else(beta %in% c("(Intercept)", "betaE"), - mean(value), value - ) - ) %>% - ungroup() %>% - dplyr::select(-value) %>% - dplyr::mutate( - term = - dplyr::case_when( - beta == "(Intercept)" ~ "(Intercept)", - beta == "betaG" ~ paste("genotype", genotype, sep = ""), - beta == "betaE" ~ paste("environment", "E1", sep = ""), - beta == "betaGE" ~ paste("genotype", genotype, ":environmentE1", sep = "") - ) - ) - actual.dtf <- actual.dtf %>% dplyr::select(-genotype) - actual_undup.dtf <- actual.dtf[!duplicated(actual.dtf), ] - return(actual_undup.dtf) - } - - actual.reshaped <- reshapeActualDtf(actual.dtf) - +getExpectation <- function(actual.dtf, toEval = "glm" ,threshold = 0, altH = "greaterAbs") { + stopifnot(toEval %in% c("glm", "glm_mixte")) + if (toEval == "glm") { + actual.reshaped <- reshapeActualDtf_glm(actual.dtf) ####### ACTUAL LABEL ######### actual.reshaped.annot <- actual.reshaped %>% dplyr::mutate( @@ -113,7 +86,8 @@ getExpectation <- function(actual.dtf, threshold = 0, altH = "greaterAbs") { "nonDE", "DE" ) ) - + } + if (toEval == "glm_mixte") actual.reshaped.annot <- reshapeActualDtf_glm_mixte(actual.dtf) return(actual.reshaped.annot) } @@ -128,9 +102,7 @@ getExpectation <- function(actual.dtf, threshold = 0, altH = "greaterAbs") { #' @export #' #' @examples -getComparison <- function(actual.dtf, inference.dtf){ - - +getComparison <- function(actual.dtf, inference.dtf) { actual2join.dtf <- data.table::data.table(actual.dtf, key = c("gene_id", "term")) inference2join.dtf <- data.table::data.table(inference.dtf, key = c("gene_id", "term")) comparison.dtf <- actual2join.dtf[inference2join.dtf] @@ -145,7 +117,52 @@ getComparison <- function(actual.dtf, inference.dtf){ ) ) - return(comparison.dtf %>% as.data.frame() ) + return(comparison.dtf %>% as.data.frame()) +} -} \ No newline at end of file +# -- facilities functions +reshapeActualDtf_glm <- function(actual.dtf) { + actual.dtf <- actual.dtf %>% + reshape2::melt( + id = c("gene_id", "genotype", "idx_mvrnom"), + value.name = "value", variable.name = "beta" + ) %>% + dplyr::group_by(gene_id, beta) %>% + dplyr::mutate( + actual.value = + dplyr::if_else(beta %in% c("(Intercept)", "betaE"), + mean(value), value + ) + ) %>% + ungroup() %>% + dplyr::select(-value) %>% + dplyr::mutate( + term = + dplyr::case_when( + beta == "(Intercept)" ~ "(Intercept)", + beta == "betaG" ~ paste("genotype", genotype, sep = ""), + beta == "betaE" ~ paste("environment", "E1", sep = ""), + beta == "betaGE" ~ paste("genotype", genotype, ":environmentE1", sep = "") + ) + ) + actual.dtf <- actual.dtf %>% dplyr::select(-genotype) + actual_undup.dtf <- actual.dtf[!duplicated(actual.dtf), ] + return(actual_undup.dtf) +} + + +reshapeActualDtf_glm_mixte <- function(actual.dtf) { + actual.dtf <- actual.dtf %>% + dplyr::group_by(gene_id) %>% + dplyr::summarise( + tmp = mean(`(Intercept)` + betaG), + environmentE1 = mean(betaE + betaGE), + "sd__(Intercept)" = sd(`(Intercept)` + betaG), + sd__environmentE1 = sd(betaGE + betaE), + "cor__(Intercept).environmentE1" = cor((betaGE + betaE), (`(Intercept)` + betaG)) + ) %>% + dplyr::rename("(Intercept)" = tmp) %>% + reshape2::melt(id = "gene_id", value.name = "actual.value", variable.name = "term") + return(actual.dtf) +} diff --git a/src/v3/HTRsim/R/workflow.R b/src/v3/HTRsim/R/workflow.R index c22b215..c6c2629 100644 --- a/src/v3/HTRsim/R/workflow.R +++ b/src/v3/HTRsim/R/workflow.R @@ -9,7 +9,9 @@ summariseDesign <- function(count_table) { sample_id_list <- colnames(count_table) experimental_design <- sample_id_list %>% - stringr::str_split(pattern = "_", simplify = TRUE) %>% .[, c(1, 2)] %>% data.frame() + stringr::str_split(pattern = "_", simplify = TRUE) %>% + .[, c(1, 2)] %>% + data.frame() colnames(experimental_design) <- c("genotype", "environment") experimental_design <- experimental_design %>% dplyr::mutate(sample_id = sample_id_list) %>% @@ -29,6 +31,9 @@ summariseDesign <- function(count_table) { #' @param max_n_replicates max number of replicates #' @param uniformNumberOfReplicates boolean #' @param uniformDispersion boolean +#' @param n_clusters +#' @param fixBetaE +#' @param fixIntercept #' @return list #' @export #' @@ -40,7 +45,21 @@ rnaMock <- function(n_genes, sequencing_factor = 2, uniformNumberOfReplicates = T, uniformDispersion = T, - dds.extraction = loadEmbedded_ObservedValues()) { + dds.extraction = loadEmbedded_ObservedValues(), + n_clusters = 5, + fixBetaE = T, + fixIntercept = T) { + # -log + log.simulation(n_genes, + n_genotypes, + max_n_replicates, + sequencing_factor, + uniformNumberOfReplicates, + uniformDispersion, + fixBetaE, + fixIntercept, + n_clusters) + ## Fit mvnorm ## list_fit.mvnorm <- getListMvnormFit(dds.extraction$beta) @@ -48,10 +67,13 @@ rnaMock <- function(n_genes, beta.actual <- getBetaforSimulation( n_genes, n_genotypes, - list_fit.mvnorm + list_fit.mvnorm, + n_clusters = n_clusters, + fixIntercept = fixIntercept, + fixBetaE = fixBetaE ) - - + + ##### build input for simulation #### model.matx <- getModelMatrix() log_qij <- getLog_qij(beta.actual, model.matx) @@ -89,3 +111,32 @@ rnaMock <- function(n_genes, actualParameters = actualParam )) } + + + + + + +log.simulation <- function(n_genes, + n_genotypes, + sequencing_factor, + max_n_replicates, + uniformNumberOfReplicates, + uniformDispersion, + fixBetaE, + fixIntercept, + n_clusters) { + embelishment_top <- "\n* ----- SETUP ----- *\n" + string0 <- "\n# - Number of genes: %d" + string1 <- "\n# - Number of genotypes: %d" + string2 <- "\n# - Maximum number of replicates: %d" + string3 <- "\n# - Number of replicates uniform: %s" + string4 <- "\n# - Dispersion uniform: %s" + string5 <- "\n# - Intercept fixed: %s" + string6 <- "\n# - BetaE fixed: %s" + string7 <- "\n# - Number of clusters: %s\n" + string <- paste(string0, string1, string2, string3, string4, string5, string6, string7, sep = "") + embelishment_bottom <- "\n* ----------------------- *\n" + whole_string <- paste(embelishment_top, string, embelishment_bottom, sep = "") + futile.logger::flog.info(whole_string, n_genes, n_genotypes, max_n_replicates, uniformNumberOfReplicates, uniformDispersion, fixIntercept, fixBetaE, n_clusters) +} diff --git a/src/v3/HTRsim/devtools_history.R b/src/v3/HTRsim/devtools_history.R index 4788939..3a5e084 100644 --- a/src/v3/HTRsim/devtools_history.R +++ b/src/v3/HTRsim/devtools_history.R @@ -17,5 +17,5 @@ usethis::use_package("rstatix") usethis::use_package("ggplot2") usethis::use_package("ggVennDiagram") usethis::use_package("gridExtra") -usethis::use_package("plotROC") +usethis::use_package("glmmTMB") -- GitLab