From b044f30486a6ab862f280e6cfe9874cedd09c247 Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Sat, 14 Jan 2023 22:08:22 +0100 Subject: [PATCH] DESEQ + GLM fonctionnal ! --- src/model.fit/HTRfit/R/model_fitting.R | 56 +++-- src/model.fit/HTRfit/R/preprocessing.R | 48 ++-- src/model.fit/HTRfit/R/wald_test.R | 48 ++-- src/model.fit/HTRfit/devtools_history.R | 3 +- src/v3/HTRsim/R/countsGenerator.R | 53 +++-- src/v3/HTRsim/R/dds_manipulations.R | 100 +++++++++ src/v3/HTRsim/R/embeddedfiles_manipulations.R | 103 +++++++++ src/v3/HTRsim/R/evaluation.R | 210 ++++++++++-------- src/v3/HTRsim/R/loadEmbeddedFiles.R | 50 ----- src/v3/HTRsim/R/manipulationsDDS_obj.R | 70 ------ .../HTRsim/R/manipulations_multinormDistrib.R | 25 --- .../HTRsim/R/multinormDistrib_manipulations.R | 47 ++++ src/v3/HTRsim/R/utils.R | 68 ++++++ src/v3/HTRsim/R/workflow.R | 11 +- 14 files changed, 556 insertions(+), 336 deletions(-) create mode 100644 src/v3/HTRsim/R/dds_manipulations.R create mode 100644 src/v3/HTRsim/R/embeddedfiles_manipulations.R delete mode 100644 src/v3/HTRsim/R/loadEmbeddedFiles.R delete mode 100644 src/v3/HTRsim/R/manipulationsDDS_obj.R delete mode 100644 src/v3/HTRsim/R/manipulations_multinormDistrib.R create mode 100644 src/v3/HTRsim/R/multinormDistrib_manipulations.R create mode 100644 src/v3/HTRsim/R/utils.R diff --git a/src/model.fit/HTRfit/R/model_fitting.R b/src/model.fit/HTRfit/R/model_fitting.R index 9c23dc0..1a6572a 100644 --- a/src/model.fit/HTRfit/R/model_fitting.R +++ b/src/model.fit/HTRfit/R/model_fitting.R @@ -14,16 +14,16 @@ launch.glm <- function(data2fit, model2fit = k_ij ~ genotype + environment + genotype:environment, fit_by = "gene_id", threads = 4) { - iteration_list <- data2fit[fit_by] %>% + data2fit <- data2fit %>% as.data.frame() + iteration_list <- data2fit[, fit_by] %>% unique() %>% - as.vector() %>% - unname() %>% - .[[1]] + unlist() %>% + unname() ## get list gene future::plan(multisession, workers = threads) fit_list <- iteration_list %>% furrr::future_map( .x = ., ~ fit.glm( - data2fit[data2fit[fit_by] == .x, ], + data2fit[which(data2fit[, fit_by] == .x), ], .x, model2fit ) @@ -42,28 +42,53 @@ launch.glm <- function(data2fit, #' #' @examples fit.glm <- function(data, id, model2fit) { - fit <- MASS::glm.nb(model2fit, data = data, link = log) - fit.dtf <- tidySummary(fit) - fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(gene_id = id) - fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(gene_id = id) - return(fit.dtf) + tryCatch( + { + fit <- MASS::glm.nb(model2fit, data = data, link = log) + fit.dtf <- tidySummary(fit) + fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(gene_id = id) + fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(gene_id = id) + fit.dtf$dispersion <- list( gene_id = id, dispersion.estimate = fit$theta ) %>% as.data.frame() + + return(fit.dtf) + }, + error = function(cnd) { + inference <- list(gene_id = id, estimate = NA, std.error = NA, term= NA) %>% as.data.frame() + fitQuality <- list(null.deviance= NA,df.null = NA, logLik = NA,AIC = NA ,BIC=NA,deviance = NA,df.residual=NA,nobs = NA,gene_id = id) %>% as.data.frame() + dispersion <- list( gene_id = id, dispersion.estimate = NA ) %>% as.data.frame() + fit.dtf <- list(inference = inference, fitQuality = fitQuality, dispersion) + + return(fit.dtf) + + } + ) } #' fit model on data #' @param fit -#' @import broom +#' @param modelType +#' @import broom.mixed +#' @import dplyr #' @return list of element #' @export #' #' @examples -tidySummary <- function(fit) { - dtf <- broom::tidy(fit) +tidySummary <- function(fit, modelType = "glm") { + dtf <- broom.mixed::tidy(fit) + if (modelType == "glm"){ + dtf <- dtf %>% + dplyr::select(estimate, std.error, term) + } + if (modelType == "glmm"){ + dtf <- dtf %>% + dplyr::select(effect, group, estimate, std.error, term) + } gl <- broom::glance(fit) return(list(inference = dtf, fitQuality = gl)) } #' convert list to dataframe -#' @param fit +#' @param list_fit #' @return list of dtf #' @export #' @@ -72,5 +97,6 @@ listFit2dtf <- function(list_fit) { tmp <- do.call(cbind, list_fit) inference.dtf <- do.call(rbind, tmp[1, ]) fitQuality.dtf <- do.call(rbind, tmp[2, ]) - return(list(inference = inference.dtf, fitQuality = fitQuality.dtf)) + dispersion.dtf <- do.call(rbind, tmp[3, ]) + return(list(inference = inference.dtf, fitQuality = fitQuality.dtf, dispersion = dispersion.dtf )) } diff --git a/src/model.fit/HTRfit/R/preprocessing.R b/src/model.fit/HTRfit/R/preprocessing.R index 045ff07..96fc399 100644 --- a/src/model.fit/HTRfit/R/preprocessing.R +++ b/src/model.fit/HTRfit/R/preprocessing.R @@ -1,33 +1,25 @@ -#' get df2fit for a given gene -#' @param k_ij -#' @param design_simulation -#' @param gene_name -#' @import dplyr -#' @return dataframe -#' @export -#' -#' @examples -getdf2fit <- function(k_ij , design_simulation, gene_name){ - df_gene_i = cbind(design_simulation, k_ij) %>% - dplyr::mutate(gene_id = gene_name) %>% - dplyr::mutate(gene_id = gene_name) - rownames(df_gene_i) <- NULL - return(df_gene_i) -} - -#' get df2fit for a given gene -#' @param countTable +#' get df2fit +#' @param countTable #' @param experimental_design -#' @import purrr -#' @return dataframe +#' @import data.table +#' @import reshape2 +#' @import tibble +#' @return dataframe #' @export #' #' @examples -reshapeCounTable<- function(countTable, experimental_design ){ +reshapeCounTable <- function(countTable, experimental_design) { + countTable = countTable %>% data.frame() + countTable.long <- countTable %>% + tibble::rownames_to_column("gene_id") %>% + reshape2::melt( + id.vars = "gene_id", + value.name = "k_ij", + variable.name = "sample_id" + ) + countTable2join <- data.table::data.table(countTable.long, key = "sample_id") + experimentalDesign2join <- data.table::data.table(experimental_design, key = "sample_id") + data2fit <- countTable2join[experimentalDesign2join] - gene_id_list = rownames( countTable ) - list_df2fit = purrr::map( .x = gene_id_list, ~getdf2fit( countTable[ .x, ], - experimental_design, .x) ) - data2fit = do.call( rbind, list_df2fit ) - return(data2fit) -} \ No newline at end of file + return(data2fit) +} diff --git a/src/model.fit/HTRfit/R/wald_test.R b/src/model.fit/HTRfit/R/wald_test.R index 2a173a5..f63a0fa 100644 --- a/src/model.fit/HTRfit/R/wald_test.R +++ b/src/model.fit/HTRfit/R/wald_test.R @@ -1,37 +1,45 @@ #' fit model on data -#' @param W -#' @param threshold -#' @param alternative +#' @param W +#' @param altHypothesis #' @import stats -#' @return pvalue +#' @return pvalue #' @export #' #' @examples -wald_test <- function( W, threshold = 0, alternative = "two.sided"){ - ## two.sided : P ( X > threshold ) + P ( X < threshold ) - pval = 2*(stats::pnorm( W, mean = 0, sd = 1, lower.tail = FALSE)) - - ## greater : P ( X > threshold ) - pval = stats::pnorm( W, mean = 0, sd = 1, lower.tail = FALSE) - - ## less : P ( X <= threshold ) - pval = stats::pnorm( W , mean = 0, sd = 1, lower.tail = TRUE) +wald_test <- function(w, altHypothesis = "greaterAbs") { + if (altHypothesis == c("greaterAbs")) { + ## greaterAbs + pval <- 2 * (stats::pnorm(w, mean = 0, sd = 1, lower.tail = FALSE)) + pval[pval > 1] <- 1 + } + if (altHypothesis %in% c("greater", "lessAbs")) { + pval <- stats::pnorm(w, mean = 0, sd = 1, lower.tail = FALSE) + pval[pval > 1] <- 1 + } return(pval) } #' fit model on data -#' @param estimate +#' @param estimate #' @param threshold #' @param stdError -#' @return W +#' @param altHypothesis +#' @return W #' @export #' #' @examples -getStatisticWaldTest<- function(estimate,stdError, threshold ){ +getStatisticWaldTest <- function(estimate, stdError, threshold, altHypothesis = "greaterAbs") { ## cf https://en.wikipedia.org/wiki/Wald_test - return( ( estimate - threshold ) / stdError ) + stopifnot(threshold >= 0) + if (altHypothesis == "greaterAbs") { + wald_stat <- (abs(estimate) - threshold) / stdError + } + if (altHypothesis == "lessAbs") { + wald_stat <- (threshold - abs(estimate)) / stdError + } + if (altHypothesis == "greater") { + wald_stat <- (estimate - threshold) / stdError + } + return(wald_stat) } - - - diff --git a/src/model.fit/HTRfit/devtools_history.R b/src/model.fit/HTRfit/devtools_history.R index 269a5fe..16f62e9 100644 --- a/src/model.fit/HTRfit/devtools_history.R +++ b/src/model.fit/HTRfit/devtools_history.R @@ -6,7 +6,8 @@ usethis::use_package("furrr") usethis::use_package("MASS") usethis::use_package("future") usethis::use_package("stats") -usethis::use_package("broom") +usethis::use_package("broom.mixed") +usethis::use_package("tibble") diff --git a/src/v3/HTRsim/R/countsGenerator.R b/src/v3/HTRsim/R/countsGenerator.R index e51f78c..89dd073 100644 --- a/src/v3/HTRsim/R/countsGenerator.R +++ b/src/v3/HTRsim/R/countsGenerator.R @@ -2,44 +2,47 @@ #' #' @param n_genes an integer #' @param n_genotype A int. -#' @param mvrnorm.fit an object fit mvronorm -#' @fixIntercept -#' @fixBetaE +#' @param mvrnormFit_list an object fit mvronorm +#' @param fixIntercept +#' @param fixBetaE +#' @param n_clusters #' @import MASS #' @import dplyr +#' @import purrr #' @return a dataframe #' @export #' #' @examples -getBetaforSimulation <- function(n_genes = 100, n_genotypes = 20, mvrnorm.fit, fixIntercept = TRUE, fixBetaE = TRUE) { - message("Get actuals beta for each genes ...") - ##### Sampling from mvnorm ######## - n_samplings <- n_genes * n_genotypes - beta.matrix <- MASS::mvrnorm( - n = n_samplings, - mu = mvrnorm.fit$mu, - Sigma = mvrnorm.fit$sigma - ) - +getBetaforSimulation <- function(n_genes = 100, n_genotypes = 20, mvrnormFit_list, fixIntercept = TRUE, fixBetaE = TRUE, n_clusters = 5) { + ## -- Sampling from mvnorm -- ## + list_idxFit <- sample(1:n_clusters, replace = TRUE, size = n_genes) + f <- purrr::map(.x = list_idxFit, ~ MASS::mvrnorm( + n = n_genotypes, + mu = mvrnormFit_list[[.x]]$mu, + Sigma = mvrnormFit_list[[.x]]$sigma + )) + beta.matrix <- do.call("rbind", f) + ## -- Build lovely dataframe & annotations -- ## genes_vec <- base::paste("gene", 1:n_genes, sep = "") genotype_vec <- base::paste("G", 0:(n_genotypes - 1), sep = "") genotype <- genotype_vec %>% rep(time = n_genes) gene_id <- rep(genes_vec, each = n_genotypes) + idx_mvrnom <- rep(list_idxFit, each = n_genotypes) ## -- saving the mvnorm distribution idx beta.dtf <- beta.matrix %>% data.frame() colnames(beta.dtf) <- c("(Intercept)", "betaG", "betaE", "betaGE") - res <- cbind(gene_id, genotype, beta.dtf) - ## fixing Genotype and GxE effects to 0 for G0 (reference) + res <- cbind(gene_id, genotype, beta.dtf, idx_mvrnom) + ## fixing Genotype and GxE effects to 0 for G0 (reference) ## -> hide in (Intercept) res[res$genotype == "G0", c("betaG", "betaGE")] <- 0 if (fixIntercept) { - res <- res %>% - dplyr::group_by(gene_id) %>% - dplyr::mutate("(Intercept)" = mean(`(Intercept)`)) + res <- res %>% + dplyr::group_by(gene_id) %>% + dplyr::mutate("(Intercept)" = mean(`(Intercept)`)) } if (fixBetaE) { - res <- res %>% - dplyr::group_by(gene_id) %>% - dplyr::mutate(betaE = mean(betaE)) + res <- res %>% + dplyr::group_by(gene_id) %>% + dplyr::mutate(betaE = mean(betaE)) } return(res) } @@ -125,7 +128,7 @@ getMu_ij <- function(log_qij.dtf, size_factor) { getGenesDispersions <- function(n_genes, sample_id_list, dispersion.vec, dispUniform_btweenCondition = T) { if (dispUniform_btweenCondition == T) { - message("Get dispersion for each genes ...\n") + # --Get dispersion for each genes gene_dispersion.dtf <- base::sample(dispersion.vec, replace = T, @@ -136,7 +139,7 @@ getGenesDispersions <- function(n_genes, sample_id_list, rownames(gene_dispersion.dtf) <- base::paste("gene", 1:(n_genes), sep = "") colnames(gene_dispersion.dtf) <- sample_id_list } else { - message("Get dispersion for each genes within each conditions ...\n") + # -- Get dispersion for each genes within each conditions replication_table <- sample_ids %>% stringr::str_replace(., pattern = "_[0-9]+", "") %>% @@ -148,7 +151,9 @@ getGenesDispersions <- function(n_genes, sample_id_list, colnames(gene_dispersion.dtf) <- sample_ids rownames(gene_dispersion.dtf) <- base::paste("gene", 1:(n_genes), sep = "") } - return(gene_dispersion.dtf %>% as.matrix()) + gene_dispersion.mtx <- gene_dispersion.dtf %>% as.matrix() + gene_dispersion.mtx <- gene_dispersion.mtx[ order(row.names(gene_dispersion.mtx)), ] + return(gene_dispersion.mtx) } diff --git a/src/v3/HTRsim/R/dds_manipulations.R b/src/v3/HTRsim/R/dds_manipulations.R new file mode 100644 index 0000000..4ac353f --- /dev/null +++ b/src/v3/HTRsim/R/dds_manipulations.R @@ -0,0 +1,100 @@ +#' Launch Deseq +#' +#' @param tabl_cnts table containing counts per genes & samples +#' @param bioDesign table describing bioDesgin of input +#' @import DESeq2 +#' @import dplyr +#' @return DESEQ2 object +#' @export +#' +#' @examples +fit_deseq <- function(tabl_cnts, bioDesign) { + model <- ~ genotype + environment + genotype:environment + tabl_cnts <- tabl_cnts %>% as.data.frame() + tabl_cnts <- tabl_cnts %>% + dplyr::mutate( + across(where(is.double), as.integer) + ) + dds <- DESeq2::DESeqDataSetFromMatrix( + countData = round(tabl_cnts), + colData = bioDesign, + design = model + ) + dds <- DESeq2::DESeq(dds, quiet = TRUE) + return(dds) +} + + +#' Get deseq results +#' +#' @param dds_obj dds object obtain on simulation +#' @import S4Vectors +#' @import dplyr +#' @import tibble +#' @return a dataframe of dispersion obtained with deseq2 +#' @export +#' +#' @examples +getDispersionFromDDS <- function(dds_obj) { + disp <- S4Vectors::mcols(dds_obj)$dispersion + gene_id <- rownames(S4Vectors::mcols(dds_obj)) + names(disp) <- gene_id + disp.dtf <- disp %>% + data.frame() %>% + dplyr::rename(., dispersion.estimate = .) %>% + tibble::rownames_to_column("gene_id") + return(disp.dtf) +} + + +#' Get deseq results +#' +#' @param dds_obj dds object obtain on simulation +#' @param threads +#' @import stringr +#' @import dplyr +#' @import future +#' @import DESeq2 +#' @import furrr +#' @return a dataframe of beta obtained with deseq2 +#' @export +#' +#' @examples +getCoefficientsFromDds <- function(dds_obj, threads = 4) { # nolint + listBeta <- DESeq2::resultsNames(dds_obj) + future::plan(multisession, workers = threads) + res <- listBeta %>% furrr::future_map( + .x = ., + ~ DESeq2::results(dds_obj, + contrast = list(.x), + # lfcThreshold = threshold, /!\ statistic & pvalue resestimate later + # altHypothesis = altH, + tidy = TRUE + ) %>% + dplyr::select(-baseMean) %>% + dplyr::mutate(term = .x) %>% + dplyr::rename( + estimate = log2FoldChange, + std.error = lfcSE, + statistic = stat, + p.value = pvalue, + gene_id = row + ), + .options = furrr_options(seed = TRUE) + ) + deseq_inference <- do.call("rbind", res) + deseq_inference.dtf <- deseq_inference %>% + dplyr::mutate( + term = term %>% + stringr::str_replace("Intercept", "(Intercept)") %>% + stringr::str_replace("_vs_G0", "") %>% + stringr::str_replace("_vs_E0", "") %>% + stringr::str_replace_all("_", "") %>% + stringr::str_replace("\\.", ":") + ) + + deseq_inference.dtf <- deseq_inference.dtf %>% + select(gene_id, estimate, std.error, term) + + return(deseq_inference.dtf) +} \ No newline at end of file diff --git a/src/v3/HTRsim/R/embeddedfiles_manipulations.R b/src/v3/HTRsim/R/embeddedfiles_manipulations.R new file mode 100644 index 0000000..3fc9891 --- /dev/null +++ b/src/v3/HTRsim/R/embeddedfiles_manipulations.R @@ -0,0 +1,103 @@ +#' Extract beta distribution from DESEQ2 object +#' +#' @param dds_obj a DESEQ2 object +#' @import S4Vectors +#' @import tidyr +#' @import stats +#' @return a list containing 1- mean and sd of BetaG 2- mean and sd of BetaE 3- mean and sd of BetaGE 5- mean and sd of gene dispersion +#' @export +#' +#' @examples +extraction_embeddedDds <- function(dds_obj) { + ## Beta + dds.mcols <- S4Vectors::mcols(dds_obj, use.names = TRUE) + Intercept <- dds.mcols$Intercept + betaG <- dds.mcols$genotype_RM11_vs_GSY147 + betaE <- dds.mcols$environment_treated_vs_untreated + betaGE <- dds.mcols$genotypeRM11.environmenttreated + beta.dtf <- cbind(Intercept, betaG, betaE, betaGE) %>% + as.data.frame() %>% + tidyr::drop_na() + colnames(beta.dtf) <- c("(Intercept)", "betaG", "betaE", "betaGE") + + ## Dispersion + gene_disp <- dds.mcols$dispersion %>% stats::na.omit() + + + return(list( + beta = beta.dtf, + gene_dispersion = gene_disp + )) +} + +#' Extract beta distribution from DESEQ2 object +#' +#' @import dplyr +#' @import utils +#' @return output of dds extraction +#' @export +#' +#' @examples +embedded_CounTable2observedValues <- function() { + tabl_cnts <- loadEmbedded_CounTable() + bioDesign <- loadEmbedded_design() + ## Launch DESEQ2 + dds <- fit_deseq(tabl_cnts, bioDesign = bioDesign) + ## Extract + dds.extraction <- extraction_embeddedDds(dds_obj = dds) + + return(dds.extraction) +} + + + +#' Load beta dtf embedded in package +#' +#' @return an object dds.Extraction +#' @export +#' +#' @examples +loadEmbedded_ObservedValues <- function() { + ## Import public beta observed + fn <- system.file("extdata/", "SRP217588_YM_observedParams.rds", package = "HTRsim") + dds.extraction <- readRDS(file = fn) + return(dds.extraction) +} + +#' Load public counts table +#' +#' @import dplyr +#' @import utils +#' @return dataframe +#' @export +#' +#' @examples +loadEmbedded_CounTable <- function() { + ## Import public counts table + fn <- system.file("extdata/", "SRP217588_YM_vkallisto.tsv", package = "HTRsim") + tabl_cnts <- utils::read.table(file = fn, header = TRUE) + rownames(tabl_cnts) <- tabl_cnts$gene_id + tabl_cnts <- tabl_cnts %>% dplyr::select(-gene_id) ## suppr colonne GeneID + tabl_cnts <- tabl_cnts[order(tabl_cnts %>% rownames()), ] + tabl_cnts <- tabl_cnts %>% dplyr::select(!matches("ru_rm_5")) + return(tabl_cnts) +} + +#' Load public design +#' +#' @import dplyr +#' @import utils +#' @return dataframe +#' @export +#' +#' @examples +loadEmbedded_design <- function() { + ## DESIGN + fn <- system.file("extdata/", "SRP217588_YM_bioDesign.csv", package = "HTRsim") + bioDesign <- utils::read.table(file = fn, header = TRUE, sep = ";") + ## defining reference + bioDesign$genotype <- factor(x = bioDesign$genotype, levels = c("GSY147", "RM11")) + bioDesign$environment <- factor(x = bioDesign$environment, levels = c("untreated", "treated")) + bioDesign <- bioDesign %>% dplyr::filter(!str_detect(sample, "ru_rm_5")) + return(bioDesign) +} diff --git a/src/v3/HTRsim/R/evaluation.R b/src/v3/HTRsim/R/evaluation.R index 0c974a8..60a48ad 100644 --- a/src/v3/HTRsim/R/evaluation.R +++ b/src/v3/HTRsim/R/evaluation.R @@ -1,130 +1,141 @@ -#' Get deseq results -#' -#' @param dds_simu.mcols dds object obtain on simulation -#' @param threads -#' @import stringr -#' @import dplyr -#' @import future -#' @import DESeq2 -#' @import furrr -#' @return a dataframe of beta obtained with deseq2 -#' @export -#' -#' @examples -deseq_fitExtraction <- function(dds_simu, threads = 4) { - listBeta <- DESeq2::resultsNames(dds_simu) - future::plan(multisession, workers = threads) - res <- listBeta %>% furrr::future_map( - .x = ., - ~ DESeq2::results(dds_simu, - contrast = list(.x), - tidy = TRUE - ) %>% - dplyr::select(-baseMean) %>% - dplyr::mutate(term = .x) %>% - dplyr::rename( - estimate = log2FoldChange, - std.error = lfcSE, - statistic = stat, - p.value = pvalue, - gene_id = row - ), - .options = furrr_options(seed = TRUE) - ) - deseq_inference <- do.call("rbind", res) - deseq_inference.dtf <- deseq_inference %>% mutate(term = term %>% - stringr::str_replace("Intercept", "(Intercept)") %>% - stringr::str_replace("_vs_G0", "") %>% - stringr::str_replace("_vs_E0", "") %>% - stringr::str_replace_all("_", "") %>% - stringr::str_replace("\\.", ":")) - - return(deseq_inference.dtf) -} - - - - -#' Get deseq results +#' Get prediction results #' -#' @param inference a obj output of [deseq, glm, glmm] -#' @param actual parameters used during simulation -#' @param threads nb of threads +#' @param inference.dtf #' @param threshold #' @param alphaRisk -#' @import reshape2 +#' @param postInferenceSelection +#' @param altH +#' @param pvalCorrection #' @import dplyr -#' @import data.table -#' @return a dataframe to compare actual and infered betas +#' @import HTRfit +#' @import rstatix +#' @return a dataframe to compare actual and infered betas with annotations #' @export #' #' @examples -getDfComparison <- function(inference, actual, threads = 4) { - actual.dtf <- actual %>% - reshape2::melt( - id = c("gene_id", "genotype"), - value.name = "value", variable.name = "beta" - ) %>% - dplyr::group_by(gene_id, beta) %>% +getPrediction <- function(inference.dtf, threshold = 0, alphaRisk = 0.05, altH = "greaterAbs", pvalCorrection = TRUE, postInferenceSelection = FALSE) { + ###### embedded functions ########### + labelling_gene <- function(altHy, coeff, threshold, proba, alphaRisk, postInferenceSelection = F) { + if (postInferenceSelection) { + if (altHy == "greaterAbs") label <- ifelse((abs(coeff) > threshold & proba < alphaRisk), "DE", "nonDE") + if (altHy == "greater") label <- ifelse((coeff > threshold & proba < alphaRisk), "DE", "nonDE") + if (altHy == "lowerAbs") label <- ifelse((abs(coeff) < threshold & proba < alphaRisk), "DE", "nonDE") + } else { + label <- ifelse(proba < alphaRisk, "DE", "nonDE") + } + return(label) + } + ############# WALD TEST ####### + inference.dtf <- inference.dtf %>% dplyr::mutate( - actual.value = - dplyr::if_else(beta %in% c("(Intercept)", "betaE"), - mean(value), value + statistic = + dplyr::case_when( + postInferenceSelection == TRUE ~ + HTRfit::getStatisticWaldTest( + estimate = estimate, stdError = std.error, + threshold = 0, altHypothesis = altH + ), + postInferenceSelection == FALSE ~ + HTRfit::getStatisticWaldTest( + estimate = estimate, stdError = std.error, + threshold = threshold, altHypothesis = altH + ) ) ) %>% - ungroup() %>% - dplyr::select(-value) %>% dplyr::mutate( - term = + p.value = HTRfit::wald_test(w = statistic, altHypothesis = altH) + ) %>% + rstatix::adjust_pvalue(p.col = "p.value", method = "fdr", output.col = "padj") + + + ############# PREDICTION ############# + inference.dtf <- inference.dtf %>% + dplyr::mutate( + prediction.label = dplyr::case_when( - beta == "(Intercept)" ~ "(Intercept)", - beta == "betaG" ~ paste("genotype", genotype, sep = ""), - beta == "betaE" ~ paste("environment", "E1", sep = ""), - beta == "betaGE" ~ paste("genotype", genotype, ":environmentE1", sep = "") + pvalCorrection == TRUE ~ + labelling_gene(altH, estimate, threshold, padj, alphaRisk, postInferenceSelection), + pvalCorrection == FALSE ~ + labelling_gene(altH, estimate, threshold, p.value, alphaRisk, postInferenceSelection), ) ) - - actual.dtf <- actual.dtf %>% dplyr::select(-genotype) - actual_undup.dtf <- actual.dtf[!duplicated(actual.dtf), ] - actual2join.dtf <- data.table::data.table(actual_undup.dtf, key = c("gene_id", "term")) - - inference.dtf <- deseq_fitExtraction(inference, threads = threads) - inference2join.dtf <- data.table::data.table(inference.dtf, key = c("gene_id", "term")) - - joined.dtf <- actual2join.dtf[inference2join.dtf] - return(joined.dtf) + return(inference.dtf) } - -#' Get deseq results +#' Get expectated results #' -#' @param comparison.Dtf a obj output of getDfComparison +#' @param actual.dtf #' @param threshold -#' @param alphaRisk +#' @param altH #' @import dplyr +#' @import reshape2 #' @return a dataframe to compare actual and infered betas with annotations #' @export #' #' @examples -getAnnotation <- function(comparison.dtf, threshold = 0, alphaRisk = 0.05) { - ## Post inference selection - comparison.dtf <- comparison.dtf %>% - dplyr::mutate( - prediction.label = - dplyr::if_else(( - abs(estimate) > threshold & padj < alphaRisk), - "DE", "nonDE" - ) - ) +getExpectation <- function(actual.dtf, threshold = 0, altH = "greaterAbs") { + reshapeActualDtf <- function(actual.dtf) { + actual.dtf <- actual.dtf %>% + reshape2::melt( + id = c("gene_id", "genotype", "idx_mvrnom"), + value.name = "value", variable.name = "beta" + ) %>% + dplyr::group_by(gene_id, beta) %>% + dplyr::mutate( + actual.value = + dplyr::if_else(beta %in% c("(Intercept)", "betaE"), + mean(value), value + ) + ) %>% + ungroup() %>% + dplyr::select(-value) %>% + dplyr::mutate( + term = + dplyr::case_when( + beta == "(Intercept)" ~ "(Intercept)", + beta == "betaG" ~ paste("genotype", genotype, sep = ""), + beta == "betaE" ~ paste("environment", "E1", sep = ""), + beta == "betaGE" ~ paste("genotype", genotype, ":environmentE1", sep = "") + ) + ) + actual.dtf <- actual.dtf %>% dplyr::select(-genotype) + actual_undup.dtf <- actual.dtf[!duplicated(actual.dtf), ] + return(actual_undup.dtf) + } - comparison.dtf <- comparison.dtf %>% + actual.reshaped <- reshapeActualDtf(actual.dtf) + + ####### ACTUAL LABEL ######### + actual.reshaped.annot <- actual.reshaped %>% dplyr::mutate( actual.label = dplyr::if_else(abs(actual.value) < threshold, "nonDE", "DE" ) ) + + return(actual.reshaped.annot) +} + +#' Get deseq results +#' +#' @param inference.dtf a obj output of [deseq, glm, glmm] +#' @param actual.dtf parameters used during simulation +#' @import dplyr +#' @import data.table +#' @return a dataframe to compare actual and infered betas +#' @export +#' +#' @examples +getComparison <- function(actual.dtf, inference.dtf){ + + + actual2join.dtf <- data.table::data.table(actual.dtf, key = c("gene_id", "term")) + inference2join.dtf <- data.table::data.table(inference.dtf, key = c("gene_id", "term")) + comparison.dtf <- actual2join.dtf[inference2join.dtf] + + ########## COMPARING ACTUAL & PREDICTION ###### comparison.dtf <- comparison.dtf %>% dplyr::mutate( annotation = dplyr::case_when( @@ -133,5 +144,8 @@ getAnnotation <- function(comparison.dtf, threshold = 0, alphaRisk = 0.05) { TRUE ~ "FALSE" ) ) - return(comparison.dtf) -} + + return(comparison.dtf %>% as.data.frame() ) + + +} \ No newline at end of file diff --git a/src/v3/HTRsim/R/loadEmbeddedFiles.R b/src/v3/HTRsim/R/loadEmbeddedFiles.R deleted file mode 100644 index da61905..0000000 --- a/src/v3/HTRsim/R/loadEmbeddedFiles.R +++ /dev/null @@ -1,50 +0,0 @@ -#' Load beta dtf embedded in package -#' -#' @return an object dds.Extraction -#' @export -#' -#' @examples -loadObservedValues <- function() { - ## Import public beta observed - fn <- system.file("extdata/", "SRP217588_YM_observedParams.rds", package = "HTRsim") - dds.extraction <- readRDS(file = fn) - return(dds.extraction) -} - -#' Load public counts table -#' -#' @import dplyr -#' @import utils -#' @return dataframe -#' @export -#' -#' @examples -loadPubliCounTable <- function() { - ## Import public counts table - fn <- system.file("extdata/", "SRP217588_YM_vkallisto.tsv", package = "HTRsim") - tabl_cnts <- utils::read.table(file = fn, header = TRUE) - rownames(tabl_cnts) <- tabl_cnts$gene_id - tabl_cnts <- tabl_cnts %>% dplyr::select(-gene_id) ## suppr colonne GeneID - tabl_cnts <- tabl_cnts[order(tabl_cnts %>% rownames()), ] - tabl_cnts <- tabl_cnts %>% dplyr::select(!matches("ru_rm_5")) - return(tabl_cnts) -} - -#' Load public design -#' -#' @import dplyr -#' @import utils -#' @return dataframe -#' @export -#' -#' @examples -loadPublicDesign <- function() { - ## DESIGN - fn <- system.file("extdata/", "SRP217588_YM_bioDesign.csv", package = "HTRsim") - bioDesign <- utils::read.table(file = fn, header = TRUE, sep = ";") - ## defining reference - bioDesign$genotype <- factor(x = bioDesign$genotype, levels = c("GSY147", "RM11")) - bioDesign$environment <- factor(x = bioDesign$environment, levels = c("untreated", "treated")) - bioDesign <- bioDesign %>% dplyr::filter(!str_detect(sample, "ru_rm_5")) - return(bioDesign) -} diff --git a/src/v3/HTRsim/R/manipulationsDDS_obj.R b/src/v3/HTRsim/R/manipulationsDDS_obj.R deleted file mode 100644 index c4a758c..0000000 --- a/src/v3/HTRsim/R/manipulationsDDS_obj.R +++ /dev/null @@ -1,70 +0,0 @@ -#' Launch Deseq -#' -#' @param tabl_cnts table containing counts per genes & samples -#' @param bioDesign table describing bioDesgin of input -#' @param model -#' @import DESeq2 -#' @import dplyr -#' @return DESEQ2 object -#' @export -#' -#' @examples -run.deseq <- function(tabl_cnts, bioDesign, model = ~ genotype + environment + genotype:environment) { - message("Run DESEQ2 ...\n") - tabl_cnts <- tabl_cnts %>% dplyr::mutate(across(where(is.double), as.integer)) - dds <- DESeq2::DESeqDataSetFromMatrix(countData = round(tabl_cnts), colData = bioDesign, design = model) - dds <- DESeq2::DESeq(dds, quiet = TRUE) - return(dds) -} - - -#' Extract beta distribution from DESEQ2 object -#' -#' @param dds_obj a DESEQ2 object -#' @import S4Vectors -#' @import tidyr -#' @import stats -#' @return a list containing 1- mean and sd of BetaG 2- mean and sd of BetaE 3- mean and sd of BetaGE 5- mean and sd of gene dispersion -#' @export -#' -#' @examples -ddsExtraction <- function(dds_obj) { - ## Beta - dds.mcols <- S4Vectors::mcols(dds_obj, use.names = TRUE) - Intercept <- dds.mcols$Intercept - betaG <- dds.mcols$genotype_RM11_vs_GSY147 - betaE <- dds.mcols$environment_treated_vs_untreated - betaGE <- dds.mcols$genotypeRM11.environmenttreated - beta.dtf <- cbind(Intercept, betaG, betaE, betaGE) %>% - as.data.frame() %>% - tidyr::drop_na() - colnames(beta.dtf) <- c("(Intercept)", "betaG", "betaE", "betaGE") - - ## Dispersion - gene_disp <- dds.mcols$dispersion %>% stats::na.omit() - - - return(list( - beta = beta.dtf, - gene_dispersion = gene_disp - )) -} - -#' Extract beta distribution from DESEQ2 object -#' -#' @import dplyr -#' @import utils -#' @return output of dds extraction -#' @export -#' -#' @examples -publicDDS_extraction <- function() { - tabl_cnts <- loadPubliCounTable() - bioDesign <- loadPublicDesign() - ## Launch DESEQ2 - dds <- run.deseq(tabl_cnts, bioDesign = bioDesign) - ## Extract - dds.extraction <- ddsExtraction(dds_obj = dds) - - return(dds.extraction) -} diff --git a/src/v3/HTRsim/R/manipulations_multinormDistrib.R b/src/v3/HTRsim/R/manipulations_multinormDistrib.R deleted file mode 100644 index 7292102..0000000 --- a/src/v3/HTRsim/R/manipulations_multinormDistrib.R +++ /dev/null @@ -1,25 +0,0 @@ -#' Fit mvnorm -#' @import Rfast -#' @return output of Rfast::mvnorm.mle -#' @export -#' -#' @examples -mvnormFitting <- function(beta.dtf) { - beta.matx = beta.dtf %>% as.matrix() - fit.mvnorm <- Rfast::mvnorm.mle(beta.matx) - return(fit.mvnorm) -} - -#' get fit mvnorm from public dataset -#' -#' @return output of Rfast::mvnorm.mle -#' @export -#' -#' @examples -getPublicMvnormFit <- function(){ - ##### Range of observed value ######### - dds.extraction = publicDDS_extraction() - beta_observed.dtf = dds.extraction$beta - fit.mvnorm = mvnormFitting(beta_observed.dtf) - return( fit.mvnorm ) -} \ No newline at end of file diff --git a/src/v3/HTRsim/R/multinormDistrib_manipulations.R b/src/v3/HTRsim/R/multinormDistrib_manipulations.R new file mode 100644 index 0000000..06d2e36 --- /dev/null +++ b/src/v3/HTRsim/R/multinormDistrib_manipulations.R @@ -0,0 +1,47 @@ +#' Fit mvnorm +#' @import Rfast +#' @return output of Rfast::mvnorm.mle +#' @export +#' +#' @examples +mvnormFitting <- function(beta.dtf) { + beta.matx <- beta.dtf %>% as.matrix() + fit.mvnorm <- Rfast::mvnorm.mle(beta.matx) + return(fit.mvnorm) +} + +#' get fit mvnorm from public dataset +#' +#' @return output of Rfast::mvnorm.mle +#' @export +#' +#' @examples +getPublicMvnormFit <- function() { + ##### Range of observed value ######### + dds.extraction <- publicDDS_extraction() + beta_observed.dtf <- dds.extraction$beta + list_fit.mvnorm <- getListMvnormFit(beta_observed.dtf) + return(list_fit.mvnorm) +} + + +#' get fit mvnorm list +#' +#' @param beta.dtf +#' @param n_genes +#' @import stats +#' @import purrr +#' @import dplyr +#' @return list of Rfast::mvnorm.mle +#' @export +#' +#' @examples +getListMvnormFit <- function(beta.dtf, n_clusters = 5) { + dtf2cluster <- beta.dtf %>% dplyr::select(c("(Intercept)", "betaG", "betaE","betaGE")) + cluster_kmean <- stats::kmeans(dtf2cluster, n_clusters)$cluster + list_fit.mvnorm <- purrr::map( + .x = 1:n_clusters, + ~ mvnormFitting(beta.dtf[cluster_kmean == .x, ]) + ) + return(list_fit.mvnorm) +} diff --git a/src/v3/HTRsim/R/utils.R b/src/v3/HTRsim/R/utils.R new file mode 100644 index 0000000..afff988 --- /dev/null +++ b/src/v3/HTRsim/R/utils.R @@ -0,0 +1,68 @@ + + +#' build Venn diagramm +#' +#' @param liste_genes +#' @import ggVennDiagram +#' @import ggplot2 +#' @return graph venn driagramm +#' @export +#' +#' @examples +buildVennDiag <- function(liste_gene) { + venn <- ggVennDiagram::Venn(liste_gene) + data <- ggVennDiagram::process_data(venn) + p <- ggplot() + + # 1. region count layer + geom_sf(aes(fill = id), data = venn_region(data)) + + # 2. set edge layer + geom_sf(color = "#000000", size = 0.3, data = venn_setedge(data), show.legend = FALSE) + + # 3. set label layer + geom_sf_text(aes(label = name), data = venn_setlabel(data)) + + # 4. region label layer + geom_sf_label( + aes( + label = scales::percent(count / sum(count)) + ), + data = venn_region(data) + ) + + theme_void() + + theme(legend.position = "none") + + return(p) +} + +#' build Venn diagramm +#' +#' @param comparisonDTF +#' @param title +#' @import dplyr +#' @import gridExtra +#' @return graph venn driagramm +#' @export +#' +#' @examples +getVennDiagramm <- function(comparisonDTF, title = "") { + comparisonDTF <- comparisonDTF %>% dplyr::mutate(label = paste(gene_id, term, sep = "_")) + coeff_DE <- comparisonDTF %>% + dplyr::filter(actual.label == "DE") %>% + .$label + coeff_nonDE <- comparisonDTF %>% + dplyr::filter(actual.label == "nonDE") %>% + .$label + pred_DE <- comparisonDTF %>% + dplyr::filter(prediction.label == "DE") %>% + .$label + pred_nonDE <- comparisonDTF %>% + dplyr::filter(prediction.label == "nonDE") %>% + .$label + gene_list_DE <- list(DE = coeff_DE, prediction = pred_DE) + p <- buildVennDiag(gene_list_DE) + p1 <- p + scale_fill_manual(values = c("#00688B", "#528B8B", "#DCDCDC")) + gene_list_nonDE <- list(nonDE = coeff_nonDE, prediction = pred_nonDE) + p <- buildVennDiag(gene_list_nonDE) + p2 <- p + scale_fill_manual(values = c("#458B00", "#9BCD9B", "#DCDCDC")) + + return(gridExtra::arrangeGrob(p1, p2, nrow = 2, top = title)) +} + diff --git a/src/v3/HTRsim/R/workflow.R b/src/v3/HTRsim/R/workflow.R index eeb1ac0..c22b215 100644 --- a/src/v3/HTRsim/R/workflow.R +++ b/src/v3/HTRsim/R/workflow.R @@ -40,17 +40,18 @@ rnaMock <- function(n_genes, sequencing_factor = 2, uniformNumberOfReplicates = T, uniformDispersion = T, - dds.extraction = loadObservedValues()) { + dds.extraction = loadEmbedded_ObservedValues()) { ## Fit mvnorm ## - fit.mvnorm <- mvnormFitting(dds.extraction$beta) + list_fit.mvnorm <- getListMvnormFit(dds.extraction$beta) ##### Ground truth ###### beta.actual <- getBetaforSimulation( n_genes, n_genotypes, - fit.mvnorm + list_fit.mvnorm ) - + + ##### build input for simulation #### model.matx <- getModelMatrix() log_qij <- getLog_qij(beta.actual, model.matx) @@ -81,7 +82,7 @@ rnaMock <- function(n_genes, design <- summariseDesign(countTable) actualParam <- list( dispersion = dispersion.matrix, - beta = beta.actual, mvnorm = fit.mvnorm + beta = beta.actual, listMvrnom = list_fit.mvnorm ) return(list( design = design, countTable = countTable, -- GitLab