From b044f30486a6ab862f280e6cfe9874cedd09c247 Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Sat, 14 Jan 2023 22:08:22 +0100
Subject: [PATCH] DESEQ + GLM fonctionnal !

---
 src/model.fit/HTRfit/R/model_fitting.R        |  56 +++--
 src/model.fit/HTRfit/R/preprocessing.R        |  48 ++--
 src/model.fit/HTRfit/R/wald_test.R            |  48 ++--
 src/model.fit/HTRfit/devtools_history.R       |   3 +-
 src/v3/HTRsim/R/countsGenerator.R             |  53 +++--
 src/v3/HTRsim/R/dds_manipulations.R           | 100 +++++++++
 src/v3/HTRsim/R/embeddedfiles_manipulations.R | 103 +++++++++
 src/v3/HTRsim/R/evaluation.R                  | 210 ++++++++++--------
 src/v3/HTRsim/R/loadEmbeddedFiles.R           |  50 -----
 src/v3/HTRsim/R/manipulationsDDS_obj.R        |  70 ------
 .../HTRsim/R/manipulations_multinormDistrib.R |  25 ---
 .../HTRsim/R/multinormDistrib_manipulations.R |  47 ++++
 src/v3/HTRsim/R/utils.R                       |  68 ++++++
 src/v3/HTRsim/R/workflow.R                    |  11 +-
 14 files changed, 556 insertions(+), 336 deletions(-)
 create mode 100644 src/v3/HTRsim/R/dds_manipulations.R
 create mode 100644 src/v3/HTRsim/R/embeddedfiles_manipulations.R
 delete mode 100644 src/v3/HTRsim/R/loadEmbeddedFiles.R
 delete mode 100644 src/v3/HTRsim/R/manipulationsDDS_obj.R
 delete mode 100644 src/v3/HTRsim/R/manipulations_multinormDistrib.R
 create mode 100644 src/v3/HTRsim/R/multinormDistrib_manipulations.R
 create mode 100644 src/v3/HTRsim/R/utils.R

diff --git a/src/model.fit/HTRfit/R/model_fitting.R b/src/model.fit/HTRfit/R/model_fitting.R
index 9c23dc0..1a6572a 100644
--- a/src/model.fit/HTRfit/R/model_fitting.R
+++ b/src/model.fit/HTRfit/R/model_fitting.R
@@ -14,16 +14,16 @@ launch.glm <- function(data2fit,
                        model2fit = k_ij ~ genotype + environment + genotype:environment,
                        fit_by = "gene_id",
                        threads = 4) {
-    iteration_list <- data2fit[fit_by] %>%
+    data2fit <- data2fit %>% as.data.frame()
+    iteration_list <- data2fit[, fit_by] %>%
         unique() %>%
-        as.vector() %>%
-        unname() %>%
-        .[[1]]
+        unlist() %>%
+        unname() ## get list gene
     future::plan(multisession, workers = threads)
     fit_list <- iteration_list %>% furrr::future_map(
         .x = .,
         ~ fit.glm(
-            data2fit[data2fit[fit_by] == .x, ],
+            data2fit[which(data2fit[, fit_by] == .x), ],
             .x,
             model2fit
         )
@@ -42,28 +42,53 @@ launch.glm <- function(data2fit,
 #'
 #' @examples
 fit.glm <- function(data, id, model2fit) {
-    fit <- MASS::glm.nb(model2fit, data = data, link = log)
-    fit.dtf <- tidySummary(fit)
-    fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(gene_id = id)
-    fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(gene_id = id)
-    return(fit.dtf)
+    tryCatch(
+        {
+            fit <- MASS::glm.nb(model2fit, data = data, link = log)
+            fit.dtf <- tidySummary(fit)
+            fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(gene_id = id)
+            fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(gene_id = id)
+            fit.dtf$dispersion <- list( gene_id = id, dispersion.estimate = fit$theta ) %>% as.data.frame()
+
+            return(fit.dtf)
+        },
+        error = function(cnd) {
+            inference <- list(gene_id = id, estimate = NA, std.error = NA, term=  NA) %>% as.data.frame()
+            fitQuality <- list(null.deviance= NA,df.null = NA, logLik = NA,AIC = NA ,BIC=NA,deviance = NA,df.residual=NA,nobs = NA,gene_id = id) %>% as.data.frame()
+            dispersion <- list( gene_id = id, dispersion.estimate = NA ) %>% as.data.frame()
+            fit.dtf <- list(inference = inference, fitQuality = fitQuality, dispersion) 
+            
+            return(fit.dtf)
+
+        }
+    )
 }
 
 #' fit model on data
 #' @param fit
-#' @import broom
+#' @param modelType
+#' @import broom.mixed
+#' @import dplyr
 #' @return list of element
 #' @export
 #'
 #' @examples
-tidySummary <- function(fit) {
-    dtf <- broom::tidy(fit)
+tidySummary <- function(fit, modelType = "glm") {
+    dtf <- broom.mixed::tidy(fit)
+    if (modelType == "glm"){
+    dtf <- dtf %>%
+        dplyr::select(estimate, std.error, term)
+    }
+    if (modelType == "glmm"){
+    dtf <- dtf %>%
+        dplyr::select(effect, group, estimate, std.error, term)
+    }
     gl <- broom::glance(fit)
     return(list(inference = dtf, fitQuality = gl))
 }
 
 #' convert list to dataframe
-#' @param fit
+#' @param list_fit
 #' @return list of dtf
 #' @export
 #'
@@ -72,5 +97,6 @@ listFit2dtf <- function(list_fit) {
     tmp <- do.call(cbind, list_fit)
     inference.dtf <- do.call(rbind, tmp[1, ])
     fitQuality.dtf <- do.call(rbind, tmp[2, ])
-    return(list(inference = inference.dtf, fitQuality = fitQuality.dtf))
+    dispersion.dtf <- do.call(rbind, tmp[3, ])
+    return(list(inference = inference.dtf, fitQuality = fitQuality.dtf, dispersion = dispersion.dtf ))
 }
diff --git a/src/model.fit/HTRfit/R/preprocessing.R b/src/model.fit/HTRfit/R/preprocessing.R
index 045ff07..96fc399 100644
--- a/src/model.fit/HTRfit/R/preprocessing.R
+++ b/src/model.fit/HTRfit/R/preprocessing.R
@@ -1,33 +1,25 @@
-#' get df2fit for a given gene
-#' @param k_ij 
-#' @param design_simulation
-#' @param gene_name
-#' @import dplyr
-#' @return dataframe 
-#' @export
-#'
-#' @examples
-getdf2fit <- function(k_ij , design_simulation, gene_name){
-  df_gene_i = cbind(design_simulation, k_ij) %>% 
-                        dplyr::mutate(gene_id = gene_name) %>% 
-                        dplyr::mutate(gene_id = gene_name)
-  rownames(df_gene_i) <- NULL
-  return(df_gene_i)
-}   
-
-#' get df2fit for a given gene
-#' @param countTable 
+#' get df2fit 
+#' @param countTable
 #' @param experimental_design
-#' @import purrr
-#' @return dataframe 
+#' @import data.table
+#' @import reshape2
+#' @import tibble
+#' @return dataframe
 #' @export
 #'
 #' @examples
-reshapeCounTable<- function(countTable, experimental_design ){
+reshapeCounTable <- function(countTable, experimental_design) {
+  countTable = countTable %>% data.frame()
+  countTable.long <- countTable %>%
+    tibble::rownames_to_column("gene_id") %>%
+    reshape2::melt(
+      id.vars = "gene_id",
+      value.name = "k_ij",
+      variable.name = "sample_id"
+    )
+  countTable2join <- data.table::data.table(countTable.long, key = "sample_id")
+  experimentalDesign2join <- data.table::data.table(experimental_design, key = "sample_id")
+  data2fit <- countTable2join[experimentalDesign2join]
 
-    gene_id_list = rownames( countTable )
-    list_df2fit = purrr::map( .x = gene_id_list, ~getdf2fit( countTable[ .x, ], 
-                                           experimental_design, .x) )
-    data2fit = do.call( rbind, list_df2fit )
-    return(data2fit)
-}
\ No newline at end of file
+  return(data2fit)
+}
diff --git a/src/model.fit/HTRfit/R/wald_test.R b/src/model.fit/HTRfit/R/wald_test.R
index 2a173a5..f63a0fa 100644
--- a/src/model.fit/HTRfit/R/wald_test.R
+++ b/src/model.fit/HTRfit/R/wald_test.R
@@ -1,37 +1,45 @@
 #' fit model on data
-#' @param W 
-#' @param threshold
-#' @param alternative
+#' @param W
+#' @param altHypothesis
 #' @import stats
-#' @return pvalue 
+#' @return pvalue
 #' @export
 #'
 #' @examples
-wald_test <- function( W, threshold = 0, alternative = "two.sided"){
-    ## two.sided : P ( X > threshold ) + P ( X < threshold )
-    pval = 2*(stats::pnorm( W, mean = 0, sd = 1, lower.tail = FALSE))
-
-    ## greater : P ( X > threshold )
-    pval = stats::pnorm( W, mean = 0, sd = 1, lower.tail = FALSE)
-    
-    ## less : P ( X <= threshold )
-    pval = stats::pnorm( W , mean = 0, sd = 1, lower.tail = TRUE)
+wald_test <- function(w, altHypothesis = "greaterAbs") {
+    if (altHypothesis == c("greaterAbs")) {
+        ## greaterAbs
+        pval <- 2 * (stats::pnorm(w, mean = 0, sd = 1, lower.tail = FALSE))
+        pval[pval > 1] <- 1
+    }
+    if (altHypothesis %in% c("greater", "lessAbs")) {
+        pval <- stats::pnorm(w, mean = 0, sd = 1, lower.tail = FALSE)
+        pval[pval > 1] <- 1
+    }
     return(pval)
 }
 
 
 #' fit model on data
-#' @param estimate 
+#' @param estimate
 #' @param threshold
 #' @param stdError
-#' @return W 
+#' @param altHypothesis
+#' @return W
 #' @export
 #'
 #' @examples
-getStatisticWaldTest<- function(estimate,stdError, threshold ){
+getStatisticWaldTest <- function(estimate, stdError, threshold, altHypothesis = "greaterAbs") {
     ## cf https://en.wikipedia.org/wiki/Wald_test
-    return( ( estimate - threshold ) / stdError )
+    stopifnot(threshold >= 0)
+    if (altHypothesis == "greaterAbs") {
+        wald_stat <- (abs(estimate) - threshold) / stdError
+    }
+    if (altHypothesis == "lessAbs") {
+        wald_stat <- (threshold - abs(estimate)) / stdError
+    }
+    if (altHypothesis == "greater") {
+        wald_stat <- (estimate - threshold) / stdError
+    }
+    return(wald_stat)
 }
-
-
-
diff --git a/src/model.fit/HTRfit/devtools_history.R b/src/model.fit/HTRfit/devtools_history.R
index 269a5fe..16f62e9 100644
--- a/src/model.fit/HTRfit/devtools_history.R
+++ b/src/model.fit/HTRfit/devtools_history.R
@@ -6,7 +6,8 @@ usethis::use_package("furrr")
 usethis::use_package("MASS")
 usethis::use_package("future")
 usethis::use_package("stats")
-usethis::use_package("broom")
+usethis::use_package("broom.mixed")
+usethis::use_package("tibble")
 
 
 
diff --git a/src/v3/HTRsim/R/countsGenerator.R b/src/v3/HTRsim/R/countsGenerator.R
index e51f78c..89dd073 100644
--- a/src/v3/HTRsim/R/countsGenerator.R
+++ b/src/v3/HTRsim/R/countsGenerator.R
@@ -2,44 +2,47 @@
 #'
 #' @param n_genes  an integer
 #' @param n_genotype A int.
-#' @param mvrnorm.fit an object fit mvronorm
-#' @fixIntercept
-#' @fixBetaE
+#' @param mvrnormFit_list an object fit mvronorm
+#' @param fixIntercept
+#' @param fixBetaE
+#' @param n_clusters
 #' @import MASS
 #' @import dplyr
+#' @import purrr
 #' @return a dataframe
 #' @export
 #'
 #' @examples
-getBetaforSimulation <- function(n_genes = 100, n_genotypes = 20, mvrnorm.fit, fixIntercept = TRUE, fixBetaE = TRUE) {
-  message("Get actuals beta for each genes ...")
-  ##### Sampling from mvnorm ########
-  n_samplings <- n_genes * n_genotypes
-  beta.matrix <- MASS::mvrnorm(
-    n = n_samplings,
-    mu = mvrnorm.fit$mu,
-    Sigma = mvrnorm.fit$sigma
-  )
-
+getBetaforSimulation <- function(n_genes = 100, n_genotypes = 20, mvrnormFit_list, fixIntercept = TRUE, fixBetaE = TRUE, n_clusters = 5) {
+  ## -- Sampling from mvnorm -- ##
+  list_idxFit <- sample(1:n_clusters, replace = TRUE, size = n_genes)
+  f <- purrr::map(.x = list_idxFit, ~ MASS::mvrnorm(
+    n = n_genotypes,
+    mu = mvrnormFit_list[[.x]]$mu,
+    Sigma = mvrnormFit_list[[.x]]$sigma
+  ))
+  beta.matrix <- do.call("rbind", f)
+  ## -- Build lovely dataframe & annotations -- ##
   genes_vec <- base::paste("gene", 1:n_genes, sep = "")
   genotype_vec <- base::paste("G", 0:(n_genotypes - 1), sep = "")
   genotype <- genotype_vec %>% rep(time = n_genes)
   gene_id <- rep(genes_vec, each = n_genotypes)
+  idx_mvrnom <- rep(list_idxFit, each = n_genotypes) ## -- saving the mvnorm distribution idx
   beta.dtf <- beta.matrix %>% data.frame()
   colnames(beta.dtf) <- c("(Intercept)", "betaG", "betaE", "betaGE")
-  res <- cbind(gene_id, genotype, beta.dtf)
-  ## fixing Genotype and GxE effects to 0 for G0 (reference) 
+  res <- cbind(gene_id, genotype, beta.dtf, idx_mvrnom)
+  ## fixing Genotype and GxE effects to 0 for G0 (reference)
   ## -> hide in (Intercept)
   res[res$genotype == "G0", c("betaG", "betaGE")] <- 0
   if (fixIntercept) {
-  res <- res %>% 
-    dplyr::group_by(gene_id) %>% 
-    dplyr::mutate("(Intercept)" = mean(`(Intercept)`)) 
+    res <- res %>%
+      dplyr::group_by(gene_id) %>%
+      dplyr::mutate("(Intercept)" = mean(`(Intercept)`))
   }
   if (fixBetaE) {
-  res <- res %>% 
-    dplyr::group_by(gene_id) %>% 
-    dplyr::mutate(betaE = mean(betaE)) 
+    res <- res %>%
+      dplyr::group_by(gene_id) %>%
+      dplyr::mutate(betaE = mean(betaE))
   }
   return(res)
 }
@@ -125,7 +128,7 @@ getMu_ij <- function(log_qij.dtf, size_factor) {
 getGenesDispersions <- function(n_genes, sample_id_list,
                                 dispersion.vec, dispUniform_btweenCondition = T) {
   if (dispUniform_btweenCondition == T) {
-    message("Get dispersion for each genes ...\n")
+    # --Get dispersion for each genes
 
     gene_dispersion.dtf <- base::sample(dispersion.vec,
       replace = T,
@@ -136,7 +139,7 @@ getGenesDispersions <- function(n_genes, sample_id_list,
     rownames(gene_dispersion.dtf) <- base::paste("gene", 1:(n_genes), sep = "")
     colnames(gene_dispersion.dtf) <- sample_id_list
   } else {
-    message("Get dispersion for each genes within each conditions ...\n")
+    # -- Get dispersion for each genes within each conditions
 
     replication_table <- sample_ids %>%
       stringr::str_replace(., pattern = "_[0-9]+", "") %>%
@@ -148,7 +151,9 @@ getGenesDispersions <- function(n_genes, sample_id_list,
     colnames(gene_dispersion.dtf) <- sample_ids
     rownames(gene_dispersion.dtf) <- base::paste("gene", 1:(n_genes), sep = "")
   }
-  return(gene_dispersion.dtf %>% as.matrix())
+  gene_dispersion.mtx <- gene_dispersion.dtf %>% as.matrix()
+  gene_dispersion.mtx <- gene_dispersion.mtx[ order(row.names(gene_dispersion.mtx)), ]
+  return(gene_dispersion.mtx)
 }
 
 
diff --git a/src/v3/HTRsim/R/dds_manipulations.R b/src/v3/HTRsim/R/dds_manipulations.R
new file mode 100644
index 0000000..4ac353f
--- /dev/null
+++ b/src/v3/HTRsim/R/dds_manipulations.R
@@ -0,0 +1,100 @@
+#' Launch Deseq
+#'
+#' @param tabl_cnts table containing counts per genes & samples
+#' @param bioDesign table describing bioDesgin of input
+#' @import DESeq2
+#' @import dplyr
+#' @return DESEQ2 object
+#' @export
+#'
+#' @examples
+fit_deseq <- function(tabl_cnts, bioDesign) {
+    model <- ~ genotype + environment + genotype:environment
+    tabl_cnts <- tabl_cnts %>% as.data.frame()
+    tabl_cnts <- tabl_cnts %>%
+        dplyr::mutate(
+            across(where(is.double), as.integer)
+        )
+    dds <- DESeq2::DESeqDataSetFromMatrix(
+        countData = round(tabl_cnts),
+        colData = bioDesign,
+        design = model
+    )
+    dds <- DESeq2::DESeq(dds, quiet = TRUE)
+    return(dds)
+}
+
+
+#' Get deseq results
+#'
+#' @param dds_obj  dds object obtain on simulation
+#' @import S4Vectors
+#' @import dplyr
+#' @import tibble
+#' @return a dataframe of dispersion obtained with deseq2
+#' @export
+#'
+#' @examples
+getDispersionFromDDS <- function(dds_obj) {
+    disp <- S4Vectors::mcols(dds_obj)$dispersion
+    gene_id <- rownames(S4Vectors::mcols(dds_obj))
+    names(disp) <- gene_id
+    disp.dtf <- disp %>%
+        data.frame() %>%
+        dplyr::rename(., dispersion.estimate = .) %>%
+        tibble::rownames_to_column("gene_id")
+    return(disp.dtf)
+}
+
+
+#' Get deseq results
+#'
+#' @param dds_obj  dds object obtain on simulation
+#' @param threads
+#' @import stringr
+#' @import dplyr
+#' @import future
+#' @import DESeq2
+#' @import furrr
+#' @return a dataframe of beta obtained with deseq2
+#' @export
+#'
+#' @examples
+getCoefficientsFromDds <- function(dds_obj, threads = 4) { # nolint
+    listBeta <- DESeq2::resultsNames(dds_obj)
+    future::plan(multisession, workers = threads)
+    res <- listBeta %>% furrr::future_map(
+        .x = .,
+        ~ DESeq2::results(dds_obj,
+            contrast = list(.x),
+            # lfcThreshold = threshold, /!\ statistic & pvalue resestimate later
+            # altHypothesis = altH,
+            tidy = TRUE
+        ) %>%
+            dplyr::select(-baseMean) %>%
+            dplyr::mutate(term = .x) %>%
+            dplyr::rename(
+                estimate = log2FoldChange,
+                std.error = lfcSE,
+                statistic = stat,
+                p.value = pvalue,
+                gene_id = row
+            ),
+        .options = furrr_options(seed = TRUE)
+    )
+    deseq_inference <- do.call("rbind", res)
+    deseq_inference.dtf <- deseq_inference %>%
+        dplyr::mutate(
+            term = term %>%
+                stringr::str_replace("Intercept", "(Intercept)") %>%
+                stringr::str_replace("_vs_G0", "") %>%
+                stringr::str_replace("_vs_E0", "") %>%
+                stringr::str_replace_all("_", "") %>%
+                stringr::str_replace("\\.", ":")
+        )
+
+    deseq_inference.dtf <- deseq_inference.dtf %>%
+        select(gene_id, estimate, std.error, term)
+
+    return(deseq_inference.dtf)
+}
\ No newline at end of file
diff --git a/src/v3/HTRsim/R/embeddedfiles_manipulations.R b/src/v3/HTRsim/R/embeddedfiles_manipulations.R
new file mode 100644
index 0000000..3fc9891
--- /dev/null
+++ b/src/v3/HTRsim/R/embeddedfiles_manipulations.R
@@ -0,0 +1,103 @@
+#' Extract beta distribution from DESEQ2 object
+#'
+#' @param dds_obj a DESEQ2 object
+#' @import S4Vectors
+#' @import tidyr
+#' @import stats
+#' @return a list containing 1- mean and sd of BetaG 2- mean and sd of BetaE 3- mean and sd of BetaGE 5- mean and sd of gene dispersion
+#' @export
+#'
+#' @examples
+extraction_embeddedDds <- function(dds_obj) {
+  ## Beta
+  dds.mcols <- S4Vectors::mcols(dds_obj, use.names = TRUE)
+  Intercept <- dds.mcols$Intercept
+  betaG <- dds.mcols$genotype_RM11_vs_GSY147
+  betaE <- dds.mcols$environment_treated_vs_untreated
+  betaGE <- dds.mcols$genotypeRM11.environmenttreated
+  beta.dtf <- cbind(Intercept, betaG, betaE, betaGE) %>%
+    as.data.frame() %>%
+    tidyr::drop_na()
+  colnames(beta.dtf) <- c("(Intercept)", "betaG", "betaE", "betaGE")
+
+  ## Dispersion
+  gene_disp <- dds.mcols$dispersion %>% stats::na.omit()
+
+
+  return(list(
+    beta = beta.dtf,
+    gene_dispersion = gene_disp
+  ))
+}
+
+#' Extract beta distribution from DESEQ2 object
+#'
+#' @import dplyr
+#' @import utils
+#' @return output of dds extraction
+#' @export
+#'
+#' @examples
+embedded_CounTable2observedValues <- function() {
+  tabl_cnts <- loadEmbedded_CounTable()
+  bioDesign <- loadEmbedded_design()
+  ## Launch DESEQ2
+  dds <- fit_deseq(tabl_cnts, bioDesign = bioDesign)
+  ## Extract
+  dds.extraction <- extraction_embeddedDds(dds_obj = dds)
+
+  return(dds.extraction)
+}
+
+
+
+#' Load beta dtf embedded in package
+#'
+#' @return an object dds.Extraction
+#' @export
+#'
+#' @examples
+loadEmbedded_ObservedValues <- function() {
+  ## Import public beta observed
+  fn <- system.file("extdata/", "SRP217588_YM_observedParams.rds", package = "HTRsim")
+  dds.extraction <- readRDS(file = fn)
+  return(dds.extraction)
+}
+
+#' Load public counts table
+#'
+#' @import dplyr
+#' @import utils
+#' @return dataframe
+#' @export
+#'
+#' @examples
+loadEmbedded_CounTable <- function() {
+  ## Import public counts table
+  fn <- system.file("extdata/", "SRP217588_YM_vkallisto.tsv", package = "HTRsim")
+  tabl_cnts <- utils::read.table(file = fn, header = TRUE)
+  rownames(tabl_cnts) <- tabl_cnts$gene_id
+  tabl_cnts <- tabl_cnts %>% dplyr::select(-gene_id) ## suppr colonne GeneID
+  tabl_cnts <- tabl_cnts[order(tabl_cnts %>% rownames()), ]
+  tabl_cnts <- tabl_cnts %>% dplyr::select(!matches("ru_rm_5"))
+  return(tabl_cnts)
+}
+
+#' Load public design
+#'
+#' @import dplyr
+#' @import utils
+#' @return dataframe
+#' @export
+#'
+#' @examples
+loadEmbedded_design <- function() {
+  ## DESIGN
+  fn <- system.file("extdata/", "SRP217588_YM_bioDesign.csv", package = "HTRsim")
+  bioDesign <- utils::read.table(file = fn, header = TRUE, sep = ";")
+  ## defining reference
+  bioDesign$genotype <- factor(x = bioDesign$genotype, levels = c("GSY147", "RM11"))
+  bioDesign$environment <- factor(x = bioDesign$environment, levels = c("untreated", "treated"))
+  bioDesign <- bioDesign %>% dplyr::filter(!str_detect(sample, "ru_rm_5"))
+  return(bioDesign)
+}
diff --git a/src/v3/HTRsim/R/evaluation.R b/src/v3/HTRsim/R/evaluation.R
index 0c974a8..60a48ad 100644
--- a/src/v3/HTRsim/R/evaluation.R
+++ b/src/v3/HTRsim/R/evaluation.R
@@ -1,130 +1,141 @@
-#' Get deseq results
-#'
-#' @param dds_simu.mcols  dds object obtain on simulation
-#' @param threads
-#' @import stringr
-#' @import dplyr
-#' @import future
-#' @import DESeq2
-#' @import furrr
-#' @return a dataframe of beta obtained with deseq2
-#' @export
-#'
-#' @examples
-deseq_fitExtraction <- function(dds_simu, threads = 4) {
-    listBeta <- DESeq2::resultsNames(dds_simu)
-    future::plan(multisession, workers = threads)
-    res <- listBeta %>% furrr::future_map(
-        .x = .,
-        ~ DESeq2::results(dds_simu,
-            contrast = list(.x),
-            tidy = TRUE
-        ) %>%
-            dplyr::select(-baseMean) %>%
-            dplyr::mutate(term = .x) %>%
-            dplyr::rename(
-                estimate = log2FoldChange,
-                std.error = lfcSE,
-                statistic = stat,
-                p.value = pvalue,
-                gene_id = row
-            ),
-        .options = furrr_options(seed = TRUE)
-    )
-    deseq_inference <- do.call("rbind", res)
-    deseq_inference.dtf <- deseq_inference %>% mutate(term = term %>%
-        stringr::str_replace("Intercept", "(Intercept)") %>%
-        stringr::str_replace("_vs_G0", "") %>%
-        stringr::str_replace("_vs_E0", "") %>%
-        stringr::str_replace_all("_", "") %>%
-        stringr::str_replace("\\.", ":"))
-
-    return(deseq_inference.dtf)
-}
-
-
-
-
-#' Get deseq results
+#' Get prediction results
 #'
-#' @param inference a obj output of [deseq, glm, glmm]
-#' @param actual parameters used during simulation
-#' @param threads nb of threads
+#' @param inference.dtf 
 #' @param threshold
 #' @param alphaRisk
-#' @import reshape2
+#' @param postInferenceSelection
+#' @param altH
+#' @param pvalCorrection
 #' @import dplyr
-#' @import data.table
-#' @return a dataframe to compare actual and infered betas
+#' @import HTRfit
+#' @import rstatix
+#' @return a dataframe to compare actual and infered betas with annotations
 #' @export
 #'
 #' @examples
-getDfComparison <- function(inference, actual, threads = 4) {
-    actual.dtf <- actual %>%
-        reshape2::melt(
-            id = c("gene_id", "genotype"),
-            value.name = "value", variable.name = "beta"
-        ) %>%
-        dplyr::group_by(gene_id, beta) %>%
+getPrediction <- function(inference.dtf, threshold = 0, alphaRisk = 0.05, altH = "greaterAbs", pvalCorrection = TRUE, postInferenceSelection = FALSE) {
+    ###### embedded functions ###########
+    labelling_gene <- function(altHy, coeff, threshold, proba, alphaRisk, postInferenceSelection = F) {
+        if (postInferenceSelection) {
+            if (altHy == "greaterAbs") label <- ifelse((abs(coeff) > threshold & proba < alphaRisk), "DE", "nonDE")
+            if (altHy == "greater") label <- ifelse((coeff > threshold & proba < alphaRisk), "DE", "nonDE")
+            if (altHy == "lowerAbs") label <- ifelse((abs(coeff) < threshold & proba < alphaRisk), "DE", "nonDE")
+        } else {
+            label <- ifelse(proba < alphaRisk, "DE", "nonDE")
+        }
+        return(label)
+    }
+    ############# WALD TEST #######
+    inference.dtf <- inference.dtf %>%
         dplyr::mutate(
-            actual.value =
-                dplyr::if_else(beta %in% c("(Intercept)", "betaE"),
-                    mean(value), value
+            statistic =
+                dplyr::case_when(
+                    postInferenceSelection == TRUE ~
+                        HTRfit::getStatisticWaldTest(
+                            estimate = estimate, stdError = std.error,
+                            threshold = 0, altHypothesis = altH
+                        ),
+                    postInferenceSelection == FALSE ~
+                        HTRfit::getStatisticWaldTest(
+                            estimate = estimate, stdError = std.error,
+                            threshold = threshold, altHypothesis = altH
+                        )
                 )
         ) %>%
-        ungroup() %>%
-        dplyr::select(-value) %>%
         dplyr::mutate(
-            term =
+            p.value = HTRfit::wald_test(w = statistic, altHypothesis = altH)
+        ) %>%
+        rstatix::adjust_pvalue(p.col = "p.value", method = "fdr", output.col = "padj")
+
+
+    ############# PREDICTION #############
+    inference.dtf <- inference.dtf %>%
+        dplyr::mutate(
+            prediction.label =
                 dplyr::case_when(
-                    beta == "(Intercept)" ~ "(Intercept)",
-                    beta == "betaG" ~ paste("genotype", genotype, sep = ""),
-                    beta == "betaE" ~ paste("environment", "E1", sep = ""),
-                    beta == "betaGE" ~ paste("genotype", genotype, ":environmentE1", sep = "")
+                    pvalCorrection == TRUE ~
+                        labelling_gene(altH, estimate, threshold, padj, alphaRisk, postInferenceSelection),
+                    pvalCorrection == FALSE ~
+                        labelling_gene(altH, estimate, threshold, p.value, alphaRisk, postInferenceSelection),
                 )
         )
-
-    actual.dtf <- actual.dtf %>% dplyr::select(-genotype)
-    actual_undup.dtf <- actual.dtf[!duplicated(actual.dtf), ]
-    actual2join.dtf <- data.table::data.table(actual_undup.dtf, key = c("gene_id", "term"))
-
-    inference.dtf <- deseq_fitExtraction(inference, threads = threads)
-    inference2join.dtf <- data.table::data.table(inference.dtf, key = c("gene_id", "term"))
-
-    joined.dtf <- actual2join.dtf[inference2join.dtf]
-    return(joined.dtf)
+    return(inference.dtf)
 }
 
-
-#' Get deseq results
+#' Get expectated results
 #'
-#' @param comparison.Dtf a obj output of getDfComparison
+#' @param actual.dtf
 #' @param threshold
-#' @param alphaRisk
+#' @param altH
 #' @import dplyr
+#' @import reshape2
 #' @return a dataframe to compare actual and infered betas with annotations
 #' @export
 #'
 #' @examples
-getAnnotation <- function(comparison.dtf, threshold = 0, alphaRisk = 0.05) {
-    ## Post inference selection
-    comparison.dtf <- comparison.dtf %>%
-        dplyr::mutate(
-            prediction.label =
-                dplyr::if_else((
-                    abs(estimate) > threshold & padj < alphaRisk),
-                "DE", "nonDE"
-                )
-        )
+getExpectation <- function(actual.dtf, threshold = 0, altH = "greaterAbs") {
+    reshapeActualDtf <- function(actual.dtf) {
+        actual.dtf <- actual.dtf %>%
+            reshape2::melt(
+                id = c("gene_id", "genotype", "idx_mvrnom"),
+                value.name = "value", variable.name = "beta"
+            ) %>%
+            dplyr::group_by(gene_id, beta) %>%
+            dplyr::mutate(
+                actual.value =
+                    dplyr::if_else(beta %in% c("(Intercept)", "betaE"),
+                        mean(value), value
+                    )
+            ) %>%
+            ungroup() %>%
+            dplyr::select(-value) %>%
+            dplyr::mutate(
+                term =
+                    dplyr::case_when(
+                        beta == "(Intercept)" ~ "(Intercept)",
+                        beta == "betaG" ~ paste("genotype", genotype, sep = ""),
+                        beta == "betaE" ~ paste("environment", "E1", sep = ""),
+                        beta == "betaGE" ~ paste("genotype", genotype, ":environmentE1", sep = "")
+                    )
+            )
+        actual.dtf <- actual.dtf %>% dplyr::select(-genotype)
+        actual_undup.dtf <- actual.dtf[!duplicated(actual.dtf), ]
+        return(actual_undup.dtf)
+    }
 
-    comparison.dtf <- comparison.dtf  %>%
+    actual.reshaped <- reshapeActualDtf(actual.dtf)
+
+    ####### ACTUAL LABEL #########
+    actual.reshaped.annot <- actual.reshaped %>%
         dplyr::mutate(
             actual.label =
                 dplyr::if_else(abs(actual.value) < threshold,
                     "nonDE", "DE"
                 )
         )
+    
+    return(actual.reshaped.annot)
+}
 
+
+#' Get deseq results
+#'
+#' @param inference.dtf a obj output of [deseq, glm, glmm]
+#' @param actual.dtf parameters used during simulation
+#' @import dplyr
+#' @import data.table
+#' @return a dataframe to compare actual and infered betas
+#' @export
+#'
+#' @examples
+getComparison <- function(actual.dtf, inference.dtf){
+
+
+    actual2join.dtf <- data.table::data.table(actual.dtf, key = c("gene_id", "term"))
+    inference2join.dtf <- data.table::data.table(inference.dtf, key = c("gene_id", "term"))
+    comparison.dtf <- actual2join.dtf[inference2join.dtf]
+
+    ########## COMPARING ACTUAL & PREDICTION ######
     comparison.dtf <- comparison.dtf %>% dplyr::mutate(
         annotation =
             dplyr::case_when(
@@ -133,5 +144,8 @@ getAnnotation <- function(comparison.dtf, threshold = 0, alphaRisk = 0.05) {
                 TRUE ~ "FALSE"
             )
     )
-    return(comparison.dtf)
-}
+
+    return(comparison.dtf %>% as.data.frame() )
+
+
+}
\ No newline at end of file
diff --git a/src/v3/HTRsim/R/loadEmbeddedFiles.R b/src/v3/HTRsim/R/loadEmbeddedFiles.R
deleted file mode 100644
index da61905..0000000
--- a/src/v3/HTRsim/R/loadEmbeddedFiles.R
+++ /dev/null
@@ -1,50 +0,0 @@
-#' Load beta dtf embedded in package
-#'
-#' @return an object dds.Extraction
-#' @export
-#'
-#' @examples
-loadObservedValues <- function() {
-  ## Import public beta observed
-  fn <- system.file("extdata/", "SRP217588_YM_observedParams.rds", package = "HTRsim")
-  dds.extraction <- readRDS(file = fn)
-  return(dds.extraction)
-}
-
-#' Load public counts table
-#'
-#' @import dplyr
-#' @import utils
-#' @return dataframe
-#' @export
-#'
-#' @examples
-loadPubliCounTable <- function() {
-  ## Import public counts table
-  fn <- system.file("extdata/", "SRP217588_YM_vkallisto.tsv", package = "HTRsim")
-  tabl_cnts <- utils::read.table(file = fn, header = TRUE)
-  rownames(tabl_cnts) <- tabl_cnts$gene_id
-  tabl_cnts <- tabl_cnts %>% dplyr::select(-gene_id) ## suppr colonne GeneID
-  tabl_cnts <- tabl_cnts[order(tabl_cnts %>% rownames()), ]
-  tabl_cnts <- tabl_cnts %>% dplyr::select(!matches("ru_rm_5"))
-  return(tabl_cnts)
-}
-
-#' Load public design
-#'
-#' @import dplyr
-#' @import utils
-#' @return dataframe
-#' @export
-#'
-#' @examples
-loadPublicDesign <- function() {
-  ## DESIGN
-  fn <- system.file("extdata/", "SRP217588_YM_bioDesign.csv", package = "HTRsim")
-  bioDesign <- utils::read.table(file = fn, header = TRUE, sep = ";")
-  ## defining reference
-  bioDesign$genotype <- factor(x = bioDesign$genotype, levels = c("GSY147", "RM11"))
-  bioDesign$environment <- factor(x = bioDesign$environment, levels = c("untreated", "treated"))
-  bioDesign <- bioDesign %>% dplyr::filter(!str_detect(sample, "ru_rm_5"))
-  return(bioDesign)
-}
diff --git a/src/v3/HTRsim/R/manipulationsDDS_obj.R b/src/v3/HTRsim/R/manipulationsDDS_obj.R
deleted file mode 100644
index c4a758c..0000000
--- a/src/v3/HTRsim/R/manipulationsDDS_obj.R
+++ /dev/null
@@ -1,70 +0,0 @@
-#' Launch Deseq
-#'
-#' @param tabl_cnts table containing counts per genes & samples
-#' @param bioDesign table describing bioDesgin of input
-#' @param model
-#' @import DESeq2
-#' @import dplyr
-#' @return DESEQ2 object
-#' @export
-#'
-#' @examples
-run.deseq <- function(tabl_cnts, bioDesign, model = ~ genotype + environment + genotype:environment) {
-  message("Run DESEQ2 ...\n")
-  tabl_cnts <- tabl_cnts %>% dplyr::mutate(across(where(is.double), as.integer))
-  dds <- DESeq2::DESeqDataSetFromMatrix(countData = round(tabl_cnts), colData = bioDesign, design = model)
-  dds <- DESeq2::DESeq(dds, quiet = TRUE)
-  return(dds)
-}
-
-
-#' Extract beta distribution from DESEQ2 object
-#'
-#' @param dds_obj a DESEQ2 object
-#' @import S4Vectors
-#' @import tidyr
-#' @import stats
-#' @return a list containing 1- mean and sd of BetaG 2- mean and sd of BetaE 3- mean and sd of BetaGE 5- mean and sd of gene dispersion
-#' @export
-#'
-#' @examples
-ddsExtraction <- function(dds_obj) {
-  ## Beta
-  dds.mcols <- S4Vectors::mcols(dds_obj, use.names = TRUE)
-  Intercept <- dds.mcols$Intercept
-  betaG <- dds.mcols$genotype_RM11_vs_GSY147
-  betaE <- dds.mcols$environment_treated_vs_untreated
-  betaGE <- dds.mcols$genotypeRM11.environmenttreated
-  beta.dtf <- cbind(Intercept, betaG, betaE, betaGE) %>%
-    as.data.frame() %>%
-    tidyr::drop_na()
-  colnames(beta.dtf) <- c("(Intercept)", "betaG", "betaE", "betaGE")
-
-  ## Dispersion
-  gene_disp <- dds.mcols$dispersion %>% stats::na.omit()
-
-
-  return(list(
-    beta = beta.dtf,
-    gene_dispersion = gene_disp
-  ))
-}
-
-#' Extract beta distribution from DESEQ2 object
-#'
-#' @import dplyr
-#' @import utils
-#' @return output of dds extraction
-#' @export
-#'
-#' @examples
-publicDDS_extraction <- function() {
-  tabl_cnts <- loadPubliCounTable()
-  bioDesign <- loadPublicDesign()
-  ## Launch DESEQ2
-  dds <- run.deseq(tabl_cnts, bioDesign = bioDesign)
-  ## Extract
-  dds.extraction <- ddsExtraction(dds_obj = dds)
-
-  return(dds.extraction)
-}
diff --git a/src/v3/HTRsim/R/manipulations_multinormDistrib.R b/src/v3/HTRsim/R/manipulations_multinormDistrib.R
deleted file mode 100644
index 7292102..0000000
--- a/src/v3/HTRsim/R/manipulations_multinormDistrib.R
+++ /dev/null
@@ -1,25 +0,0 @@
-#' Fit mvnorm
-#' @import Rfast
-#' @return output of Rfast::mvnorm.mle
-#' @export
-#'
-#' @examples
-mvnormFitting <- function(beta.dtf) {
-    beta.matx = beta.dtf %>% as.matrix()
-    fit.mvnorm <- Rfast::mvnorm.mle(beta.matx)
-    return(fit.mvnorm)
-}
-
-#' get fit mvnorm from public dataset
-#'
-#' @return output of Rfast::mvnorm.mle
-#' @export
-#'
-#' @examples
-getPublicMvnormFit <- function(){
-    ##### Range of observed value #########
-    dds.extraction = publicDDS_extraction()
-    beta_observed.dtf =  dds.extraction$beta
-    fit.mvnorm = mvnormFitting(beta_observed.dtf)
-    return( fit.mvnorm )
-}
\ No newline at end of file
diff --git a/src/v3/HTRsim/R/multinormDistrib_manipulations.R b/src/v3/HTRsim/R/multinormDistrib_manipulations.R
new file mode 100644
index 0000000..06d2e36
--- /dev/null
+++ b/src/v3/HTRsim/R/multinormDistrib_manipulations.R
@@ -0,0 +1,47 @@
+#' Fit mvnorm
+#' @import Rfast
+#' @return output of Rfast::mvnorm.mle
+#' @export
+#'
+#' @examples
+mvnormFitting <- function(beta.dtf) {
+    beta.matx <- beta.dtf %>% as.matrix()
+    fit.mvnorm <- Rfast::mvnorm.mle(beta.matx)
+    return(fit.mvnorm)
+}
+
+#' get fit mvnorm from public dataset
+#'
+#' @return output of Rfast::mvnorm.mle
+#' @export
+#'
+#' @examples
+getPublicMvnormFit <- function() {
+    ##### Range of observed value #########
+    dds.extraction <- publicDDS_extraction()
+    beta_observed.dtf <- dds.extraction$beta
+    list_fit.mvnorm <- getListMvnormFit(beta_observed.dtf)
+    return(list_fit.mvnorm)
+}
+
+
+#' get fit mvnorm list
+#'
+#' @param beta.dtf
+#' @param n_genes
+#' @import stats
+#' @import purrr
+#' @import dplyr
+#' @return list of Rfast::mvnorm.mle
+#' @export
+#'
+#' @examples
+getListMvnormFit <- function(beta.dtf, n_clusters = 5) {
+    dtf2cluster <- beta.dtf %>% dplyr::select(c("(Intercept)", "betaG", "betaE","betaGE"))
+    cluster_kmean <- stats::kmeans(dtf2cluster, n_clusters)$cluster
+    list_fit.mvnorm <- purrr::map(
+        .x = 1:n_clusters,
+        ~ mvnormFitting(beta.dtf[cluster_kmean == .x, ])
+    )
+    return(list_fit.mvnorm)
+}
diff --git a/src/v3/HTRsim/R/utils.R b/src/v3/HTRsim/R/utils.R
new file mode 100644
index 0000000..afff988
--- /dev/null
+++ b/src/v3/HTRsim/R/utils.R
@@ -0,0 +1,68 @@
+
+
+#' build Venn diagramm
+#'
+#' @param liste_genes
+#' @import ggVennDiagram
+#' @import ggplot2
+#' @return graph venn driagramm
+#' @export
+#'
+#' @examples
+buildVennDiag <- function(liste_gene) {
+    venn <- ggVennDiagram::Venn(liste_gene)
+    data <- ggVennDiagram::process_data(venn)
+    p <- ggplot() +
+        # 1. region count layer
+        geom_sf(aes(fill = id), data = venn_region(data)) +
+        # 2. set edge layer
+        geom_sf(color = "#000000", size = 0.3, data = venn_setedge(data), show.legend = FALSE) +
+        # 3. set label layer
+        geom_sf_text(aes(label = name), data = venn_setlabel(data)) +
+        # 4. region label layer
+        geom_sf_label(
+            aes(
+                label = scales::percent(count / sum(count))
+            ),
+            data = venn_region(data)
+        ) +
+        theme_void() +
+        theme(legend.position = "none")
+
+    return(p)
+}
+
+#' build Venn diagramm
+#'
+#' @param comparisonDTF
+#' @param title
+#' @import dplyr
+#' @import gridExtra
+#' @return graph venn driagramm
+#' @export
+#'
+#' @examples
+getVennDiagramm <- function(comparisonDTF, title = "") {
+    comparisonDTF <- comparisonDTF %>% dplyr::mutate(label = paste(gene_id, term, sep = "_"))
+    coeff_DE <- comparisonDTF %>%
+        dplyr::filter(actual.label == "DE") %>%
+        .$label
+    coeff_nonDE <- comparisonDTF %>%
+        dplyr::filter(actual.label == "nonDE") %>%
+        .$label
+    pred_DE <- comparisonDTF %>%
+        dplyr::filter(prediction.label == "DE") %>%
+        .$label
+    pred_nonDE <- comparisonDTF %>%
+        dplyr::filter(prediction.label == "nonDE") %>%
+        .$label
+    gene_list_DE <- list(DE = coeff_DE, prediction = pred_DE)
+    p <- buildVennDiag(gene_list_DE)
+    p1 <- p + scale_fill_manual(values = c("#00688B", "#528B8B", "#DCDCDC"))
+    gene_list_nonDE <- list(nonDE = coeff_nonDE, prediction = pred_nonDE)
+    p <- buildVennDiag(gene_list_nonDE)
+    p2 <- p + scale_fill_manual(values = c("#458B00", "#9BCD9B", "#DCDCDC"))
+
+    return(gridExtra::arrangeGrob(p1, p2, nrow = 2, top = title))
+}
+
diff --git a/src/v3/HTRsim/R/workflow.R b/src/v3/HTRsim/R/workflow.R
index eeb1ac0..c22b215 100644
--- a/src/v3/HTRsim/R/workflow.R
+++ b/src/v3/HTRsim/R/workflow.R
@@ -40,17 +40,18 @@ rnaMock <- function(n_genes,
                     sequencing_factor = 2,
                     uniformNumberOfReplicates = T,
                     uniformDispersion = T,
-                    dds.extraction = loadObservedValues()) {
+                    dds.extraction = loadEmbedded_ObservedValues()) {
   ## Fit mvnorm ##
-  fit.mvnorm <- mvnormFitting(dds.extraction$beta)
+  list_fit.mvnorm <- getListMvnormFit(dds.extraction$beta)
 
   ##### Ground truth ######
   beta.actual <- getBetaforSimulation(
     n_genes,
     n_genotypes,
-    fit.mvnorm
+    list_fit.mvnorm
   )
-
+  
+  
   ##### build input for simulation ####
   model.matx <- getModelMatrix()
   log_qij <- getLog_qij(beta.actual, model.matx)
@@ -81,7 +82,7 @@ rnaMock <- function(n_genes,
   design <- summariseDesign(countTable)
   actualParam <- list(
     dispersion = dispersion.matrix,
-    beta = beta.actual, mvnorm = fit.mvnorm
+    beta = beta.actual, listMvrnom = list_fit.mvnorm
   )
   return(list(
     design = design, countTable = countTable,
-- 
GitLab