From 701cb79ef8e0873fb84bb79e50287a0ec300536d Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Fri, 19 May 2023 09:52:49 +0200 Subject: [PATCH] end of v3 --- reports/2022-12-09_report.Rmd | 799 ++++++++++++++++++ reports/css/modest.css | 219 +++++ reports/css/retro.css | 202 +++++ reports/css/splendor.css | 225 +++++ results/v3/2022-12-04_dev.Rmd | 158 ++++ results/v3/2023_02_10_benchmarking.Rmd | 237 ++++++ results/v3/2023_02_10_benchmarkingv2.Rmd | 174 ++++ results/v3/2023_02_10_benchmarkingv3.Rmd | 161 ++++ .../v3/2023_02_21_benchmarking_distrib.Rmd | 201 +++++ results/v3/2023_02_21_benchmarkingv3.Rmd | 193 +++++ .../2023_02_08_benchmarkingPackages.Rmd | 2 +- .../GLM_mixte/2023_02_08_randomIntercept.Rmd | 26 +- src/model.fit/HTRfit/R/model_fitting.R | 14 +- src/v3/HTRsim/R/countsGenerator.R | 2 +- .../HTRsim/R/multinormDistrib_manipulations.R | 4 +- src/v3/HTRsim/R/workflow.R | 7 +- 16 files changed, 2613 insertions(+), 11 deletions(-) create mode 100644 reports/2022-12-09_report.Rmd create mode 100644 reports/css/modest.css create mode 100644 reports/css/retro.css create mode 100644 reports/css/splendor.css create mode 100644 results/v3/2022-12-04_dev.Rmd create mode 100644 results/v3/2023_02_10_benchmarking.Rmd create mode 100644 results/v3/2023_02_10_benchmarkingv2.Rmd create mode 100644 results/v3/2023_02_10_benchmarkingv3.Rmd create mode 100644 results/v3/2023_02_21_benchmarking_distrib.Rmd create mode 100644 results/v3/2023_02_21_benchmarkingv3.Rmd diff --git a/reports/2022-12-09_report.Rmd b/reports/2022-12-09_report.Rmd new file mode 100644 index 0000000..ca9895e --- /dev/null +++ b/reports/2022-12-09_report.Rmd @@ -0,0 +1,799 @@ +--- +title: "High throughtput RNA-seq " +output: html_document +date: "2022-12-09" +css: + - css/air.css +--- + +```{r setup, message=FALSE, warning=FALSE, include=TRUE, results="hide", include=FALSE} +library(HTRfit) +library(HTRsim) +library(plotROC) +library(gridExtra) +library(ggVennDiagram) +``` + +## RNA-seq and GLM + +Generalized linear models (GLM) are a classic method for analyzing RNA-seq expression data. In contrast to exact tests, GLMs allow for more general comparisons. It provides the ability to analyse complex experiments involving multiple treatment conditions while still taking full account of biological variation. Biological variation between RNA samples is estimated separately from the technical variation + +### Negative binomial parameters + +```{r message=FALSE, warning=FALSE, message=FALSE, warning=FALSE, include=TRUE, echo = FALSE, fig.width = 4, fig.height = 3, fig.align='center'} +a <- rnbinom(10000, size = 10, mu = 100) +mu = rep("mu = 100", 10000 ) +size = rep("size = 0.1", 10000 ) +dtf = cbind(a, mu, size) %>% data.frame() + +a <- rnbinom(10000, size = 100, mu = 100) +mu = rep("mu = 100", 10000 ) +size = rep("size = 100", 10000 ) +tmp = cbind(a, mu, size) %>% data.frame() +dtf = rbind(dtf, tmp) + +a <- rnbinom(10000, size = 10, mu = 1000) +mu = rep("mu = 1000", 10000 ) +size = rep("size = 0.1", 10000 ) +tmp = cbind(a, mu, size) %>% data.frame() +dtf = rbind(dtf, tmp) + +a <- rnbinom(10000, size = 100, mu = 1000) +mu = rep("mu = 1000", 10000 ) +size = rep("size = 100", 10000 ) +tmp = cbind(a, mu, size) %>% data.frame() +dtf = rbind(dtf, tmp) + +dtf$size <- factor(dtf$size) +dtf$mu <- factor(dtf$mu) +dtf$a = as.numeric(dtf$a) +ggplot(dtf) + geom_density(aes(x = a, fill = size)) + + facet_wrap(~mu, scales = "free") + theme_minimal() + + scale_fill_brewer(palette="Paired") +``` + + +### False discovery rate + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide", include=FALSE} +dtf.comp.annot.uncorrected = getAnnotation(dtf.comp, threshold = thr, alphaRisk = 0.05, postInferenceSelection = F, pvalCorrection = F) +p1 = getVennDiagramm(dtf.comp.annot.uncorrected, title = "pvalues") +``` + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide", include=FALSE} +dtf.comp.annot.corrected = getAnnotation(dtf.comp, threshold = thr, alphaRisk = 0.05, postInferenceSelection = F, pvalCorrection = T) +p2 = getVennDiagramm(dtf.comp.annot.corrected, title = "padj") +``` + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide", include=FALSE} +dtf1 = dtf.comp.annot.corrected %>% mutate(pvalueCorrection = "pvalue adjusted") +dtf2 = dtf.comp.annot.uncorrected %>% mutate(pvalueCorrection = "pvalue") +dtf = rbind(dtf1, dtf2) + +dtf$annotation <- factor(dtf$annotation, levels = c("TRUE", "FALSE")) + +p3 = ggplot(dtf %>% filter(beta != "(Intercept)")) + + geom_point(aes(x = actual.value, y = estimate, col = annotation), alpha = 0.5) + geom_abline(intercept = 0, slope = 1) + + geom_vline(xintercept = c(-thr, thr), linetype = "dotted") + + facet_grid(beta~pvalueCorrection, scales = "free") + + scale_color_brewer(palette = "Set2") + +p4 = ggplot(dtf %>% filter(beta != "(Intercept)"), + aes(d = actual.label , m = padj, color = pvalueCorrection)) + + geom_roc(n.cuts = 0, labels = F) + +p1p2 = grid.arrange(p1,p2, ncol = 1, nrow = 2) +p1p2p3 = grid.arrange(p1p2, p4 , ncol = 2, nrow = 1) +``` + +```{r} + +p1p2p3p4 = grid.arrange(p3, p1p2p3, nrow = 2, ncol = 1) +``` + +```{r} +dtf.comp.annot$annotation <- factor(dtf.comp.annot$annotation, levels = c("TRUE", "FALSE")) +dtf.comp.annot$from <- factor(dtf.comp.annot$from, levels = c("p(|beta| > 0) > 0.95 & |beta| > T", "p(|beta| > T) > 0.95")) + +ggplot(dtf.comp.annot %>% filter(beta != "(Intercept)")) + + geom_point(aes(x = actual.value, y = estimate, col = annotation), alpha = 0.5) + geom_abline(intercept = 0, slope = 1) + + geom_vline(xintercept = c(-thr, thr), linetype = "dotted") + + facet_wrap(~beta, scales = "free") + + scale_color_brewer(palette = "Set2") + +p = ggplot(dtf.comp.annot %>% filter(beta != "(Intercept)"), + aes(d = actual.label , m = padj)) + + geom_roc(n.cuts = 0, labels = F) + facet_wrap(~from) +p +``` + + +#### Increasing performances increasing number of replicates + +```{r} +n_genes = 4 +n_genotypes = 10 +n_environments = 2 +n_rep_list = c(3, 5, 10, 25) +sequencing_factor = 2 +uniformNumberOfReplicates = T +uniformDispersion = T +dds.extraction = loadEmbedded_ObservedValues() +thr = 2 + +remove(df_final) +## Fit mvnorm ## +fit.mvnorm <- getListMvnormFit(dds.extraction$beta) +##### Ground truth ###### +beta.actual <- getBetaforSimulation( +n_genes, +n_genotypes, +fit.mvnorm, n_clusters = 5 +) + +##### build input for simulation #### +model.matx <- getModelMatrix() +log_qij <- getLog_qij(beta.actual, model.matx) +mu_ij <- getMu_ij(log_qij.dtf = log_qij, sequencing_factor) +sample_ids <- colnames(mu_ij) +gene_dispersion.vec <- dds.extraction$gene_dispersion +dispersion.matrix <- getGenesDispersions(n_genes, + sample_ids, + dispersion.vec = gene_dispersion.vec, + uniformDispersion + ) + +for (n_rep in n_rep_list){ + print(n_rep) + ##### Design replicates ###### + designReplication.matx <- getReplicationDesign( + 3, + n_genotypes, + n_environments, + uniformNumberOfReplicates + ) + + ##### build counts table #### + countTable <- getCountTable(mu_ij, dispersion.matrix, + n_genes, n_genotypes, + sample_id_list = sample_ids, + replication.matx = designReplication.matx + ) + design <- summariseDesign(countTable) + actualParam <- list( + dispersion = dispersion.matrix, + beta = beta.actual, mvnorm = fit.mvnorm + ) + mock = list(design = design, countTable = countTable, + actualParameters = actualParam) + + count_table = mock$countTable %>% as.data.frame() + bioDesign = mock$design + + ############### DESEQ ################## + dds_simu = HTRsim::fit_deseq(count_table, bioDesign) + deseqFitdtf = getCoefficientsFromDds(dds_simu) + prediction = getPrediction(deseqFitdtf, threshold = thr, alphaRisk = 0.05) + expectation = getExpectation(mock$actualParameters$beta, threshold = thr) + comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + comparison.deseq = comparison %>% mutate(from = "Deseq2") %>% mutate(n_rep = n_rep) + + ############### GLM #################### + count_data = HTRfit::reshapeCounTable(count_table, bioDesign) + l = HTRfit::launch.glm(count_data) + fitDtf =listFit2dtf(l) + prediction = getPrediction(fitDtf$inference, threshold = thr, alphaRisk = 0.05) + comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + comparison.glm = comparison %>% mutate(from = "MASS::glm") %>% mutate(n_rep = n_rep) + ###################### DF final ################### + tmp = rbind(comparison.glm, comparison.deseq) + if (exists('df_final')){ + df_final = rbind(df_final, tmp) + } + else{ + df_final = tmp + } +} +df_final$n_rep %>% unique() +``` +## ROC + +```{r} +df_final$n_rep <- factor(df_final$n_rep) +df_final[is.na(df_final$term),] +p = ggplot(df_final %>% filter(beta != "(Intercept)"), + aes(d = actual.label , m = padj, col = n_rep)) + + geom_roc(n.cuts = 0, labels = F) + facet_wrap(~from) +p = p + scale_color_manual(values = c(c("#D3D3D3","#BDBDBD", "#9E9E9E", "#7D7D7D"))) + +ggsave("../img/graph/replicates_eff.png", p, height = 4, width = 6) + #scale_color_manual(values = c("#D3D3D3","#BDBDBD", "#9E9E9E")) + #scale_color_manual(values = c("#D3D3D3","#BDBDBD", "#9E9E9E", "#7D7D7D", "#696969" )) + +``` + +#### Increasing performances increasing sequencing depth + +```{r} +n_genes = 6000 +n_genotypes = 3 +n_environments = 2 +max_n_replicates = 3 +sequencing_factor_list = c(0.01, 0.1, 1, 2) +uniformNumberOfReplicates = T +uniformDispersion = T +dds.extraction = loadObservedValues() +thr = 2 + +remove(df_final) +## Fit mvnorm ## +fit.mvnorm <- getListMvnormFit(dds.extraction$beta) +##### Ground truth ###### +beta.actual <- getBetaforSimulation( +n_genes, +n_genotypes, +fit.mvnorm, n_clusters = 5 +) + +##### build input for simulation #### +model.matx <- getModelMatrix() +log_qij <- getLog_qij(beta.actual, model.matx) + + +for (sequencing_factor in sequencing_factor_list){ + print(sequencing_factor) +mu_ij <- getMu_ij(log_qij.dtf = log_qij, sequencing_factor) +sample_ids <- colnames(mu_ij) +gene_dispersion.vec <- dds.extraction$gene_dispersion +dispersion.matrix <- getGenesDispersions(n_genes, + sample_ids, + dispersion.vec = gene_dispersion.vec, + uniformDispersion + ) + + + ##### Design replicates ###### + designReplication.matx <- getReplicationDesign( + max_n_replicates, + n_genotypes, + n_environments, + uniformNumberOfReplicates + ) + + ##### build counts table #### + countTable <- getCountTable(mu_ij, dispersion.matrix, + n_genes, n_genotypes, + sample_id_list = sample_ids, + replication.matx = designReplication.matx + ) + design <- summariseDesign(countTable) + actualParam <- list( + dispersion = dispersion.matrix, + beta = beta.actual, mvnorm = fit.mvnorm + ) + mock = list(design = design, countTable = countTable, + actualParameters = actualParam) + + + count_table = mock$countTable %>% as.data.frame() + count_table %>% dim() + reads_counts = count_table %>% colSums() %>% sum() %>% format(., scientific = TRUE, digits=1) + bioDesign = mock$design + + ############### DESEQ ################## + dds_simu = HTRsim::fit_deseq(count_table, bioDesign) + deseqFitdtf = getCoefficientsFromDds(dds_simu) + prediction = getPrediction(deseqFitdtf, threshold = thr, alphaRisk = 0.05) + expectation = getExpectation(mock$actualParameters$beta, threshold = thr) + comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + comparison.deseq = comparison %>% mutate(from = "Deseq2") %>% mutate(sequencingDepth = sequencing_factor) + + ############### GLM #################### + count_data = HTRfit::reshapeCounTable(count_table, bioDesign) + l = HTRfit::launch.glm(count_data) + fitDtf =listFit2dtf(l) + prediction = getPrediction(fitDtf$inference, threshold = thr, alphaRisk = 0.05) + comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + comparison.glm = comparison %>% mutate(from = "MASS::glm") %>% mutate(sequencingDepth = sequencing_factor) + ###################### DF final ################### + tmp = rbind(comparison.glm, comparison.deseq) %>% mutate(reads_sequenced = reads_counts) + if (exists('df_final')){ + df_final = rbind(df_final, tmp) + } + else{ + df_final = tmp + } +} +df_final$reads_sequenced %>% unique() +``` +## ROC + +```{r} +df_final$sequencingDepth <- factor(df_final$sequencingDepth) +df_final$reads_sequenced <- as.numeric(df_final$reads_sequenced) +df_final$reads_sequenced <- factor(df_final$reads_sequenced) + +p = ggplot(df_final %>% filter(beta != "(Intercept)"), + aes(d = actual.label , m = padj, col = reads_sequenced)) + + geom_roc(n.cuts = 0, labels = F) + facet_wrap(~from) +p +p =p + scale_color_manual(values = c("#D3D3D3","#BDBDBD", "#9E9E9E", "#7D7D7D")) +ggsave("../img/graph/replicates_eff.png", p, height = 4, width = 6) + + #scale_color_manual(values = c("#D3D3D3","#BDBDBD", "#9E9E9E", "#7D7D7D", "#696969" )) + +``` + +## MASS::GLM + +```{r} +s = HTRfit::reshapeCounTable(count_table, bioDesign) +fit <- MASS::glm.nb(k_ij ~ genotype + environment + genotype:environment, data = s %>% filter(gene_id == "gene1"), link = log) +broom.mixed::tidy(fit, component = c("disp")) +list(estimate = fit$theta, gene_id = "gene1") %>% as.data.frame() +a =summary(fit) +a %>% as.data.frame() +MASS::dis +fit$fitted.value +l = HTRfit::launch.glm(s) +l[[1]] +fitDtf =listFit2dtf(l) +fitDtf$deviance + +prediction = getPrediction(fitDtf$inference, threshold = 2, alphaRisk = 0.05, postInferenceSelection = T) +expectation = getExpectation(mock$actualParameters$beta, threshold = 2) +comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + + +getVennDiagramm(comparison) +``` + +## increase number of genotypes + +```{r} +n_genes = 100 +n_genotypes_list = c(10) +n_environments = 2 +max_n_replicates = 15 +sequencing_factor = 2 +uniformNumberOfReplicates = T +uniformDispersion = T +dds.extraction = loadObservedValues() +thr = 2 + +#remove(df_final) +## Fit mvnorm ## +fit.mvnorm <- getListMvnormFit(dds.extraction$beta) + +for (x in 1:4){ + print(x) +for (n_genotypes in n_genotypes_list){ + ##### Ground truth ###### + beta.actual <- getBetaforSimulation( + n_genes, + n_genotypes, + fit.mvnorm, n_clusters = 5 + ) + + ##### build input for simulation #### + model.matx <- getModelMatrix() + log_qij <- getLog_qij(beta.actual, model.matx) + mu_ij <- getMu_ij(log_qij.dtf = log_qij, sequencing_factor) + sample_ids <- colnames(mu_ij) + gene_dispersion.vec <- dds.extraction$gene_dispersion + dispersion.matrix <- getGenesDispersions(n_genes, + sample_ids, + dispersion.vec = gene_dispersion.vec, + uniformDispersion + ) + + ##### Design replicates ###### + designReplication.matx <- getReplicationDesign( + max_n_replicates, + n_genotypes, + n_environments, + uniformNumberOfReplicates + ) + + ##### build counts table #### + countTable <- getCountTable(mu_ij, dispersion.matrix, + n_genes, n_genotypes, + sample_id_list = sample_ids, + replication.matx = designReplication.matx + ) + + design <- summariseDesign(countTable) + actualParam <- list( + dispersion = dispersion.matrix, + beta = beta.actual, mvnorm = fit.mvnorm + ) + mock = list(design = design, countTable = countTable, + actualParameters = actualParam) + + count_table = mock$countTable %>% as.data.frame() + bioDesign = mock$design + + ############### DESEQ ################## + start_time <- Sys.time() + dds_simu = HTRsim::fit_deseq(count_table, bioDesign) + deseqFitdtf = getCoefficientsFromDds(dds_simu) + end_time <- Sys.time() + time_process = difftime(end_time, start_time, units = "secs") %>% as.numeric() + prediction = getPrediction(deseqFitdtf, threshold = thr, alphaRisk = 0.05) + expectation = getExpectation(mock$actualParameters$beta, threshold = thr) + comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + comparison.deseq = comparison %>% mutate(from = "Deseq2") %>% mutate(n_G = n_genotypes) %>% mutate(timeProcess = time_process) + + ############### GLM #################### + count_data = HTRfit::reshapeCounTable(count_table, bioDesign) + start_time <- Sys.time() + l = HTRfit::launch.glm(count_data) + fitDtf =listFit2dtf(l) + end_time <- Sys.time() + time_process = difftime(end_time, start_time, units = "secs") %>% as.numeric() + prediction = getPrediction(fitDtf$inference, threshold = thr, alphaRisk = 0.05) + comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + comparison.glm = comparison %>% mutate(from = "MASS::glm") %>% mutate(n_G = n_genotypes) %>% mutate(timeProcess = time_process) + ###################### DF final ################### + tmp = rbind(comparison.glm, comparison.deseq) + #tmp = comparison.glm + if (exists('df_final')){ + df_final = rbind(df_final, tmp) + #df_final = comparison.glm + } + else{ + df_final = tmp + } +} +} + +df_final$n_G %>% unique() +write_tsv(df_final, file = "backup_genotypeEffect.tsv") + + +listBeta <- DESeq2::resultsNames(dds_simu) +library(furrr) +future::plan(multisession, workers = 2) +res <- listBeta %>% furrr::future_map( + .x = ., + ~ DESeq2::results(dds_simu, + contrast = list(.x), + lfcThreshold = 2, #/!\ statistic & pvalue resestimate later + # altHypothesis = altH, + tidy = TRUE + ) %>% + dplyr::select(-baseMean) %>% + dplyr::mutate(term = .x) %>% + dplyr::rename( + estimate = log2FoldChange, + std.error = lfcSE, + statistic = stat, + p.value = pvalue, + gene_id = row + ), + .options = furrr_options(seed = TRUE) + ) +deseq_inference <- do.call("rbind", res) + +plot(deseq_inference$p.value,prediction$p.value) +z = prediction %>% + rstatix::adjust_pvalue(p.col = "p.value", method = "BH", output.col = "padj2") %>% mutate() + +deseq_inference +plot(deseq_inference$padj ,z$padj2) +p.adjust.methods +``` + +## ROC + +```{r} + +p = ggplot(df_final %>% filter(beta != "(Intercept)"), + aes(d = actual.label , m = padj, col = from)) + + geom_roc(n.cuts = 0, labels = F) + facet_wrap(~n_G, nrow = 1) +p = p + scale_colour_manual(values = c("#F8CBAD", "#9DC3E6")) +p +ggsave("../img/plotROC.png",p, height = 6, width = 8) +library("Rmisc") +tgc <- summarySE(df_final, measurevar="timeProcess", groupvars=c("n_G","from")) +tgc$n_G <- factor(tgc$n_G) +tgc$n_G = as.numeric(as.character(tgc$n_G)) +p= ggplot(tgc, aes(x = n_G, y = timeProcess, colour = from)) + + geom_line(aes(x = n_G, y= timeProcess+sd)) + + geom_line(aes(x = n_G, y= timeProcess-sd)) + + geom_point() + + scale_y_log10() + scale_colour_manual(values = c("#F8CBAD", "#9DC3E6")) + +p +bakcup = rbind(bakcup, df_final) +bakcup$n_G <- as.numeric(bakcup$n_G) +bakcup$n_G %>% unique() +``` +## GLMM + +```{r} + +n_genes = 10 +n_genotypes = 200#c(5, 60, 600, 1000) +n_environments = 2 +max_n_replicates = 5 +sequencing_factor = 1 +uniformNumberOfReplicates = T +uniformDispersion = T +dds.extraction = loadEmbedded_ObservedValues() +thr = 2 +fit.mvnorm <- getListMvnormFit(dds.extraction$beta, n_clusters = 5) +beta.actual <- getBetaforSimulation( +n_genes, +n_genotypes, +fit.mvnorm, fixIntercept = FALSE, fixBetaE = TRUE, n_clusters = 5 +) + +##### build input for simulation #### +model.matx <- getModelMatrix() +log_qij <- getLog_qij(beta.actual, model.matx) +mu_ij <- getMu_ij(log_qij.dtf = log_qij, sequencing_factor) +sample_ids <- colnames(mu_ij) +gene_dispersion.vec <- dds.extraction$gene_dispersion +dispersion.matrix <- getGenesDispersions(n_genes, + sample_ids, + dispersion.vec = gene_dispersion.vec, + uniformDispersion + ) + +##### Design replicates ###### +designReplication.matx <- getReplicationDesign( + max_n_replicates, + n_genotypes, + n_environments, + uniformNumberOfReplicates + ) + +##### build counts table #### +countTable <- getCountTable(mu_ij, dispersion.matrix, +n_genes, n_genotypes, +sample_id_list = sample_ids, +replication.matx = designReplication.matx +) + +design <- summariseDesign(countTable) + +actualParam <- list( +dispersion = dispersion.matrix, +beta = beta.actual, mvnorm = fit.mvnorm +) +mock = list(design = design, countTable = countTable, + actualParameters = actualParam) + +count_table = mock$countTable %>% as.data.frame() +bioDesign = mock$design + +count_data = HTRfit::reshapeCounTable(count_table, bioDesign) +iteration_list <- count_data[, "gene_id"] %>% + unique() %>% + unlist() %>% + unname() ## get list gene + +library(future) +plan(multisession, workers = 4) +library(lme4) +library(glmmTMB) +fit.glmm <- function(data2fit, id){ + tryCatch( + { + fit <- lme4::glmer.nb(k_ij ~ environment + ( 1 + environment | genotype) , data= data2fit , verbose=F) + fit.dtf <- tidySummary(fit, "glmm") + fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(gene_id = id) + fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(gene_id = id) + return(fit.dtf) + + } + ,error = function(cnd) { + inference <- list(gene_id = id, estimate = NA, std.error = NA, term= NA) %>% as.data.frame() + fitQuality <- list(null.deviance= NA,df.null = NA, logLik = NA,AIC = NA ,BIC=NA,deviance = NA,df.residual=NA,nobs = NA,gene_id = id) %>% as.data.frame() + fit.dtf <- list(inference = inference, fitQuality = fitQuality) + return(fit.dtf) + + } + + ) } +a = lme4::glmer.nb(k_ij ~ environment + (1 + environment | genotype), data = count_data %>% filter(gene_id == "gene1"), verbose = F) +as = summary(a) + +res = launch.glm_mixte(count_data) +results = furrr::future_map(.x = iteration_list, .f = + ~fit.glmm(count_data[which(count_data[,"gene_id"] == .x),], .x) ) +glmm.res = listFit2dtf(res) +glmm.res$inference +beta.actual.glmm = beta.actual %>% dplyr::group_by(gene_id) %>% + dplyr::summarise(tmp=mean(`(Intercept)` + betaG ), + environmentE1 = mean(betaE + betaGE), + "sd__(Intercept)" = sd(`(Intercept)` + betaG ), + sd__environmentE1 = sd(betaGE + betaE ), + "cor__(Intercept).environmentE1"= cor((betaGE + betaE),(`(Intercept)`+ betaG))) %>% + dplyr::rename("(Intercept)" = tmp) %>% + reshape2::melt(id = "gene_id", value.name = "actual.value", variable.name = 'term') +gene_id2fitMvnorm = beta.actual %>% select(gene_id, idx_mvrnom) %>% unique() + +actual2join.dtf <- data.table::data.table( beta.actual.glmm, key = c("gene_id", "term")) +gene_id2fitMvnorm2join <- data.table::data.table( gene_id2fitMvnorm, key = c("gene_id")) +actual2join.dtf = actual2join.dtf[gene_id2fitMvnorm2join] +inference2join.dtf <- data.table::data.table(glmm.res$inference, key = c("gene_id", "term")) +comparison.dtf <- actual2join.dtf[inference2join.dtf] + + +comparison.dtf = comparison.dtf %>% mutate(actual.value = if_else(str_detect(term, "cor_"), actual.value, actual.value*log(2) )) +comparison.dtf$idx_mvrnom = factor(comparison.dtf$idx_mvrnom) +comparison.dtf$term = factor(comparison.dtf$term, levels = c("(Intercept)", "environmentE1","cor__(Intercept).environmentE1", "sd__(Intercept)", "sd__environmentE1")) +p = ggplot(comparison.dtf) + + geom_point(aes(x = actual.value, y = estimate, col = idx_mvrnom), alpha = 0.5, size = 2) + geom_abline(intercept = 0, slope = 1) + + facet_wrap(~term, scales = "free") +p +ggsave("../img/graph/poc_glmm2_1000.png", p,) + + +x = comparison.dtf %>% reshape2::dcast(., gene_id ~ term, value.var = "estimate") +gene_vec = rep(x$gene_id, 1000) +a = rnorm(n_genes* 1000, sd = x$`sd__(Intercept)`, mean = x$`(Intercept)` ) %>% data.frame() %>% mutate(Env = "(Intercept)") %>% mutate(gene_id = gene_vec) +b = rnorm(n_genes*1000, sd = x$sd__environmentE1, mean = x$`sd__(Intercept)` ) %>% data.frame() %>% mutate(Env = "E1") %>% mutate(gene_id = gene_vec) +dtf = rbind(a,b) %>% rename(value = ".") %>% mutate(from = "prediction") + +x2 = comparison.dtf %>% reshape2::dcast(., gene_id ~ term, value.var = "actual.value") +gene_vec = rep(x$gene_id, 1000) +a = rnorm(n_genes* 1000, sd = x2$`sd__(Intercept)`, mean = x2$`(Intercept)` ) %>% data.frame() %>% mutate(Env = "(Intercept)") %>% mutate(gene_id = gene_vec) +b = rnorm(n_genes*1000, sd = x2$sd__environmentE1, mean = x2$`sd__(Intercept)` ) %>% data.frame() %>% mutate(Env = "E1") %>% mutate(gene_id = gene_vec) +dtf2 = rbind(a,b) %>% rename(value = ".") %>% mutate(from = "actual") + +dtf = rbind(dtf, dtf2) +library("ggridges") + + +dtf$gene_id = factor(dtf$gene_id, level = x$gene_id[order(x$sd__environmentE1)]) +dtf$from = factor(dtf$from, level = c("prediction", 'actual')) +p = ggplot(dtf) + geom_density_ridges(aes(x = value , y = gene_id, fill = from), alpha = 0.6) + facet_wrap(~Env, scales = 'free_x', ncol = 2) + xlim(c(-15,15)) + scale_fill_manual(values = c("#FFE699", "#1F4E79")) +p +ggsave("../img/graph/poc_glmm1000.png", p, height = 6, width = 8) +``` + +```{r} + +############ GLMMM##### +fit_glmm <- function(data2fit, id){ + print(data2fit) + ########################### FIT LME4 ################################# + print("LME4") + m.nb <- glmer.nb(k_ij ~ 0 + environment + ( 1 + environment | genotype ) , data= data2fit , verbose=F) + res_lme4 = fit_extraction(m.nb) + res_lme4 = res_lme4 %>% mutate(gene_id = id) %>% mutate(from = "lme4") + + ##################### FIT GMTMB 1 ######################### + print("glmTMB 1") + m.nb <- glmmTMB::glmmTMB(k_ij ~ 0 + environment + ( 1 + environment | genotype ) , data=data2fit, family=nbinom1, verbose = F) + res_glmTMB1 = fit_extraction(m.nb) + res_glmTMB1 = res_glmTMB1 %>% mutate(gene_id = id) %>% mutate(from = "glmTMB nbinom1") + + ##################### FIT GMTMB 2 ######################### + print("glmTMB 2") + + m.nb <- glmmTMB::glmmTMB(k_ij ~ 0 + environment + ( 1 + environment | genotype ) , data=data2fit, family=nbinom2, verbose = F) + res_glmTMB2 = fit_extraction(m.nb) + res_glmTMB2 = res_glmTMB2 %>% mutate(gene_id = id) %>% mutate(from = "glmTMB nbinom2") + + res = rbind(res_lme4, res_glmTMB1, res_glmTMB2) + return(res) +} +``` + +## fixed or random effect + +#### Increasing performances increasing sequencing depth + +```{r} +n_genes = 1000 +n_genotypes = 5 +n_environments = 2 +max_n_replicates = 10 +uniformNumberOfReplicates = T +uniformDispersion = T +dds.extraction = loadObservedValues() +thr = 2 + +remove(df_final) +## Fit mvnorm ## +fit.mvnorm <- getListMvnormFit(dds.extraction$beta) +##### Ground truth ###### + + + +for (boolean in c(F,T)){ +beta.actual <- getBetaforSimulation( +n_genes, +n_genotypes, +fit.mvnorm, n_clusters = 5, fixIntercept = boolean +) + +##### build input for simulation #### +model.matx <- getModelMatrix() +log_qij <- getLog_qij(beta.actual, model.matx) +mu_ij <- getMu_ij(log_qij.dtf = log_qij, sequencing_factor) +sample_ids <- colnames(mu_ij) +gene_dispersion.vec <- dds.extraction$gene_dispersion +dispersion.matrix <- getGenesDispersions(n_genes, + sample_ids, + dispersion.vec = gene_dispersion.vec, + uniformDispersion + ) + + + ##### Design replicates ###### + designReplication.matx <- getReplicationDesign( + max_n_replicates, + n_genotypes, + n_environments, + uniformNumberOfReplicates + ) + + ##### build counts table #### + countTable <- getCountTable(mu_ij, dispersion.matrix, + n_genes, n_genotypes, + sample_id_list = sample_ids, + replication.matx = designReplication.matx + ) + design <- summariseDesign(countTable) + actualParam <- list( + dispersion = dispersion.matrix, + beta = beta.actual, mvnorm = fit.mvnorm + ) + mock = list(design = design, countTable = countTable, + actualParameters = actualParam) + + + count_table = mock$countTable %>% as.data.frame() + bioDesign = mock$design + + ############### DESEQ ################## + dds_simu = HTRsim::fit_deseq(count_table, bioDesign) + deseqFitdtf = getCoefficientsFromDds(dds_simu) + prediction = getPrediction(deseqFitdtf, threshold = thr, alphaRisk = 0.05) + expectation = getExpectation(mock$actualParameters$beta, threshold = thr) + comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + comparison.deseq = comparison %>% mutate(from = "Deseq2") %>% mutate(Intercept_fixed = boolean) + + ############### GLM #################### + count_data = HTRfit::reshapeCounTable(count_table, bioDesign) + l = HTRfit::launch.glm(count_data) + fitDtf =listFit2dtf(l) + prediction = getPrediction(fitDtf$inference, threshold = thr, alphaRisk = 0.05) + comparison = getComparison(actual.dtf = expectation, inference.dtf = prediction) + comparison.glm = comparison %>% mutate(from = "MASS::glm") %>% mutate(Intercept_fixed = boolean) + ###################### DF final ################### + tmp = rbind(comparison.glm, comparison.deseq) + if (exists('df_final')){ + df_final = rbind(df_final, tmp) + } + else{ + df_final = tmp + } +} +``` +## ROC + +```{r} + + +p = ggplot(df_final %>% filter(beta != "(Intercept)"), + aes(d = actual.label , m = padj, col = Intercept_fixed)) + + geom_roc(n.cuts = 0, labels = F) + facet_wrap(~from) +p +p =p + scale_color_manual(values = c("#A9D18E", "#1F4E79")) +ggsave("../img/graph/ROCfixed_eff.png", p, height = 4, width = 6) + +###### +df_final[is.na(df_final$beta),] +p = ggplot(df_final %>% filter(beta != is.na(beta))) + + geom_point(aes(x = actual.value, y = estimate, col = Intercept_fixed), alpha = 0.4, size = 2) + geom_abline(intercept = 0, slope = 1) + + facet_grid(from~beta, scales = "free") +p = p + scale_color_manual(values = c("#A9D18E", "#1F4E79")) +ggsave("../img/graph/IDfixed_eff.png", p,) + +``` diff --git a/reports/css/modest.css b/reports/css/modest.css new file mode 100644 index 0000000..858fd74 --- /dev/null +++ b/reports/css/modest.css @@ -0,0 +1,219 @@ +@media print { + *, + *:before, + *:after { + background: transparent !important; + color: #000 !important; + box-shadow: none !important; + text-shadow: none !important; + } + + a, + a:visited { + text-decoration: underline; + } + + a[href]:after { + content: " (" attr(href) ")"; + } + + abbr[title]:after { + content: " (" attr(title) ")"; + } + + a[href^="#"]:after, + a[href^="javascript:"]:after { + content: ""; + } + + pre, + blockquote { + border: 1px solid #999; + page-break-inside: avoid; + } + + thead { + display: table-header-group; + } + + tr, + img { + page-break-inside: avoid; + } + + img { + max-width: 100% !important; + } + + p, + h2, + h3 { + orphans: 3; + widows: 3; + } + + h2, + h3 { + page-break-after: avoid; + } + } + + pre, + code { + font-family: Menlo, Monaco, "Courier New", monospace; + } + + pre { + padding: .5rem; + line-height: 1.25; + overflow-x: scroll; + } + + a, + a:visited { + color: #3498db; + } + + a:hover, + a:focus, + a:active { + color: #2980b9; + } + + .modest-no-decoration { + text-decoration: none; + } + + html { + font-size: 12px; + } + + @media screen and (min-width: 32rem) and (max-width: 48rem) { + html { + font-size: 15px; + } + } + + @media screen and (min-width: 48rem) { + html { + font-size: 16px; + } + } + + body { + line-height: 1.85; + } + + p, + .modest-p { + font-size: 1rem; + margin-bottom: 1.3rem; + } + + h1, + .modest-h1, + h2, + .modest-h2, + h3, + .modest-h3, + h4, + .modest-h4 { + margin: 1.414rem 0 .5rem; + font-weight: inherit; + line-height: 1.42; + } + + h1, + .modest-h1 { + margin-top: 0; + font-size: 3.998rem; + } + + h2, + .modest-h2 { + font-size: 2.827rem; + } + + h3, + .modest-h3 { + font-size: 1.999rem; + } + + h4, + .modest-h4 { + font-size: 1.414rem; + } + + h5, + .modest-h5 { + font-size: 1.121rem; + } + + h6, + .modest-h6 { + font-size: .88rem; + } + + small, + .modest-small { + font-size: .707em; + } + + /* https://github.com/mrmrs/fluidity */ + + img, + canvas, + iframe, + video, + svg, + select, + textarea { + max-width: 100%; + } + + @import url(http://fonts.googleapis.com/css?family=Open+Sans+Condensed:300,300italic,700); + + @import url(http://fonts.googleapis.com/css?family=Arimo:700,700italic); + + html { + font-size: 18px; + max-width: 100%; + } + + body { + color: #444; + font-family: 'Open Sans Condensed', sans-serif; + font-weight: 300; + margin: 0 auto; + max-width: 48rem; + line-height: 1.45; + padding: .25rem; + } + + h1, + h2, + h3, + h4, + h5, + h6 { + font-family: Arimo, Helvetica, sans-serif; + } + + h1, + h2, + h3 { + border-bottom: 2px solid #fafafa; + margin-bottom: 1.15rem; + padding-bottom: .5rem; + text-align: center; + } + + blockquote { + border-left: 8px solid #fafafa; + padding: 1rem; + } + + pre, + code { + background-color: #fafafa; + } \ No newline at end of file diff --git a/reports/css/retro.css b/reports/css/retro.css new file mode 100644 index 0000000..8a26db7 --- /dev/null +++ b/reports/css/retro.css @@ -0,0 +1,202 @@ + + +pre, +code { + font-family: Menlo, Monaco, "Courier New", monospace; +} + +pre { + padding: .5rem; + line-height: 1.25; + overflow-x: scroll; +} + +@media print { + *, + *:before, + *:after { + background: transparent !important; + color: #000 !important; + box-shadow: none !important; + text-shadow: none !important; + } + + a, + a:visited { + text-decoration: underline; + } + + a[href]:after { + content: " (" attr(href) ")"; + } + + abbr[title]:after { + content: " (" attr(title) ")"; + } + + a[href^="#"]:after, + a[href^="javascript:"]:after { + content: ""; + } + + pre, + blockquote { + border: 1px solid #999; + page-break-inside: avoid; + } + + thead { + display: table-header-group; + } + + tr, + img { + page-break-inside: avoid; + } + + img { + max-width: 100% !important; + } + + p, + h2, + h3 { + orphans: 3; + widows: 3; + } + + h2, + h3 { + page-break-after: avoid; + } +} + +a, +a:visited { + color: #01ff70; +} + +a:hover, +a:focus, +a:active { + color: #2ecc40; +} + +.retro-no-decoration { + text-decoration: none; +} + +html { + font-size: 12px; +} + +@media screen and (min-width: 32rem) and (max-width: 48rem) { + html { + font-size: 15px; + } +} + +@media screen and (min-width: 48rem) { + html { + font-size: 16px; + } +} + +body { + line-height: 1.85; +} + +p, +.retro-p { + font-size: 1rem; + margin-bottom: 1.3rem; +} + +h1, +.retro-h1, +h2, +.retro-h2, +h3, +.retro-h3, +h4, +.retro-h4 { + margin: 1.414rem 0 .5rem; + font-weight: inherit; + line-height: 1.42; +} + +h1, +.retro-h1 { + margin-top: 0; + font-size: 3.998rem; +} + +h2, +.retro-h2 { + font-size: 2.827rem; +} + +h3, +.retro-h3 { + font-size: 1.999rem; +} + +h4, +.retro-h4 { + font-size: 1.414rem; +} + +h5, +.retro-h5 { + font-size: 1.121rem; +} + +h6, +.retro-h6 { + font-size: .88rem; +} + +small, +.retro-small { + font-size: .707em; +} + +/* https://github.com/mrmrs/fluidity */ + +img, +canvas, +iframe, +video, +svg, +select, +textarea { + max-width: 100%; +} + +html, +body { + background-color: #222; + min-height: 100%; +} + +html { + font-size: 18px; +} + +body { + color: #fafafa; + font-family: "Courier New"; + line-height: 1.45; + margin: 6rem auto 1rem; + max-width: 48rem; + padding: .25rem; +} + +pre { + background-color: #333; +} + +blockquote { + border-left: 3px solid #01ff70; + padding-left: 1rem; +} \ No newline at end of file diff --git a/reports/css/splendor.css b/reports/css/splendor.css new file mode 100644 index 0000000..4121b51 --- /dev/null +++ b/reports/css/splendor.css @@ -0,0 +1,225 @@ +@media print { + *, + *:before, + *:after { + background: transparent !important; + color: #000 !important; + box-shadow: none !important; + text-shadow: none !important; + } + + a, + a:visited { + text-decoration: underline; + } + + a[href]:after { + content: " (" attr(href) ")"; + } + + abbr[title]:after { + content: " (" attr(title) ")"; + } + + a[href^="#"]:after, + a[href^="javascript:"]:after { + content: ""; + } + + pre, + blockquote { + border: 1px solid #999; + page-break-inside: avoid; + } + + thead { + display: table-header-group; + } + + tr, + img { + page-break-inside: avoid; + } + + img { + max-width: 100% !important; + } + + p, + h2, + h3 { + orphans: 3; + widows: 3; + } + + h2, + h3 { + page-break-after: avoid; + } + } + + html { + font-size: 12px; + } + + @media screen and (min-width: 32rem) and (max-width: 48rem) { + html { + font-size: 15px; + } + } + + @media screen and (min-width: 48rem) { + html { + font-size: 16px; + } + } + + body { + line-height: 1.85; + } + + p, + .splendor-p { + font-size: 1rem; + margin-bottom: 1.3rem; + } + + h1, + .splendor-h1, + h2, + .splendor-h2, + h3, + .splendor-h3, + h4, + .splendor-h4 { + margin: 1.414rem 0 .5rem; + font-weight: inherit; + line-height: 1.42; + } + + h1, + .splendor-h1 { + margin-top: 0; + font-size: 3.998rem; + } + + h2, + .splendor-h2 { + font-size: 2.827rem; + } + + h3, + .splendor-h3 { + font-size: 1.999rem; + } + + h4, + .splendor-h4 { + font-size: 1.414rem; + } + + h5, + .splendor-h5 { + font-size: 1.121rem; + } + + h6, + .splendor-h6 { + font-size: .88rem; + } + + small, + .splendor-small { + font-size: .707em; + } + + /* https://github.com/mrmrs/fluidity */ + + img, + canvas, + iframe, + video, + svg, + select, + textarea { + max-width: 100%; + } + + @import url(http://fonts.googleapis.com/css?family=Merriweather:300italic,300); + + html { + font-size: 18px; + max-width: 100%; + } + + body { + color: #444; + font-family: 'Merriweather', Georgia, serif; + margin: 0; + max-width: 100%; + } + + /* === A bit of a gross hack so we can have bleeding divs/blockquotes. */ + + p, + *:not(div):not(img):not(body):not(html):not(li):not(blockquote):not(p) { + margin: 1rem auto 1rem; + max-width: 36rem; + padding: .25rem; + } + + div { + width: 100%; + } + + div img { + width: 100%; + } + + blockquote p { + font-size: 1.5rem; + font-style: italic; + margin: 1rem auto 1rem; + max-width: 48rem; + } + + li { + margin-left: 2rem; + } + + /* Counteract the specificity of the gross *:not() chain. */ + + h1 { + padding: 4rem 0 !important; + } + + /* === End gross hack */ + + p { + color: #555; + height: auto; + line-height: 1.45; + } + + pre, + code { + font-family: Menlo, Monaco, "Courier New", monospace; + } + + pre { + background-color: #fafafa; + font-size: .8rem; + overflow-x: scroll; + padding: 1.125em; + } + + a, + a:visited { + color: #3498db; + } + + a:hover, + a:focus, + a:active { + color: #2980b9; + } \ No newline at end of file diff --git a/results/v3/2022-12-04_dev.Rmd b/results/v3/2022-12-04_dev.Rmd new file mode 100644 index 0000000..259adc0 --- /dev/null +++ b/results/v3/2022-12-04_dev.Rmd @@ -0,0 +1,158 @@ +--- +title: "HTRsim" +date: '2022-11-23' +output: + html_document: +--- + + +## Required + + +```{r setup, message=FALSE, warning=FALSE, results="hide"} +library(HTRsim) +library(HTRfit) +library(plotROC) +``` + +## Simulation + +### Parameters + +```{r} +# -- SETUP +n_G = 1000 +n_genoT = 3 +n_env = 2 +FixIntercept = T +max_n_rep = 10 +sequencing_fact = 1 +n_clus = 1 +thr = 2 +########### +``` + +### Random intercept + +```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# -- Simulate counts +mock = rnaMock(n_genes = n_G, n_genotypes = n_genoT, n_environments = n_env, + max_n_replicates = max_n_rep, sequencing_factor = sequencing_fact, fixIntercept = FixIntercept, n_clusters = n_clus) +# -- count table & experimental design +count_table = mock$countTable %>% as.data.frame() +bioDesign = mock$design + +# -- ground truth +beta.actual = mock$actualParam$beta + +``` + + +### get time to fit + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide", eval=FALSE} +rm(dtf2evaluation) +rm (dispersion_comparison.glm) +# -- MASS::glm -- #x +dtf2fit = HTRfit::reshapeCounTable(count_table, + bioDesign) +l.glm = HTRfit::launch.glm(dtf2fit) +fitDtf.glm =listFit2dtf(l.glm) +fitDtf.glm$inference$from = 'MASS::glm' +expectation = getExpectation(beta.actual, + toEval = "glm" , threshold = thr) +prediction = getPrediction(fitDtf.glm$inference, + threshold = thr, alphaRisk = 0.05) +comp.glm = getComparison(actual.dtf = expectation, + inference.dtf = prediction) + +dispersion_estimated = fitDtf.glm$dispersion +# -- one dispersion per gene +dispersion_actual = mock$actualParameters$dispersion %>% + rowMeans() %>% + as.data.frame() %>% + dplyr::rename(., dispersion.actual = ".") %>% tibble::rownames_to_column("gene_id") + + +dispersion_actual = data.table::data.table(dispersion_actual, key = "gene_id") +dispersion_estimated = data.table::data.table(dispersion_estimated, key = "gene_id") +dtf_idxMvrnorm = expectation %>% dplyr::select(gene_id, idx_mvrnom) +dtf_idxMvrnorm = data.table::data.table(dtf_idxMvrnorm, key = "gene_id") +dtf_idxMvrnorm = dtf_idxMvrnorm[!duplicated(dtf_idxMvrnorm)] +dispersion_comparison.glm = dispersion_actual[dispersion_estimated] +dispersion_comparison.glm = dtf_idxMvrnorm[dispersion_comparison.glm] +dispersion_comparison.glm = dispersion_comparison.glm %>% mutate(from = "MASS::glm") +dispersion_comparison.glm$dispersion.estimate = 1/dispersion_comparison.glm$dispersion.estimate +# -- DESEQ2 -- # +dds_simu = HTRsim::fit_deseq(count_table, bioDesign) +deseqFitdtf = getCoefficientsFromDds(dds_simu) +prediction = getPrediction(deseqFitdtf, threshold = thr, + alphaRisk = 0.05, pvalCorrection = T) +comp.deseq = getComparison(actual.dtf = expectation, + inference.dtf = prediction ) +comp.deseq$from = 'DESEQ2' + +dispersion_estimated = getDispersionFromDDS(dds_simu) +dispersion_estimated = data.table::data.table(dispersion_estimated, key = "gene_id") +dispersion_comparison.deseq = dispersion_actual[dispersion_estimated] +dispersion_comparison.deseq = dtf_idxMvrnorm[dispersion_comparison.deseq] +dispersion_comparison.deseq = dispersion_comparison.deseq %>% mutate(from = "DESEQ2") + + +# ---- Saving results ----- # +tmp = rbind(comp.deseq, comp.glm) +dispersion_comparison = rbind(dispersion_comparison.deseq, dispersion_comparison.glm) +if (exists('dtf2evaluation')){ + dtf2evaluation = rbind(dtf2evaluation, tmp) +} +else{ + dtf2evaluation = tmp +} + + +``` + +```{r} + + +dtf2evaluation$idx_mvrnom = factor(dtf2evaluation$idx_mvrnom) +stderror = dtf2evaluation %>% filter(beta != "(Intercept)" ) %>% reshape2::dcast(., gene_id + beta + term + idx_mvrnom ~ from, value.var = 'std.error') +p1 = ggplot(stderror) + geom_point(aes(`MASS::glm`, DESEQ2, col = idx_mvrnom)) + + geom_abline(intercept = 0, slope = 1) + scale_x_log10() + scale_y_log10() + xlab("Std.error_MASS") + ylab("Std.error_DESEQ2") +p1 + + +dispersion_comparison$idx_mvrnom = factor(dispersion_comparison$idx_mvrnom) +p2 = ggplot(dispersion_comparison) + geom_point(aes(dispersion.actual, dispersion.estimate, col = idx_mvrnom, shape = from) ) + + geom_abline(intercept = 0, slope = 1) + scale_x_log10() + scale_y_log10() + +p3 = ggplot(dtf2evaluation %>% filter(beta != "(Intercept)")) + + geom_point(aes(x = actual.value, y = estimate, color = idx_mvrnom), alpha = 0.5) + + geom_abline(intercept = 0, slope = 1) + + facet_grid(beta~from, scales = "free") + +p3_bis = ggplot(dtf2evaluation %>% filter(beta != "(Intercept)")) + + geom_point(aes(x = actual.value, y = estimate, color = padj), alpha = 0.5) + + geom_abline(intercept = 0, slope = 1) + + facet_grid(beta~from, scales = "free") + +p4 = ggplot(dtf2evaluation %>% filter(beta != "(Intercept)"), + aes(d = actual.label , m = padj, col = from)) + + geom_roc(n.cuts = 0, labels = F) #+ facet_grid(~beta, scales = "free") + + + +beta.actual$idx_mvrnom <- factor(beta.actual$idx_mvrnom) +p5 = ggplot(beta.actual) + geom_point(aes(x = betaG, y= betaGE, col = idx_mvrnom)) + +library(gridExtra) +a = grid.arrange( p3, p3_bis, nrow = 2, ncol = 1) +b = grid.arrange(p1,p2, nrow = 2) +f = grid.arrange(a,b, ncol = 2) + +ggsave('test_export2.png',f, width = 14, height = 10) + + +dispersion_comparison %>% filter(idx_mvrnom == 3) %>% .$gene_id %>% unique() +dtf2evaluation %>% filter(idx_mvrnom == 3) %>% .$gene_id %>% unique() +``` diff --git a/results/v3/2023_02_10_benchmarking.Rmd b/results/v3/2023_02_10_benchmarking.Rmd new file mode 100644 index 0000000..2554d74 --- /dev/null +++ b/results/v3/2023_02_10_benchmarking.Rmd @@ -0,0 +1,237 @@ +--- +title: "Benchmarking packages" +date: "2023-02-08" +output: + html_document: + toc: true +css: css/air.css +--- + +## Required + + +```{r setup, message=FALSE, warning=FALSE, results="hide"} +library(HTRsim) +library(HTRfit) +library(plotROC) +library(Rmisc) +``` + + +## Simulation + +### Parameters + +```{r} +# -- SETUP +n_G = 100 +n_genoT_list = c(1000) +n_env = 2 +FixIntercept = F +max_n_rep = 8 +sequencing_fact = 1 +n_clus = 3 +thr = 2 +number_ofRepetition = 4 +########### +``` + +### get time to fit + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide", eval=FALSE} +# ---------------- # +# /!\ Very very long +# Not exectuted in the notebook +# ------------------------- # + +#rm(dtf2evaluation) +for (n_genoT in n_genoT_list){ + for (i in 1:number_ofRepetition){ + # -- traceback + print(paste("Number of genotype:", n_genoT, sep = " ")) + print(paste("Iteration:", i, sep = " ")) + + # -- Simulate counts -- # + mock_random = rnaMock(n_genes = n_G, + n_genotypes = n_genoT, + n_environments = n_env, + max_n_replicates = max_n_rep, + sequencing_factor = sequencing_fact, + fixIntercept = FixIntercept, + n_clusters = n_clus) + + # -- count table & experimental design + count_table_random = mock_random$countTable %>% as.data.frame() + bioDesign_random = mock_random$design + # -- ground truth -- # + beta.actual_random = mock_random$actualParam$beta + + # --- Glm mixte fitting --- # + # -- lme4::glm.nb + dtf2fit_random = HTRfit::reshapeCounTable(count_table_random, + bioDesign_random) + start_time <- Sys.time() + l_random.lme4 = HTRfit::launch.glm_mixte(dtf2fit_random, + package = "lme4") + end_time <- Sys.time() + + fitDtf_random.lme4 =listFit2dtf(l_random.lme4) + fitDtf_random.lme4$inference$timeProcess = difftime(end_time, + start_time, units = "secs") %>% as.numeric() + fitDtf_random.lme4$inference$from = 'lme4' + fitDtf_random.lme4$inference$n_genotypes = n_genoT + # -- glmmTMB::glmmTMB + start_time <- Sys.time() + l_random.tmb = HTRfit::launch.glm_mixte(dtf2fit_random, package = "glmmTMB") + end_time <- Sys.time() + + fitDtf_random.tmb =listFit2dtf(l_random.tmb) + fitDtf_random.tmb$inference$timeProcess = difftime(end_time, + start_time, units = "secs") %>% as.numeric() + fitDtf_random.tmb$inference$from = 'glmmTMB' + fitDtf_random.tmb$inference$n_genotypes = n_genoT + # ------------------------- # + + # -- join LME4 & glmmTMB res -- # + fitDtf.inference = rbind(fitDtf_random.tmb$inference, + fitDtf_random.lme4$inference) + + expectation = getExpectation(beta.actual_random, + toEval = "glm_mixte" , threshold = thr) + prediction = getPrediction(fitDtf.inference, threshold = thr, alphaRisk = 0.05) + + # -- Glm mixte : join actual & inference -- # + actual2join.dtf <- data.table::data.table(expectation, + key = c("gene_id", "term")) + inference2join.dtf <- data.table::data.table(prediction, + key = c("gene_id", "term")) + comp.glmMixte <- actual2join.dtf[inference2join.dtf] %>% select(-group, -effect) + + # -- MASS::glm -- #x + if (n_genoT < 300){ + start_time <- Sys.time() + l_random.glm = HTRfit::launch.glm(dtf2fit_random) + end_time <- Sys.time() + + fitDtf_random.glm =listFit2dtf(l_random.glm) + fitDtf_random.glm$inference$timeProcess = difftime(end_time, + start_time, units = "secs") %>% as.numeric() + fitDtf_random.glm$inference$from = 'MASS::glm' + fitDtf_random.glm$inference$n_genotypes = n_genoT + # -- convert to log base 10 + fitDtf.glm$inference$estimate = fitDtf.glm$inference$estimate/log(2) + fitDtf.glm$inference$std.error = fitDtf.glm$inference$std.error/log(2) + expectation = getExpectation(beta.actual_random, + toEval = "glm" , threshold = thr) + prediction = getPrediction(fitDtf_random.glm$inference, + threshold = thr, alphaRisk = 0.05) + comp.glm = getComparison(actual.dtf = expectation, + inference.dtf = prediction) + } + # -- DESEQ2 -- # + if (n_genoT <= 100){ + start_time <- Sys.time() + dds_simu = HTRsim::fit_deseq(count_table_random, bioDesign_random) + end_time <- Sys.time() + + deseqFitdtf = getCoefficientsFromDds(dds_simu) + prediction = getPrediction(deseqFitdtf, threshold = thr, + alphaRisk = 0.05, pvalCorrection = T) + comp.deseq = getComparison(actual.dtf = expectation, + inference.dtf = prediction ) + comp.deseq$timeProcess = difftime(end_time, + start_time, units = "secs") %>% as.numeric() + comp.deseq$from = 'DESEQ2' + comp.deseq$n_genotypes = n_genoT + } + + # ---- Saving results ----- # + list_column = c("gene_id", "term", "actual.value" ,"estimate", "std.error", "timeProcess", "from", "n_genotypes", "statistic" , "p.value", "padj" , "prediction.label") + + if (n_genoT <=100) { + comp.deseq = comp.deseq %>% select(list_column) + tmp = rbind(comp.deseq, comp.glm, comp.glmMixte) + } + if (n_genoT <=300) { + comp.glm = comp.glm %>% select(list_column) + tmp = rbind(comp.glm, comp.glmMixte) + } + else tmp = comp.glmMixte + + if (exists('dtf2evaluation')){ + dtf2evaluation = rbind(dtf2evaluation, tmp) + } + else{ + dtf2evaluation = tmp + } + } +} + + +#write_tsv(tgc, file = "2023_01_17-tgc_backup.tsv") +#write_tsv(dtf2evaluation, file = "2023_01_17-dfEval_random_backup.tsv") + +``` + +## Evaluation + +### Preparing dataframe + +```{r} +dtf2evaluation <- read_tsv('2023_01_17-dfEval_random_backup.tsv', show_col_types = FALSE) +dtf2evaluation$timeProcess = dtf2evaluation$timeProcess/60 +tgc <- Rmisc::summarySE(dtf2evaluation, + measurevar="timeProcess", + groupvars=c("n_genotypes","from")) +# -- backup +#write_tsv(tgc, file = "2023_01_17-tgc_backup.tsv") + +#tgc <- read_tsv('2023_01_17-tgc_backup.tsv', show_col_types = FALSE) +``` + +### Build graph for evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +#tgc$n_genotypes <- factor(tgc$n_genotypes) +#tgc$n_genotypes = as.numeric(as.character(tgc$n_genotypes)) +tgc$from = factor(tgc$from, levels = c("DESEQ2","MASS::glm", "lme4", "glmmTMB" )) + +tgc = tgc %>% filter(from %in% c('DESEQ2', "MASS::glm", "glmmTMB")) +p= ggplot(tgc, aes(x = n_genotypes, y = timeProcess, colour = from)) + + geom_line(aes(x = n_genotypes, y= timeProcess+sd), + linetype = "dashed", alpha = 0.6) + + geom_line(aes(x = n_genotypes, y= timeProcess-sd), + linetype = "dashed", alpha = 0.6) + + geom_ribbon(aes(ymin = timeProcess-sd , + ymax= timeProcess+sd, fill = from), alpha=0.4) + + geom_line(aes(x = n_genotypes, y= timeProcess), + linetype = "solid", alpha = 0.6) + + #geom_point() + + scale_y_log10() + scale_x_log10() + + scale_color_manual(values = c("#16A085", "#2C3E50", '#FFC30F' )) + + scale_fill_manual(values = c("#16A085", "#2C3E50", '#FFC30F' )) + ylab('Time processing (min)') +p +ggsave("timeprocessing3.png", p, width = 8, height = 5) +df2ROC = dtf2evaluation %>% + filter(from %in% c("MASS::glm", "DESEQ2"))%>% + filter(term != "(Intercept)") %>% + dplyr::mutate( + actual.label = + dplyr::if_else(abs(actual.value) < 2, + "nonDE", "DE" + ) + ) +df2ROC$n_genotypes <- factor(df2ROC$n_genotypes) +# -- ROC curve +p1 = ggplot(df2ROC, + aes(d = actual.label , m = padj, col = from)) + + geom_roc(n.cuts = 0, labels = F) + facet_grid(~n_genotypes) + scale_color_manual(values = c("#yellow","#BDBDBD")) +p1 +``` + +### Visualization + +```{r, message=FALSE, warning=FALSE} +p +``` diff --git a/results/v3/2023_02_10_benchmarkingv2.Rmd b/results/v3/2023_02_10_benchmarkingv2.Rmd new file mode 100644 index 0000000..2a0a82a --- /dev/null +++ b/results/v3/2023_02_10_benchmarkingv2.Rmd @@ -0,0 +1,174 @@ +--- +title: "Benchmarking packages" +date: "2023-02-08" +output: + html_document: + toc: true +css: css/air.css +--- + +## Required + + +```{r setup, message=FALSE, warning=FALSE, results="hide"} +library(HTRsim) +library(HTRfit) +library(plotROC) +library(Rmisc) +``` + + +## Simulation + +### Parameters + +```{r} +# -- SETUP +n_G = 100 +n_genoT_list = c(100) +n_env = 2 +FixIntercept = T +max_n_rep = 15 +sequencing_fact = 2 +n_clus = 3 +thr = 2 +number_ofRepetition = 3 +########### +``` + +### get time to fit + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide", eval=FALSE} +# ---------------- # +# /!\ Very very long +# Not exectuted in the notebook +# ------------------------- # + +#rm(dtf2evaluation) +for (n_genoT in n_genoT_list){ + for (i in 1:number_ofRepetition){ + # -- traceback + print(paste("Number of genotype:", n_genoT, sep = " ")) + print(paste("Iteration:", i, sep = " ")) + + # -- Simulate counts -- # + mock_fixed = rnaMock(n_genes = n_G, + n_genotypes = n_genoT, + n_environments = n_env, + max_n_replicates = max_n_rep, + sequencing_factor = sequencing_fact, + fixIntercept = FixIntercept, + n_clusters = n_clus) + + # -- count table & experimental design + count_table_fixed = mock_fixed$countTable %>% as.data.frame() + bioDesign_fixed = mock_fixed$design + # -- ground truth -- # + beta.actual_fixed = mock_fixed$actualParam$beta + + # -- MASS::glm -- #x + dtf2fit_fixed = HTRfit::reshapeCounTable(count_table_fixed, + bioDesign_fixed) + start_time <- Sys.time() + l_fixed.glm = HTRfit::launch.glm(dtf2fit_fixed) + end_time <- Sys.time() + + fitDtf_fixed.glm =listFit2dtf(l_fixed.glm) + fitDtf_fixed.glm$inference$timeProcess = difftime(end_time, + start_time, units = "secs") %>% as.numeric() + fitDtf_fixed.glm$inference$from = 'MASS::glm' + fitDtf_fixed.glm$inference$n_genotypes = n_genoT + expectation = getExpectation(beta.actual_fixed, + toEval = "glm" , threshold = thr) + prediction = getPrediction(fitDtf_fixed.glm$inference, + threshold = thr, alphaRisk = 0.05) + comp.glm = getComparison(actual.dtf = expectation, + inference.dtf = prediction) + + # -- DESEQ2 -- # + start_time <- Sys.time() + dds_simu = HTRsim::fit_deseq(count_table_fixed, bioDesign_fixed) + end_time <- Sys.time() + + deseqFitdtf = getCoefficientsFromDds(dds_simu) + prediction = getPrediction(deseqFitdtf, threshold = thr, + alphaRisk = 0.05, pvalCorrection = T) + comp.deseq = getComparison(actual.dtf = expectation, + inference.dtf = prediction ) + comp.deseq$timeProcess = difftime(end_time, + start_time, units = "secs") %>% as.numeric() + comp.deseq$from = 'DESEQ2' + comp.deseq$n_genotypes = n_genoT + + + # ---- Saving results ----- # + list_column = c("gene_id", "term", "actual.value" ,"estimate", "std.error", "timeProcess", "from", "n_genotypes", "statistic" , "p.value", "padj" , "prediction.label") + + #comp.deseq = comp.deseq %>% select(list_column) + #print(colnames(comp.deseq)) + #print(colnames(comp.glm)) + tmp = rbind(comp.deseq, comp.glm) + + if (exists('dtf2evaluation')){ + dtf2evaluation = rbind(dtf2evaluation, tmp) + } + else{ + dtf2evaluation = tmp + } + } +} + + +#write_tsv(tgc, file = "2023_01_17-tgc_backup.tsv") +#write_tsv(dtf2evaluation, file = "2023_02_03-dfEval_fixed_backup.tsv") + +``` + +## Evaluation + +### Preparing dataframe + +```{r} + +tgc <- Rmisc::sum-marySE(dtf2evaluation, + measurevar="timeProcess", + groupvars=c("n_genotypes","from")) +``` + +### Build graph for evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} + +tgc$from = factor(tgc$from, levels = c("DESEQ2","MASS::glm")) +p= ggplot(tgc, aes(x = n_genotypes, y = timeProcess, colour = from)) + + geom_line(aes(x = n_genotypes, y= timeProcess+sd), + linetype = "dashed", alpha = 0.6) + + geom_line(aes(x = n_genotypes, y= timeProcess-sd), + linetype = "dashed", alpha = 0.6) + + geom_ribbon(aes(ymin = timeProcess-sd , + ymax= timeProcess+sd, fill = from), alpha=0.4) + + geom_line(aes(x = n_genotypes, y= timeProcess), + linetype = "solid", alpha = 0.6) + + #geom_point() + + scale_y_log10() + scale_x_log10() + + scale_color_manual(values = c("#16A085", "#2C3E50", '#EC7063', '#FFC30F' )) + + scale_fill_manual(values = c("#16A085", "#2C3E50", '#EC7063', '#FFC30F' )) + +p +df2ROC = dtf2evaluation %>% + filter(from %in% c("MASS::glm", "DESEQ2"))%>% + filter(term != "(Intercept)") %>% + dplyr::mutate( + actual.label = + dplyr::if_else(abs(actual.value) < 2, + "nonDE", "DE" + ) + ) +df2ROC$n_genotypes <- factor(df2ROC$n_genotypes) +# -- ROC curve +p1 = ggplot(df2ROC, aes(d = actual.label , m = padj, col = from)) + + geom_roc(n.cuts = 0, labels = F) + facet_grid(~n_genotypes) + + scale_color_manual(values = c("#16A085", "#2C3E50")) +p1 +ggsave(p1, filename = "benchmark_fig.png", height = 8, width = 10) +``` diff --git a/results/v3/2023_02_10_benchmarkingv3.Rmd b/results/v3/2023_02_10_benchmarkingv3.Rmd new file mode 100644 index 0000000..0307f78 --- /dev/null +++ b/results/v3/2023_02_10_benchmarkingv3.Rmd @@ -0,0 +1,161 @@ +--- +title: "Random intercept" +date: "2023-02-08" +output: + html_document: + toc: true +css: ../css/air.css +--- + +## Required + + +```{r setup, message=FALSE, warning=FALSE, results="hide"} +library(HTRsim) +library(HTRfit) +library(plotROC) +``` + + +## Simulation + +### Parameters + +```{r} +# -- SETUP +n_G = 4 +n_genoT = 300 +n_env = 2 +FixIntercept = F +max_n_rep = 7 +sequencing_fact = 2 +n_clus = 1 +thr = 1 +########### +``` + +### Random intercept + +```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# -- Simulate counts +mock_random = rnaMock(n_genes = n_G, n_genotypes = n_genoT, n_environments = n_env, + max_n_replicates = max_n_rep, sequencing_factor = sequencing_fact, fixIntercept = FixIntercept, n_clusters = n_clus) + +# -- count table & experimental design +count_table_random = mock_random$countTable %>% as.data.frame() +bioDesign_random = mock_random$design + +# -- ground truth +beta.actual_random = mock_random$actualParam$beta + +``` + +## Estimation + +### Launch glmmTMB::glmmTMB on simulated data (random intercept) + +```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# -- glmmTMB::glmmTMB +dtf2fit_random = HTRfit::reshapeCounTable(count_table_random, bioDesign_random) +l_random.tmb = HTRfit::launch.glm_mixte(dtf2fit_random, package = "glmmTMB") +fitDtf_random.tmb =listFit2dtf(l_random.tmb) +``` + +```{r} +# -- MASS::glm +l_random.glm = HTRfit::launch.glm(dtf2fit_random) +fitDtf_random.glm =listFit2dtf(l_random.glm) +prediction = getPrediction(fitDtf_random.glm$inference, + threshold = thr, alphaRisk = 0.05) + + +glm.dtf_lng = fitDtf_random.glm$inference %>% + dplyr::mutate(type = dplyr::case_when( + str_detect(term, "genotypeG\\d+\\:environment") ~ "betaGE", + str_detect(term, "genotypeG\\d+$") ~ "betaG", + str_detect(term, "environmentE\\d+$") ~ "betaE", + str_detect(term, "(Intercept)") ~ "(Intercept)")) %>% + reshape2::dcast(., gene_id ~ type, value.var = "estimate", fun.aggregate = list) %>% unnest(c(`(Intercept)`,betaE, betaG, betaGE)) + + +glm.dtf = getExpectation(glm.dtf_lng , toEval = "glm_mixte" , threshold = thr) %>% rename(estimate = "actual.value") %>% mutate(from = "prediction Mass::glm") + +``` + +### Join Actual & Inference + +```{r} +###### -- GLMTMB -- ####### +expectation_random = getExpectation(beta.actual_random,toEval = "glm_mixte" , threshold = thr) +prediction_glmMixte = getPrediction(fitDtf_random.tmb$inference, threshold = thr, alphaRisk = 0.05) %>% mutate(from = 'prediction glm mixte' ) +# - rbind glm & glm mixte +prediction.dtf = rbind(prediction_glmMixte %>% select(estimate, term, gene_id, from), glm.dtf %>% select(estimate, term, gene_id, from)) + +# -- join actual & inference +actual2join.dtf <- data.table::data.table(expectation_random, key = c("gene_id", "term")) +inference.dtf <- data.table::data.table(prediction.dtf, key = c("gene_id", "term")) + +comparison.tmb <- actual2join.dtf[inference.dtf] + +comparison.dtf = comparison.tmb +``` + +## Evaluation + +### Build graph for evaluation mixte model + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} + +# -- preparing dtf +comparison.dtf$term = factor(comparison.dtf$term, levels = c("(Intercept)", "environmentE1","cor__(Intercept).environmentE1", "sd__(Intercept)", "sd__environmentE1")) + + +# -- Identity plot +p = ggplot(comparison.dtf) + + geom_point(aes(x = actual.value, + y = estimate, col = from), alpha = 0.6, size = 3) + + geom_abline(intercept = 0, slope = 1) + + facet_wrap(~term, scales = "free") + + theme(strip.text.x = element_text(size = 7)) + + scale_color_manual(values = c('#581845', '#FFC30F')) +p +#ggsave("../img/graph/poc_glmm2_1000.png", p,) +``` + + + +### Visualization + +```{r, message=FALSE, warning=FALSE} + +library(ggridges) + +pred_mixte = prediction_glmMixte %>% select(estimate, term, gene_id, from) %>% reshape2::dcast(., gene_id + from ~ term, value.var = "estimate") +pred_mixte = pred_mixte %>% mutate(betaG_E0 = list(rnorm(1000, mean = `(Intercept)` , sd = `sd__(Intercept)` ))) %>% + mutate(betaG_E1 = list(rnorm(1000, mean = environmentE1 + `(Intercept)` , sd = `sd__(Intercept)` ))) %>% + mutate(betaGE_E1 = list(rnorm(1000, mean = environmentE1 + `(Intercept)` , sd = sd__environmentE1 ))) %>% + select(gene_id, betaG_E0, betaG_E1, betaGE_E1, from) %>% + unnest(c(betaG_E0, betaG_E1, betaGE_E1)) + + +pred_glm = glm.dtf_lng %>% mutate(betaG_E0 = `(Intercept)` + betaG) %>% + mutate(betaG_E1 = `(Intercept)` + betaE + betaG ) %>% + mutate(betaGE_E1 = `(Intercept)` + betaE + betaGE) %>% select(gene_id, betaG_E0, betaG_E1, betaGE_E1) %>% mutate(from = "prediction MASS::glm") + +actu = beta.actual_random %>% mutate(betaG_E0 = `(Intercept)` + betaG) %>% + mutate(betaG_E1 = `(Intercept)` + betaE + betaG ) %>% + mutate(betaGE_E1 = `(Intercept)` + betaE + betaGE) %>% select(gene_id, betaG_E0, betaG_E1, betaGE_E1) %>% mutate(from = "Actual") + + +dtf = rbind(actu, pred_glm, pred_mixte) %>% reshape2::melt( + id.vars = c("gene_id", 'from'), + value.name = "value", + variable.name = 'env') + +dtf %>% dplyr::filter(gene_id %in% c("gene1")) +p = ggplot(dtf %>% filter(gene_id %in% c("gene1"))) + + geom_density_ridges(aes(x = value , y = from, fill = from), alpha = 0.6) + facet_grid(env~gene_id, scales = 'free_x') + scale_fill_manual(values = c("#FFE699", "#EC7063", "#2C3E50")) +p +ggsave("../img/graph/poc_glmm1000.png", p, height = 6, width = 8) + +``` diff --git a/results/v3/2023_02_21_benchmarking_distrib.Rmd b/results/v3/2023_02_21_benchmarking_distrib.Rmd new file mode 100644 index 0000000..0996320 --- /dev/null +++ b/results/v3/2023_02_21_benchmarking_distrib.Rmd @@ -0,0 +1,201 @@ +--- +title: "Random intercept" +date: "2023-02-08" +output: + html_document: + toc: true +css: ../css/air.css +--- + +## Required + + +```{r setup, message=FALSE, warning=FALSE, results="hide"} +library(HTRsim) +library(HTRfit) +library(plotROC) +``` + + +## Simulation + +### Parameters + +```{r} +# -- SETUP +n_G = 5 +n_genoT = 200 +n_env = 2 +FixIntercept = F +max_n_rep = 15 +sequencing_fact = 2 +n_clus = 7 +thr = 1 +########### +``` + +### Random intercept + +```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# -- Simulate counts +mock_random = rnaMock(n_genes = n_G, n_genotypes = n_genoT, n_environments = n_env, + max_n_replicates = max_n_rep, sequencing_factor = sequencing_fact, fixIntercept = FixIntercept, n_clusters = n_clus) + +# -- count table & experimental design +count_table_random = mock_random$countTable %>% as.data.frame() +bioDesign_random = mock_random$design + +# -- ground truth +beta.actual_random = mock_random$actualParam$beta + +beta.actual_random$idx_mvrnom %>% unique() +beta.actual_random %>% filter(gene_id == "gene10") +``` + +## Estimation + +### Launch glmmTMB::glmmTMB on simulated data (random intercept) + +```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# -- glmmTMB::glmmTMB +dtf2fit_random = HTRfit::reshapeCounTable(count_table_random, bioDesign_random) +#dtf2fit_random2 = dtf2fit_random %>% filter(genotype != "G0") +l_random.tmb = HTRfit::launch.glm_mixte(dtf2fit_random2, package = "glmmTMB") +fitDtf_random.tmb =listFit2dtf(l_random.tmb) +``` + +```{r} +# -- MASS::glm +l_random.glm = HTRfit::launch.glm(dtf2fit_random) +fitDtf_random.glm =listFit2dtf(l_random.glm) +``` + + +```{r} +prediction_glm = fitDtf_random.glm$inference %>% + dplyr::mutate(type = dplyr::case_when( + str_detect(term, "genotypeG\\d+\\:environment") ~ "betaGE", + str_detect(term, "genotypeG\\d+$") ~ "betaG", + str_detect(term, "environmentE\\d+$") ~ "betaE", + str_detect(term, "(Intercept)") ~ "(Intercept)")) %>% + reshape2::dcast(., gene_id ~ type, value.var = "estimate", fun.aggregate = list) %>% unnest(c(`(Intercept)`,betaE, betaG, betaGE)) +glm.dtf = getExpectation(prediction_glm , toEval = "glm_mixte" , threshold = thr) %>% dplyr::rename(estimate = "actual.value") %>% mutate(from = "prediction Mass::glm") + + +prediction_glmMixte = getPrediction(fitDtf_random.tmb$inference, threshold = thr, alphaRisk = 0.05) %>% mutate(from = 'prediction glm mixte' ) + +``` + + +```{r, message=FALSE, warning=FALSE} + +library(ggridges) + +prediction_glmMixte = prediction_glmMixte %>% select(estimate, term, gene_id, from) %>% reshape2::dcast(., gene_id + from ~ term, value.var = "estimate") +nb_tirage = 10000 +prediction_glmMixte = prediction_glmMixte %>% + mutate(betaG_E0 = pmap(list(nb_tirage, `(Intercept)`, `sd__(Intercept)`), + function (n, mu, sd) rnorm(n, mu, sd))) %>% + mutate(betaG_E1 = pmap(list(nb_tirage, environmentE1 + `(Intercept)`, `sd__(Intercept)`), + function (n, mu, sd) rnorm(n, mu, sd))) %>% + mutate(betaGE_E1 = pmap(list(nb_tirage, environmentE1, `sd__environmentE1`), + function (n, mu, sd) rnorm(n, mu, sd))) %>% + select(gene_id, betaG_E0, betaG_E1, betaGE_E1, from) %>% + unnest(c(betaG_E0, betaG_E1, betaGE_E1)) %>% as.data.frame() + + + +prediction_glm = prediction_glm %>% mutate(betaG_E0 = `(Intercept)` + betaG ) %>% + mutate(betaG_E1 = `(Intercept)` + betaE + betaG ) %>% + mutate(betaGE_E1 = betaE + betaGE) %>% + select(gene_id, betaG_E0, betaG_E1, betaGE_E1) %>% mutate(from = "prediction MASS::glm") %>% as.data.frame() + +actu = beta.actual_random %>% mutate(betaG_E0 = `(Intercept)` + betaG ) %>% + mutate(betaG_E1 = `(Intercept)` + betaE + betaG ) %>% + mutate(betaGE_E1 = betaE + betaGE ) %>% select(gene_id, betaG_E0, betaG_E1, betaGE_E1) %>% + mutate(from = "Actual") %>% + as.data.frame() + + +dtf = rbind(actu, prediction_glm,prediction_glmMixte) %>% reshape2::melt( + id.vars = c("gene_id", 'from'), + value.name = "value", + variable.name = 'env') %>% as.data.frame() %>% separate(env, c("type", "Env"), sep = "_") + + +#dtf %>% dplyr::filter(gene_id == "gene1") +p = ggplot( dtf %>% dplyr::filter(gene_id %in% c("gene3")) )+ + geom_density_ridges(aes(x = value , y = from, fill = Env), quantile_lines=TRUE, + quantile_fun=function(x,...)mean(x), alpha = 0.6) + facet_wrap(~type, ncol = 2, scales = "free_x") + scale_fill_manual(values = c("#2E75B6", "#F4B183")) +p + +ggsave("../../img/distrib/geneE_whole.svg", p, height = 4, width = 6) + +``` + + +```{r} + +reshapeActualDtf_glm_mixte2 <- function(actual.dtf) { + actual.dtf <- actual.dtf %>% + dplyr::group_by(gene_id) %>% + dplyr::summarise( + tmp = mean(`(Intercept)` + betaG), + environmentE1 = mean(betaE + betaGE), + "sd__(Intercept)" = sd(`(Intercept)` + betaG), + sd__environmentE1 = sd(betaGE + betaE), + "cor__(Intercept).environmentE1" = cor((betaGE + betaE), (`(Intercept)` + betaG)) + ) %>% + dplyr::rename("(Intercept)" = tmp) %>% + reshape2::melt(id = "gene_id", value.name = "actual.value", variable.name = "term") + return(actual.dtf) +} + + + +``` + + +```{r} +beta.actual_random$idx_mvrnom = factor(beta.actual_random$idx_mvrnom) +ggplot(beta.actual_random) + geom_point(aes( x = betaG, y = betaGE, col = idx_mvrnom )) +ggplot(beta.actual_random ) + geom_point(aes( x = betaG, y = betaGE, col = gene_id )) + + +###### -- GLMTMB -- ####### +expectation_random = reshapeActualDtf_glm_mixte2(beta.actual_random) +prediction_glmMixte = getPrediction(fitDtf_random.tmb$inference, threshold = thr, alphaRisk = 0.05) %>% mutate(from = 'prediction glm mixte' ) + +# - rbind glm & glm mixte +prediction.dtf = rbind(prediction_glmMixte %>% select(estimate, term, gene_id, from), glm.dtf %>% select(estimate, term, gene_id, from)) + +# -- join actual & inference +actual2join.dtf <- data.table::data.table(expectation_random, key = c("gene_id", "term")) +inference.dtf <- data.table::data.table(prediction.dtf, key = c("gene_id", "term")) + +comparison.tmb <- actual2join.dtf[inference.dtf] + +comparison.dtf = comparison.tmb +``` + +## Evaluation + +### Build graph for evaluation mixte model + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} + +# -- preparing dtf +comparison.dtf$term = factor(comparison.dtf$term, levels = c("(Intercept)", "environmentE1","cor__(Intercept).environmentE1", "sd__(Intercept)", "sd__environmentE1")) + + +# -- Identity plot +p = ggplot(comparison.dtf %>% filter(from == "prediction glm mixte") %>% filter(!is.na(term))) + + geom_point(aes(x = actual.value, + y = estimate, col = gene_id), alpha = 0.6, size = 3) + + geom_abline(intercept = 0, slope = 1) + + facet_wrap(~term, scales = "free") + + theme(strip.text.x = element_text(size = 7)) #+ + scale_color_manual(values = c('#581845', '#FFC30F')) +p +#ggsave("../img/graph/poc_glmm2_1000.png", p,) +``` diff --git a/results/v3/2023_02_21_benchmarkingv3.Rmd b/results/v3/2023_02_21_benchmarkingv3.Rmd new file mode 100644 index 0000000..825acab --- /dev/null +++ b/results/v3/2023_02_21_benchmarkingv3.Rmd @@ -0,0 +1,193 @@ +--- +title: "Benchmarking packages" +date: "2023-02-08" +output: + html_document: + toc: true +css: css/air.css +--- + +## Required + + +```{r setup, message=FALSE, warning=FALSE, results="hide"} +library(HTRsim) +library(HTRfit) +library(plotROC) +library(Rmisc) +``` + + +## Simulation + +### Parameters + +```{r} +# -- SETUP +n_G = 100 +n_genoT_list = c(2, 3, 6) +n_env = 2 +FixIntercept = T +max_n_rep = 10 +sequencing_fact = 2 +n_clus = 1 +thr = 1.2 +number_ofRepetition = 2 +########### +``` + +### get time to fit + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide", eval=FALSE} +# ---------------- # +# /!\ Very very long +# Not exectuted in the notebook +# ------------------------- # + +#rm(dtf2evaluation) +for (n_genoT in n_genoT_list){ + for (i in 1:number_ofRepetition){ + # -- traceback + print(paste("Number of genotype:", n_genoT, sep = " ")) + print(paste("Iteration:", i, sep = " ")) + + # -- Simulate counts -- # + mock_random = rnaMock(n_genes = n_G, + n_genotypes = n_genoT, + n_environments = n_env, + max_n_replicates = max_n_rep, + sequencing_factor = sequencing_fact, + fixIntercept = FixIntercept, + n_clusters = n_clus) + + # -- count table & experimental design + count_table_random = mock_random$countTable %>% as.data.frame() + bioDesign_random = mock_random$design + # -- ground truth -- # + beta.actual_random = mock_random$actualParam$beta + + # -- MASS::glm -- #x + + dtf2fit_random = HTRfit::reshapeCounTable(count_table_random, + bioDesign_random) + start_time <- Sys.time() + l_random.glm = HTRfit::launch.glm(dtf2fit_random) + end_time <- Sys.time() + + fitDtf.glm =listFit2dtf(l_random.glm) + fitDtf.glm$inference$timeProcess = difftime(end_time, + start_time, units = "secs") %>% as.numeric() + fitDtf.glm$inference$from = 'MASS::glm' + fitDtf.glm$inference$n_genotypes = n_genoT + # -- convert to log base 10 + fitDtf.glm$inference$estimate = fitDtf.glm$inference$estimate/log(2) + fitDtf.glm$inference$std.error = fitDtf.glm$inference$std.error/log(2) + expectation = getExpectation(beta.actual_random, + toEval = "glm" , threshold = thr) + prediction = getPrediction(fitDtf.glm$inference, + threshold = thr, alphaRisk = 0.05) + comp.glm = getComparison(actual.dtf = expectation, + inference.dtf = prediction) + + # -- DESEQ2 -- # + if (n_genoT < 100 ){ + start_time <- Sys.time() + dds_simu = HTRsim::fit_deseq(count_table_random, bioDesign_random) + end_time <- Sys.time() + + deseqFitdtf = getCoefficientsFromDds(dds_simu) + prediction = getPrediction(deseqFitdtf, threshold = thr, + alphaRisk = 0.05, pvalCorrection = T) + comp.deseq = getComparison(actual.dtf = expectation, + inference.dtf = prediction ) + comp.deseq$timeProcess = difftime(end_time, + start_time, units = "secs") %>% as.numeric() + comp.deseq$from = 'DESEQ2' + comp.deseq$n_genotypes = n_genoT + + # ---- Saving results ----- # + #list_column = c("gene_id", "term", "actual.value" ,"estimate", "std.error", "timeProcess", "from", "n_genotypes", "statistic" , "p.value", "padj" , "prediction.label") + + tmp = rbind(comp.deseq, comp.glm) + } + else{ + tmp = comp.glm + } + + if (exists('dtf2evaluation')){ + dtf2evaluation = rbind(dtf2evaluation, tmp) + } + else{ + dtf2evaluation = tmp + } + } +} + + +#write_tsv(tgc, file = "2023_01_17-tgc_backup.tsv") +#write_tsv(dtf2evaluation, file = "2023_02_21-dfEval_random_backup2.tsv") +dtf2evaluation %>% group_by(from, n_genotypes) %>% tally() +``` + +## Evaluation + +### Preparing dataframe + +```{r} +#dtf2evaluation2 = dtf2evaluation +#dtf2evaluation <- read_tsv('2023_02_21-dfEval_random_backup2.tsv', show_col_types = FALSE) +#dtf2evaluation = rbind( dtf2evaluation2, dtf2evaluation ) +tgc <- Rmisc::summarySE(dtf2evaluation, + measurevar="timeProcess", + groupvars=c("n_genotypes","from")) +# -- backup +#write_tsv(tgc, file = "2023_01_17-tgc_backup.tsv") + +#tgc <- read_tsv('2023_01_17-tgc_backup.tsv', show_col_types = FALSE) +``` + +### Build graph for evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +#tgc$n_genotypes <- factor(tgc$n_genotypes) +#tgc$n_genotypes = as.numeric(as.character(tgc$n_genotypes)) +tgc$from = factor(tgc$from, levels = c("DESEQ2","MASS::glm", "lme4", "glmmTMB" )) +p= ggplot(tgc, aes(x = n_genotypes, y = timeProcess, colour = from)) + + geom_line(aes(x = n_genotypes, y= timeProcess+sd), + linetype = "dashed", alpha = 0.6) + + geom_line(aes(x = n_genotypes, y= timeProcess-sd), + linetype = "dashed", alpha = 0.6) + + geom_ribbon(aes(ymin = timeProcess-sd , + ymax= timeProcess+sd, fill = from), alpha=0.4) + + geom_line(aes(x = n_genotypes, y= timeProcess), + linetype = "solid", alpha = 0.6) + + #geom_point() + + scale_y_log10() + scale_x_log10() + + scale_color_manual(values = c("#16A085", "#2C3E50", '#EC7063', '#FFC30F' )) + + scale_fill_manual(values = c("#16A085", "#2C3E50", '#EC7063', '#FFC30F' )) + +p +df2ROC = dtf2evaluation %>% + filter(from %in% c("MASS::glm", "DESEQ2"))%>% + filter(term != "(Intercept)") %>% + dplyr::mutate( + actual.label = + dplyr::if_else(abs(actual.value) < 2, + "nonDE", "DE" + ) + ) +df2ROC$n_genotypes <- factor(df2ROC$n_genotypes) +# -- ROC curve +p1 = ggplot(df2ROC %>% filter(n_genotypes != 5) , + aes(d = actual.label , m = padj, col = from)) + + geom_roc(n.cuts = 0, labels = F) + facet_grid(~n_genotypes) + + scale_color_manual(values = c("#16A085", "#2C3E50")) + theme(axis.text.x = element_text(size = 6)) +p1 +ggsave(filename = "ROCcurves_genotincr.png", p1 , height = 4, width = 8) +``` + +### Visualization + +```{r, message=FALSE, warning=FALSE} +p +``` diff --git a/results/v3/GLM_mixte/2023_02_08_benchmarkingPackages.Rmd b/results/v3/GLM_mixte/2023_02_08_benchmarkingPackages.Rmd index 75a7a66..4e86cb9 100644 --- a/results/v3/GLM_mixte/2023_02_08_benchmarkingPackages.Rmd +++ b/results/v3/GLM_mixte/2023_02_08_benchmarkingPackages.Rmd @@ -1,5 +1,5 @@ --- -title: "Random intercept" +title: "Benchmarking packages" date: "2023-02-08" output: html_document: diff --git a/results/v3/GLM_mixte/2023_02_08_randomIntercept.Rmd b/results/v3/GLM_mixte/2023_02_08_randomIntercept.Rmd index 9241c4d..2be7f63 100644 --- a/results/v3/GLM_mixte/2023_02_08_randomIntercept.Rmd +++ b/results/v3/GLM_mixte/2023_02_08_randomIntercept.Rmd @@ -24,10 +24,10 @@ library(plotROC) ```{r} # -- SETUP n_G = 10 -n_genoT = 200 +n_genoT = 300 n_env = 2 FixIntercept = F -max_n_rep = 5 +max_n_rep = 15 sequencing_fact = 1 n_clus = 3 thr = 2 @@ -121,7 +121,7 @@ p = ggplot(comparison.dtf) + facet_wrap(~term, scales = "free") + theme(strip.text.x = element_text(size = 7)) + scale_color_manual(values = c('#581845', '#FFC30F')) - +p #ggsave("../img/graph/poc_glmm2_1000.png", p,) ``` @@ -129,4 +129,24 @@ p = ggplot(comparison.dtf) + ```{r, message=FALSE, warning=FALSE} p + +library(ggridges) + +x = comparison.dtf %>% reshape2::dcast(., gene_id ~ term, value.var = "estimate") +gene_vec = rep(x$gene_id, 1000) +a = rnorm(n_G* 1000, sd = x$`sd__(Intercept)`, mean = x$`(Intercept)` ) %>% data.frame() %>% mutate(Env = "(Intercept)") %>% mutate(gene_id = gene_vec) +b = rnorm(n_G*1000, sd = x$sd__environmentE1, mean = x$`sd__(Intercept)` ) %>% data.frame() %>% mutate(Env = "E1") %>% mutate(gene_id = gene_vec) +dtf = rbind(a,b) %>% dplyr::rename(value = ".") %>% mutate(from = "prediction") + +x2 = comparison.dtf %>% reshape2::dcast(., gene_id ~ term, value.var = "actual.value") +gene_vec = rep(x$gene_id, 1000) +a = rnorm(n_G* 1000, sd = x2$`sd__(Intercept)`, mean = x2$`(Intercept)` ) %>% data.frame() %>% mutate(Env = "(Intercept)") %>% mutate(gene_id = gene_vec) +b = rnorm(n_G*1000, sd = x2$sd__environmentE1, mean = x2$`sd__(Intercept)` ) %>% data.frame() %>% mutate(Env = "E1") %>% mutate(gene_id = gene_vec) +dtf2 = rbind(a,b) %>% dplyr::rename(value = ".") %>% mutate(from = "actual") + +dtf = rbind(dtf, dtf2) +p = ggplot(dtf %>% filter(gene_id %in% c("gene1", "gene2", "gene3"))) + geom_density_ridges(aes(x = value , y = from, fill = from), alpha = 0.6) + facet_wrap(gene_id~Env, scales = 'free_x', ncol = 2) + xlim(c(-15,15)) + scale_fill_manual(values = c("#FFE699", "#1F4E79")) +p +ggsave("../img/graph/poc_glmm1000.png", p, height = 6, width = 8) + ``` diff --git a/src/model.fit/HTRfit/R/model_fitting.R b/src/model.fit/HTRfit/R/model_fitting.R index a522b9c..337afcd 100644 --- a/src/model.fit/HTRfit/R/model_fitting.R +++ b/src/model.fit/HTRfit/R/model_fitting.R @@ -55,9 +55,12 @@ launch.glm <- function(data2fit, fit.glm <- function(data, id, model2fit, fit_by) { # -- function executed if all perform well f <- function(data, id, model2fit, fit_by) { - fit <- MASS::glm.nb(model2fit, data = data, link = log) + fit <- MASS::glm.nb(model2fit, data = data, link = log, control = glm.control(maxit=1000)) fit.dtf <- tidySummary(fit, "glm") fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(!!fit_by := id) + # -- convert estimation from natural logarithm to log base 2 + fit.dtf$inference$estimate <- fit.dtf$inference$estimate/log(2) + fit.dtf$inference$std.error <- fit.dtf$inference$std.error/log(2) fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(!!fit_by := id) fit.dtf$dispersion <- list(dispersion.estimate = fit$theta) %>% as.data.frame() %>% @@ -151,7 +154,7 @@ launch.glm_mixte <- function(data2fit, fit_by = "gene_id", package = "glmmTMB", threads = 4) { # -- log - futile.logger::flog.info("Fit started") + futile.logger::flog.info("Fit started per %s", fit_by) futile.logger::flog.info("GLM mixte: %s", package) data2fit <- data2fit %>% as.data.frame() @@ -207,10 +210,15 @@ fit.glm_mixte <- function(data, id, model2fit, fit_by, package = "glmmTMB") { fit.dtf <- tidySummary(fit, "glm_mixte") fit.dtf$inference <- fit.dtf$inference %>% dplyr::mutate(!!fit_by := id) fit.dtf$fitQuality <- fit.dtf$fitQuality %>% dplyr::mutate(!!fit_by := id) - # if warning while fitting => no AIC and BIC => set to NA + # -- convert estimation from natural logarithm to log base 2 + fit.dtf$inference = fit.dtf$inference %>% dplyr::mutate( + estimate = if_else(str_detect(term, "cor_"), + estimate, estimate/log(2) )) + # avoid error if missing columns => set to NA if (!("AIC" %in% colnames(fit.dtf$fitQuality))) fit.dtf$fitQuality$AIC = NA if (!("BIC" %in% colnames(fit.dtf$fitQuality))) fit.dtf$fitQuality$BIC = NA if (!("logLik" %in% colnames(fit.dtf$fitQuality))) fit.dtf$fitQuality$logLik = NA + if (!("deviance" %in% colnames(fit.dtf$fitQuality))) fit.dtf$fitQuality$deviance = NA fit.dtf$dispersion <- list(dispersion.estimate = theta) %>% as.data.frame() %>% diff --git a/src/v3/HTRsim/R/countsGenerator.R b/src/v3/HTRsim/R/countsGenerator.R index c8b8af3..5736beb 100644 --- a/src/v3/HTRsim/R/countsGenerator.R +++ b/src/v3/HTRsim/R/countsGenerator.R @@ -33,7 +33,7 @@ getBetaforSimulation <- function(n_genes = 100, n_genotypes = 20, mvrnormFit_lis res <- cbind(gene_id, genotype, beta.dtf, idx_mvrnom) ## fixing Genotype and GxE effects to 0 for G0 (reference) ## -> hide in (Intercept) - res[res$genotype == "G0", c("betaG", "betaGE")] <- 0 + #res[res$genotype == "G0", c("betaG", "betaGE")] <- 0 if (fixIntercept) { res <- res %>% dplyr::group_by(gene_id) %>% diff --git a/src/v3/HTRsim/R/multinormDistrib_manipulations.R b/src/v3/HTRsim/R/multinormDistrib_manipulations.R index 06d2e36..956d1df 100644 --- a/src/v3/HTRsim/R/multinormDistrib_manipulations.R +++ b/src/v3/HTRsim/R/multinormDistrib_manipulations.R @@ -18,7 +18,7 @@ mvnormFitting <- function(beta.dtf) { #' @examples getPublicMvnormFit <- function() { ##### Range of observed value ######### - dds.extraction <- publicDDS_extraction() + dds.extraction <- embedded_CounTable2observedValues() beta_observed.dtf <- dds.extraction$beta list_fit.mvnorm <- getListMvnormFit(beta_observed.dtf) return(list_fit.mvnorm) @@ -37,7 +37,7 @@ getPublicMvnormFit <- function() { #' #' @examples getListMvnormFit <- function(beta.dtf, n_clusters = 5) { - dtf2cluster <- beta.dtf %>% dplyr::select(c("(Intercept)", "betaG", "betaE","betaGE")) + dtf2cluster <- beta.dtf %>% dplyr::select(c("betaG", "betaGE")) cluster_kmean <- stats::kmeans(dtf2cluster, n_clusters)$cluster list_fit.mvnorm <- purrr::map( .x = 1:n_clusters, diff --git a/src/v3/HTRsim/R/workflow.R b/src/v3/HTRsim/R/workflow.R index c6c2629..93d3227 100644 --- a/src/v3/HTRsim/R/workflow.R +++ b/src/v3/HTRsim/R/workflow.R @@ -49,6 +49,7 @@ rnaMock <- function(n_genes, n_clusters = 5, fixBetaE = T, fixIntercept = T) { + # -log log.simulation(n_genes, n_genotypes, @@ -61,7 +62,11 @@ rnaMock <- function(n_genes, n_clusters) ## Fit mvnorm ## - list_fit.mvnorm <- getListMvnormFit(dds.extraction$beta) + list_fit.mvnorm <- getListMvnormFit(dds.extraction$beta, n_clusters) + + ## -- add higher env effect + list_fit.mvnorm = map(1:n_clusters, function(.x) {list_fit.mvnorm[[.x]]$mu[3] = list_fit.mvnorm[[.x]]$mu[3] + runif(1, -20, 20); list_fit.mvnorm[[.x]]} ) + ##### Ground truth ###### beta.actual <- getBetaforSimulation( -- GitLab