From 871ffe49a79b7645000e21c5313c85468ad33f6c Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Tue, 17 Jan 2023 18:42:13 +0100
Subject: [PATCH] update results

---
 results/v3/GLM/2023_02_04_dispersion.Rmd      |   2 +-
 results/v3/GLM/2023_02_08_randomIntercept.Rmd | 141 ++++++++++++++++
 .../v3/GLM_mixte/2023_01_17-tgc_backup.tsv    |  11 ++
 .../v3/GLM_mixte/2023_02_06_dispersion.Rmd    |  78 +++++++++
 .../2023_02_08_benchmarkingPackages.Rmd       | 159 ++++++++++++++++++
 .../GLM_mixte/2023_02_08_randomIntercept.Rmd  | 132 +++++++++++++++
 6 files changed, 522 insertions(+), 1 deletion(-)
 create mode 100644 results/v3/GLM/2023_02_08_randomIntercept.Rmd
 create mode 100644 results/v3/GLM_mixte/2023_01_17-tgc_backup.tsv
 create mode 100644 results/v3/GLM_mixte/2023_02_06_dispersion.Rmd
 create mode 100644 results/v3/GLM_mixte/2023_02_08_benchmarkingPackages.Rmd
 create mode 100644 results/v3/GLM_mixte/2023_02_08_randomIntercept.Rmd

diff --git a/results/v3/GLM/2023_02_04_dispersion.Rmd b/results/v3/GLM/2023_02_04_dispersion.Rmd
index e4a355c..cecb263 100644
--- a/results/v3/GLM/2023_02_04_dispersion.Rmd
+++ b/results/v3/GLM/2023_02_04_dispersion.Rmd
@@ -21,7 +21,7 @@ library(HTRfit)
 
 ```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
 # -- SETUP
-n_G = 1000
+n_G = 300
 n_genoT = 3
 n_env = 2
 max_n_rep = 15
diff --git a/results/v3/GLM/2023_02_08_randomIntercept.Rmd b/results/v3/GLM/2023_02_08_randomIntercept.Rmd
new file mode 100644
index 0000000..9ef903c
--- /dev/null
+++ b/results/v3/GLM/2023_02_08_randomIntercept.Rmd
@@ -0,0 +1,141 @@
+---
+title: "Random intercept"
+date: "2023-02-08"
+output: 
+  html_document:
+    toc: true 
+css: ../css/air.css
+---
+
+## Required
+
+
+```{r setup, message=FALSE, warning=FALSE, results="hide"}
+library(HTRsim)
+library(HTRfit)
+library(plotROC)
+```
+
+
+## Simulation
+
+### Parameters
+
+```{r}
+# -- SETUP
+n_G = 2000
+n_genoT = 26
+n_env = 2
+max_n_rep = 3
+sequencing_fact = 2
+thr = 2
+###########
+```
+
+### Intercept fixed
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- Simulate counts
+mock_fixed = rnaMock(n_genes = n_G, n_genotypes = n_genoT, n_environments = n_env, 
+    max_n_replicates = max_n_rep, sequencing_factor = sequencing_fact, fixIntercept = T)
+
+# -- count table & experimental design
+count_table_fixed = mock_fixed$countTable %>% as.data.frame()
+bioDesign_fixed = mock_fixed$design
+
+# -- ground truth
+beta.actual_fixed = mock_fixed$actualParam$beta
+```
+
+### Random intercept 
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- Simulate counts
+mock_random = rnaMock(n_genes = n_G, n_genotypes = n_genoT, n_environments = n_env, 
+    max_n_replicates = max_n_rep, sequencing_factor = sequencing_fact, fixIntercept = F)
+
+# -- count table & experimental design
+count_table_random = mock_random$countTable %>% as.data.frame()
+bioDesign_random = mock_random$design
+
+# -- ground truth
+beta.actual_random = mock_random$actualParam$beta
+
+```
+
+## Estimation
+
+### Launch MASS::glm.nb on simulated data (fixed intercept)
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- MASS:glm.nb
+dtf2fit_fixed = HTRfit::reshapeCounTable(count_table_fixed, bioDesign_fixed)
+l_fixed = HTRfit::launch.glm(dtf2fit_fixed)
+fitDtf_fixed =listFit2dtf(l_fixed)
+```
+
+### Launch MASS::glm.nb on simulated data (random intercept)
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- MASS:glm.nb
+dtf2fit_random = HTRfit::reshapeCounTable(count_table_random, bioDesign_random)
+l_random = HTRfit::launch.glm(dtf2fit_random)
+fitDtf_random =listFit2dtf(l_random)
+```
+
+### Join Actual & Inference
+
+```{r}
+
+###### -- Fixed Intercept -- ########
+expectation_fixed = getExpectation(beta.actual_fixed, threshold = thr)
+prediction_fixed = getPrediction(fitDtf_fixed$inference, threshold = thr, alphaRisk = 0.05)
+# -- join actual & inference
+comparison_fixed = getComparison(actual.dtf = expectation_fixed, inference.dtf = prediction_fixed)
+#####################################
+
+###### -- Random Intercept -- #######
+expectation_random = getExpectation(beta.actual_random, threshold = thr)
+prediction_random = getPrediction(fitDtf_random$inference, threshold = thr, alphaRisk = 0.05)
+# -- join actual & inference
+comparison_random = getComparison(actual.dtf = expectation_random, inference.dtf = prediction_random)
+#####################################
+
+```
+
+### Join random & fixed dataframe
+
+```{r}
+# -- Annotations
+comparison_random  = comparison_random %>% dplyr::mutate(from = "Random intercept")
+comparison_fixed = comparison_fixed %>% dplyr::mutate(from = "Fixed intercept")
+# -- join
+df2evaluation = rbind(comparison_fixed, comparison_random)
+```
+
+
+## Evaluation
+
+### Build graph for evaluation 
+
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- ROC curve
+p1 = ggplot(df2evaluation %>% filter(beta != "(Intercept)"), 
+           aes(d = actual.label , m = padj, col = from)) + 
+  geom_roc(n.cuts = 0, labels = F)   #+ 
+  scale_color_manual(values = c("#yellow","#BDBDBD"))
+#ggsave("../img/graph/replicates_eff.png", p, height = 4, width = 6)
+
+# Identity plot
+p2 = ggplot(df2evaluation %>% filter(beta != "(Intercept)")) + 
+  geom_point(aes(x = actual.value, y = estimate, color = padj), alpha = 0.5) +
+  geom_abline(intercept = 0, slope = 1) + 
+  facet_grid(beta~from, scales = "free")
+```
+
+### Visualization
+
+```{r, message=FALSE, warning=FALSE}
+p1
+p2
+```
diff --git a/results/v3/GLM_mixte/2023_01_17-tgc_backup.tsv b/results/v3/GLM_mixte/2023_01_17-tgc_backup.tsv
new file mode 100644
index 0000000..3866054
--- /dev/null
+++ b/results/v3/GLM_mixte/2023_01_17-tgc_backup.tsv
@@ -0,0 +1,11 @@
+n_genotypes	from	N	timeProcess	sd	se	ci
+20	glmmTMB	200	15.130752444267273	2.6080884089635843	0.1844196999912184	0.3636676342866279
+20	lme4	192	20.31276837239663	2.8278579904444543	0.20408307150164248	0.40254610129193336
+50	glmmTMB	200	20.799276530742645	5.514627455256488	0.38994304693293763	0.7689507432848744
+50	lme4	200	40.54833024740219	19.021427312622208	1.345018024060217	2.6523170946821053
+100	glmmTMB	200	25.420783281326294	4.2043931998868596	0.29729549424146057	0.5862537954460072
+100	lme4	200	37.98638904094696	4.275002040382448	0.3022882932340756	0.5960993780936279
+200	glmmTMB	200	39.15629369020462	5.9148858659293335	0.4182455905743096	0.8247621294374486
+200	lme4	200	68.16497600078583	12.042067627674905	0.8515027679035926	1.67912645561597
+400	glmmTMB	200	66.25662302970886	7.7012016308806155	0.5445571896480582	1.0738431138458309
+400	lme4	200	136.99138724803925	35.908038766273386	2.539081771074134	5.0069589148599665
diff --git a/results/v3/GLM_mixte/2023_02_06_dispersion.Rmd b/results/v3/GLM_mixte/2023_02_06_dispersion.Rmd
new file mode 100644
index 0000000..909b205
--- /dev/null
+++ b/results/v3/GLM_mixte/2023_02_06_dispersion.Rmd
@@ -0,0 +1,78 @@
+---
+title: "Dispersion"
+date: "2023-02-01"
+output: 
+  html_document:
+    toc: true 
+css: ../css/air.css
+---
+
+## Required
+
+
+```{r setup, message=FALSE, warning=FALSE, results="hide"}
+library(HTRsim)
+library(HTRfit)
+```
+
+
+## Simulation
+
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- SETUP
+n_G = 10
+n_genoT = 200
+n_env = 2
+max_n_rep = 5
+sequencing_fact = 1
+thr = 2
+###########
+
+# -- Simulate counts
+mock = rnaMock(n_genes = n_G, n_genotypes = n_genoT, n_environments = n_env, 
+    max_n_replicates = max_n_rep, sequencing_factor = sequencing_fact)
+
+# -- count table & experimental design
+count_table = mock$countTable %>% as.data.frame()
+bioDesign = mock$design
+
+# -- dispersion value per gene & per condition
+mock$actualParameters$dispersion[1:3, 1:3]
+
+```
+
+## Estimation
+
+### Launch MASS::glm.nb on simulated data
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- MASS:glm.nb
+dtf2fit = HTRfit::reshapeCounTable(count_table, bioDesign)
+l = HTRfit::launch.glm_mixte(dtf2fit)
+fitDtf =listFit2dtf(l)
+dispersion_estimated = fitDtf$dispersion
+```
+
+### Join Actual & Inference
+
+```{r}
+# -- one dispersion per gene
+dispersion_actual = mock$actualParameters$dispersion %>% 
+          rowMeans() %>% 
+          as.data.frame() %>% 
+          dplyr::rename(., dispersion.actual = ".") %>% tibble::rownames_to_column("gene_id")
+
+dispersion_actual = data.table::data.table(dispersion_actual, key = "gene_id")
+dispersion_estimated = data.table::data.table(dispersion_estimated, key = "gene_id")
+dispersion_comparison = dispersion_actual[dispersion_estimated]
+
+```
+
+### Identity plot 
+
+```{r}
+ggplot(dispersion_comparison) + 
+  geom_point(aes(x = dispersion.actual, y = 1/dispersion.estimate), alpha = 0.5) +
+  geom_abline(intercept = 0, slope = 1) + scale_y_log10() + scale_x_log10()
+```
diff --git a/results/v3/GLM_mixte/2023_02_08_benchmarkingPackages.Rmd b/results/v3/GLM_mixte/2023_02_08_benchmarkingPackages.Rmd
new file mode 100644
index 0000000..75a7a66
--- /dev/null
+++ b/results/v3/GLM_mixte/2023_02_08_benchmarkingPackages.Rmd
@@ -0,0 +1,159 @@
+---
+title: "Random intercept"
+date: "2023-02-08"
+output: 
+  html_document:
+    toc: true 
+css: ../css/air.css
+---
+
+## Required
+
+
+```{r setup, message=FALSE, warning=FALSE, results="hide"}
+library(HTRsim)
+library(HTRfit)
+#library(plotROC)
+library(Rmisc)
+```
+
+
+## Simulation
+
+### Parameters
+
+```{r}
+# -- SETUP
+n_G = 10
+n_genoT_list = c(20, 50, 100, 200, 400)
+n_env = 2
+FixIntercept = F
+max_n_rep = 5
+sequencing_fact = 1
+n_clus = 3
+thr = 2
+number_ofRepetition = 4
+###########
+```
+
+### get time to fit 
+
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide", eval=FALSE}
+#       ----------------        #
+#      /!\ Very very long
+# Not exectuted in the notebook
+#   -------------------------   #
+
+#rm(dtf2evaluation)
+for (n_genoT in n_genoT_list){
+  for (i in 1:number_ofRepetition){
+    # -- traceback
+    print(paste("Number of genotype:", n_genoT, sep = " "))
+    print(paste("Iteration:", i, sep = " "))
+    
+    #     -- Simulate counts --     #
+    mock_random = rnaMock(n_genes = n_G, 
+                          n_genotypes = n_genoT, 
+                          n_environments = n_env, 
+                          max_n_replicates = max_n_rep, 
+                          sequencing_factor = sequencing_fact, 
+                          fixIntercept = FixIntercept, 
+                          n_clusters = n_clus)
+    
+    # -- count table & experimental design
+    count_table_random = mock_random$countTable %>% as.data.frame()
+    bioDesign_random = mock_random$design
+    #      -- ground truth --       #
+    beta.actual_random = mock_random$actualParam$beta
+    
+    #     --- Model fitting ---     #
+    # -- lme4::glm.nb
+    start_time <- Sys.time()
+    dtf2fit_random = HTRfit::reshapeCounTable(count_table_random,
+                                              bioDesign_random)
+    l_random.lme4 = HTRfit::launch.glm_mixte(dtf2fit_random, 
+                                             package = "lme4")
+    fitDtf_random.lme4 =listFit2dtf(l_random.lme4)
+    end_time <- Sys.time()
+    fitDtf_random.lme4$inference$timeProcess = difftime(end_time, 
+                            start_time, units = "secs") %>% as.numeric()
+    fitDtf_random.lme4$inference$from = 'lme4' 
+    fitDtf_random.lme4$inference$n_genotypes = n_genoT
+    # -- glmmTMB::glmmTMB
+    start_time <- Sys.time()
+    l_random.tmb = HTRfit::launch.glm_mixte(dtf2fit_random, package = "glmmTMB")
+    fitDtf_random.tmb =listFit2dtf(l_random.tmb)
+    end_time <- Sys.time()
+    fitDtf_random.tmb$inference$timeProcess = difftime(end_time, 
+                            start_time, units = "secs") %>% as.numeric()
+    fitDtf_random.tmb$inference$from = 'glmmTMB'
+    fitDtf_random.tmb$inference$n_genotypes = n_genoT
+    #       -------------------------        #
+    
+    #     -- join LME4 & glmmTMB res --      #
+    fitDtf.inference = rbind(fitDtf_random.tmb$inference, 
+                             fitDtf_random.lme4$inference)
+    
+    expectation = getExpectation(beta.actual_random,
+                                        toEval = "glm_mixte" , threshold = thr)
+    prediction = getPrediction(fitDtf.inference, threshold = thr, alphaRisk = 0.05)
+    
+      #      -- join actual & inference --       #
+    actual2join.dtf <- data.table::data.table(expectation, 
+                                            key = c("gene_id", "term"))
+    inference2join.dtf <- data.table::data.table(prediction, 
+                                               key = c("gene_id", "term"))
+    tmp <- actual2join.dtf[inference2join.dtf]
+    
+    #       ---- Saving results -----          #
+    if (exists('dtf2evaluation')){
+        dtf2evaluation = rbind(dtf2evaluation, tmp)
+    }
+    else{
+        dtf2evaluation = tmp
+    }
+  }
+}
+
+
+```
+
+## Evaluation
+
+### Preparing dataframe
+
+```{r}
+
+#tgc <- Rmisc::summarySE(dtf2evaluation, 
+#                 measurevar="timeProcess", groupvars=c("n_genotypes","from"))
+# -- backup
+#write_tsv(tgc, file = "2023_01_17-tgc_backup.tsv")
+
+tgc <- read_tsv('2023_01_17-tgc_backup.tsv', show_col_types = FALSE)
+```
+
+### Build graph for evaluation 
+
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+#tgc$n_genotypes <- factor(tgc$n_genotypes)
+#tgc$n_genotypes = as.numeric(as.character(tgc$n_genotypes))
+p= ggplot(tgc, aes(x = n_genotypes, y = timeProcess, colour = from)) + 
+  geom_line(aes(x = n_genotypes, y= timeProcess+sd), 
+            linetype = "dashed") +
+  geom_line(aes(x = n_genotypes, y= timeProcess-sd), 
+            linetype = "dashed") +
+  geom_ribbon(aes(ymin = timeProcess-sd , 
+                  ymax= timeProcess+sd, fill = from), alpha=0.2) + 
+  geom_line(aes(x = n_genotypes, y= timeProcess), 
+            linetype = "solid") +
+  geom_point() +  
+  scale_y_log10() +
+  scale_color_manual(values = c('#581845', '#FFC30F')) + 
+  scale_fill_manual(values = c('#581845', '#FFC30F'))
+```
+
+### Visualization
+
+```{r, message=FALSE, warning=FALSE}
+p
+```
diff --git a/results/v3/GLM_mixte/2023_02_08_randomIntercept.Rmd b/results/v3/GLM_mixte/2023_02_08_randomIntercept.Rmd
new file mode 100644
index 0000000..9241c4d
--- /dev/null
+++ b/results/v3/GLM_mixte/2023_02_08_randomIntercept.Rmd
@@ -0,0 +1,132 @@
+---
+title: "Random intercept"
+date: "2023-02-08"
+output: 
+  html_document:
+    toc: true 
+css: ../css/air.css
+---
+
+## Required
+
+
+```{r setup, message=FALSE, warning=FALSE, results="hide"}
+library(HTRsim)
+library(HTRfit)
+library(plotROC)
+```
+
+
+## Simulation
+
+### Parameters
+
+```{r}
+# -- SETUP
+n_G = 10
+n_genoT = 200
+n_env = 2
+FixIntercept = F
+max_n_rep = 5
+sequencing_fact = 1
+n_clus = 3
+thr = 2
+###########
+```
+
+### Random intercept 
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- Simulate counts
+mock_random = rnaMock(n_genes = n_G, n_genotypes = n_genoT, n_environments = n_env, 
+    max_n_replicates = max_n_rep, sequencing_factor = sequencing_fact, fixIntercept = FixIntercept, n_clusters = n_clus)
+
+# -- count table & experimental design
+count_table_random = mock_random$countTable %>% as.data.frame()
+bioDesign_random = mock_random$design
+
+# -- ground truth
+beta.actual_random = mock_random$actualParam$beta
+
+```
+
+## Estimation
+
+### Launch glmmTMB::glmmTMB on simulated data (random intercept)
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- glmmTMB::glmmTMB
+dtf2fit_random = HTRfit::reshapeCounTable(count_table_random, bioDesign_random)
+l_random.tmb = HTRfit::launch.glm_mixte(dtf2fit_random, package = "glmmTMB")
+fitDtf_random.tmb =listFit2dtf(l_random.tmb)
+```
+
+### Launch lme4::glmer.nb on simulated data (random intercept)
+
+```{r, message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# -- lme4::glm.nb
+l_random.lme4 = HTRfit::launch.glm_mixte(dtf2fit_random, package = "lme4")
+fitDtf_random.lme4 =listFit2dtf(l_random.lme4)
+```
+
+### Join Actual & Inference
+
+```{r}
+###### -- LME4 -- #######
+expectation_random = getExpectation(beta.actual_random,toEval = "glm_mixte" , threshold = thr)
+prediction_random = getPrediction(fitDtf_random.lme4$inference, threshold = thr, alphaRisk = 0.05)
+# -- join actual & inference
+actual2join.dtf <- data.table::data.table(expectation_random, key = c("gene_id", "term"))
+inference2join.dtf <- data.table::data.table(prediction_random, key = c("gene_id", "term"))
+comparison.lme4 <- actual2join.dtf[inference2join.dtf]
+
+
+###### -- GLMTMB -- #######
+expectation_random = getExpectation(beta.actual_random,toEval = "glm_mixte" , threshold = thr)
+prediction_random = getPrediction(fitDtf_random.tmb$inference, threshold = thr, alphaRisk = 0.05)
+# -- join actual & inference
+actual2join.dtf <- data.table::data.table(expectation_random, key = c("gene_id", "term"))
+inference2join.dtf <- data.table::data.table(prediction_random, key = c("gene_id", "term"))
+comparison.tmb <- actual2join.dtf[inference2join.dtf]
+
+```
+
+### Join lme4 & glmmTMB
+
+```{r}
+###### -- Annotations
+comparison.tmb = comparison.tmb %>% dplyr::mutate(from = "glmmTMB")
+comparison.lme4 = comparison.lme4 %>% dplyr::mutate(from = "lme4")
+# -- join
+comparison.dtf = rbind(comparison.lme4, comparison.tmb)
+```
+## Evaluation
+
+### Build graph for evaluation 
+
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+
+# -- preparing dtf
+comparison.dtf$term = factor(comparison.dtf$term, levels = c("(Intercept)", "environmentE1","cor__(Intercept).environmentE1", "sd__(Intercept)", "sd__environmentE1"))
+comparison.dtf = comparison.dtf %>% 
+  mutate(actual.value = 
+           if_else(str_detect(term, "cor_"), 
+                   actual.value, actual.value*log(2) ))
+
+# -- Identity plot
+p = ggplot(comparison.dtf)  + 
+  geom_point(aes(x = actual.value, 
+                 y = estimate, col = from), alpha = 0.6, size = 3) +
+  geom_abline(intercept = 0, slope = 1) + 
+  facet_wrap(~term, scales = "free")  + 
+  theme(strip.text.x = element_text(size = 7)) +
+  scale_color_manual(values = c('#581845', '#FFC30F'))
+
+#ggsave("../img/graph/poc_glmm2_1000.png", p,) 
+```
+
+### Visualization
+
+```{r, message=FALSE, warning=FALSE}
+p
+```
-- 
GitLab