From 4e82e7867670fe02157bbf24f03c0bf58f8d3686 Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Thu, 11 Jan 2024 16:35:41 +0100
Subject: [PATCH] update vignette

---
 dev/flat_full.Rmd    | 34 ++++++++++++++++++++--------------
 vignettes/htrfit.Rmd | 10 +++++-----
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/dev/flat_full.Rmd b/dev/flat_full.Rmd
index 7ad9b3f..7d6d48d 100644
--- a/dev/flat_full.Rmd
+++ b/dev/flat_full.Rmd
@@ -8798,6 +8798,7 @@ is_formula_mixedTypeI <- function(formula) {
   if (length(all.vars(formula)) != 3) return(FALSE)
   if (sum(all.names(formula) == "+") > 1) return(FALSE)
   if (sum(all.names(formula) == "/") > 0) return(FALSE)
+  if (length(all_var_in_formula) == 4 && all_var_in_formula[2] != all_var_in_formula[3]) return(FALSE)8
   return(TRUE)
 }
 
@@ -9055,11 +9056,14 @@ test_that("Test is_formula_mixedTypeI", {
   formula2 <- y ~ z + group1 + (1 | group1)
   formula3 <- y ~ z + (1 | group1 + group2)
   formula4 <- y ~ z + (1 | group1/z)
-
+  formula5 <- y ~ z + ( group | z ) ## z is fixed then expected on the left in parenthesis
+  
   expect_true(is_formula_mixedTypeI(formula1))
   expect_false(is_formula_mixedTypeI(formula2))
   expect_false(is_formula_mixedTypeI(formula3))
   expect_false(is_formula_mixedTypeI(formula4))
+  expect_false(is_formula_mixedTypeI(formula5))
+
 
 })
 
@@ -9388,7 +9392,7 @@ mock_data <- mock_rnaseq(list_var, N_GENES,
 
 ## Set dispersion of gene expression
 
-The dispersion parameter ($\alpha_i$), characterizes the relationship between the variance of the observed read counts and its mean value. In simple terms, it quantifies how much we expect observed counts to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter.
+The dispersion parameter ($dispersion_i$), characterizes the relationship between the variance of the observed read counts and its mean value. In simple terms, it quantifies how much we expect observed counts to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter.
 
 
 ```{r example-mock_rnaseq_disp, warning = FALSE, message = FALSE}
@@ -9436,7 +9440,7 @@ mock_data <- mock_rnaseq(list_var, N_GENES,
 ## Mock RNAseq object
 
 ```{r example-str_obj_mock, warning=FALSE, message=FALSE}
-str(mock_data)
+str(mock_data, max.level = 1)
 ```
 
 
@@ -9447,11 +9451,11 @@ str(mock_data)
 </div> 
 
 
-In this modeling framework, counts denoted as $K_{ij}$ for gene i and sample j are generated using a negative binomial distribution. The negative binomial distribution considers a fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$.
+In this modeling framework, counts denoted as $K_{ij}$ for gene i and sample j are generated using a negative binomial distribution. The negative binomial distribution considers a fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $dispersion_i$.
 
 The fitted mean $\mu_{ij}$ is determined by a parameter, $q_{ij}$, which is proportionally related to the sum of all effects specified using `init_variable()` or `add_interaction()`. If basal gene expressions are provided, the $\mu_{ij}$ values are scaled accordingly using the gene-specific basal expression value ($bexpr_i$).
 
-Furthermore, the coefficients $\beta_i$ represent the natural logarithm fold changes for gene i across each column of the model matrix X. The dispersion parameter $\alpha_i$ plays a crucial role in defining the relationship between the variance of observed counts and their mean value. In simpler terms, it quantifies how far we expect observed counts to deviate from the mean value.
+Furthermore, the coefficients $\beta_i$ represent the natural logarithm fold changes for gene i across each column of the model matrix X. The dispersion parameter $dispersion_i$ plays a crucial role in defining the relationship between the variance of observed counts and their mean value. In simpler terms, it quantifies how far we expect observed counts to deviate from the mean value for each genes.
 
 
 # Fitting models
@@ -9485,7 +9489,8 @@ metaData <- mock_data$metadata
 ## -- convert counts matrix and samples metadatas in a data frame for fitting
 data2fit = prepareData2fit(countMatrix = count_matrix, 
                            metadata =  metaData, 
-                           normalization = F)
+                           normalization = F,
+                           response_name = "kij")
 
 
 ## -- median ratio normalization
@@ -9656,7 +9661,7 @@ The dispersion plot, generated by the `evaluation_report()` function, offers a v
 The Receiver Operating Characteristic (ROC) curve is a valuable tool for assessing the performance of classification models, particularly in the context of identifying differentially expressed genes. It provides a graphical representation of the model's ability to distinguish between genes that are differentially expressed and those that are not, by varying the `coeff_threshold` and the `alt_hypothesis` parameters.
 
 
-```{r example-outputROC, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9}
+```{r example-outputROC, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=7}
 ## -- ROC curve
 resSimu$roc$params
 ```
@@ -9668,7 +9673,7 @@ resSimu$roc$params
 The precision-recall curve (PR curve) illustrates the relationship between precision and recall at various classification thresholds. It is particularly valuable in the context of imbalanced data, where one class is significantly more prevalent than the other. Unlike the ROC curve, which can be influenced by class distribution, the PR curve focuses on the model's ability to correctly identify examples of the minority class while maintaining high precision.
 
 
-```{r example-outputPR, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9}
+```{r example-outputPR, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=7}
 ## -- precision-recall curve
 resSimu$precision_recall$params
 ```
@@ -9681,7 +9686,7 @@ The area under the ROC curve (AUC) provides a single metric that summarizes the
 In addition to evaluating model performance through the AUC for both ROC and PR curves, we provide access to key classification metrics, including Accuracy, Precision, Recall (or Sensitivity), and Specificity. These metrics offer a comprehensive view of the model's classification capabilities. These metrics are computed for each parameter (excluding the intercept when skip_eval_intercept = TRUE), providing detailed insights into individual parameter contributions. Furthermore, an aggregated assessment is performed, considering all parameters (except the intercept by default), offering a perspective on the model's overall classification effectiveness.
 
 
-```{r example-outputPerf, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9}
+```{r example-outputPerf, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=7}
 ## -- precision-recall curve
 resSimu$performances
 ```
@@ -9719,7 +9724,7 @@ resSimu$identity$params
 resSimu$identity$dispersion
 ```
 
-```{r example-outputResSimu_metric, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9}
+```{r example-outputResSimu_metric, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=7}
 ## -- precision-recall curve
 resSimu$precision_recall$params
 ## -- ROC curve
@@ -9757,7 +9762,7 @@ resSimu$identity$params
 resSimu$identity$dispersion
 ```
 
-```{r example-subsetGenes_metrics, warning=FALSE, message=FALSE, fig.align='center', fig.height=4, fig.width=9}
+```{r example-subsetGenes_metrics, warning=FALSE, message=FALSE, fig.align='center', fig.height=4, fig.width=7}
 ## -- precision-recall curve
 resSimu$precision_recall$params
 ## -- ROC curve
@@ -9813,7 +9818,8 @@ mock_data <- mock_rnaseq(input_var_list,
 ## -- prepare data & fit a model with mixed effect
 data2fit = prepareData2fit(countMatrix = mock_data$counts, 
                            metadata =  mock_data$metadata, 
-                           normalization = F)
+                           normalization = F,
+                           response_name = "kij")
 l_tmb <- fitModelParallel(formula = kij ~ varB + (varB | varA),
                           data = data2fit, 
                           group_by = "geneID",
@@ -9835,7 +9841,7 @@ resSimu$identity$params
 resSimu$identity$dispersion
 ```
 
-```{r example-outputResSimuMixed_metric, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=9}
+```{r example-outputResSimuMixed_metric, warning = FALSE, message = FALSE, fig.align='center', fig.height=4, fig.width=7}
 ## -- precision-recall curve
 resSimu$precision_recall$params
 ## -- ROC curve
@@ -9844,7 +9850,7 @@ resSimu$roc$params
 resSimu$performances
 ```
 
-## Strure of evaluation report object
+## Structure of evaluation report object
 
 
 ```{r example-str_eval_report, warning=FALSE, message=FALSE}
diff --git a/vignettes/htrfit.Rmd b/vignettes/htrfit.Rmd
index d7d1d68..ca71e15 100644
--- a/vignettes/htrfit.Rmd
+++ b/vignettes/htrfit.Rmd
@@ -165,7 +165,7 @@ mock_data <- mock_rnaseq(list_var, N_GENES,
 
 ## Set dispersion of gene expression
 
-The dispersion parameter ($\alpha_i$), characterizes the relationship between the variance of the observed read counts and its mean value. In simple terms, it quantifies how much we expect observed counts to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter.
+The dispersion parameter ($dispersion_i$), characterizes the relationship between the variance of the observed read counts and its mean value. In simple terms, it quantifies how much we expect observed counts to deviate from the mean value. You can specify the dispersion for individual genes using the dispersion parameter.
 
 
 
@@ -212,7 +212,7 @@ mock_data <- mock_rnaseq(list_var, N_GENES,
 ## Mock RNAseq object
 
 ```{r example-str_obj_mock, warning = FALSE, message = FALSE}
-str(mock_data)
+str(mock_data, max.level = 1)
 ```
 
 # Theory behind HTRfit 
@@ -222,11 +222,11 @@ str(mock_data)
 </div> 
 
 
-In this modeling framework, counts denoted as $K_{ij}$ for gene i and sample j are generated using a negative binomial distribution. The negative binomial distribution considers a fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$.
+In this modeling framework, counts denoted as $K_{ij}$ for gene i and sample j are generated using a negative binomial distribution. The negative binomial distribution considers a fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $dispersion_i$.
 
 The fitted mean $\mu_{ij}$ is determined by a parameter, $q_{ij}$, which is proportionally related to the sum of all effects specified using `init_variable()` or `add_interaction()`. If basal gene expressions are provided, the $\mu_{ij}$ values are scaled accordingly using the gene-specific basal expression value ($bexpr_i$).
 
-Furthermore, the coefficients $\beta_i$ represent the natural logarithm fold changes for gene i across each column of the model matrix X. The dispersion parameter $\alpha_i$ plays a crucial role in defining the relationship between the variance of observed counts and their mean value. In simpler terms, it quantifies how far we expect observed counts to deviate from the mean value.
+Furthermore, the coefficients $\beta_i$ represent the natural logarithm fold changes for gene i across each column of the model matrix X. The dispersion parameter $dispersion_i$ plays a crucial role in defining the relationship between the variance of observed counts and their mean value. In simpler terms, it quantifies how far we expect observed counts to deviate from the mean value.
 
 
 
@@ -623,7 +623,7 @@ resSimu$roc$params
 resSimu$performances
 ```
 
-## Strure of evaluation report object
+## Structure of evaluation report object
 
 ```{r example-str_eval_report, warning = FALSE, message = FALSE}
 str(resSimu, max.level = 1)
-- 
GitLab