From 56f398bdcf1509aefe9979427faecc31325ba555 Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Tue, 26 Mar 2024 10:44:24 +0100
Subject: [PATCH] add fix_reference_effect parameter and utils

Former-commit-id: 9d73431e71bc99d4ad7e6ad1ce8d0877db320447
Former-commit-id: 52edbac846370096cf86ff60c723a1937956e284
Former-commit-id: 9502ca085592283dd8aa5d2b3ed2267fb97561bb
---
 NAMESPACE                        |   2 +
 R/mock_rnaseq.R                  |   7 +-
 R/simulation.R                   |  71 ++++++++++++++--
 dev/flat_full.Rmd                | 135 +++++++++++++++++++++++++++++--
 man/getDataFromRnorm.Rd          |   6 +-
 man/getInput2simulation.Rd       |   5 ++
 man/getRefLevel.Rd               |  18 +++++
 man/mock_rnaseq.Rd               |   7 +-
 man/replaceReferenceEffectBy0.Rd |  21 +++++
 tests/testthat/test-simulation.R |  57 +++++++++++++
 10 files changed, 313 insertions(+), 16 deletions(-)
 create mode 100644 man/getRefLevel.Rd
 create mode 100644 man/replaceReferenceEffectBy0.Rd

diff --git a/NAMESPACE b/NAMESPACE
index fb7c5ff..bd18a90 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -90,6 +90,7 @@ export(getLog_qij)
 export(getMu_ij)
 export(getMu_ij_matrix)
 export(getNumberOfCombinationsInInteraction)
+export(getRefLevel)
 export(getReplicationMatrix)
 export(getSE_df)
 export(getSampleID)
@@ -158,6 +159,7 @@ export(removeDigitsAtEnd)
 export(removeDuplicatedWord)
 export(renameColumns)
 export(reorderColumns)
+export(replaceReferenceEffectBy0)
 export(replicateByGroup)
 export(replicateMatrix)
 export(replicateRows)
diff --git a/R/mock_rnaseq.R b/R/mock_rnaseq.R
index ce43744..e880f9b 100644
--- a/R/mock_rnaseq.R
+++ b/R/mock_rnaseq.R
@@ -105,6 +105,9 @@ warning_too_low_mu_ij_row <- function(mu_ij_matrix, threshold = 1 ){
 #' @param normal_distr Specifies the distribution type for generating effects. Choose between 'univariate' or 'multivariate' (default).
 #' - 'univariate': Effects are drawn independently from univariate normal distributions. 
 #' - 'multivariate': Effects are drawn jointly from a multivariate normal distribution. 
+#' @param fix_reference_effect A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the 
+#' reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked 
+#' randomly from the distribution specified by the user. This option works only when `normal_distr` is set to 'univariate'. Default is FALSE.
 #' @return List containing the ground truth, counts, and metadata
 #' @export
 #' @examples
@@ -113,10 +116,10 @@ warning_too_low_mu_ij_row <- function(mu_ij_matrix, threshold = 1 ){
 #'               max_replicates = 4)
 mock_rnaseq <- function(list_var, n_genes, min_replicates, max_replicates, sequencing_depth = NULL,  
                         basal_expression = 0 , dispersion = stats::runif(n_genes, min = 0, max = 1000), 
-                        normal_distr = "multivariate") {
+                        normal_distr = "multivariate", fix_reference_effect = FALSE) {
   
   ## -- get my effect
-  df_inputSimulation <- getInput2simulation(list_var, n_genes, normal_distr)
+  df_inputSimulation <- getInput2simulation(list_var, n_genes, normal_distr, fix_reference_effect )
   ## -- add column logQij
   df_inputSimulation <- getLog_qij(df_inputSimulation)
   df_inputSimulation <- addBasalExpression(df_inputSimulation, n_genes, basal_expression)
diff --git a/R/simulation.R b/R/simulation.R
index d507012..0fb1d3e 100644
--- a/R/simulation.R
+++ b/R/simulation.R
@@ -10,6 +10,9 @@
 #' @param normal_distr Specifies the distribution type for generating effects. Choose between 'univariate' or 'multivariate' (default).
 #' - 'univariate': Effects are drawn independently from univariate normal distributions. 
 #' - 'multivariate': Effects are drawn jointly from a multivariate normal distribution. 
+#' @param fix_reference_effect A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the 
+#' reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked 
+#' randomly from the distribution specified by the user. This option works only when `normal_distr` is set to 'univariate'. Default is FALSE.
 #' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL)
 #' @return A data frame with input coefficients for simulation
 #' @export
@@ -17,7 +20,7 @@
 #' # Example usage
 #' list_var <- init_variable()
 #' getInput2simulation(list_var, n_genes = 10)
-getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "multivariate", input2mvrnorm = NULL) {
+getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "multivariate", fix_reference_effect = FALSE,  input2mvrnorm = NULL) {
   
   stopifnot( normal_distr %in% c("multivariate", "univariate") )
 
@@ -26,7 +29,7 @@ getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "multivari
       l_dataFrom_normdistr <- getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
     } 
   if (normal_distr == "univariate"){
-      l_dataFrom_normdistr <- getDataFromRnorm(list_var, n_genes)
+      l_dataFrom_normdistr <- getDataFromRnorm(list_var, n_genes, fix_reference_effect)
     }
   
   l_dataFromUser = getDataFromUser(list_var)
@@ -38,24 +41,82 @@ getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "multivari
 
 
 
+#' Get the reference level for categorical variables in the data
+#'
+#' This function extracts the reference level for each categorical variable in the data.
+#' The reference level is the first level encountered for each categorical variable.
+#'
+#' @param data The data frame containing the categorical variables.
+#' @return A list containing the reference level for each categorical variable.
+#' @export
+getRefLevel <- function(data){
+  col_names <- colnames(data)
+  categorical_vars <- col_names[grepl(col_names, pattern = "label_")]
+  if (length(categorical_vars) == 1){
+    l_labels <- list()
+    l_labels[[categorical_vars]] <- levels(data[, categorical_vars])
+    
+  } else l_labels <- lapply(data[, categorical_vars], levels)
+  l_labels_ref <- sapply(l_labels, function(vec) vec[1])
+  return(l_labels_ref)
+}
+
+#' Replace the reference effect by 0 in the data
+#'
+#' This function replaces the effect corresponding to the reference level with 0 in the data.
+#'
+#' @param list_var The list of variables containing the effects to modify.
+#' @param l_labels_ref A list containing the reference level for each categorical variable.
+#' @param data The data frame containing the effects to modify.
+#' @return The modified data frame with reference effects replaced by 0.
+#' @export
+replaceReferenceEffectBy0 <- function(list_var, l_labels_ref , data){
+  varInitialized <- c(getListVar(list_var), getListVar(list_var$interactions))
+  varInitialized <- varInitialized[varInitialized != 'interactions']
+  df_effects_with0 <- sapply(varInitialized, function(var){
+    categorical_var <- paste("label", unlist(strsplit(var, ":")), sep = "_")
+    if (length(categorical_var) == 1) 
+      idx_0 <- data[[categorical_var]] == l_labels_ref[[categorical_var]]
+    else {
+      bool_matrix <- sapply(categorical_var, function(uniq_cat_var) data[uniq_cat_var] ==  l_labels_ref[uniq_cat_var])
+      idx_0 <- rowSums(bool_matrix) == length(categorical_var)  
+    }
+    return(replace(data[[var]], idx_0, 0))  
+  } 
+  )
+  col_names <- colnames(data)
+  categorical_vars <- col_names[grepl(col_names, pattern = "label_")]
+  data <- cbind(data[c("geneID", categorical_vars)], df_effects_with0) 
+  return(data)
+} 
+
+
+
 #' Prepare data using effects from a normal distribution
 #'
 #' Prepares the data by generating effects from a normal distribution for each gene.
 #'
 #' @param list_var A list of variables (already initialized)
 #' @param n_genes Number of genes to generate data for.
+#' @param fix_reference_effect A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the 
+#' reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked 
+#' randomly from the distribution specified by the user. This option works only when `normal_distr` is set to 'univariate'. Default is FALSE.
 #' @return A dataframe containing gene metadata and effects generated from a normal distribution.
 #' @export
-getDataFromRnorm <- function(list_var, n_genes){
+getDataFromRnorm <- function(list_var, n_genes, fix_reference_effect = FALSE){
     ## -- check if all data have been provided by user
     if (is.null(getInput2mvrnorm(list_var)$covMatrix))
         return(list())
-
     metadata <- getGeneMetadata(list_var , n_genes)
     df_effects <- get_effects_from_rnorm(list_var, metadata)
     data <- cbind(metadata, df_effects)  
+    
+    if (isTRUE(fix_reference_effect)){
+      l_labels_ref <- getRefLevel(data)
+      data <- replaceReferenceEffectBy0(input_var_list, l_labels_ref, data)
+    }
+    
     return(list(data))
-   
 }
 
 #' Generate effects from a normal distribution
diff --git a/dev/flat_full.Rmd b/dev/flat_full.Rmd
index 70b35a8..f9e1c8e 100644
--- a/dev/flat_full.Rmd
+++ b/dev/flat_full.Rmd
@@ -1527,6 +1527,9 @@ test_that("set_correlation sets the correlation between variables correctly", {
 #' @param normal_distr Specifies the distribution type for generating effects. Choose between 'univariate' or 'multivariate' (default).
 #' - 'univariate': Effects are drawn independently from univariate normal distributions. 
 #' - 'multivariate': Effects are drawn jointly from a multivariate normal distribution. 
+#' @param fix_reference_effect A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the 
+#' reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked 
+#' randomly from the distribution specified by the user. This option works only when `normal_distr` is set to 'univariate'. Default is FALSE.
 #' @param input2mvrnorm Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL)
 #' @return A data frame with input coefficients for simulation
 #' @export
@@ -1534,7 +1537,7 @@ test_that("set_correlation sets the correlation between variables correctly", {
 #' # Example usage
 #' list_var <- init_variable()
 #' getInput2simulation(list_var, n_genes = 10)
-getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "multivariate", input2mvrnorm = NULL) {
+getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "multivariate", fix_reference_effect = FALSE,  input2mvrnorm = NULL) {
   
   stopifnot( normal_distr %in% c("multivariate", "univariate") )
 
@@ -1543,7 +1546,7 @@ getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "multivari
       l_dataFrom_normdistr <- getDataFromMvrnorm(list_var, input2mvrnorm, n_genes)
     } 
   if (normal_distr == "univariate"){
-      l_dataFrom_normdistr <- getDataFromRnorm(list_var, n_genes)
+      l_dataFrom_normdistr <- getDataFromRnorm(list_var, n_genes, fix_reference_effect)
     }
   
   l_dataFromUser = getDataFromUser(list_var)
@@ -1555,24 +1558,82 @@ getInput2simulation <- function(list_var, n_genes = 1, normal_distr = "multivari
 
 
 
+#' Get the reference level for categorical variables in the data
+#'
+#' This function extracts the reference level for each categorical variable in the data.
+#' The reference level is the first level encountered for each categorical variable.
+#'
+#' @param data The data frame containing the categorical variables.
+#' @return A list containing the reference level for each categorical variable.
+#' @export
+getRefLevel <- function(data){
+  col_names <- colnames(data)
+  categorical_vars <- col_names[grepl(col_names, pattern = "label_")]
+  if (length(categorical_vars) == 1){
+    l_labels <- list()
+    l_labels[[categorical_vars]] <- levels(data[, categorical_vars])
+    
+  } else l_labels <- lapply(data[, categorical_vars], levels)
+  l_labels_ref <- sapply(l_labels, function(vec) vec[1])
+  return(l_labels_ref)
+}
+
+#' Replace the reference effect by 0 in the data
+#'
+#' This function replaces the effect corresponding to the reference level with 0 in the data.
+#'
+#' @param list_var The list of variables containing the effects to modify.
+#' @param l_labels_ref A list containing the reference level for each categorical variable.
+#' @param data The data frame containing the effects to modify.
+#' @return The modified data frame with reference effects replaced by 0.
+#' @export
+replaceReferenceEffectBy0 <- function(list_var, l_labels_ref , data){
+  varInitialized <- c(getListVar(list_var), getListVar(list_var$interactions))
+  varInitialized <- varInitialized[varInitialized != 'interactions']
+  df_effects_with0 <- sapply(varInitialized, function(var){
+    categorical_var <- paste("label", unlist(strsplit(var, ":")), sep = "_")
+    if (length(categorical_var) == 1) 
+      idx_0 <- data[[categorical_var]] == l_labels_ref[[categorical_var]]
+    else {
+      bool_matrix <- sapply(categorical_var, function(uniq_cat_var) data[uniq_cat_var] ==  l_labels_ref[uniq_cat_var])
+      idx_0 <- rowSums(bool_matrix) == length(categorical_var)  
+    }
+    return(replace(data[[var]], idx_0, 0))  
+  } 
+  )
+  col_names <- colnames(data)
+  categorical_vars <- col_names[grepl(col_names, pattern = "label_")]
+  data <- cbind(data[c("geneID", categorical_vars)], df_effects_with0) 
+  return(data)
+} 
+
+
+
 #' Prepare data using effects from a normal distribution
 #'
 #' Prepares the data by generating effects from a normal distribution for each gene.
 #'
 #' @param list_var A list of variables (already initialized)
 #' @param n_genes Number of genes to generate data for.
+#' @param fix_reference_effect A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the 
+#' reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked 
+#' randomly from the distribution specified by the user. This option works only when `normal_distr` is set to 'univariate'. Default is FALSE.
 #' @return A dataframe containing gene metadata and effects generated from a normal distribution.
 #' @export
-getDataFromRnorm <- function(list_var, n_genes){
+getDataFromRnorm <- function(list_var, n_genes, fix_reference_effect = FALSE){
     ## -- check if all data have been provided by user
     if (is.null(getInput2mvrnorm(list_var)$covMatrix))
         return(list())
-
     metadata <- getGeneMetadata(list_var , n_genes)
     df_effects <- get_effects_from_rnorm(list_var, metadata)
     data <- cbind(metadata, df_effects)  
+    
+    if (isTRUE(fix_reference_effect)){
+      l_labels_ref <- getRefLevel(data)
+      data <- replaceReferenceEffectBy0(input_var_list, l_labels_ref, data)
+    }
+    
     return(list(data))
-   
 }
 
 #' Generate effects from a normal distribution
@@ -2186,6 +2247,63 @@ test_that("All negative values", {
 
 
 
+
+# Test for getRefLevel function
+test_that("getRefLevel returns correct reference levels", {
+  # Create a sample data frame
+  data <- data.frame(
+    label_genotype = factor(c("A", "B", "A", "B")),
+    label_environment = factor(c("X", "Y", "X", "Y"))
+  )
+  
+  # Expected reference levels
+  expected_ref_levels <- list(
+    label_genotype = "A",
+    label_environment = "X"
+  ) %>% unlist()
+  
+  # Test getRefLevel function
+  ref_levels <- getRefLevel(data)
+  
+  # Check if reference levels match the expected ones
+  expect_identical(ref_levels, expected_ref_levels)
+})
+
+# Test for replaceReferenceEffectBy0 function
+test_that("replaceReferenceEffectBy0 replaces reference effects correctly", {
+  
+  input_var_list <- init_variable( name = "genotype", mu = 0, sd = 2.18, level = 2) %>%
+                      init_variable( name = "env", mu = 0, sd = 0.57, level = 4 ) %>%
+                      init_variable(name = "T", mu = 0, sd = 0.39, level = 3) %>% 
+                      add_interaction(between_var = c("genotype", "env"), mu = 0 , sd = 1.018) %>%
+                      add_interaction(between_var = c("genotype", "env", "T"), mu = 0 , sd = 1.018)
+  
+  
+  N_GENES <- 10
+  l_dataFrom_normdistr <- getDataFromRnorm(input_var_list, N_GENES)
+  metadata <- getGeneMetadata(input_var_list , N_GENES)
+  df_effects <- get_effects_from_rnorm(input_var_list, metadata)
+  data <- cbind(metadata, df_effects) 
+  
+  l_labels_ref <- getRefLevel(data)
+  data <- replaceReferenceEffectBy0(input_var_list, l_labels_ref, data)
+  
+  # Check if modified data matches the expected data
+  expect_identical(colnames(data), c("geneID","label_genotype","label_env", "label_T", "genotype", "env" , "T", "genotype:env"  ,"genotype:env:T"))
+  expected_data <- data.frame(geneID = "gene1", label_genotype = "genotype1", label_env = "env1", label_T = "T1", genotype = 0, 
+                              env = 0, T = 0, "genotype:env" = 0, "genotype:env:T" = 0 )
+  colnames(expected_data) <- c("geneID", "label_genotype", "label_env", "label_T", "genotype", 
+                              "env", "T" , "genotype:env", "genotype:env:T" )
+  expected_data$label_genotype <- factor(expected_data$label_genotype , levels = c("genotype1", "genotype2"))
+  expected_data$label_env <- factor(expected_data$label_env , levels = c("env1", "env2", "env3", "env4"))
+  expected_data$label_T <- factor(expected_data$label_T , levels = c("T1", "T2", "T3"))
+
+  expect_identical(data[1,], expected_data )
+
+})
+
+
+
 ```
 
 
@@ -2295,6 +2413,9 @@ warning_too_low_mu_ij_row <- function(mu_ij_matrix, threshold = 1 ){
 #' @param normal_distr Specifies the distribution type for generating effects. Choose between 'univariate' or 'multivariate' (default).
 #' - 'univariate': Effects are drawn independently from univariate normal distributions. 
 #' - 'multivariate': Effects are drawn jointly from a multivariate normal distribution. 
+#' @param fix_reference_effect A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the 
+#' reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked 
+#' randomly from the distribution specified by the user. This option works only when `normal_distr` is set to 'univariate'. Default is FALSE.
 #' @return List containing the ground truth, counts, and metadata
 #' @export
 #' @examples
@@ -2303,10 +2424,10 @@ warning_too_low_mu_ij_row <- function(mu_ij_matrix, threshold = 1 ){
 #'               max_replicates = 4)
 mock_rnaseq <- function(list_var, n_genes, min_replicates, max_replicates, sequencing_depth = NULL,  
                         basal_expression = 0 , dispersion = stats::runif(n_genes, min = 0, max = 1000), 
-                        normal_distr = "multivariate") {
+                        normal_distr = "multivariate", fix_reference_effect = FALSE) {
   
   ## -- get my effect
-  df_inputSimulation <- getInput2simulation(list_var, n_genes, normal_distr)
+  df_inputSimulation <- getInput2simulation(list_var, n_genes, normal_distr, fix_reference_effect )
   ## -- add column logQij
   df_inputSimulation <- getLog_qij(df_inputSimulation)
   df_inputSimulation <- addBasalExpression(df_inputSimulation, n_genes, basal_expression)
diff --git a/man/getDataFromRnorm.Rd b/man/getDataFromRnorm.Rd
index a9354eb..afcdf4a 100644
--- a/man/getDataFromRnorm.Rd
+++ b/man/getDataFromRnorm.Rd
@@ -4,12 +4,16 @@
 \alias{getDataFromRnorm}
 \title{Prepare data using effects from a normal distribution}
 \usage{
-getDataFromRnorm(list_var, n_genes)
+getDataFromRnorm(list_var, n_genes, fix_reference_effect = FALSE)
 }
 \arguments{
 \item{list_var}{A list of variables (already initialized)}
 
 \item{n_genes}{Number of genes to generate data for.}
+
+\item{fix_reference_effect}{A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the
+reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked
+randomly from the distribution specified by the user. This option works only when \code{normal_distr} is set to 'univariate'. Default is FALSE.}
 }
 \value{
 A dataframe containing gene metadata and effects generated from a normal distribution.
diff --git a/man/getInput2simulation.Rd b/man/getInput2simulation.Rd
index f50ce20..a4cca47 100644
--- a/man/getInput2simulation.Rd
+++ b/man/getInput2simulation.Rd
@@ -8,6 +8,7 @@ getInput2simulation(
   list_var,
   n_genes = 1,
   normal_distr = "multivariate",
+  fix_reference_effect = FALSE,
   input2mvrnorm = NULL
 )
 }
@@ -22,6 +23,10 @@ getInput2simulation(
 \item 'multivariate': Effects are drawn jointly from a multivariate normal distribution.
 }}
 
+\item{fix_reference_effect}{A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the
+reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked
+randomly from the distribution specified by the user. This option works only when \code{normal_distr} is set to 'univariate'. Default is FALSE.}
+
 \item{input2mvrnorm}{Input to the \code{mvrnorm} function for simulating data from multivariate normal distribution (default: NULL)}
 }
 \value{
diff --git a/man/getRefLevel.Rd b/man/getRefLevel.Rd
new file mode 100644
index 0000000..6a29450
--- /dev/null
+++ b/man/getRefLevel.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulation.R
+\name{getRefLevel}
+\alias{getRefLevel}
+\title{Get the reference level for categorical variables in the data}
+\usage{
+getRefLevel(data)
+}
+\arguments{
+\item{data}{The data frame containing the categorical variables.}
+}
+\value{
+A list containing the reference level for each categorical variable.
+}
+\description{
+This function extracts the reference level for each categorical variable in the data.
+The reference level is the first level encountered for each categorical variable.
+}
diff --git a/man/mock_rnaseq.Rd b/man/mock_rnaseq.Rd
index c68b463..0d86249 100644
--- a/man/mock_rnaseq.Rd
+++ b/man/mock_rnaseq.Rd
@@ -12,7 +12,8 @@ mock_rnaseq(
   sequencing_depth = NULL,
   basal_expression = 0,
   dispersion = stats::runif(n_genes, min = 0, max = 1000),
-  normal_distr = "multivariate"
+  normal_distr = "multivariate",
+  fix_reference_effect = FALSE
 )
 }
 \arguments{
@@ -35,6 +36,10 @@ mock_rnaseq(
 \item 'univariate': Effects are drawn independently from univariate normal distributions.
 \item 'multivariate': Effects are drawn jointly from a multivariate normal distribution.
 }}
+
+\item{fix_reference_effect}{A logical value indicating whether the effect of the reference label should be fixed to zero. If set to TRUE, the effect of the
+reference label is constrained to zero, ensuring that it does not contribute to the model. If set to FALSE, the effect from the reference label is picked
+randomly from the distribution specified by the user. This option works only when \code{normal_distr} is set to 'univariate'. Default is FALSE.}
 }
 \value{
 List containing the ground truth, counts, and metadata
diff --git a/man/replaceReferenceEffectBy0.Rd b/man/replaceReferenceEffectBy0.Rd
new file mode 100644
index 0000000..2a8380e
--- /dev/null
+++ b/man/replaceReferenceEffectBy0.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulation.R
+\name{replaceReferenceEffectBy0}
+\alias{replaceReferenceEffectBy0}
+\title{Replace the reference effect by 0 in the data}
+\usage{
+replaceReferenceEffectBy0(list_var, l_labels_ref, data)
+}
+\arguments{
+\item{list_var}{The list of variables containing the effects to modify.}
+
+\item{l_labels_ref}{A list containing the reference level for each categorical variable.}
+
+\item{data}{The data frame containing the effects to modify.}
+}
+\value{
+The modified data frame with reference effects replaced by 0.
+}
+\description{
+This function replaces the effect corresponding to the reference level with 0 in the data.
+}
diff --git a/tests/testthat/test-simulation.R b/tests/testthat/test-simulation.R
index d43b959..28e804d 100644
--- a/tests/testthat/test-simulation.R
+++ b/tests/testthat/test-simulation.R
@@ -248,3 +248,60 @@ test_that("All negative values", {
 
 
 
+
+# Test for getRefLevel function
+test_that("getRefLevel returns correct reference levels", {
+  # Create a sample data frame
+  data <- data.frame(
+    label_genotype = factor(c("A", "B", "A", "B")),
+    label_environment = factor(c("X", "Y", "X", "Y"))
+  )
+  
+  # Expected reference levels
+  expected_ref_levels <- list(
+    label_genotype = "A",
+    label_environment = "X"
+  ) %>% unlist()
+  
+  # Test getRefLevel function
+  ref_levels <- getRefLevel(data)
+  
+  # Check if reference levels match the expected ones
+  expect_identical(ref_levels, expected_ref_levels)
+})
+
+# Test for replaceReferenceEffectBy0 function
+test_that("replaceReferenceEffectBy0 replaces reference effects correctly", {
+  
+  input_var_list <- init_variable( name = "genotype", mu = 0, sd = 2.18, level = 2) %>%
+                      init_variable( name = "env", mu = 0, sd = 0.57, level = 4 ) %>%
+                      init_variable(name = "T", mu = 0, sd = 0.39, level = 3) %>% 
+                      add_interaction(between_var = c("genotype", "env"), mu = 0 , sd = 1.018) %>%
+                      add_interaction(between_var = c("genotype", "env", "T"), mu = 0 , sd = 1.018)
+  
+  
+  N_GENES <- 10
+  l_dataFrom_normdistr <- getDataFromRnorm(input_var_list, N_GENES)
+  metadata <- getGeneMetadata(input_var_list , N_GENES)
+  df_effects <- get_effects_from_rnorm(input_var_list, metadata)
+  data <- cbind(metadata, df_effects) 
+  
+  l_labels_ref <- getRefLevel(data)
+  data <- replaceReferenceEffectBy0(input_var_list, l_labels_ref, data)
+  
+  # Check if modified data matches the expected data
+  expect_identical(colnames(data), c("geneID","label_genotype","label_env", "label_T", "genotype", "env" , "T", "genotype:env"  ,"genotype:env:T"))
+  expected_data <- data.frame(geneID = "gene1", label_genotype = "genotype1", label_env = "env1", label_T = "T1", genotype = 0, 
+                              env = 0, T = 0, "genotype:env" = 0, "genotype:env:T" = 0 )
+  colnames(expected_data) <- c("geneID", "label_genotype", "label_env", "label_T", "genotype", 
+                              "env", "T" , "genotype:env", "genotype:env:T" )
+  expected_data$label_genotype <- factor(expected_data$label_genotype , levels = c("genotype1", "genotype2"))
+  expected_data$label_env <- factor(expected_data$label_env , levels = c("env1", "env2", "env3", "env4"))
+  expected_data$label_T <- factor(expected_data$label_T , levels = c("T1", "T2", "T3"))
+
+  expect_identical(data[1,], expected_data )
+
+})
+
+
+
-- 
GitLab