From 58cd580eb83cc5004822214c714916be5abe8793 Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Wed, 29 Jun 2022 09:44:41 +0200
Subject: [PATCH] update git

---
 results/report_22-04-2022.Rmd                 | 294 ------------------
 results/template.css                          |  29 --
 src/htrsim/R/input_estimation.R               |  22 +-
 src/htrsim/R/launch_deseq.R                   |  12 +-
 src/htrsim/inst/extdata/public_bioDesign.csv  |  13 -
 src/htrsim/man/estim.mu.Rd                    |   4 +-
 src/htrsim/man/handle_except.Rd               |   8 +-
 src/htrsim/man/reshape_input2setup.Rd         |   4 +-
 src/htrsim/man/setup_countGener.Rd            |   5 +-
 .../comment-utiliser-mon-package.Rmd          | 105 +++++--
 10 files changed, 115 insertions(+), 381 deletions(-)
 delete mode 100644 results/report_22-04-2022.Rmd
 delete mode 100644 results/template.css
 delete mode 100644 src/htrsim/inst/extdata/public_bioDesign.csv

diff --git a/results/report_22-04-2022.Rmd b/results/report_22-04-2022.Rmd
deleted file mode 100644
index 6f5ce4d..0000000
--- a/results/report_22-04-2022.Rmd
+++ /dev/null
@@ -1,294 +0,0 @@
----
-title: "Report_2022-04-21"
-output: html_document
-date: '2022-04-21'
-css: template.css
----
-
-
-## Introduction
-
-
-In living world, phenotypes are understanding as a mixture between a genotype effect, an environment effect and an interaction between G&E. 
-$$Phenotype = Genotype + Environment + Genotype.Environment$$
-The quantification of each strengths (G,E; G&E) can be estimate by a coefficient $\beta$. 
-Then, our expression becomes: 
-$$Phenotype = \beta_{G} * Genotype + \beta_{E}*Environment +  \beta_{G*E} * Genotype.Environment + \epsilon$$
-Notice that $\beta$ is specific of each component. Furthermore, we introduced above $\epsilon$. It's the residual of the model. $\epsilon$ can be seen as the difference between observed values and values predicted by the model.
-
-Genes expression can be also considered as a phenotype. <br> 
-According to this, the quantification of $\beta_{G}$, $\beta_{E}$ and  $\beta_{G*E}$ for a given gene in a given condition may open the possibility to assess differences between the strengths in presence in different conditions.
-
-That's the purpose of Htrsim !
-
-## Htrsim
-
-<u> Model </u>
-
-In this aim, Htrsim is based on a model. <br> 
-Because of is easy of use this model is managed by DESEQ2.
-Then, $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. 
-
-$$
-K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i)
-$$
-$$
-\mu_{ij} = s_jq_{ij}
-$$
-$$
-log_2(q_{ij}) = x_j*\beta_i
-$$
-The fitted mean is composed of a sample-specific size factor $s_j$ and a parameter qij proportional to the expected true concentration of fragments for sample j. 
-The coefficients $\beta_i$ give the log2 fold changes for gene i for each column of the model matrix X. The sample-specific size factors can be replaced by gene-specific normalization factors for each sample using normalizationFactors.
-
-According to the DESEQ2 GLM and our purpose, we can write: 
-$$
-log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij}
-$$
-
-According to this generalized linear model, we wish to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene i, in a given condition j. Achieve this, would allow us to quantify each strengths (G, E, G&E) for a given gene i, in a given condition j.
-
-
-<u> Required </u>
-
-```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
-library(htrsim)
-library(tidyverse)
-library(reshape2)
-```
-
-<u> Worklow </u>
-
-Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br>
-<br>
-Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$.
-
-
-a. Input 
-
-
-```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
-## Import & reshape table counts
-fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim")
-tabl_cnts <- read.table(file = fn, header = TRUE)
-rownames(tabl_cnts) <- tabl_cnts$gene_id
-tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID
-tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName
-
-## import design of bioProject
-fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim")
-bioDesign <- read.table(file = fn, header = T, sep = ';')
-
-```
-
-b. Launch DESEQ2
-
-```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
-dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign)
-```
-
-
-DESEQ returns a dds object which contains many, many things ... <br>
-In particular it contains the $\beta$ coefficients. <br>
-<br>
-You can access to beta coefficients using:
-
-```{r}
-dds.mcols = S4Vectors::mcols(dds,use.names=TRUE)
-```
-
-c. $\mu_{ij}$ 
-
-Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2,
-
-$$
-log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij}
-$$
-
-Then, $\mu_{ij]}$ can be estimate
-
-$$
-\mu_{ij} = s_j * 2^{log_2\mu_{ij]}}
-$$
-```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
-## Model matrix per samples
-mm <- model.matrix(~genotype + env + genotype:env, bioDesign)
-
-## Input estimation
-estim_mu = estim.mu(dds, mm)
-mu.input = estim_mu$mu
-```
-
-d. $K_{ij}$
-
-As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. 
-
-$$
-K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i)
-$$
-The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br>
-You can access to $\alpha_i$  using:
-```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
-alpha.input = estim.alpha(dds)
-```
-
-Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper.
-We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b.
-
-```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
-# Setup simulation
-input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE)
-
-#input$gene_id
-setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id,
-                                     n_rep = 1,
-                                     alpha = input$alpha,
-                                     gene_id = input$gene_id,
-                                     mu = input$mu)
-
-#setup.simulation %>% dim()
-# Simulate counts
-htrs <- generate_counts(setup.simulation)
-
-```
-
-
-```{r message=FALSE, warning=FALSE, include=TRUE}
-k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist()
-k_ij.actual = tabl_cnts %>% flatten() %>% unlist()
-
-df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin")
-df$origin = df$Var2
-df = df %>% select(-Var2)
-max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max()
-max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max()
-
-ggplot(df, aes(x = k_ij, fill= origin )) +  geom_density(bins = 100, alpha = 0.5) + 
-  geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) +  
-  geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" )   +
-  scale_x_log10()
-```
-
-Comment: $K_{ij}$ simulated are abnormally huge !
-Comment: $K_{ij}$ simulated are slightly different from the actual K_{ij} !
-
-
-## Why so much differences
-
-b. $\epsilon$
-
-In our model, we define as follow:
-$$
-\epsilon_{ij} \sim {\sf N}(0 ; deviance_i)
-$$
-
-Let's see the distribution of $deviance_{i}$.
-
-
-```{r warning=FALSE}
-#estim_mu$beta.matrix
-
-deviance_i = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)]^2
-#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt ))  %>% data.frame() %>% flatten() %>% unlist()
-
-
-# Histogram logarithmic y axis
-ggplot(data.frame(deviance_i), aes(deviance_i)) +               
-  geom_histogram(bins = 100) #+ scale_x_log10()
-
-
-```
-
-
-The deviance is also inferred by DESEQ while computing its model.
-deviance is mostly inferred between 100 and 200.  
-
-$$
-log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij}
-$$
-
-```{r warning=FALSE}
-#estim_mu$beta.matrix
-
-deviance_i.sqrt = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)]
-epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 ))  %>% data.frame() %>% flatten() %>% unlist()
-#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 ))  %>% data.frame() %>% flatten() %>% unlist()
-
-# Histogram logarithmic y axis
-ggplot(data.frame(epsilon_ij), aes(epsilon_ij)) +               
-  geom_histogram(bins = 100) #+ scale_x_log10()
-
-
-```
-
-
-Comment: Some  $\epsilon_{ij}$ are huge !
-Recall: $\epsilon$ can be seen as the difference between observed values and values predicted by the model.
-
-A large panel of $\epsilon$ mean that the model doesn't fit well with the observed data.
-
-It means that even if $\beta$ coefficients are well estimate. $\log_2(\mu_{ij]})$ will vary around them with a large panel of values (+/- 40)
-
-
-
-
-```{r warning=FALSE}
-beta.dtf = estim_mu$beta.matrix %>% data.frame()
-beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin")
-
-ggplot(beta.dtf.long, aes(x = beta )) +  geom_density(bins = 100, alpha = 0.5, fill = 'grey') + facet_grid(~origin, scales = "free_x")
-
-```
-```{r warning=FALSE}
-beta.dtf = estim_mu$beta.matrix %>% data.frame()
-beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin")
-
-
-
-B0 = estim_mu$dds.mcols$SE_Intercept
-B1 = estim_mu$dds.mcols$SE_genotype_msn2D_vs_wt
-B2 <- estim_mu$dds.mcols$SE_genotype_msn4D_vs_wt
-B3 <- estim_mu$dds.mcols$SE_env_kcl_vs_control
-B4 <- estim_mu$dds.mcols$SE_genotypemsn2D.envkcl
-B5 <- estim_mu$dds.mcols$SE_genotypemsn4D.envkcl
-
-
-SE_B.dtf <- cbind(B0, B1, B2, B3, B4, B5) %>% data.frame()
-SE_B.dtf.long = SE_B.dtf %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin")
-
-ggplot(SE_B.dtf.long, aes(x = SE_beta, fill= origin )) +  geom_density(bins = 100, alpha = 0.5) + facet_grid(~origin)
-```
-
-```{r warning=FALSE}
-bind_dtf<- cbind(SE_B.dtf.long, beta.dtf.long %>% select(-origin))
-ggplot(bind_dtf, aes(x = beta, y= SE_beta, fill= origin )) +  geom_point(alpha = 0.1) + facet_grid(~origin)
-
-
-#new <- bind_dtf %>% mutate(annot = ifelse(origin == "B4 | B5" && SE_beta > 6 , TRUE, FALSE ))
-#new <- bind_dtf %>% tail
-#new %>% filter(beta == "B4")
-```
-
-
-```{r warning=FALSE}
-dim(htrs)
-
-new <- bind_dtf %>% mutate(annot = ifelse(((origin == "B4") | (origin == "B5")) & (SE_beta > 6) , TRUE, FALSE ))
-### WARNING 
-new %>% dcast(., annot ~ origin)
-
-
-SE_threshold = 6
-SE_B.dtf.annot = SE_B.dtf %>%  mutate(annot = ifelse((B4 > SE_threshold) | (B5 > SE_threshold) , TRUE, FALSE ))
-SE_B.dtf.annot %>% group_by(annot) %>% tally()
-SE_B.dtf.annot.long = SE_B.dtf.annot %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin")
-
-
-bind_dtf.annot<- cbind(SE_B.dtf.annot.long, beta.dtf.long %>% select(-origin))
-bind_dtf.annot = bind_dtf.annot %>% filter(!is.na(annot))
-ggplot(bind_dtf.annot, aes(x = beta, y= SE_beta, col = annot )) +  geom_point(alpha = 0.1) + facet_grid(~origin)
-
-
-```
-
-
diff --git a/results/template.css b/results/template.css
deleted file mode 100644
index 0649808..0000000
--- a/results/template.css
+++ /dev/null
@@ -1,29 +0,0 @@
-
-title {
-    margin-top: 200px;
-
-    text-align: center;
-}
-
-
-
-
-h1, .h1 {
-    margin-top: 84px;
-
-    text-align: center;
-}
-
-
-h3, .h3, h4 {
-    text-align: center;
-}
-
-.caption {
-  font-size: 0.9em;
-  font-style: italic;
-  color: grey;
-  margin-right: 10%;
-  margin-left: 10%;
-  text-align: center;
-}
diff --git a/src/htrsim/R/input_estimation.R b/src/htrsim/R/input_estimation.R
index b601c5e..cfbe88b 100644
--- a/src/htrsim/R/input_estimation.R
+++ b/src/htrsim/R/input_estimation.R
@@ -57,7 +57,7 @@ estim.alpha <- function(dds, export = FALSE){
 #' @import S4Vectors
 #' @examples
 #' @importFrom rlang .data
-estim.mu <- function(dds, mm, export = FALSE){
+estim.mu <- function(dds, mm, epsilon = TRUE,  export = FALSE){
 
 
   gene_id <- NULL
@@ -90,14 +90,22 @@ estim.mu <- function(dds, mm, export = FALSE){
   beta.matrix = cbind(B0, B1,B2,B3,B4,B5) %>% as.matrix()
   #p_ij = B0_i*mm1_j + B1_i*mm2_j + B3_i*mm3_j + B4_i*mm4_j + B5_i*mm5_j
   p_ij = beta.matrix %*% t(mm)
-  #epsilon_ij ~ N(0, deviance)
-  #epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 ))  %>% data.frame() %>% as.matrix()
-  epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt ))  %>% data.frame() %>% as.matrix()
-  ## log_qij = p_ij + epsilon_ij
-  log_qij <- p_ij + epsilon_ij
+
+
+  if (epsilon == TRUE){
+    message("Epsilon : TRUE")
+    #epsilon_ij ~ N(0, deviance)
+    epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt ))  %>% data.frame() %>% as.matrix()
+    ## log_qij = p_ij + epsilon_ij
+    log_qij <- p_ij + epsilon_ij
+  }
+  else {
+    message("Epsilon : FALSE")
+    log_qij <- p_ij
+  }
+
   ## s_j
   s_j = dds$sizeFactor
-
   mu_ij = s_j * 2^log_qij
 
 
diff --git a/src/htrsim/R/launch_deseq.R b/src/htrsim/R/launch_deseq.R
index d500f78..1f5f266 100644
--- a/src/htrsim/R/launch_deseq.R
+++ b/src/htrsim/R/launch_deseq.R
@@ -9,13 +9,11 @@
 #'
 #' @examples
 run.deseq <- function(tabl_cnts, bioDesign ){
-  ########### LAUNCH DESEQ #############
-  ## Design model - specify reference
-  bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D'))
-  bioDesign$env <- factor(x = bioDesign$env,levels = c('control', 'kcl'))
 
-  ## DESEQ standard analysis
-  dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ genotype + env + genotype:env)
+  dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ genotype + env + genotype:env )
+
+
   dds <- DESeq2::DESeq(dds)
   return(dds)
-}
+
+  }
diff --git a/src/htrsim/inst/extdata/public_bioDesign.csv b/src/htrsim/inst/extdata/public_bioDesign.csv
deleted file mode 100644
index 9c0ad93..0000000
--- a/src/htrsim/inst/extdata/public_bioDesign.csv
+++ /dev/null
@@ -1,13 +0,0 @@
-sample;env;genotype
-WT_control_rep1;control;wt
-WT_control_rep2;control;wt
-WT_KCl_rep1;kcl;wt
-WT_KCl_rep2;kcl;wt
-Msn4D_control_rep1;control;msn4D
-Msn4D_control_rep2;control;msn4D
-Msn4D_KCl_rep1;kcl;msn4D
-Msn4D_KCl_rep2;kcl;msn4D
-Msn2D_control_rep1;control;msn2D
-Msn2D_control_rep2;control;msn2D
-Msn2D_KCl_rep1;kcl;msn2D
-Msn2D_KCl_rep2;kcl;msn2D
diff --git a/src/htrsim/man/estim.mu.Rd b/src/htrsim/man/estim.mu.Rd
index 55ffd21..0a203bd 100644
--- a/src/htrsim/man/estim.mu.Rd
+++ b/src/htrsim/man/estim.mu.Rd
@@ -4,11 +4,13 @@
 \alias{estim.mu}
 \title{Estimate mu_ij}
 \usage{
-estim.mu(dds, export = FALSE)
+estim.mu(dds, mm, epsilon = TRUE, export = FALSE)
 }
 \arguments{
 \item{dds}{DESEQ2 object}
 
+\item{mm}{a model matrix}
+
 \item{export}{Boolean}
 }
 \value{
diff --git a/src/htrsim/man/handle_except.Rd b/src/htrsim/man/handle_except.Rd
index 563ca47..083d5be 100644
--- a/src/htrsim/man/handle_except.Rd
+++ b/src/htrsim/man/handle_except.Rd
@@ -4,18 +4,16 @@
 \alias{handle_except}
 \title{Handle exception}
 \usage{
-handle_except(bioSample, n_rep, gene_id, alpha, n_genes)
+handle_except(bioSample, n_rep, gene_id, alpha)
 }
 \arguments{
+\item{bioSample}{vector of id for each bioSample}
+
 \item{n_rep}{number of replicates}
 
 \item{gene_id}{vector of id for each gene}
 
 \item{alpha}{vector of alpha_i}
-
-\item{n_genes}{number of genes}
-
-\item{bioSample_id}{vector of id for each bioSample}
 }
 \value{
 
diff --git a/src/htrsim/man/reshape_input2setup.Rd b/src/htrsim/man/reshape_input2setup.Rd
index eb85d5a..27551e3 100644
--- a/src/htrsim/man/reshape_input2setup.Rd
+++ b/src/htrsim/man/reshape_input2setup.Rd
@@ -4,12 +4,14 @@
 \alias{reshape_input2setup}
 \title{Reshape input before building setup}
 \usage{
-reshape_input2setup(mu.dtf, alpha.dtf)
+reshape_input2setup(mu.dtf, alpha.dtf, average_rep = FALSE)
 }
 \arguments{
 \item{mu.dtf}{dataframe of mu_ij}
 
 \item{alpha.dtf}{dataframe of alpha_i}
+
+\item{average_rep}{bool}
 }
 \value{
 
diff --git a/src/htrsim/man/setup_countGener.Rd b/src/htrsim/man/setup_countGener.Rd
index c9b1556..4d6b277 100644
--- a/src/htrsim/man/setup_countGener.Rd
+++ b/src/htrsim/man/setup_countGener.Rd
@@ -5,11 +5,10 @@
 \title{Build setup for counts generator}
 \usage{
 setup_countGener(
-  bioSample_id = "my_first_lib",
+  bioSample_id = NULL,
   n_rep = NULL,
   gene_id = NULL,
   alpha = NULL,
-  n_genes = NULL,
   mu = NULL
 )
 }
@@ -22,8 +21,6 @@ setup_countGener(
 
 \item{alpha}{vector of alpha_i}
 
-\item{n_genes}{number of genes}
-
 \item{mu}{dataframe of mu_ij}
 }
 \value{
diff --git a/src/htrsim/vignettes/comment-utiliser-mon-package.Rmd b/src/htrsim/vignettes/comment-utiliser-mon-package.Rmd
index 5f5daa5..ba6fe98 100644
--- a/src/htrsim/vignettes/comment-utiliser-mon-package.Rmd
+++ b/src/htrsim/vignettes/comment-utiliser-mon-package.Rmd
@@ -7,28 +7,47 @@ vignette: >
   %\VignetteEncoding{UTF-8}
 ---
 
+<style>
+body {
+text-align: justify}
+</style>
+
 # A. Introduction
 
+
+A differential expression analysis uses a generalized linear model of the form:
+
+$$
+K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i)
+$$
 $$
-Phenotype = Genotype + Environment + Genotype.Environment
+\mu_{ij} = s_jq_{ij}
 $$
-From this expression, $\beta_{G}$, $\beta_{E}$, $\beta_{G*E}$ can be seen as coefficients which allow quantifying the participation of each factors (Genotype, Environment and interaction Genotype/Environment).
-In mathematical term, it leads to a linear expression such: 
 $$
-P = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0}
+log_2(q_{ij}) = x_j*\beta_i
+$$
+where counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. 
+The fitted mean is composed of a sample-specific size factor $s_j$ and a parameter qij proportional to the expected true concentration of fragments for sample j. 
+The coefficients $\beta_i$ give the log2 fold changes for gene i for each column of the model matrix X. The sample-specific size factors can be replaced by gene-specific normalization factors for each sample using normalizationFactors.
+
+Basically, genes expression is understanding as a shake between a genotype effect, an environment effect and an interaction between G&E. The part of each effect can be modelized by a coefficient $\beta$. 
+Considering genes expression as a phenotype we can write:
+$$Phenotype = \beta_{G} * Genotype + \beta_{E}*Environment +  \beta_{G*E} * Genotype.Environment$$
+
+From a generalized  linear model we can try to quantify each effect : $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$.
+Quantifying such coefficients will allow to evaluate the participation of each factors (Genotype, Environment and interaction Genotype/Environment) for each gene. Then, differencies between genes or conditions could be assessed.
+
+According to the DESEQ2 GLM, we can write: 
+$$
+log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0}
 $$
 
-In order to estimate these coefficients, a Generalized Linear Model (GLM) can be used.
 
 # B. HTRSIM getting started
 
   <u>a. Required</u>
 
-```{r, setupWD, include=FALSE}
-#knitr::opts_knit$set(root.dir = '~/counts_simulation/')
-```
-
-```{r setup}
+```{r setup, results='hide', message=FALSE, warning=FALSE}
 library(htrsim)
 library(tidyverse)
 ```
@@ -58,27 +77,73 @@ tabl_cnts <- read.table(file = fn, header = TRUE)
 rownames(tabl_cnts) <- tabl_cnts$gene_id
 tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID
 tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName
+```
+
+```{r}
+fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim")
+
+bioDesign <- read.table(file = fn, header = TRUE, sep = ';')
 ```
 
  <u> e. Launch HTRSIM</u>
    
 ```{r message=FALSE, warning=FALSE}
-fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim")
 
-## import design of bioProject
-bioDesign <- read.table(file = fn, header = T, sep = ';')
-#bioDesign
-#source(file = "htrsim/main.R")
-tabl_cnts %>% dim()
-bioDesign %>% dim()
-#simul_cnts = htrsim(tabl_cnts, bioDesign = bioDesign, 2)
+#dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign)
+
+#htrsim.results = htrsim(countData = tabl_cnts, bioDesign= bioDesign, N_replicates=1)
+
+
+
+```
+<u> f. Evaluate simulation </u>
+
+```{r}
+build_design2Deseq = function(htrs.tablCnts){
+  #samples = samples %>% data.frame()
+  sample = colnames(htrs.tablCnts %>% select(-gene_id))
+  #design.deseq <- list(samples = colnames(htrs.tablCnts))
+  genotype <- map(sample, ~str_split(., pattern = "_")[[1]][1]) %>% unlist()
+  env <-  map(sample, ~str_split(., pattern = "_")[[1]][2]) %>% unlist()
+  new_design = cbind(sample,genotype, env) %>% data.frame()
+  
+  return (new_design)
+}
+
+designSimu <- build_design2Deseq(htrsim.results$countDataSim)
+
+row.names(htrsim.results$countDataSim) = htrsim.results$countDataSim$gene_id
+htrsim.results$countDataSim = htrsim.results$countDataSim %>% select(-gene_id)
+
+htrsim.results$countDataSim %>% dim()
+designSimu %>% dim()
+#bioDesign$
+designSimu$env
+colnames(htrsim.results$countDataSim)
+model.matrix(~genotype + env + genotype:env, designSimu)
+simu_deseq = run.deseq(tabl_cnts = htrsim.results$countDataSim, designSimu)
+
+designSimu$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D'))
+designSimu$env <- factor(x = bioDesign$env,levels = c('control', 'kcl'))
+
+  ## DESEQ standard analysis
+
+htrs.tablCnts <- htrsim.results$countDataSim %>% replace(is.na(.), 0)
+
+max(htrs.tablCnts)
+dds = DESeq2::DESeqDataSetFromMatrix( countData = round(htrs.tablCnts/10), colData = designSimu , design = ~ genotype + env + genotype:env)
+dds <- DESeq2::DESeq(dds)
 
-htrsim.results = htrsim(countData = tabl_cnts, bioDesign= bioDesign, N_replicates=2)
 
 ```
 
+
  <u> f. Visualize results</u>
    
 ```{r message=FALSE, warning=FALSE}
-htrsim.results$input$d
+
+devtools::load_all()
+
+
+
 ```
-- 
GitLab