From bd2f35dd0960c70ca735e153cd2fa5df53848bda Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Fri, 25 Mar 2022 15:53:44 +0000
Subject: [PATCH] update tuto

---
 src/tutorial_htrsim.Rmd | 495 ++++++++++++++++++++++++++++++++--------
 1 file changed, 404 insertions(+), 91 deletions(-)

diff --git a/src/tutorial_htrsim.Rmd b/src/tutorial_htrsim.Rmd
index 8ddc0c5..36d2092 100644
--- a/src/tutorial_htrsim.Rmd
+++ b/src/tutorial_htrsim.Rmd
@@ -3,10 +3,29 @@ title: "HTRSIM Getting started"
 output: html_document
 ---
 
+# A. Introduction
+
+$$
+Phenotype = Genotype + Environment + Genotype.Environment
+$$
+From this expression, $\beta_{G}$, $\beta_{E}$, $\beta_{G*E}$ can be seen as coefficients which allow quantifying the participation of each factors (Genotype, Environment and interaction Genotype/Environment).
+Then, 
+$$
+P = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0}
+$$
+In order to estimate these coefficients, a Generalized Linear Model (GLM) can be used.
+
+
+```{r echo=FALSE, out.width='100%'}
+knitr::include_graphics('../img/schema_loop.jpg')
+```
+
+
+
 This is a tutorial for *htrsim* utilization
 
 
-```{r required}
+```{r required, message=FALSE, echo = T, results = "hide"}
 library(data.table)
 library(tidyverse)
 library(DESeq2)
@@ -15,7 +34,9 @@ library(DESeq2)
 
 ```{r setworkdir}
 # on berthollet
-setwd("/home2/aduvermy/counts_simulation/src/")
+#setwd("/home2/aduvermy/counts_simulation/src/")
+# on VM
+setwd("/home/rstudio/mydatalocal/counts_simulation/src/")
 ```
 
 
@@ -35,7 +56,7 @@ As a first step we will simulate a single library with 100 replicates.</br>
 
   <u>a. Setup the simulation</u> 
   
-```{r simple design, echo=TRUE}
+```{r simple design, echo=TRUE, message=FALSE, warning=FALSE}
 N_samples = 1 ## number of samples
 N_replicates = 3 ## Nb of replicates per samples
 N_genes = 3 # number of genes in your favorite organism
@@ -46,7 +67,9 @@ samples
 
 ### SETUP SIMULATION 
 source(file = "htrsim/setup_cntsGenerator.R")
-setup.simulation <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , n_genes = N_genes)
+setup.simulation <- setup_countGener(sample_names = samples$name,
+                                     n_rep= samples$n_rep,
+                                     n_genes = N_genes)
 setup.simulation
 ```
 
@@ -54,7 +77,8 @@ setup.simulation
 
 Now you can generate a lovely counts table from parameters *mu* and *alpha* stored in ```setup.simulation```.</br>
 
-```{r generate counts, echo=TRUE}
+```{r generate counts, echo=TRUE, message=FALSE, warning=FALSE}
+## GENERATE COUNTS
 source(file = "htrsim/counts_generator.R")
 htrs <- generate_counts(setup.simulation)
 htrs %>% dim()
@@ -86,7 +110,7 @@ Then, $\mu_{ij}$ and $\alpha_{i}$ define a negative binomial distribution from w
 </br>
 Please redo the simulation with a large set of replicates for your library.  </br>
 
-```{r understand mu & alpha, echo=TRUE}
+```{r understand mu & alpha, echo=TRUE, message=FALSE, warning=FALSE}
 N_samples = 1 ## number of samples
 N_replicates = 2000 ## Nb of replicates per samples
 N_genes = 3 # number of genes in your favorite organism
@@ -96,7 +120,10 @@ samples <- list(name = "my_sample", n_rep = N_replicates)
 
 ### SETUP SIMULATION 
 source(file = "htrsim/setup_cntsGenerator.R")
-setup.simulation <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , n_genes = N_genes)
+setup.simulation <- setup_countGener(sample_names= samples$name, 
+                                     n_rep= samples$n_rep, 
+                                     n_genes = N_genes)
+### GENERATE COUNTS
 source(file = "htrsim/counts_generator.R")
 htrs <- generate_counts(setup.simulation)
 ```
@@ -106,16 +133,19 @@ Next, plot your distribution of simulated counts per gene.</br>
 
 <span style="color:red">*WARNING: Code bellow is informative only with one library, a small set of genes, but a lot of replicates !*</span></br>
 
-```{r plot hist, echo = TRUE}
-
+```{r plot hist, message=FALSE, warning=FALSE}
+## Reshape and build dataframe easy to plot
 data2plot <- htrs %>% reshape2::melt( ., id=c("name_gene"), variable.name = "Run")
-data2plot <- data2plot %>% dplyr::group_by(name_gene) %>% dplyr::mutate(mean_obs = mean(value)) %>% dplyr::ungroup()
+data2plot <- data2plot %>% 
+                dplyr::group_by(name_gene) %>% 
+                dplyr::mutate(mean_obs = mean(value)) %>% 
+                dplyr::ungroup()
 
 figure = data2plot %>% ggplot(., aes(x=value)) +
-  geom_histogram(fill= "grey", binwidth = 30) + 
-  facet_wrap(~name_gene, scale = "free_y") +
-  geom_vline(aes(xintercept = mean_obs), col="#0072B2") +
-  geom_text(aes(x = mean_obs, y = 0, label = paste0("Mean obs\n", mean_obs), vjust = -2, hjust=-0.2), col="#0072B2")
+            geom_histogram(fill= "grey", binwidth = 30) + 
+            facet_wrap(~name_gene, scale = "free_y") +
+            geom_vline(aes(xintercept = mean_obs), col="#0072B2") +
+            geom_text(aes(x = mean_obs, y = 0, label = paste0("Mean obs\n", mean_obs), vjust = -2, hjust=-0.2), col="#0072B2")
 figure
 setup.simulation
 ``` 
@@ -131,11 +161,10 @@ And alpha is the dispersion of the distribution.
   <u>d. Setup mu and alpha before counts generation</u>
    
 *htrsim* gives you the possibility to setup your own parameters for negative binomial. </br>
-You can control the distribution from which each count $\c_{ij}$ are sampled.
+You can control the distribution from which each count $c_{ij}$ are sampled.
 
 
 ```{r setup params}
-
 N_samples = 2 ## number of samples
 N_replicates = 2 ## Nb of replicates per samples
 N_genes = 3 # number of genes in your favorite organism
@@ -145,47 +174,58 @@ samples <-  list(name = paste0('sample', 1:N_samples), n_rep = N_replicates)
 
 
 ### Defining alpha params for each gene
-alphaGene.set = runif(0.2,120, n = N_genes) ## randomly defined between 2 and 100
+alphaGene.set = stats::runif(0.2,120, n = N_genes) ## randomly defined between 2 and 120
 nameGene.set = paste0('gene', 1:N_genes)
 genes <-  list(name = nameGene.set , alpha = alphaGene.set) %>% as.data.frame()
-#genes
 
-### Defining mu params for each gene of each sample
+
 ## from an input dataframe
 mu.dtf.colnames = samples$name
 mu.dtf.geneID = genes$name
 mu.dtf.init = data.frame(matrix(0, nrow = length(mu.dtf.geneID), ncol = length(mu.dtf.colnames)))
-colnames(mu.dtf.init) = mu.dtf.colnames#
-mu.dtf <- mu.dtf.init %>% mutate_all(., ~ runif(100, 2000, n = N_genes)) ## fill mu.dtf from uniform law
-mu.dtf <- mu.dtf %>% mutate(gene_id = mu.dtf.geneID) %>% select(gene_id ,everything()) #optionnal
-## from a statistical law
-mu.foo <- function(x) return(runif(100,1000, n = x ))
-
+colnames(mu.dtf.init) = mu.dtf.colnames
+mu.dtf <- mu.dtf.init %>% 
+                dplyr::mutate_all(., ~ runif(100, 2000, n = N_genes)) ## fill mu.dtf from uniform law
+mu.dtf <- mu.dtf %>% #optionnal
+                dplyr::mutate(gene_id = mu.dtf.geneID) %>% 
+                dplyr::select(gene_id ,everything()) 
 
 ## Setup simulation
 source(file = "htrsim/setup_cntsGenerator.R")
-## with mu dataframe
-setup.simulation1 <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , gene_dispersion = genes$alpha, gene_names = genes$name, mu = mu.dtf)
+## with mu define from a dataframe
+setup.simulation1 <- setup_countGener(sample_names= samples$name, 
+                                      n_rep= samples$n_rep, 
+                                      gene_dispersion = genes$alpha, 
+                                      gene_names = genes$name, 
+                                      mu = mu.dtf)
 setup.simulation1
-## with mu function
-setup.simulation2 <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , gene_dispersion = genes$alpha, gene_names = genes$name, mu = mu.foo)
+
+### Defining mu params for each gene of each sample
+## from a statistical law
+mu.foo <- function(x) return(runif(100,1000, n = x ))
+## with mu define from function
+setup.simulation2 <- setup_countGener(sample_names= samples$name, 
+                                      n_rep= samples$n_rep, 
+                                      gene_dispersion = genes$alpha, 
+                                      gene_names = genes$name, 
+                                      mu = mu.foo)
 setup.simulation2
 ```
 
 
 Above we randomly defined $\mu_{ij}$ and $\alpha_{i}$.</br>
 
-Defining mu and alpha value per gene for each sample is  useful when you can infer them from experimental data.</br>
+Defining mu and alpha value per gene for each sample is useful when you can infer them from experimental data.</br>
 [See rna-seq_public_library_investigation](https://gitbio.ens-lyon.fr/aduvermy/rna-seq_public_library_investigations)
 
 
 Use filter to understand the ```setup.simulation``` object.</br>
 
 ```{r filter setup, echo=TRUE}
-setup.simulation1 %>% filter(name_gene == "gene1")
+setup.simulation1 %>% dplyr::filter(name_gene == "gene1")
 ```
 
-You should notice that $$\mu$$ and $$\alpha$$ are equal for equivalent gene between replicates.</br>
+You should notice that $\mu$ and $\alpha$ are equal for equivalent gene between replicates.</br>
 But not $\mu$ ! </br>
 As defined in : <br> 
 $$
@@ -194,7 +234,7 @@ $$
 
 ## B. Performing complex design
 
-Now you understood how mu and alpha affect the distribution of counts per genes.
+Now you understood how $\mu$ and $\alpha$ affect the distribution of counts per gene.<br>
 Also, you mastered using *htrsim* to build a library from simple RNA-seq design.
 
 
@@ -203,7 +243,7 @@ Also, you mastered using *htrsim* to build a library from simple RNA-seq design.
 *Htrsim* was developed to build complex RNA-seq design. 
 By complex RNA-seq design I mean several libraries with several replicates and not often the same number of replicates per samples.
 
-```{r complex design}
+```{r complex design, message=FALSE, warning=FALSE}
 N_samples = 10
 max_N_replicates = 5 ## maximum number of replicates per sample -> heterogeneous design
 N_genes = 6000
@@ -224,17 +264,24 @@ genes <-  list(name = nameGene.set , alpha = alphaGene.set) %>% as.data.frame()
 mu.dtf.colnames = samples$name
 mu.dtf.geneID = genes$name
 mu.dtf.init = data.frame(matrix(0, nrow = length(mu.dtf.geneID), ncol = length(mu.dtf.colnames)))
-colnames(mu.dtf.init) = mu.dtf.colnames#
-mu.dtf <- mu.dtf.init %>% mutate_all(., ~ runif(100, 2000, n = N_genes)) ## fill mu.dtf from uniform law
-mu.dtf <- mu.dtf %>% mutate(gene_id = mu.dtf.geneID) %>% select(gene_id ,everything())
+colnames(mu.dtf.init) = mu.dtf.colnames
+mu.dtf <- mu.dtf.init %>% 
+                      mutate_all(., ~ runif(100, 2000, n = N_genes)) ## fill mu.dtf from uniform law
+mu.dtf <- mu.dtf %>% 
+                  mutate(gene_id = mu.dtf.geneID) %>%  # reshape for convenience
+                  dplyr::select(gene_id ,everything())
 
 ## Setup simulation
 source(file = "htrsim/setup_cntsGenerator.R")
 ## with mu dataframe
-setup.simulation <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , gene_dispersion = genes$alpha, gene_names = genes$name, mu = mu.dtf)
+setup.simulation <- setup_countGener(sample_names= samples$name, 
+                                     n_rep= samples$n_rep, 
+                                     gene_dispersion = genes$alpha, 
+                                     gene_names = genes$name, 
+                                     mu = mu.dtf)
 source(file= "htrsim/counts_generator.R" )
 htrs <- generate_counts(setup.simulation)
-htrs %>% filter(name_gene == "gene1")
+htrs %>% dplyr::filter(name_gene == "gene1")
 ```
 
 About columns names:</br>
@@ -248,21 +295,23 @@ About columns names:</br>
 
 Once you convert name_gene columns to rownames ```htrs``` can be used as input of DESEQ2.
 
-```{r input deseq 1}
+```{r input deseq 1, message=FALSE, warning=FALSE}
+## Reshape for DESEQ
 rownames(htrs) <- htrs$name_gene
-htrs.deseq = htrs %>% select(-name_gene) # remove name gene column
-htrs.deseq %>% filter(rownames(.)=='gene1')
+htrs.deseq = htrs %>% dplyr::select(-name_gene) # remove name gene column
 ```
 
-But DESEQ2 also needs to understand your design. For this follow this (link to deseq tuto)
-Otherwise, we developed a function to convert ```setup.simulation``` into a conventional deseq2 design input.
+But DESEQ2 also needs to understand your design. [DESeq2 tutorial](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)<br>
+For this, we developed a function to convert ```setup.simulation``` into a conventional deseq2 design input.
 
 
-```{r input deseq 2}
+```{r input deseq 2, message=FALSE, warning=FALSE}
 ## add information for each sample
 set.seed(101)
 samples$env <- c(rep("kcl", 5), rep("control", 5)) ## 5 x 2 environments
 samples$genotype <- sample(c("msn2d", "msn4D", "WT"),10, replace = TRUE)
+
+## SETUP DESEQ
 source(file = "htrsim/setup_deseq.R")
 setup.deseq <- setup_deseq(samples, htrs.deseq)
 setup.deseq %>% head()
@@ -270,17 +319,19 @@ setup.deseq %>% head()
 
   <u>c. Run DESeq2</u>
 
-```{r}
+```{r message=FALSE, warning=FALSE}
 #source(file = "htrsim/launch_deseq.R")
 htrs.deseq %>% dim()# htrs %>% select(-name_gene)
 setup.deseq %>% dim()
 #model.matrix(~genotype + env + genotype:env, setup.deseq) # check if a column is fully equal to 0 if yes -> lead to an error
-dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq , colData = setup.deseq , design = ~ genotype + env + genotype:env)
+dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq, 
+                                     colData = setup.deseq, 
+                                     design = ~ genotype + env + genotype:env)
 ```
 
    <u>d. alpha</u>
 
-```{r}
+```{r message=FALSE, warning=FALSE}
 dds <- DESeq2::estimateSizeFactors(dds)
 dds  <- DESeq2::estimateDispersions(dds)
 #dispersion_estimate <- dip
@@ -299,16 +350,18 @@ Whereas we produced alpha per gene from a uniform distribution.<br>
 
 <u>d. mu </u>
 
-```{r}
+```{r message=FALSE, warning=FALSE}
 
 mu_estimate <- dds@assays@data$mu
 mu_inference.vec = as.vector(mu_estimate)
 mu.inference = data.frame(mu = mu_inference.vec, from = "inference")
 
-mu_input.vec = as.vector(mu.dtf %>% select(-gene_id)) %>% flatten() %>% unlist()
+mu_input.vec = as.vector(mu.dtf %>% dplyr::select(-gene_id)) %>% 
+                                            purrr::flatten() %>% 
+                                            BiocGenerics::unlist()
 mu.input = data.frame(mu = mu_input.vec, from = "input")
 
-mu.dtf <- rbind(mu.input, mu.inference)
+mu.dtf <- BiocGenerics::rbind(mu.input, mu.inference)
 ggplot(mu.dtf, aes(x=mu)) + geom_density()  + facet_grid(~from)
 ```
 
@@ -319,32 +372,49 @@ Whereas we produced mu per gene from a uniform distribution.<br>
 
 ## C. BioProject PRJNA675209b as input 
 
-[link github](https://gitbio.ens-lyon.fr/aduvermy/rna-seq_public_library_investigations)<br>
+PRJNA675209b is a RNA-seq project which proposes 3 genotypes studied into 2 environmental conditions.<br>
+Using the conventional RNA-Seq procedure described at [see](https://gitbio.ens-lyon.fr/aduvermy/rna-seq_public_library_investigations), and a Generalized Linear Model it is possible to deduce $\mu$ and $\alpha$ from real counts per gene.
+Using this RNA-seq conventional procedure we defined $\mu$ and $\alpha$ from RNA-seq real table counts (PRJNA675209b).
+<br>
 
-We will mimic an existing design.
+By using  as input parameters of ```htrsim```, $\mu_{ij}$ and $\alpha_{i}$ deduced from PRJNA675209b table counts, we will simulate table counts ($c_{ij}$). 
+By using a Generalized Linear Model, we will infer $\mu_{ij}$ and $\alpha_{i}$ from  $c_{ij}$ obtained in simulation.<br>
+Finally, we will compare $\mu_{ij}$, $\alpha_{i}$ deduced from real counts, and $\mu_{ij}$, $\alpha_{i}$ obtained from simulate counts.
 
+**By playing on pragmatical parameters (number of replicates, library size, ...) we aim to understand on which way we may maximize the fit between $\mu_{ij}$ and $\alpha_{i}$ obtained from real and simulate counts**
 
 1. Input for simulation
 
 ```{r message=FALSE, warning=FALSE}
-## import mu(ij) params for each gene & each sample
-mu_params = readr::read_tsv(file="../../rna-seq_public_library_investigations/results/2022-03-03/estimate_mu.tsv", show_col_type=F)
-mu_params <- mu_params %>% select(., contains("rep1")) 
-
 ## import alpha(i) for each genes
-alpha_params = readr::read_tsv(file="../../rna-seq_public_library_investigations/results/2022-03-03/estimate_dispersion.tsv", show_col_type=F)
+alpha_params = readr::read_tsv(file="../../rna-seq_public_library_investigations/results/2022-03-03/estimate_dispersion.tsv")#, show_col_type=F)
 
 
+## import mu(ij) params for each gene & each sample
+mu_params = readr::read_tsv(file="../../rna-seq_public_library_investigations/results/2022-03-03/estimate_mu.tsv")#, show_col_type=F)
 ## Defining sample names
 sample_names <- mu_params %>% 
-                #select(-gene_id) %>% 
+                select(-gene_id) %>% 
                 colnames() %>% 
-                map(., ~str_split(.,"_")[[1]][1:2] %>% 
-                        paste(., collapse='_') )  %>% 
-                unlist() %>% unique()
+                purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>% 
+                                  BiocGenerics::paste(., collapse='_') )  %>% 
+                BiocGenerics::unlist() %>% BiocGenerics::unique() 
 
+## Mu is same for replicate 
+### case 1: choose 1  replicate 
+#mu_params <- mu_params %>% dplyr::select(., contains("rep1")) 
 ## rename mu_params colnames to ensure corresponding with sample_names 
-colnames(mu_params) <- sample_names
+#colnames(mu_params) <- sample_names
+### case 2: average replicates
+average_rep <- function(x) {
+    varname <- x
+    mu_params %>% 
+      select(.,contains(x)) %>% 
+      mutate(!!varname := rowMeans(.)) %>% 
+      select(varname)
+}
+mu_params <- sample_names %>% map(.x = ., .f = ~average_rep(.x))  %>% data.frame() 
+mu_params$gene_id <- alpha_params$gene_id
 
 ## Defining number of genes:
 N_genes = length(alpha_params$gene_id)
@@ -358,75 +428,318 @@ samples <- list(name = sample_names,  n_rep = N_replicates) ## Nb of replicates
 ### Defining alpha params for each gene
 alphaGene.set = alpha_params$dispersion
 nameGene.set = alpha_params$gene_id
-genes <-  list(name = nameGene.set , alpha = alphaGene.set) %>% as.data.frame()
+genes <-  list(name = nameGene.set, 
+               alpha = alphaGene.set) %>%
+                                  as.data.frame()
 ```
 
 2. setup & process simulation
 
-```{r}
+```{r message=FALSE, warning=FALSE}
 source(file = "htrsim/setup_cntsGenerator.R")
-## with mu dataframe
-setup.simulation <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , gene_dispersion = genes$alpha, gene_names = genes$name, mu = mu_params)
-setup.simulation %>% head
+## Setup simulation from PRJNA675209b table counts
+setup.simulation <- setup_countGener(sample_names = samples$name,
+                                     n_rep= samples$n_rep,
+                                     gene_dispersion = genes$alpha, 
+                                     gene_names = genes$name,
+                                     mu = mu_params)
+## Generate counts
 source(file= "htrsim/counts_generator.R" )
 htrs <- generate_counts(setup.simulation)
 ```
 
 3. Setup & Run DESeq2
 
-```{r}
+```{r message=FALSE, warning=FALSE}
 ## reshape htrs for convenience
 rownames(htrs) <- htrs$name_gene
-htrs.deseq = htrs %>% select(-name_gene) # remove name gene column
+htrs.deseq = htrs %>% 
+              dplyr::select(-name_gene) # remove name gene column
 
 
 ## add information for each sample
-samples$env <- sample_names %>% map(., ~str_split(.,"_")[[1]][2]) %>% unlist() ## 5 x 2 environments
-samples$genotype <- sample_names %>% map(., ~str_split(.,"_")[[1]][1]) %>% unlist()
+samples$env <- sample_names %>% 
+                    purrr::map(., ~stringr::str_split(.,"_")[[1]][2]) %>% 
+                    BiocGenerics::unlist() ## 5 x 2 environments
+samples$genotype <- sample_names %>% 
+                        purrr::map(., ~stringr::str_split(.,"_")[[1]][1]) %>% 
+                        BiocGenerics::unlist()
+
 ## build input dataframe for deseq
-setup.deseq = samples %>% as.data.frame() %>% uncount(n_rep, .id = "idx") %>% unite(name, name, idx , sep = "_")  
+setup.deseq = samples %>% 
+                  as.data.frame() %>% 
+                  tidyr::uncount(n_rep, .id = "idx") %>% 
+                  tidyr::unite(name, name, idx , sep = "_")  
 rownames(setup.deseq) <- c() ## drop rownames
 
 
-
 #check homogeneity between countData column number & colData row number
-htrs.deseq %>% dim()# htrs %>% select(-name_gene)
+htrs.deseq %>% dim()
 setup.deseq %>% dim()
-#model.matrix(~genotype + env + genotype:env, setup.deseq) # check if a column is fully equal to 0 if yes -> lead to an error
-dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq , colData = setup.deseq , design = ~ genotype + env + genotype:env)
+
+# check if a column is fully equal to 0 if yes -> lead to an error
+#model.matrix(~genotype + env + genotype:env, setup.deseq) 
+dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq,
+                                     colData = setup.deseq,
+                                     design = ~ genotype + env + genotype:env)
 ```
 
 4. alpha
 
-```{r}
+```{r message=FALSE, warning=FALSE}
+
+## Dispersion inference
 dds <- DESeq2::estimateSizeFactors(dds)
 dds  <- DESeq2::estimateDispersions(dds)
-#dispersion_estimate <- dip
 dispersion_estimate <- DESeq2::dispersions(dds) 
 
-alpha.inference = data.frame(alpha = dispersion_estimate, from = "inference")
-alpha.input = data.frame(alpha = alphaGene.set, from = "input") 
+## Build dataframes
+alpha.inference = data.frame(inference = dispersion_estimate)
 
-alpha.dtf <- rbind(alpha.input, alpha.inference)
-ggplot(alpha.dtf, aes(x=alpha), na.rm = TRUE) + geom_density() + scale_x_log10() + facet_grid(~from)
+alpha.input = data.frame(input = alpha_params$dispersion,  
+                        name_gene = alpha_params$gene_id) 
+
+## Same dimension
+#alpha.input %>% dim
+#alpha.inference %>% dim
+
+## Merge & plot
+alpha.dtf <- BiocGenerics::cbind(alpha.input, alpha.inference)
+
+
+## Plot 2 by 2
+ggplot(alpha.dtf, aes(x=input, y = inference), na.rm = TRUE) + geom_point(alpha=0.4) 
+
+# Reshape and plot distribution
+alpha.dtf.reshape = alpha.dtf %>% 
+                      reshape2::melt( ., id=c("name_gene"), 
+                                      variable.name = "from", 
+                                      value.name = "alpha")
+                                    
+
+ggplot(alpha.dtf.reshape, aes(x=alpha), na.rm = TRUE) + geom_density() + scale_x_log10() + facet_grid(~from)
 ```
 
 
 5. mu
 
 ```{r}
-
+## Mu inference
 mu_estimate <- dds@assays@data$mu
-mu_inference.vec = as.vector(mu_estimate) %>% as.numeric()
-mu.inference = data.frame(mu = mu_inference.vec, from = "inference")
+colnames(mu_estimate) <- htrs.deseq %>% colnames()
+mu_estimate = mu_estimate %>% as.data.frame()
+## Average replicates
+average_rep <- function(x) {
+    varname <- x
+    mu_estimate %>% 
+      select(.,contains(x)) %>% 
+      mutate(!!varname := rowMeans(.)) %>% 
+      select(varname)
+}
+
+mu_estimate <- sample_names %>% map(.x = ., .f = ~average_rep(.x))  %>% data.frame() 
+mu_inference.vec = mu_estimate %>% 
+                      purrr::flatten() %>% 
+                      BiocGenerics::unlist() %>% 
+                      as.numeric()
+mu.inference = data.frame(inference = mu_inference.vec, 
+                          env = rep(samples$env, each = N_genes),
+                          genotype = rep(samples$genotype, each = N_genes))
+
+## Mu used as input 
+mu_input.vec = mu_params %>% 
+                    select(-gene_id) %>%
+                    purrr::flatten() %>% 
+                    BiocGenerics::unlist() %>% 
+                    as.numeric()
+mu.input = data.frame(input = mu_input.vec, name_gene = mu_params$gene_id )
+
+#Same dimension
+#mu.inference %>% dim()
+#mu.input %>% dim()
+
+## Merged input & inference
+mu.dtf <- BiocGenerics::cbind(mu.input, mu.inference)
+
+## Plot 2 by 2
+ggplot(mu.dtf, aes(x=input, y = inference, col=genotype, shape=env), na.rm = TRUE) + geom_point(alpha=0.4) 
+
+# Reshape and plot distribution
+mu.dtf.reshape = mu.dtf %>% 
+                      reshape2::melt( ., id=c("name_gene", "env", "genotype"), 
+                                      variable.name = "from", 
+                                      value.name = "mu")
+                                    
+ggplot(mu.dtf.reshape, aes(x=mu), na.rm = TRUE) + geom_density() +
+  scale_x_log10() + facet_grid(~from)
+```
 
-mu_input.vec = as.vector(mu.dtf) %>% flatten() %>% unlist() %>% as.numeric()
-mu.input = data.frame(mu = mu_input.vec, from = "input")
+6. Number of replicates effect
 
-mu.dtf <- rbind(mu.input, mu.inference)
+Bellow, $\mu_{ij}$ and $\alpha_{i}$ deduced from PRJNA675209b table counts, will be reuse as input of simulation.<br>
+By increasing the number of replicates per sample we hope to improve fitting  $\mu_{ij}$ and $\alpha_{i}$ obtained from real and simulate counts.
+
+<br>
+Please, redo the simulation and the inference of $\mu_{ij}$ and $\alpha_{i}$ by increasing the number of replicates per sample.
+
+```{r message=FALSE, warning=FALSE}
+## See *B-1. Input for simulation* to define :
+# - mu_params
+# - alpha_params
+# - samples (see *B-3. Setup & Run DESeq2* to add information for each sample)
+
+
+
+# number of replicate per simulation
+replicate_number2simul = seq(2, 16, by=4)
+
+#init output
+alpha.inference = data.frame(inference = numeric(), 
+                             additional_params = factor())
+mu.inference = data.frame(inference = numeric(), 
+                          env = factor(),
+                          genotype = factor(),
+                          additional_params = factor(),
+                          name_gene = character())
+
+## Mu used as input 
+mu_input.vec = mu_params %>% 
+                    select(-gene_id) %>%
+                    purrr::flatten() %>% 
+                    BiocGenerics::unlist() %>% 
+                    as.numeric()
+mu.input = data.frame(input = mu_input.vec, 
+                      name_gene = mu_params$gene_id,
+                      env = rep(samples$env, each = N_genes),
+                     genotype = rep(samples$genotype, each = N_genes))
+
+
+alpha.input = data.frame(input = alpha_params$dispersion,  
+                        name_gene = alpha_params$gene_id) 
+
+
+
+
+for (N in replicate_number2simul){
+  
+    samples$n_rep = N ## modify nb of replicate per simulation
+    
+    ## Setup simulation
+    source(file = "htrsim/setup_cntsGenerator.R")
+    setup.simulation <- setup_countGener(sample_names= samples$name, 
+                                         n_rep= samples$n_rep, 
+                                         gene_dispersion = genes$alpha, 
+                                         gene_names = genes$name, 
+                                         mu = mu_params)
+    ## Generate counts from setup
+    source(file = "htrsim/counts_generator.R" )
+    htrs <- generate_counts(setup.simulation)
+    
+    ## reshape htrs for convenience
+    rownames(htrs) <- htrs$name_gene
+    htrs.deseq = htrs %>% dplyr::select(-name_gene) # remove name gene column
+    
+    
+    ## build input dataframe for deseq
+    setup.deseq = samples %>% 
+                  as.data.frame() %>% 
+                  tidyr::uncount(n_rep, .id = "idx") %>% 
+                  tidyr::unite(name, name, idx , sep = "_")  
+    rownames(setup.deseq) <- c() ## drop rownames
+    
+    ## run DESEQ
+    dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq , 
+                                         colData = setup.deseq , 
+                                         design = ~ genotype + env + genotype:env)
+
+    # Build results
+    dds <- DESeq2::estimateSizeFactors(dds)
+    dds  <- DESeq2::estimateDispersions(dds)
+    dispersion_estimate <- DESeq2::dispersions(dds) 
+
+    
+    ################# Alpha ################
+    tmp = data.frame(inference = dispersion_estimate, 
+                     additional_params=N)
+    
+    alpha.inference = BiocGenerics::rbind(alpha.inference, tmp)
+    
+    ################## Mu  ################
+    mu_estimate <- dds@assays@data$mu  %>% data.frame() 
+    colnames(mu_estimate) <- htrs.deseq %>% colnames()
+    average_rep <- function(x) {
+          varname <- x
+          mu_estimate %>% 
+            select(.,contains(x)) %>% 
+            mutate(!!varname := rowMeans(.)) %>% 
+            select(varname)
+    }
+
+    mu_estimate <- sample_names %>% map(.x = ., .f = ~average_rep(.x))  %>% data.frame() 
+    mu_inference.vec = mu_estimate %>% 
+                      purrr::flatten() %>% 
+                      BiocGenerics::unlist() %>% 
+                      as.numeric()
+    tmp = data.frame(inference = mu_inference.vec, 
+                     env = rep(samples$env, each = N_genes),
+                     genotype = rep(samples$genotype, each = N_genes),
+                     additional_params=N,
+                     name_gene = rep(mu_params$gene_id, length(sample_names)))
+    mu.inference = BiocGenerics::rbind(mu.inference, tmp)
+    
+}
 
-ggplot(mu.dtf, aes(x=mu), na.rm = TRUE) + geom_density() +
-  scale_x_log10() + facet_grid(~from)
 ```
 
 
+Now, you can compare  $\alpha_{i}$ from real and simulate counts.<br>
+
+
+```{r message=FALSE, warning=FALSE}
+## bind alpha used as input and alpha inferred
+alpha.dtf <- BiocGenerics::cbind(alpha.input, alpha.inference)
+
+
+## plot 2 by 2
+alpha.inference$additional_params = alpha.inference$additional_params %>% as.factor()
+ggplot(alpha.dtf, na.rm = TRUE) + geom_point(aes(x=input, y=inference), alpha=0.02) + scale_y_log10() + scale_x_log10()  + facet_wrap(~additional_params, scales = 'free_y')
+
+
+
+# Reshape and plot distribution
+alpha.dtf.reshape = alpha.dtf %>% 
+                      reshape2::melt( ., id=c("name_gene", "additional_params"), 
+                                      variable.name = "from", 
+                                      value.name = "alpha")
+
+ggplot(alpha.dtf.reshape, aes(x=alpha), na.rm = TRUE) + geom_density() + scale_x_log10() + facet_grid(from~additional_params)
+
+
+```
+
+The number of replicates does not seem to have a significant effect on the inference of $\alpha_{i}$.<br>
+<br>
+Now, compare  $\mu_{ij}$ from real and simulate counts.<br>
+
+```{r message=FALSE, warning=FALSE}
+## Join input & inference
+dt1 <- data.table(mu.inference %>% group_by(additional_params), key =  c("name_gene", "env", "genotype")) 
+dt2 <- data.table(mu.input, key = c("name_gene", "env", "genotype"))
+mu.dtf = dt1[dt2,  nomatch=0, roll=1] 
+
+
+## Plot 2 by 2
+ggplot(mu.dtf, aes(x=input, y = inference), na.rm = TRUE) + geom_point(alpha=0.4) 
+
+## Reshape and plot distribution
+mu.dtf.reshape = mu.dtf %>% 
+                      reshape2::melt( ., id=c("name_gene", "env", "genotype", "additional_params"), 
+                                      variable.name = "from", 
+                                      value.name = "mu")
+                                    
+ggplot(mu.dtf.reshape, aes(x=mu), na.rm = TRUE) + geom_density() +
+  scale_x_log10() + facet_grid(from~additional_params)
+
+```
+
+**Increasing the number of replicates seems to improve the fit between the distribution of  $\mu_{ij}$, used as input, and the distribution obtained by inference.**
-- 
GitLab