From a3dbde3b81f75cb973ce25fdc02a4991ec32349b Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Fri, 22 Apr 2022 15:43:58 +0200
Subject: [PATCH] update package

---
 results/report_22-04-2022.Rmd         | 294 ++++++++++++++++++++++++++
 results/template.css                  |  29 +++
 src/htrsim/tests/testthat.R           |   4 +
 src/htrsim/tests/testthat/test-name.R |  29 +++
 4 files changed, 356 insertions(+)
 create mode 100644 results/report_22-04-2022.Rmd
 create mode 100644 results/template.css
 create mode 100644 src/htrsim/tests/testthat.R
 create mode 100644 src/htrsim/tests/testthat/test-name.R

diff --git a/results/report_22-04-2022.Rmd b/results/report_22-04-2022.Rmd
new file mode 100644
index 0000000..6f5ce4d
--- /dev/null
+++ b/results/report_22-04-2022.Rmd
@@ -0,0 +1,294 @@
+---
+title: "Report_2022-04-21"
+output: html_document
+date: '2022-04-21'
+css: template.css
+---
+
+
+## Introduction
+
+
+In living world, phenotypes are understanding as a mixture between a genotype effect, an environment effect and an interaction between G&E. 
+$$Phenotype = Genotype + Environment + Genotype.Environment$$
+The quantification of each strengths (G,E; G&E) can be estimate by a coefficient $\beta$. 
+Then, our expression becomes: 
+$$Phenotype = \beta_{G} * Genotype + \beta_{E}*Environment +  \beta_{G*E} * Genotype.Environment + \epsilon$$
+Notice that $\beta$ is specific of each component. Furthermore, we introduced above $\epsilon$. It's the residual of the model. $\epsilon$ can be seen as the difference between observed values and values predicted by the model.
+
+Genes expression can be also considered as a phenotype. <br> 
+According to this, the quantification of $\beta_{G}$, $\beta_{E}$ and  $\beta_{G*E}$ for a given gene in a given condition may open the possibility to assess differences between the strengths in presence in different conditions.
+
+That's the purpose of Htrsim !
+
+## Htrsim
+
+<u> Model </u>
+
+In this aim, Htrsim is based on a model. <br> 
+Because of is easy of use this model is managed by DESEQ2.
+Then, $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. 
+
+$$
+K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i)
+$$
+$$
+\mu_{ij} = s_jq_{ij}
+$$
+$$
+log_2(q_{ij}) = x_j*\beta_i
+$$
+The fitted mean is composed of a sample-specific size factor $s_j$ and a parameter qij proportional to the expected true concentration of fragments for sample j. 
+The coefficients $\beta_i$ give the log2 fold changes for gene i for each column of the model matrix X. The sample-specific size factors can be replaced by gene-specific normalization factors for each sample using normalizationFactors.
+
+According to the DESEQ2 GLM and our purpose, we can write: 
+$$
+log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij}
+$$
+
+According to this generalized linear model, we wish to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene i, in a given condition j. Achieve this, would allow us to quantify each strengths (G, E, G&E) for a given gene i, in a given condition j.
+
+
+<u> Required </u>
+
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+library(htrsim)
+library(tidyverse)
+library(reshape2)
+```
+
+<u> Worklow </u>
+
+Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br>
+<br>
+Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$.
+
+
+a. Input 
+
+
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+## Import & reshape table counts
+fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim")
+tabl_cnts <- read.table(file = fn, header = TRUE)
+rownames(tabl_cnts) <- tabl_cnts$gene_id
+tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID
+tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName
+
+## import design of bioProject
+fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim")
+bioDesign <- read.table(file = fn, header = T, sep = ';')
+
+```
+
+b. Launch DESEQ2
+
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign)
+```
+
+
+DESEQ returns a dds object which contains many, many things ... <br>
+In particular it contains the $\beta$ coefficients. <br>
+<br>
+You can access to beta coefficients using:
+
+```{r}
+dds.mcols = S4Vectors::mcols(dds,use.names=TRUE)
+```
+
+c. $\mu_{ij}$ 
+
+Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2,
+
+$$
+log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij}
+$$
+
+Then, $\mu_{ij]}$ can be estimate
+
+$$
+\mu_{ij} = s_j * 2^{log_2\mu_{ij]}}
+$$
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+## Model matrix per samples
+mm <- model.matrix(~genotype + env + genotype:env, bioDesign)
+
+## Input estimation
+estim_mu = estim.mu(dds, mm)
+mu.input = estim_mu$mu
+```
+
+d. $K_{ij}$
+
+As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. 
+
+$$
+K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i)
+$$
+The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br>
+You can access to $\alpha_i$  using:
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+alpha.input = estim.alpha(dds)
+```
+
+Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper.
+We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b.
+
+```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"}
+# Setup simulation
+input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE)
+
+#input$gene_id
+setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id,
+                                     n_rep = 1,
+                                     alpha = input$alpha,
+                                     gene_id = input$gene_id,
+                                     mu = input$mu)
+
+#setup.simulation %>% dim()
+# Simulate counts
+htrs <- generate_counts(setup.simulation)
+
+```
+
+
+```{r message=FALSE, warning=FALSE, include=TRUE}
+k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist()
+k_ij.actual = tabl_cnts %>% flatten() %>% unlist()
+
+df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin")
+df$origin = df$Var2
+df = df %>% select(-Var2)
+max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max()
+max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max()
+
+ggplot(df, aes(x = k_ij, fill= origin )) +  geom_density(bins = 100, alpha = 0.5) + 
+  geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) +  
+  geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" )   +
+  scale_x_log10()
+```
+
+Comment: $K_{ij}$ simulated are abnormally huge !
+Comment: $K_{ij}$ simulated are slightly different from the actual K_{ij} !
+
+
+## Why so much differences
+
+b. $\epsilon$
+
+In our model, we define as follow:
+$$
+\epsilon_{ij} \sim {\sf N}(0 ; deviance_i)
+$$
+
+Let's see the distribution of $deviance_{i}$.
+
+
+```{r warning=FALSE}
+#estim_mu$beta.matrix
+
+deviance_i = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)]^2
+#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt ))  %>% data.frame() %>% flatten() %>% unlist()
+
+
+# Histogram logarithmic y axis
+ggplot(data.frame(deviance_i), aes(deviance_i)) +               
+  geom_histogram(bins = 100) #+ scale_x_log10()
+
+
+```
+
+
+The deviance is also inferred by DESEQ while computing its model.
+deviance is mostly inferred between 100 and 200.  
+
+$$
+log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij}
+$$
+
+```{r warning=FALSE}
+#estim_mu$beta.matrix
+
+deviance_i.sqrt = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)]
+epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 ))  %>% data.frame() %>% flatten() %>% unlist()
+#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 ))  %>% data.frame() %>% flatten() %>% unlist()
+
+# Histogram logarithmic y axis
+ggplot(data.frame(epsilon_ij), aes(epsilon_ij)) +               
+  geom_histogram(bins = 100) #+ scale_x_log10()
+
+
+```
+
+
+Comment: Some  $\epsilon_{ij}$ are huge !
+Recall: $\epsilon$ can be seen as the difference between observed values and values predicted by the model.
+
+A large panel of $\epsilon$ mean that the model doesn't fit well with the observed data.
+
+It means that even if $\beta$ coefficients are well estimate. $\log_2(\mu_{ij]})$ will vary around them with a large panel of values (+/- 40)
+
+
+
+
+```{r warning=FALSE}
+beta.dtf = estim_mu$beta.matrix %>% data.frame()
+beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin")
+
+ggplot(beta.dtf.long, aes(x = beta )) +  geom_density(bins = 100, alpha = 0.5, fill = 'grey') + facet_grid(~origin, scales = "free_x")
+
+```
+```{r warning=FALSE}
+beta.dtf = estim_mu$beta.matrix %>% data.frame()
+beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin")
+
+
+
+B0 = estim_mu$dds.mcols$SE_Intercept
+B1 = estim_mu$dds.mcols$SE_genotype_msn2D_vs_wt
+B2 <- estim_mu$dds.mcols$SE_genotype_msn4D_vs_wt
+B3 <- estim_mu$dds.mcols$SE_env_kcl_vs_control
+B4 <- estim_mu$dds.mcols$SE_genotypemsn2D.envkcl
+B5 <- estim_mu$dds.mcols$SE_genotypemsn4D.envkcl
+
+
+SE_B.dtf <- cbind(B0, B1, B2, B3, B4, B5) %>% data.frame()
+SE_B.dtf.long = SE_B.dtf %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin")
+
+ggplot(SE_B.dtf.long, aes(x = SE_beta, fill= origin )) +  geom_density(bins = 100, alpha = 0.5) + facet_grid(~origin)
+```
+
+```{r warning=FALSE}
+bind_dtf<- cbind(SE_B.dtf.long, beta.dtf.long %>% select(-origin))
+ggplot(bind_dtf, aes(x = beta, y= SE_beta, fill= origin )) +  geom_point(alpha = 0.1) + facet_grid(~origin)
+
+
+#new <- bind_dtf %>% mutate(annot = ifelse(origin == "B4 | B5" && SE_beta > 6 , TRUE, FALSE ))
+#new <- bind_dtf %>% tail
+#new %>% filter(beta == "B4")
+```
+
+
+```{r warning=FALSE}
+dim(htrs)
+
+new <- bind_dtf %>% mutate(annot = ifelse(((origin == "B4") | (origin == "B5")) & (SE_beta > 6) , TRUE, FALSE ))
+### WARNING 
+new %>% dcast(., annot ~ origin)
+
+
+SE_threshold = 6
+SE_B.dtf.annot = SE_B.dtf %>%  mutate(annot = ifelse((B4 > SE_threshold) | (B5 > SE_threshold) , TRUE, FALSE ))
+SE_B.dtf.annot %>% group_by(annot) %>% tally()
+SE_B.dtf.annot.long = SE_B.dtf.annot %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin")
+
+
+bind_dtf.annot<- cbind(SE_B.dtf.annot.long, beta.dtf.long %>% select(-origin))
+bind_dtf.annot = bind_dtf.annot %>% filter(!is.na(annot))
+ggplot(bind_dtf.annot, aes(x = beta, y= SE_beta, col = annot )) +  geom_point(alpha = 0.1) + facet_grid(~origin)
+
+
+```
+
+
diff --git a/results/template.css b/results/template.css
new file mode 100644
index 0000000..0649808
--- /dev/null
+++ b/results/template.css
@@ -0,0 +1,29 @@
+
+title {
+    margin-top: 200px;
+
+    text-align: center;
+}
+
+
+
+
+h1, .h1 {
+    margin-top: 84px;
+
+    text-align: center;
+}
+
+
+h3, .h3, h4 {
+    text-align: center;
+}
+
+.caption {
+  font-size: 0.9em;
+  font-style: italic;
+  color: grey;
+  margin-right: 10%;
+  margin-left: 10%;
+  text-align: center;
+}
diff --git a/src/htrsim/tests/testthat.R b/src/htrsim/tests/testthat.R
new file mode 100644
index 0000000..123067a
--- /dev/null
+++ b/src/htrsim/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(htrsim)
+
+test_check("htrsim")
diff --git a/src/htrsim/tests/testthat/test-name.R b/src/htrsim/tests/testthat/test-name.R
new file mode 100644
index 0000000..a2c9cee
--- /dev/null
+++ b/src/htrsim/tests/testthat/test-name.R
@@ -0,0 +1,29 @@
+
+
+
+
+## manually created data expected
+dat1 <- list(names= c("name", "n_replicates", "gene_id", "mu", "alpha"), row.names = c(1,2,3), class = "data.frame" )
+dat2 <- list(names= c("name", "n_replicates", "gene_id", "mu", "alpha"), row.names = 1:200 , class = "data.frame" )
+dat3 <- list(names= c("name", "n_replicates", "gene_id", "mu", "alpha"), row.names = 1 , class = "data.frame" )
+dat4 <- dat1
+dat5 <- dat1
+dat6 <- dat3
+dat7 <- dat3
+dat8 <- 0.4
+dat9 <- dat1
+
+test_that("Setup counts generator", {
+  expect_equal(attributes(setup_countGener()), dat1 )
+  expect_equal(attributes(setup_countGener(gene_id = 1:200)), dat2 )
+  expect_equal(attributes(setup_countGener(gene_id = 0)), dat3 )
+  expect_equal(attributes(setup_countGener(n_rep = 0)), dat4 )
+  expect_equal(attributes(setup_countGener(bioSample_id = "lib1")), dat5 )
+  expect_equal(attributes(setup_countGener(bioSample_id = "lib1", gene_id = 0)), dat6 )
+  expect_equal(attributes(setup_countGener(bioSample_id = "lib1", gene_id = 0, alpha = 0.4)), dat7 )
+  expect_equal(setup_countGener(bioSample_id = "lib1", gene_id = 0, alpha = 0.4)
+                                  %>% select(alpha)
+                                      %>% as.numeric(), expected = dat8)
+  expect_equal(attributes(setup_countGener(bioSample_id = "lib1", alpha = c(0.4, 0.2, 0.3))), dat9)
+})
+
-- 
GitLab