From 9ca06184fbd6f1e215b074dd160bc3a07c11a7c5 Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Wed, 29 Jun 2022 09:48:07 +0200 Subject: [PATCH] update reports in results folder --- results/2022-04-22_batch_effect.Rmd | 432 +++++++++++++++++++++++++ results/2022-04-22_htrsim.Rmd | 428 +++++++++++++++++++++++++ results/2022-04-22_lowCnts_effect.Rmd | 436 ++++++++++++++++++++++++++ results/2022-04-24_epsilon_effect.Rmd | 136 ++++++++ results/2022-04-28_beta_SE_filter.Rmd | 293 +++++++++++++++++ results/2022-05-22_alpha_effect.Rmd | 329 +++++++++++++++++++ results/2022-06-24_subsampling.Rmd | 303 ++++++++++++++++++ results/2022-06-28_kallistoEffect.Rmd | 133 ++++++++ results/2022_06_08_investigation.Rmd | 30 ++ results/SVA.R | 72 +++++ results/SVA.Rmd | 180 +++++++++++ results/css/footer.css | 29 ++ results/css/template.css | 84 +++++ results/lab-meeting_preparation.Rmd | 61 ++++ 14 files changed, 2946 insertions(+) create mode 100644 results/2022-04-22_batch_effect.Rmd create mode 100644 results/2022-04-22_htrsim.Rmd create mode 100644 results/2022-04-22_lowCnts_effect.Rmd create mode 100644 results/2022-04-24_epsilon_effect.Rmd create mode 100644 results/2022-04-28_beta_SE_filter.Rmd create mode 100644 results/2022-05-22_alpha_effect.Rmd create mode 100644 results/2022-06-24_subsampling.Rmd create mode 100644 results/2022-06-28_kallistoEffect.Rmd create mode 100644 results/2022_06_08_investigation.Rmd create mode 100644 results/SVA.R create mode 100644 results/SVA.Rmd create mode 100644 results/css/footer.css create mode 100644 results/css/template.css create mode 100644 results/lab-meeting_preparation.Rmd diff --git a/results/2022-04-22_batch_effect.Rmd b/results/2022-04-22_batch_effect.Rmd new file mode 100644 index 0000000..c596b2c --- /dev/null +++ b/results/2022-04-22_batch_effect.Rmd @@ -0,0 +1,432 @@ +--- +title: "HTRSIM" +date: '2022-04-21' +output: + html_document: + + +css: + - css/template.css + - css/footer.css + +--- + + +## Introduction + + +In living world, phenotypes are understanding as a mixture between a genotype effect, an environment effect and an interaction between G&E. +$$Phenotype = Genotype + Environment + Genotype.Environment$$ +The quantification of each strengths (G,E; G&E) can be estimate by a coefficient $\beta$. +Then, our expression becomes: +$$Phenotype = \beta_{G} * Genotype + \beta_{E}*Environment + \beta_{G*E} * Genotype.Environment + \epsilon$$ +Notice that $\beta$ is specific of each component. Furthermore, we introduced above $\epsilon$. It's the residual of the model. $\epsilon$ can be seen as the difference between observed values and values predicted by the model. + +Genes expression can be also considered as a phenotype. <br> +According to this, the quantification of $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene in a given condition may open the possibility to assess differences between the strengths in presence in different conditions. + +That's the purpose of Htrsim ! + +## Htrsim + +##### Model + +In this aim, Htrsim is based on a model. <br> +Because of is easy of use this model is managed by DESEQ2. +Then, $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) +$$ +$$ +\mu_{ij} = s_jq_{ij} +$$ +$$ +log_2(q_{ij}) = x_j*\beta_i +$$ +The fitted mean is composed of a sample-specific size factor $s_j$ and a parameter qij proportional to the expected true concentration of fragments for sample j. +The coefficients $\beta_i$ give the log2 fold changes for gene i for each column of the model matrix X. The sample-specific size factors can be replaced by gene-specific normalization factors for each sample using normalizationFactors. + + + +According to the DESEQ2 GLM and our purpose, we can write: +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +According to this generalized linear model, we wish to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene i, in a given condition j. Achieve this, would allow us to quantify each strengths (G, E, G&E) for a given gene i, in a given condition j. + + +##### Required + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +``` + +##### Worklow + +Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br> +<br> +Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$. + + +a. Input + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') +bioDesign$batch = 1:12 +``` + +b. Launch DESEQ2 + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign, batch = TRUE) +``` + + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) +``` + +c. $\mu_{ij}$ + +Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2, + +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +Then, $\mu_{ij]}$ can be estimate + +$$ +\mu_{ij} = s_j * 2^{log_2(q_{ij})} +$$ +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Model matrix per samples +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) + +## Input estimation +estim_mu = estim.mu(dds, mm) +mu.input = estim_mu$mu +``` + +d. $K_{ij}$ + +As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \alpha_i) +$$ +The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br> +You can access to $\alpha_i$ using: +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +alpha.input = estim.alpha(dds) +``` + +Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. +We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# Setup simulation +input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +``` + + +```{r, error=TRUE} +sample = htrs %>% select(where(is.numeric)) %>% colnames() +genotype = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[1]) %>% unlist() + +env = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[2]) %>% unlist() + +designSimu = cbind(sample, env, genotype) %>% data.frame() + +## RESHAPE HTRS +htrs.reshape = htrs +rownames(htrs.reshape) = htrs.reshape$gene_id +htrs.reshape = htrs.reshape %>% select(-gene_id) +########### LAUNCH DESEQ ############# +## Design model - specify reference +designSimu$genotype <- factor(x = designSimu$genotype,levels = c('WT','Msn2D', 'Msn4D')) +designSimu$env <- factor(x = designSimu$env,levels = c('control', 'KCl')) + + +k_ij.simulation = htrs.reshape + +## DESEQ standard analysis +dds_simu = DESeq2::DESeqDataSetFromMatrix( countData = k_ij.simulation , + colData = designSimu , + design = ~ genotype + env + genotype:env) + +max(k_ij.simulation) +.Machine$integer.max + +``` + +##### Evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` + +Comment: $K_{ij}$ simulated are abnormally huge ! +Comment: $K_{ij}$ simulated are slightly different from the actual K_{ij} ! + + +## Why so much differences + +b. $\epsilon$ + +In our model, we define as follow: +$$ +\epsilon_{ij} \sim {\sf N}(0 ; deviance_i) +$$ + +Let's see the distribution of $deviance_{i}$. + + +```{r warning=FALSE} +#estim_mu$beta.matrix + +deviance_i = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)]^2 +#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% flatten() %>% unlist() + + +# Histogram logarithmic y axis +ggplot(data.frame(deviance_i), aes(deviance_i)) + + geom_histogram(bins = 100) #+ scale_x_log10() + + +``` + + +The deviance is also inferred by DESEQ while computing its model. +deviance is mostly inferred between 100 and 200. + + + +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +```{r warning=FALSE} +#estim_mu$beta.matrix + +deviance_i.sqrt = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)] +epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% flatten() %>% unlist() +#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 )) %>% data.frame() %>% flatten() %>% unlist() + +# Histogram logarithmic y axis +ggplot(data.frame(epsilon_ij), aes(epsilon_ij)) + + geom_histogram(bins = 100) #+ scale_x_log10() + + +``` + + +Comment: Some $\epsilon_{ij}$ are huge ! +Recall: $\epsilon$ can be seen as the difference between observed values and values predicted by the model. + +A large panel of $\epsilon$ mean that the model doesn't fit well with the observed data. + +It means that even if $\beta$ coefficients are well estimate. $log_2(q_{ij})$ will vary around them with a large panel of values (+/- 40) + + + + +```{r warning=FALSE} + +beta.dtf = estim_mu$beta.matrix %>% data.frame() +beta.dtf$B6 = dds.mcols$batch +beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") + +ggplot(beta.dtf.long, aes(x = beta )) + geom_density(bins = 100, alpha = 0.5, fill = 'grey') + facet_grid(~origin, scales = "free_x") + +``` + +```{r warning=FALSE} +beta.dtf = estim_mu$beta.matrix %>% data.frame() +beta.dtf$B6 = dds.mcols$batch +beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") + + + +## Standard Error +B0 = estim_mu$dds.mcols$SE_Intercept +B1 = estim_mu$dds.mcols$SE_genotype_msn2D_vs_wt +B2 <- estim_mu$dds.mcols$SE_genotype_msn4D_vs_wt +B3 <- estim_mu$dds.mcols$SE_env_kcl_vs_control +B4 <- estim_mu$dds.mcols$SE_genotypemsn2D.envkcl +B5 <- estim_mu$dds.mcols$SE_genotypemsn4D.envkcl +B6 <- estim_mu$dds.mcols$SE_batch + +SE_B.dtf <- cbind(B0, B1, B2, B3, B4, B5, B6) %>% data.frame() +SE_B.dtf.long = SE_B.dtf %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") + +ggplot(SE_B.dtf.long, aes(x = SE_beta, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + facet_grid(~origin) +``` + +```{r warning=FALSE} +bind_dtf<- cbind(SE_B.dtf.long, beta.dtf.long %>% select(-origin)) +ggplot(bind_dtf, aes(x = beta, y= SE_beta, fill= origin )) + geom_point(alpha = 0.1) + facet_grid(~origin) + + +#new <- bind_dtf %>% mutate(annot = ifelse(origin == "B4 | B5" && SE_beta > 6 , TRUE, FALSE )) +#new <- bind_dtf %>% tail +#new %>% filter(beta == "B4") +``` + + +```{r warning=FALSE} +#dim(htrs) + +#new <- bind_dtf %>% mutate(annot = ifelse(((origin == "B4") | (origin == "B5")) & (SE_beta > 6) , TRUE, FALSE )) +### WARNING +#new %>% dcast(., annot ~ origin) + + +SE_threshold = 6 +SE_B.dtf.annot = SE_B.dtf %>% mutate(annot = ifelse((B4 > SE_threshold) | (B5 > SE_threshold) , TRUE, FALSE )) +SE_B.dtf.annot %>% group_by(annot) %>% tally() +SE_B.dtf.annot.long = SE_B.dtf.annot %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") + + +bind_dtf.annot<- cbind(SE_B.dtf.annot.long, beta.dtf.long %>% select(-origin)) +bind_dtf.annot = bind_dtf.annot %>% filter(!is.na(annot)) +ggplot(bind_dtf.annot, aes(x = beta, y= SE_beta, col = annot )) + geom_point(alpha = 0.1, na.rm = T) + facet_grid(~origin) + + +``` + + +## Beta0 vs SE & deviance + + + +```{r} + +B0 = beta.dtf$B0 + + +L = SE_B.dtf %>% colnames() %>% length() + + +B0_vector = replicate(L, B0) %>% as.data.frame() %>% flatten() %>% unlist() +deviance.sqrt_vec = replicate(L , estim_mu$deviance.sqrt )%>% as.data.frame() %>% flatten() %>% unlist() +SE_B.dtf.annot.long$B0 = B0_vector +SE_B.dtf.annot.long$deviance.sqrt = estim_mu$deviance.sqrt + +ggplot(SE_B.dtf.annot.long, aes(x = B0, y = SE_beta, col= annot )) + geom_point(alpha = 0.1, na.rm = T) + facet_grid(~origin) + + +SE_B.dtf.annot.long_B0= SE_B.dtf.annot.long %>% filter(origin == "B0") %>% filter(!is.na(annot)) + +ggplot( SE_B.dtf.annot.long_B0, aes(x = B0, y = 2^deviance.sqrt, col = annot)) + geom_point(alpha = 0.1, na.rm = TRUE) + scale_y_log10() + + +``` + + + + +```{undefined eval=FALSE, include=FALSE} +hist(dds.mcols$dispFit) +hist(log_qij, ) +hist(log10(alpha.input$alpha)) +max(dds.mcols$dispersion, na.rm = T) +hist(log(dds.mcols$dispersion)) +max(dds.mcols$deviance, na.rm = T) +hist(dds.mcols$deviance) +max(dds.mcols$dispFit, na.rm = T) +DESeq2::design(dds) + +fitted.common.scale = t(t(dds@assays@data$mu)/dds$sizeFactor) + +hist(t(t(dds@assays@data$mu)/dds$sizeFactor)) +hist(residual_deseq) +max(residual_deseq, na.rm = T) +residual_deseq = (DESeq2::counts(dds, normalized=TRUE) - fitted.common.scale ) + + +w =DESeq2::nbinomWaldTest(dds) +w@dispersionFunction() +#S4Vectors::assays(dds)[["mu"]] + + +vst = DESeq2::varianceStabilizingTransformation(dds) +vst@assays@data$ +``` + + + +https://support.bioconductor.org/p/123305/ +https://support.bioconductor.org/p/60567/ +http://bioconductor.org/packages/release/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf +https://bioinformatics-core-shared-training.github.io/RNAseq-R/slides/LinearModels.pdf + + +```{r eval=FALSE, include=FALSE} +#install.packages("RUVSeq") +#BiocManager::install("RUVSeq") +library(RUVSeq) + +mm +y <- DGEList(counts=counts(tabl_cnts), group=x) +y <- calcNormFactors(y, method="upperquartile") +y <- estimateGLMCommonDisp(y, design) +y <- estimateGLMTagwiseDisp(y, design) +fit <- glmFit(y, design) +res <- residuals(fit, type="deviance") + + +``` + + + + diff --git a/results/2022-04-22_htrsim.Rmd b/results/2022-04-22_htrsim.Rmd new file mode 100644 index 0000000..62575c9 --- /dev/null +++ b/results/2022-04-22_htrsim.Rmd @@ -0,0 +1,428 @@ +--- +title: "HTRSIM" +date: '2022-04-21' +output: + html_document: + + +css: + - css/template.css + - css/footer.css + +--- + + +## Introduction + + +In living world, phenotypes are understanding as a mixture between a genotype effect, an environment effect and an interaction between G&E. +$$Phenotype = Genotype + Environment + Genotype.Environment$$ +The quantification of each strengths (G,E; G&E) can be estimate by a coefficient $\beta$. +Then, our expression becomes: +$$Phenotype = \beta_{G} * Genotype + \beta_{E}*Environment + \beta_{G*E} * Genotype.Environment + \epsilon$$ +Notice that $\beta$ is specific of each component. Furthermore, we introduced above $\epsilon$. It's the residual of the model. $\epsilon$ can be seen as the difference between observed values and values predicted by the model. + +Genes expression can be also considered as a phenotype. <br> +According to this, the quantification of $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene in a given condition may open the possibility to assess differences between the strengths in presence in different conditions. + +That's the purpose of Htrsim ! + +## Htrsim + +##### Model + +In this aim, Htrsim is based on a model. <br> +Because of is easy of use this model is managed by DESEQ2. +Then, $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) +$$ +$$ +\mu_{ij} = s_jq_{ij} +$$ +$$ +log_2(q_{ij}) = x_j*\beta_i +$$ +The fitted mean is composed of a sample-specific size factor $s_j$ and a parameter qij proportional to the expected true concentration of fragments for sample j. +The coefficients $\beta_i$ give the log2 fold changes for gene i for each column of the model matrix X. The sample-specific size factors can be replaced by gene-specific normalization factors for each sample using normalizationFactors. + + + +According to the DESEQ2 GLM and our purpose, we can write: +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +According to this generalized linear model, we wish to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene i, in a given condition j. Achieve this, would allow us to quantify each strengths (G, E, G&E) for a given gene i, in a given condition j. + + +##### Required + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +``` + +##### Worklow + +Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br> +<br> +Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$. + + +a. Input + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') + +``` + +b. Launch DESEQ2 + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) +``` + + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) +``` + +c. $\mu_{ij}$ + +Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2, + +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +Then, $\mu_{ij]}$ can be estimate + +$$ +\mu_{ij} = s_j * 2^{log_2(q_{ij})} +$$ +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Model matrix per samples +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) + +## Input estimation +estim_mu = estim.mu(dds, mm) +mu.input = estim_mu$mu +``` + +d. $K_{ij}$ + +As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \alpha_i) +$$ +The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br> +You can access to $\alpha_i$ using: +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +alpha.input = estim.alpha(dds) +``` + +Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. +We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# Setup simulation +input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +``` + + +```{r, error=TRUE} +sample = htrs %>% select(where(is.numeric)) %>% colnames() +genotype = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[1]) %>% unlist() + +env = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[2]) %>% unlist() + +designSimu = cbind(sample, env, genotype) %>% data.frame() + +## RESHAPE HTRS +htrs.reshape = htrs +rownames(htrs.reshape) = htrs.reshape$gene_id +htrs.reshape = htrs.reshape %>% select(-gene_id) +########### LAUNCH DESEQ ############# +## Design model - specify reference +designSimu$genotype <- factor(x = designSimu$genotype,levels = c('WT','Msn2D', 'Msn4D')) +designSimu$env <- factor(x = designSimu$env,levels = c('control', 'KCl')) + + +k_ij.simulation = htrs.reshape + +## DESEQ standard analysis +dds_simu = DESeq2::DESeqDataSetFromMatrix( countData = k_ij.simulation , + colData = designSimu , + design = ~ genotype + env + genotype:env) + +max(k_ij.simulation) +.Machine$integer.max + +``` + +##### Evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` + +Comment: $K_{ij}$ simulated are abnormally huge ! +Comment: $K_{ij}$ simulated are slightly different from the actual K_{ij} ! + + +## Why so much differences + +b. $\epsilon$ + +In our model, we define as follow: +$$ +\epsilon_{ij} \sim {\sf N}(0 ; deviance_i) +$$ + +Let's see the distribution of $deviance_{i}$. + + +```{r warning=FALSE} +#estim_mu$beta.matrix + +deviance_i = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)]^2 +#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% flatten() %>% unlist() + + +# Histogram logarithmic y axis +ggplot(data.frame(deviance_i), aes(deviance_i)) + + geom_histogram(bins = 100) #+ scale_x_log10() + + +``` + + +The deviance is also inferred by DESEQ while computing its model. +deviance is mostly inferred between 100 and 200. + + + +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +```{r warning=FALSE} +#estim_mu$beta.matrix + +deviance_i.sqrt = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)] +epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% flatten() %>% unlist() +#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 )) %>% data.frame() %>% flatten() %>% unlist() + +# Histogram logarithmic y axis +ggplot(data.frame(epsilon_ij), aes(epsilon_ij)) + + geom_histogram(bins = 100) #+ scale_x_log10() + + +``` + + +Comment: Some $\epsilon_{ij}$ are huge ! +Recall: $\epsilon$ can be seen as the difference between observed values and values predicted by the model. + +A large panel of $\epsilon$ mean that the model doesn't fit well with the observed data. + +It means that even if $\beta$ coefficients are well estimate. $log_2(q_{ij})$ will vary around them with a large panel of values (+/- 40) + + + + +```{r warning=FALSE} +beta.dtf = estim_mu$beta.matrix %>% data.frame() +beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") + +ggplot(beta.dtf.long, aes(x = beta )) + geom_density(bins = 100, alpha = 0.5, fill = 'grey') + facet_grid(~origin, scales = "free_x") + +``` +```{r warning=FALSE} +beta.dtf = estim_mu$beta.matrix %>% data.frame() +beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") + + + +## Standard Error +B0 = estim_mu$dds.mcols$SE_Intercept +B1 = estim_mu$dds.mcols$SE_genotype_msn2D_vs_wt +B2 <- estim_mu$dds.mcols$SE_genotype_msn4D_vs_wt +B3 <- estim_mu$dds.mcols$SE_env_kcl_vs_control +B4 <- estim_mu$dds.mcols$SE_genotypemsn2D.envkcl +B5 <- estim_mu$dds.mcols$SE_genotypemsn4D.envkcl + + +SE_B.dtf <- cbind(B0, B1, B2, B3, B4, B5) %>% data.frame() +SE_B.dtf.long = SE_B.dtf %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") + +ggplot(SE_B.dtf.long, aes(x = SE_beta, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + facet_grid(~origin) +``` + +```{r warning=FALSE} +bind_dtf<- cbind(SE_B.dtf.long, beta.dtf.long %>% select(-origin)) +ggplot(bind_dtf, aes(x = beta, y= SE_beta, fill= origin )) + geom_point(alpha = 0.1) + facet_grid(~origin) + + +#new <- bind_dtf %>% mutate(annot = ifelse(origin == "B4 | B5" && SE_beta > 6 , TRUE, FALSE )) +#new <- bind_dtf %>% tail +#new %>% filter(beta == "B4") +``` + + +```{r warning=FALSE} +#dim(htrs) + +#new <- bind_dtf %>% mutate(annot = ifelse(((origin == "B4") | (origin == "B5")) & (SE_beta > 6) , TRUE, FALSE )) +### WARNING +#new %>% dcast(., annot ~ origin) + + +SE_threshold = 6 +SE_B.dtf.annot = SE_B.dtf %>% mutate(annot = ifelse((B4 > SE_threshold) | (B5 > SE_threshold) , TRUE, FALSE )) +SE_B.dtf.annot %>% group_by(annot) %>% tally() +SE_B.dtf.annot.long = SE_B.dtf.annot %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") + + +bind_dtf.annot<- cbind(SE_B.dtf.annot.long, beta.dtf.long %>% select(-origin)) +bind_dtf.annot = bind_dtf.annot %>% filter(!is.na(annot)) +ggplot(bind_dtf.annot, aes(x = beta, y= SE_beta, col = annot )) + geom_point(alpha = 0.1, na.rm = T) + facet_grid(~origin) + + +``` + + +## Beta0 vs SE & deviance + + + +```{r} + +B0 = beta.dtf$B0 + + +L = SE_B.dtf %>% colnames() %>% length() + + +B0_vector = replicate(L, B0) %>% as.data.frame() %>% flatten() %>% unlist() +deviance.sqrt_vec = replicate(L , estim_mu$deviance.sqrt )%>% as.data.frame() %>% flatten() %>% unlist() +SE_B.dtf.annot.long$B0 = B0_vector +SE_B.dtf.annot.long$deviance.sqrt = estim_mu$deviance.sqrt + +ggplot(SE_B.dtf.annot.long, aes(x = B0, y = SE_beta, col= annot )) + geom_point(alpha = 0.1, na.rm = T) + facet_grid(~origin) + + +SE_B.dtf.annot.long_B0= SE_B.dtf.annot.long %>% filter(origin == "B0") %>% filter(!is.na(annot)) + +ggplot( SE_B.dtf.annot.long_B0, aes(x = B0, y = 2^deviance.sqrt, col = annot)) + geom_point(alpha = 0.1, na.rm = TRUE) + scale_y_log10() + + +``` + + + + +```{undefined eval=FALSE, include=FALSE} +hist(dds.mcols$dispFit) +hist(log_qij, ) +hist(log10(alpha.input$alpha)) +max(dds.mcols$dispersion, na.rm = T) +hist(log(dds.mcols$dispersion)) +max(dds.mcols$deviance, na.rm = T) +hist(dds.mcols$deviance) +max(dds.mcols$dispFit, na.rm = T) +DESeq2::design(dds) + +fitted.common.scale = t(t(dds@assays@data$mu)/dds$sizeFactor) + +hist(t(t(dds@assays@data$mu)/dds$sizeFactor)) +hist(residual_deseq) +max(residual_deseq, na.rm = T) +residual_deseq = (DESeq2::counts(dds, normalized=TRUE) - fitted.common.scale ) + + +w =DESeq2::nbinomWaldTest(dds) +w@dispersionFunction() +#S4Vectors::assays(dds)[["mu"]] + + +vst = DESeq2::varianceStabilizingTransformation(dds) +vst@assays@data$ +``` + + + +https://support.bioconductor.org/p/123305/ +https://support.bioconductor.org/p/60567/ +http://bioconductor.org/packages/release/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf +https://bioinformatics-core-shared-training.github.io/RNAseq-R/slides/LinearModels.pdf + + +```{r eval=FALSE, include=FALSE} +#install.packages("RUVSeq") +#BiocManager::install("RUVSeq") +library(RUVSeq) + +mm +y <- DGEList(counts=counts(tabl_cnts), group=x) +y <- calcNormFactors(y, method="upperquartile") +y <- estimateGLMCommonDisp(y, design) +y <- estimateGLMTagwiseDisp(y, design) +fit <- glmFit(y, design) +res <- residuals(fit, type="deviance") + + +``` + + + + diff --git a/results/2022-04-22_lowCnts_effect.Rmd b/results/2022-04-22_lowCnts_effect.Rmd new file mode 100644 index 0000000..323a2ab --- /dev/null +++ b/results/2022-04-22_lowCnts_effect.Rmd @@ -0,0 +1,436 @@ +--- +title: "HTRSIM" +date: '2022-04-21' +output: + html_document: + + +css: + - css/template.css + - css/footer.css + +--- + + +## Introduction + + +In living world, phenotypes are understanding as a mixture between a genotype effect, an environment effect and an interaction between G&E. +$$Phenotype = Genotype + Environment + Genotype.Environment$$ +The quantification of each strengths (G,E; G&E) can be estimate by a coefficient $\beta$. +Then, our expression becomes: +$$Phenotype = \beta_{G} * Genotype + \beta_{E}*Environment + \beta_{G*E} * Genotype.Environment + \epsilon$$ +Notice that $\beta$ is specific of each component. Furthermore, we introduced above $\epsilon$. It's the residual of the model. $\epsilon$ can be seen as the difference between observed values and values predicted by the model. + +Genes expression can be also considered as a phenotype. <br> +According to this, the quantification of $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene in a given condition may open the possibility to assess differences between the strengths in presence in different conditions. + +That's the purpose of Htrsim ! + +## Htrsim + +##### Model + +In this aim, Htrsim is based on a model. <br> +Because of is easy of use this model is managed by DESEQ2. +Then, $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) +$$ +$$ +\mu_{ij} = s_jq_{ij} +$$ +$$ +log_2(q_{ij}) = x_j*\beta_i +$$ +The fitted mean is composed of a sample-specific size factor $s_j$ and a parameter qij proportional to the expected true concentration of fragments for sample j. +The coefficients $\beta_i$ give the log2 fold changes for gene i for each column of the model matrix X. The sample-specific size factors can be replaced by gene-specific normalization factors for each sample using normalizationFactors. + + + +According to the DESEQ2 GLM and our purpose, we can write: +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +According to this generalized linear model, we wish to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene i, in a given condition j. Achieve this, would allow us to quantify each strengths (G, E, G&E) for a given gene i, in a given condition j. + + +##### Required + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +``` + +##### Worklow + +Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br> +<br> +Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$. + + +a. Input + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + + +keep <- rowSums( tabl_cnts ) >= 5000 +#keep +#tabl_cnts <- +tabl_cnts <- tabl_cnts[keep,] + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') +#bioDesign$batch = 1:12 +``` + +b. Launch DESEQ2 + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) +``` + + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) +``` + +c. $\mu_{ij}$ + +Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2, + +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +Then, $\mu_{ij]}$ can be estimate + +$$ +\mu_{ij} = s_j * 2^{log_2(q_{ij})} +$$ +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Model matrix per samples +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) + +## Input estimation +estim_mu = estim.mu(dds, mm) +mu.input = estim_mu$mu +``` + +d. $K_{ij}$ + +As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \alpha_i) +$$ +The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br> +You can access to $\alpha_i$ using: +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +alpha.input = estim.alpha(dds) +``` + +Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. +We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# Setup simulation +input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +``` + + +```{r, error=TRUE} +sample = htrs %>% select(where(is.numeric)) %>% colnames() +genotype = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[1]) %>% unlist() + +env = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[2]) %>% unlist() + +designSimu = cbind(sample, env, genotype) %>% data.frame() + +## RESHAPE HTRS +htrs.reshape = htrs +rownames(htrs.reshape) = htrs.reshape$gene_id +htrs.reshape = htrs.reshape %>% select(-gene_id) +########### LAUNCH DESEQ ############# +## Design model - specify reference +designSimu$genotype <- factor(x = designSimu$genotype,levels = c('WT','Msn2D', 'Msn4D')) +designSimu$env <- factor(x = designSimu$env,levels = c('control', 'KCl')) + + +k_ij.simulation = htrs.reshape + +## DESEQ standard analysis +dds_simu = DESeq2::DESeqDataSetFromMatrix( countData = k_ij.simulation , + colData = designSimu , + design = ~ genotype + env + genotype:env) + +max(k_ij.simulation) +.Machine$integer.max + +``` + +##### Evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` + +Comment: $K_{ij}$ simulated are abnormally huge ! +Comment: $K_{ij}$ simulated are slightly different from the actual K_{ij} ! + + +## Why so much differences + +b. $\epsilon$ + +In our model, we define as follow: +$$ +\epsilon_{ij} \sim {\sf N}(0 ; deviance_i) +$$ + +Let's see the distribution of $deviance_{i}$. + + +```{r warning=FALSE} +#estim_mu$beta.matrix + +deviance_i = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)]^2 +#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% flatten() %>% unlist() + + +# Histogram logarithmic y axis +ggplot(data.frame(deviance_i), aes(deviance_i)) + + geom_histogram(bins = 100) #+ scale_x_log10() + + +``` + + +The deviance is also inferred by DESEQ while computing its model. +deviance is mostly inferred between 100 and 200. + + + +$$ +log_2(q_{ij}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +```{r warning=FALSE} +#estim_mu$beta.matrix + +deviance_i.sqrt = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)] +epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% flatten() %>% unlist() +#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 )) %>% data.frame() %>% flatten() %>% unlist() + +# Histogram logarithmic y axis +ggplot(data.frame(epsilon_ij), aes(epsilon_ij)) + + geom_histogram(bins = 100) #+ scale_x_log10() + + +``` + + +Comment: Some $\epsilon_{ij}$ are huge ! +Recall: $\epsilon$ can be seen as the difference between observed values and values predicted by the model. + +A large panel of $\epsilon$ mean that the model doesn't fit well with the observed data. + +It means that even if $\beta$ coefficients are well estimate. $log_2(q_{ij})$ will vary around them with a large panel of values (+/- 40) + + + + +```{r warning=FALSE} + + +beta.dtf = estim_mu$beta.matrix %>% data.frame() +beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") + +ggplot(beta.dtf.long, aes(x = beta )) + geom_density(bins = 100, alpha = 0.5, fill = 'grey') + facet_grid(~origin, scales = "free_x") + +``` +```{r warning=FALSE} +beta.dtf = estim_mu$beta.matrix %>% data.frame() +beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") + + + +## Standard Error +B0 = estim_mu$dds.mcols$SE_Intercept +B1 = estim_mu$dds.mcols$SE_genotype_msn2D_vs_wt +B2 <- estim_mu$dds.mcols$SE_genotype_msn4D_vs_wt +B3 <- estim_mu$dds.mcols$SE_env_kcl_vs_control +B4 <- estim_mu$dds.mcols$SE_genotypemsn2D.envkcl +B5 <- estim_mu$dds.mcols$SE_genotypemsn4D.envkcl + + +SE_B.dtf <- cbind(B0, B1, B2, B3, B4, B5) %>% data.frame() +SE_B.dtf.long = SE_B.dtf %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") + +ggplot(SE_B.dtf.long, aes(x = SE_beta, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + facet_grid(~origin) +``` + +```{r warning=FALSE} +bind_dtf<- cbind(SE_B.dtf.long, beta.dtf.long %>% select(-origin)) +ggplot(bind_dtf, aes(x = beta, y= SE_beta, fill= origin )) + geom_point(alpha = 0.1) + facet_grid(~origin) + + +#new <- bind_dtf %>% mutate(annot = ifelse(origin == "B4 | B5" && SE_beta > 6 , TRUE, FALSE )) +#new <- bind_dtf %>% tail +#new %>% filter(beta == "B4") +``` + + +```{r warning=FALSE} +#dim(htrs) + +#new <- bind_dtf %>% mutate(annot = ifelse(((origin == "B4") | (origin == "B5")) & (SE_beta > 6) , TRUE, FALSE )) +### WARNING +#new %>% dcast(., annot ~ origin) + + +SE_threshold = 6 +SE_B.dtf.annot = SE_B.dtf %>% mutate(annot = ifelse((B4 > SE_threshold) | (B5 > SE_threshold) , TRUE, FALSE )) +SE_B.dtf.annot %>% group_by(annot) %>% tally() +SE_B.dtf.annot.long = SE_B.dtf.annot %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") + + +bind_dtf.annot<- cbind(SE_B.dtf.annot.long, beta.dtf.long %>% select(-origin)) +bind_dtf.annot = bind_dtf.annot %>% filter(!is.na(annot)) +ggplot(bind_dtf.annot, aes(x = beta, y= SE_beta, col = annot )) + geom_point(alpha = 0.1, na.rm = T) + facet_grid(~origin) + + +``` + + +## Beta0 vs SE & deviance + + + +```{r} + +B0 = beta.dtf$B0 + + +L = SE_B.dtf %>% colnames() %>% length() + + +B0_vector = replicate(L, B0) %>% as.data.frame() %>% flatten() %>% unlist() +deviance.sqrt_vec = replicate(L , estim_mu$deviance.sqrt )%>% as.data.frame() %>% flatten() %>% unlist() +SE_B.dtf.annot.long$B0 = B0_vector +SE_B.dtf.annot.long$deviance.sqrt = estim_mu$deviance.sqrt + +ggplot(SE_B.dtf.annot.long, aes(x = B0, y = SE_beta, col= annot )) + geom_point(alpha = 0.1, na.rm = T) + facet_grid(~origin) + + +SE_B.dtf.annot.long_B0= SE_B.dtf.annot.long %>% filter(origin == "B0") %>% filter(!is.na(annot)) + +ggplot( SE_B.dtf.annot.long_B0, aes(x = B0, y = 2^deviance.sqrt, col = annot)) + geom_point(alpha = 0.1, na.rm = TRUE) + scale_y_log10() + + +``` + + + + +```{undefined eval=FALSE, include=FALSE} +hist(dds.mcols$dispFit) +hist(log_qij, ) +hist(log10(alpha.input$alpha)) +max(dds.mcols$dispersion, na.rm = T) +hist(log(dds.mcols$dispersion)) +max(dds.mcols$deviance, na.rm = T) +hist(dds.mcols$deviance) +max(dds.mcols$dispFit, na.rm = T) +DESeq2::design(dds) + +fitted.common.scale = t(t(dds@assays@data$mu)/dds$sizeFactor) + +hist(t(t(dds@assays@data$mu)/dds$sizeFactor)) +hist(residual_deseq) +max(residual_deseq, na.rm = T) +residual_deseq = (DESeq2::counts(dds, normalized=TRUE) - fitted.common.scale ) + + +w =DESeq2::nbinomWaldTest(dds) +w@dispersionFunction() +#S4Vectors::assays(dds)[["mu"]] + + +vst = DESeq2::varianceStabilizingTransformation(dds) +vst@assays@data$ +``` + + + +https://support.bioconductor.org/p/123305/ +https://support.bioconductor.org/p/60567/ +http://bioconductor.org/packages/release/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf +https://bioinformatics-core-shared-training.github.io/RNAseq-R/slides/LinearModels.pdf + + +```{r eval=FALSE, include=FALSE} +#install.packages("RUVSeq") +#BiocManager::install("RUVSeq") +library(RUVSeq) + +mm +y <- DGEList(counts=counts(tabl_cnts), group=x) +y <- calcNormFactors(y, method="upperquartile") +y <- estimateGLMCommonDisp(y, design) +y <- estimateGLMTagwiseDisp(y, design) +fit <- glmFit(y, design) +res <- residuals(fit, type="deviance") + + +``` + + + + diff --git a/results/2022-04-24_epsilon_effect.Rmd b/results/2022-04-24_epsilon_effect.Rmd new file mode 100644 index 0000000..e7b0f27 --- /dev/null +++ b/results/2022-04-24_epsilon_effect.Rmd @@ -0,0 +1,136 @@ +--- +title: "Epsilon effect" +date: '2022-04-21' +output: + html_document: + + +css: + - css/template.css + +--- + + + +##### Required + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +``` + + + +## Worklow + +Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br> +<br> +Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$. + + +##### a. Input + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') +``` + +##### b. Launch DESEQ2 + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) +``` + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) +``` + +##### c. $\mu_{ij}$ + +Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2, + +$$ +log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +Then, $\mu_{ij]}$ can be estimate + +$$ +\mu_{ij} = s_j * 2^{log_2\mu_{ij]}} +$$ + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Model matrix per samples +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) + +## Input estimation +estim_mu = estim.mu(dds, mm, epsilon = FALSE) +mu.input = estim_mu$mu +``` + +##### d. $K_{ij}$ + +As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) +$$ +The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br> +You can access to $\alpha_i$ using: + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +alpha.input = estim.alpha(dds) +``` + +Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. +We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# Setup simulation +input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +``` + +##### Evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` diff --git a/results/2022-04-28_beta_SE_filter.Rmd b/results/2022-04-28_beta_SE_filter.Rmd new file mode 100644 index 0000000..ef206af --- /dev/null +++ b/results/2022-04-28_beta_SE_filter.Rmd @@ -0,0 +1,293 @@ +--- +title: "Apply filter on beta SE" +date: '2022-04-21' +output: + html_document: + + +css: + - css/template.css + +--- + + + +##### Required + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +``` + + +##### Worklow + +Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br> +<br> +Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$. + + +##### a. Input + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') +``` + + +##### b. Launch DESEQ2 + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) +``` + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) +``` + + + +## Removing genes with SE_B4 ou B5 > threshold + + +##### Some manipulations + +```{r warning=FALSE} +## BETA +B0 <- dds.mcols$Intercept +B1 <- dds.mcols$genotype_msn2D_vs_wt +B2 <- dds.mcols$genotype_msn4D_vs_wt +B3 <- dds.mcols$env_kcl_vs_control +B4 <- dds.mcols$genotypemsn2D.envkcl +B5 <- dds.mcols$genotypemsn4D.envkcl + +beta.matrix = cbind(B0, B1,B2,B3,B4,B5) %>% as.matrix() +## BETA +beta.dtf = beta.matrix %>% data.frame() +beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") + + + +## Standard Error +B0 = dds.mcols$SE_Intercept +B1 = dds.mcols$SE_genotype_msn2D_vs_wt +B2 <- dds.mcols$SE_genotype_msn4D_vs_wt +B3 <- dds.mcols$SE_env_kcl_vs_control +B4 <- dds.mcols$SE_genotypemsn2D.envkcl +B5 <- dds.mcols$SE_genotypemsn4D.envkcl + +SE_B.dtf <- cbind(B0, B1, B2, B3, B4, B5) %>% data.frame() +SE_B.dtf.long = SE_B.dtf %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") + +ggplot(SE_B.dtf.long, aes(x = SE_beta, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + facet_grid(~origin) +``` + +##### Filter + +```{r warning=FALSE} +SE_threshold = 2 + +SE_B.dtf.annot = SE_B.dtf %>% mutate(toKickOut = ifelse((B4 > SE_threshold) | (B5 > SE_threshold) , TRUE, FALSE )) +SE_B.dtf.annot$gene_id = rownames(tabl_cnts) +dtf_forFiltration = SE_B.dtf.annot %>% select(c(gene_id, toKickOut)) +SE_B.dtf.annot.long = SE_B.dtf.annot %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") + + +bind_dtf <- cbind(SE_B.dtf.annot.long, beta.dtf.long %>% select(-origin)) +bind_df.filter = bind_dtf %>% filter(toKickOut == FALSE) +ggplot(bind_df.filter, aes(x = beta, y= SE_beta, fill= origin )) + geom_point(alpha = 0.1) + facet_grid(~origin) + +``` + + +##### mu for genes with SE_B4/5 > threshold + +```{r warning=FALSE} + +beta_filtered.short = bind_df.filter %>% dcast(., gene_id ~ origin, value.var = "beta") + +### Estimate mu +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) +beta.matrix = beta_filtered.short %>% select(-gene_id) %>% as.matrix() + + + +deviance_i =dds.mcols$deviance[!SE_B.dtf.annot$toKickOut & !is.na(SE_B.dtf.annot$toKickOut)] +deviance_i.sqrt = sqrt(deviance_i) +epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% as.matrix() + + +#p_ij = B0_i*mm1_j + B1_i*mm2_j + B3_i*mm3_j + B4_i*mm4_j + B5_i*mm5_j +p_ij = beta.matrix %*% t(mm) +#message("EPSILON : TRUE") +log_qij <- p_ij + epsilon_ij + +## s_j +s_j = dds$sizeFactor +mu_ij = s_j * 2^log_qij +rownames(mu_ij) = beta_filtered.short$gene_id +## drop NA in dispersion estimate (link to unexpressed gene) +### and convert to lovely dataframe +mu_gene_filtered = mu_ij %>% + stats::na.omit() %>% + data.frame() +colnames(mu_gene_filtered) <- rownames(dds@colData) +mu_gene_filtered <- mu_gene_filtered %>% + tibble::rownames_to_column(var = "gene_id") +``` + + +##### alpha for genes with SE_B4/5 > threshold + +```{r} +alpha.input = estim.alpha(dds) +alpha.input.filtered = alpha.input %>% filter(gene_id %in% beta_filtered.short$gene_id) +``` + +###### Setup simulation + +```{r} +input = reshape_input2setup(mu.dtf = mu_gene_filtered, alpha.dtf = alpha.input.filtered, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +#### +``` + + +##### Evaluation + + +```{r} + +#dtf_merged = merge(htrs, dtf_forFiltration, "gene_id") +#htrs.filterSE = dtf_merged %>% filter(toKickOut == FALSE) %>% select(-toKickOut) + +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density( alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + geom_vline(xintercept = 0, col= "#00BFC4" ) + + scale_x_log10() +``` + + + +## Without epsilon + + +##### mu for genes with SE_B4/5 > threshold + +```{r warning=FALSE} + +beta_filtered.short = bind_df.filter %>% dcast(., gene_id ~ origin, value.var = "beta") + +### Estimate mu +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) +beta.matrix = beta_filtered.short %>% select(-gene_id) %>% as.matrix() + +#deviance_i.sqrt = sqrt(dds.mcols$deviance) +#epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% as.matrix() + + +#p_ij = B0_i*mm1_j + B1_i*mm2_j + B3_i*mm3_j + B4_i*mm4_j + B5_i*mm5_j +p_ij = beta.matrix %*% t(mm) +#message("EPSILON : FALSE") +log_qij <- p_ij #+ epsilon_ij + +## s_j +s_j = dds$sizeFactor +mu_ij = s_j * 2^log_qij +rownames(mu_ij) = beta_filtered.short$gene_id +## drop NA in dispersion estimate (link to unexpressed gene) +### and convert to lovely dataframe +mu_gene_filtered = mu_ij %>% + stats::na.omit() %>% + data.frame() +colnames(mu_gene_filtered) <- rownames(dds@colData) +mu_gene_filtered <- mu_gene_filtered %>% + tibble::rownames_to_column(var = "gene_id") +``` + + +##### alpha for genes with SE_B4/5 > threshold +```{r} +alpha.input = estim.alpha(dds) +alpha.input.filtered = alpha.input %>% filter(gene_id %in% beta_filtered.short$gene_id) +``` + +###### Setup simulation + +```{r} +input = reshape_input2setup(mu.dtf = mu_gene_filtered, alpha.dtf = alpha.input.filtered, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +#### +``` + + +##### Evaluation + + +```{r} + +#dtf_merged = merge(htrs, dtf_forFiltration, "gene_id") +#htrs.filterSE = dtf_merged %>% filter(toKickOut == FALSE) %>% select(-toKickOut) + +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density( alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + geom_vline(xintercept = 0, col= "#00BFC4" ) + + scale_x_log10() +``` diff --git a/results/2022-05-22_alpha_effect.Rmd b/results/2022-05-22_alpha_effect.Rmd new file mode 100644 index 0000000..2e78ece --- /dev/null +++ b/results/2022-05-22_alpha_effect.Rmd @@ -0,0 +1,329 @@ +--- +title: "Alpha effect" +date: '2022-04-21' +output: + html_document: + + +css: + - css/template.css + +--- + + +## Required + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +``` + + + +## Worklow + +Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br> +<br> +Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$. + + +##### a. Input + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') +``` + +##### Launch DESEQ + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) +``` + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) +``` + +##### mu estimation + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Model matrix per samples +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) + +## Input estimation +estim_mu = estim.mu(dds, mm) +mu.input = estim_mu$mu +``` + +d. $\alpha_{i}$ + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +alpha.input = estim.alpha(dds) +``` + + +```{r} +hist(alpha.input$alpha) +maximum_alpha = max(alpha.input$alpha, na.rm = T) +maximum_alpha +min(alpha.input$alpha, na.rm = T) + +``` + +variance = n(1-alpha)/alpha^2. + +```{r} +n = 1000 +alpha = 2.5 +variance = n*(1 - alpha)/alpha^2 +variance +hist(rnbinom(n=1000, mu = 400, size = alpha)) +``` + +```{r} +n = 1000 +alpha = 0.01 +variance = n*(1 - alpha)/alpha^2 +variance +hist(rnbinom(n=1000, mu = 400, size = alpha)) +``` + + +```{r} +## filter on alphz +#alpha.input.filtered = alpha.input %>% filter(gene_id %in% beta_filtered.short$gene_id) +#alpha.input.filtered = alpha.input %>% filter(alpha > 10) + + +### FIX ALPHA +N_genes = length(alpha.input$gene_id) +alpha.input$alpha = runif(n= N_genes, min = 0.1, max = maximum_alpha) +``` + +Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. +We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## filter mu based on alpha filter +#mu.input.filtered = mu.input %>% filter(gene_id %in% alpha.input.filtered$gene_id ) + +# Setup simulation +input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +``` + +##### Evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% filter(rownames(.) %in% htrs$gene_id ) %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` + + + +## Without Epsilon + +##### mu estimation + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Model matrix per samples +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) + +## Input estimation +estim_mu = estim.mu(dds, mm, epsilon = FALSE) +mu.input = estim_mu$mu +``` + +d. $\alpha_{i}$ + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +alpha.input = estim.alpha(dds) +``` + +```{r} +## filter on alphz +#alpha.input.filtered = alpha.input %>% filter(gene_id %in% beta_filtered.short$gene_id) +#alpha.input.filtered = alpha.input %>% filter(alpha > 10) + + +### FIX ALPHA +N_genes = length(alpha.input$gene_id) +alpha.input$alpha = runif(n= N_genes, min = 0.1, max = maximum_alpha) +#alpha.input$alpha = rnbinom(n = N_genes, mu = mean(alpha.input$alpha), size = 20) +#hist(alpha.input$alpha) +``` + +Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. +We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## filter mu based on alpha filter +#mu.input.filtered = mu.input %>% filter(gene_id %in% alpha.input.filtered$gene_id ) + +# Setup simulation +input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) +#input$gene_id %>% length() +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +``` + +##### Evaluation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% filter(rownames(.) %in% htrs$gene_id ) %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` + + +```{r} +#colnames(htrs) +##colnames(tabl_cnts) +#tabl_cnts.geneSimulate <- tabl_cnts %>% filter(rownames(.) %in% htrs$gene_id ) +#actual = tabl_cnts.geneSimulate$WT_control_rep1 +#simu = htrs$WT_control_rep1_1 +ggplot() + geom_point(aes(x=k_ij.actual, y = k_ij.simu), alpha=0.5) + geom_abline(intercept = 0, slope = 1) +#abline(a=0, b=1) +``` + + +## Re-estimate beta + +##### Design + +```{r} +sample = htrs %>% select(where(is.numeric)) %>% colnames() +genotype = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[1]) %>% unlist() + +env = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[2]) %>% unlist() + +designSimu = cbind(sample, env, genotype) %>% data.frame() +``` + + +##### Launch DESEQ + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## RESHAPE HTRS +htrs.reshape = htrs +rownames(htrs.reshape) = htrs.reshape$gene_id +htrs.reshape = htrs.reshape %>% select(-gene_id) +########### LAUNCH DESEQ ############# +## Design model - specify reference +designSimu$genotype <- factor(x = designSimu$genotype,levels = c('WT','Msn2D', 'Msn4D')) +designSimu$env <- factor(x = designSimu$env,levels = c('control', 'KCl')) + + +k_ij.simulate = htrs.reshape + +## DESEQ standard analysis +dds_simu = DESeq2::DESeqDataSetFromMatrix( countData = k_ij.simulate , colData = designSimu , design = ~ genotype + env + genotype:env) + +max(htrs.reshape) +.Machine$integer.max + +dds_simu <- DESeq2::DESeq(dds_simu) +``` + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds_simu.mcols = S4Vectors::mcols(dds_simu,use.names=TRUE) +#rownames(tabl_cnts) +``` + + +## Evaluation of beta inference + + +```{r} +## BETA INPUT +beta_input = estim_mu$beta.matrix %>% as.data.frame() +idx_nonNA = which(!is.na(beta_input$B0)) +beta_input = beta_input[idx_nonNA,] +beta_input$gene_id = input$gene_id +## BETA SIMU +B0 <- dds_simu.mcols$Intercept +B1 <- dds_simu.mcols$genotype_Msn2D_vs_WT +B2 <- dds_simu.mcols$genotype_Msn4D_vs_WT +B3 <- dds_simu.mcols$env_KCl_vs_control +B4 <- dds_simu.mcols$genotypeMsn2D.envKCl +B5 <- dds_simu.mcols$genotypeMsn4D.envKCl + + +beta.dtf = cbind(B0, B1,B2,B3,B4,B5) %>% as.data.frame() +beta.dtf$gene_id = input$gene_id +beta.dtf$origin = "inference" +beta_input$origin = "input" +dtf.merged = rbind(beta.dtf, beta_input) + +dtf.merged.long.tmp = dtf.merged %>% reshape2::melt(., value.name = "value", variable.name= "beta")#, variable.name = "origin")() +dtf.merged.long = dtf.merged.long.tmp %>% reshape2::dcast(., gene_id + beta ~ origin) + +ggplot(dtf.merged.long) + geom_point(aes(x=input, y = inference),alpha =0.1)+ geom_abline(intercept = 0, slope = 1) + facet_grid(~beta) +``` diff --git a/results/2022-06-24_subsampling.Rmd b/results/2022-06-24_subsampling.Rmd new file mode 100644 index 0000000..6a7b9e5 --- /dev/null +++ b/results/2022-06-24_subsampling.Rmd @@ -0,0 +1,303 @@ +--- +title: "Simulated counts subsampling" +output: html_document +date: '2022-06-08' +output: + html_document: + +css: + - css/template.css + +--- + +##### Required + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +``` + +## Worklow + +##### a. Input + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') +########### LAUNCH DESEQ ############# +## Design model - specify reference +bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D')) +bioDesign$env <- factor(x = bioDesign$env,levels = c('control', 'kcl')) +``` + +##### b. Launch DESEQ2 + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) +``` + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) +``` + +##### c. $\mu_{ij}$ + +Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2, + +$$ +log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +Then, $\mu_{ij]}$ can be estimate + +$$ +\mu_{ij} = s_j * 2^{log_2\mu_{ij]}} +$$ + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Model matrix per samples +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) + +## Input estimation +estim_mu = estim.mu(dds, mm, epsilon = TRUE) +mu.input = estim_mu$mu +``` + +##### d. $K_{ij}$ + +As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) +$$ +The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br> +You can access to $\alpha_i$ using: + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +alpha.input = estim.alpha(dds) +``` + +Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. +We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# Setup simulation +input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +``` + + +##### Evaluation after simulation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` + + + +```{r} +#max(tabl_cnts) + +htrs.filter = htrs %>% column_to_rownames("gene_id") %>% filter_all(all_vars(is.numeric(.) & . < max(tabl_cnts))) +max(htrs.filter) +``` + + +##### Evaluation after simulation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs.filter %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% filter(rownames(.) %in% rownames(htrs.filter)) %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` + + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs.filter %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% filter(rownames(.) %in% rownames(htrs.filter)) %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% data.frame() + +ggplot(df, aes(x = k_ij.actual, y = k_ij.simu )) + geom_point() + geom_abline(slope=1, intercept=0) +``` + + +##### Subsampling + +```{r message=FALSE, warning=FALSE, include=TRUE} + +# +sub_sampling <- function(kij.sample, N_reads){ + + kij.sample = log2(kij.sample ) %>% replace(!is.finite(.), 0) + + ecdf_maison = cumsum(kij.sample)/sum(kij.sample) + decumulativ_vector = c(0, ecdf_maison[-length(ecdf_maison)]) ## build decumulative vector + ## start with 0 and without the last element + ## reads_vec_cnsts = ceiling((ecdf_maison - decumulativ_vector) * 10*10^6) + reads_vec_cnts = round((ecdf_maison - decumulativ_vector) * N_reads) + return(reads_vec_cnts) +} + +htrs.sub = htrs %>% select_if(., is.numeric) %>% map_df(., ~sub_sampling(., 10^6)) +rownames(htrs.sub) = htrs$gene_id + +#( rowSums(htrs.sub) > 100 ) %>% table() +``` + +```{r} +k_ij.simu = htrs.sub %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% filter(rownames(.) %in% rownames(htrs.filter)) %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density( alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` + + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs.sub %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% filter(rownames(.) %in% rownames(htrs.filter)) %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% data.frame() + +ggplot(df, aes(x = k_ij.actual, y = k_ij.simu )) + geom_point() + geom_abline(slope=1, intercept=0) +``` + + + +## DESEQ2 on simulation + +```{r } +sample = htrs.sub %>% select(where(is.numeric)) %>% colnames() +genotype = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[1]) %>% unlist() + +env = htrs %>% + select(where(is.numeric)) %>% + colnames() %>% + map(., ~str_split(.,pattern = '_', simplify = T)[2]) %>% unlist() + +designSimu = cbind(sample, env, genotype) %>% data.frame() + + +########### LAUNCH DESEQ ############# +## Design model - specify reference +designSimu$genotype <- factor(x = designSimu$genotype,levels = c('WT','Msn2D', 'Msn4D')) +designSimu$env <- factor(x = designSimu$env,levels = c('control', 'KCl')) + + +k_ij.simulation = htrs.sub + +## DESEQ standard analysis +dds_simu = DESeq2::DESeqDataSetFromMatrix( countData = k_ij.simulation , + colData = designSimu , + design = ~ genotype + env + genotype:env) + + + +dds_simu = run.deseq(tabl_cnts = k_ij.simulation, bioDesign = designSimu) + +``` + + +```{r} +dds_simu.mcols = S4Vectors::mcols(dds_simu,use.names=TRUE) + #rownames(tabl_cnts) +``` + + +## Evaluation of beta inference + + +```{r} +## BETA INPUT +beta_input = estim_mu$beta.matrix %>% as.data.frame() +idx_nonNA = which(!is.na(beta_input$B0)) +beta_input = beta_input[idx_nonNA,] +beta_input$gene_id = input$gene_id +## BETA SIMU +B0 <- dds_simu.mcols$Intercept +B1 <- dds_simu.mcols$genotype_Msn2D_vs_WT +B2 <- dds_simu.mcols$genotype_Msn4D_vs_WT +B3 <- dds_simu.mcols$env_KCl_vs_control +B4 <- dds_simu.mcols$genotypeMsn2D.envKCl +B5 <- dds_simu.mcols$genotypeMsn4D.envKCl + +plot(B0) +plot(beta_input$B0) + +beta.dtf = cbind(B0, B1,B2,B3,B4,B5) %>% as.data.frame() +beta.dtf$gene_id = input$gene_id +beta.dtf$origin = "inference" +beta_input$origin = "input" +dtf.merged = rbind(beta.dtf, beta_input) + +dtf.merged.long.tmp = dtf.merged %>% reshape2::melt(., value.name = "value", variable.name= "beta")#, variable.name = "origin")() +dtf.merged.long = dtf.merged.long.tmp %>% reshape2::dcast(., gene_id + beta ~ origin) + +ggplot(dtf.merged.long) + geom_point(aes(x=input, y = inference),alpha =0.1)+ geom_abline(intercept = 0, slope = 1) + facet_grid(~beta) +``` + + +```{r} + +plot(x = tabl_cnts$WT_control_rep1[idx_nonNA], y = htrs.sub$WT_control_rep1_1) + abline(1,0) +``` diff --git a/results/2022-06-28_kallistoEffect.Rmd b/results/2022-06-28_kallistoEffect.Rmd new file mode 100644 index 0000000..30a89c3 --- /dev/null +++ b/results/2022-06-28_kallistoEffect.Rmd @@ -0,0 +1,133 @@ +--- +title: "Kallisto Effect" +output: html_document +date: '2022-06-28' + +output: html_document +date: '2022-06-08' +output: + html_document: + +css: + - css/template.css + +--- + +##### Required + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +``` + + +## Worklow + +##### a. Input + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Import & reshape table counts +fn = system.file("extdata/", "PRJNA675209v2.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id + +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +#tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') +``` + +##### b. Launch DESEQ2 + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) +``` + +DESEQ returns a dds object which contains many, many things ... <br> +In particular it contains the $\beta$ coefficients. <br> +<br> +You can access to beta coefficients using: + +```{r} +dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) +``` + +##### c. $\mu_{ij}$ + +Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2, + +$$ +log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} +$$ + +Then, $\mu_{ij]}$ can be estimate + +$$ +\mu_{ij} = s_j * 2^{log_2\mu_{ij]}} +$$ + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Model matrix per samples +mm <- model.matrix(~genotype + env + genotype:env, bioDesign) + +## Input estimation +estim_mu = estim.mu(dds, mm, epsilon = TRUE) +mu.input = estim_mu$mu +``` + +##### d. $K_{ij}$ + +As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) +$$ +The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br> +You can access to $\alpha_i$ using: + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +alpha.input = estim.alpha(dds) +``` + +Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. +We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +# Setup simulation +input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) + +#input$gene_id +setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, + n_rep = 1, + alpha = input$alpha, + gene_id = input$gene_id, + mu = input$mu) + +#setup.simulation %>% dim() +# Simulate counts +htrs <- generate_counts(setup.simulation) + +``` + + +##### Evaluation after simulation + +```{r message=FALSE, warning=FALSE, include=TRUE} +k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() +k_ij.actual = tabl_cnts %>% flatten() %>% unlist() + +df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") +df$origin = df$Var2 +df = df %>% select(-Var2) +max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() +max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() + +ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + + geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + + geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + + scale_x_log10() +``` diff --git a/results/2022_06_08_investigation.Rmd b/results/2022_06_08_investigation.Rmd new file mode 100644 index 0000000..95e071e --- /dev/null +++ b/results/2022_06_08_investigation.Rmd @@ -0,0 +1,30 @@ +--- +title: "2022_06_08_investigation" +output: html_document +date: '2022-06-08' +output: + html_document: + +css: + - css/template.css + +--- + + + +##### Required + +```{r setup message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(htrsim) +library(tidyverse) +library(reshape2) +library(DESeq2) +``` + + +```{r} +BiocManager::install('tximport') +BiocManager::install('rhdf5') +library(rhdf5) +tximport("../data/abundance.h5", type = "kallisto", txOut = TRUE) +``` diff --git a/results/SVA.R b/results/SVA.R new file mode 100644 index 0000000..6004178 --- /dev/null +++ b/results/SVA.R @@ -0,0 +1,72 @@ +library(htrsim) +library(tidyverse) +library(reshape2) +library(sva) + +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') + + +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) + + +########## SVA +dat <- DESeq2::counts(dds, normalized = TRUE) +idx <- rowMeans(dat) > 1 +dat <- dat[idx, ] +mod <- model.matrix(~genotype + env + genotype:env, bioDesign) +#mod <- model.matrix(~ dex, colData(dds)) +mod0 <- model.matrix(~ 1, bioDesign) +svseq <- svaseq(dat, mod, mod0, n.sv = 2) + + +plot(svseq$sv) + + +par(mfrow = c(2, 1), mar = c(3,5,3,1)) +for (i in 1:2) { + stripchart(svseq$sv[, i] ~ dds$genotype, vertical = TRUE, main = paste0("SV", i)) + abline(h = 0) +} + + +## PCA +data2PCA <- dat %>% data.frame() %>% + mutate(gene_id = rownames(.)) %>% remove_rownames() %>% + reshape2::melt(.,id = c('gene_id'), variable.name = "sample_id", value.name = "count_norm") %>% + dcast(., sample_id ~ gene_id) + +rownames(data2PCA) = data2PCA$sample_id +data2PCA = data2PCA %>% select(-sample_id) + +## PCA processed +pca.obj <- prcomp( data2PCA , scale. = FALSE) +summary(pca.obj) +plot(pca.obj) +## Annotation of the pca object +sample_id = data2PCA %>% rownames() +environment = data2PCA %>% rownames() %>% str_split(pattern = "_", simplify = TRUE)%>% .[,2] +genotype = data2PCA %>% rownames() %>% str_split(pattern = "_", simplify = TRUE)%>% .[,1] + +dtp <- data.frame( 'sample_id' = sample_id , + 'genotype' = genotype, + 'environment' = environment, + pca.obj$x[,1:2]) + +## Plot +P <- ggplot(data = dtp) + + geom_point(aes(x = PC1, y = PC2, + col = genotype, ### Possible choice: period, compartments, day, lumen_mucus, ... + shape = environment, ### Possible choice: period, compartments, day, lumen_mucus, ... + text = paste("ID:", sample_id))) + + theme_minimal() +ggplotly(P, tooltip = c("text")) +P diff --git a/results/SVA.Rmd b/results/SVA.Rmd new file mode 100644 index 0000000..21775c1 --- /dev/null +++ b/results/SVA.Rmd @@ -0,0 +1,180 @@ +--- +title: "HTRSIM" +date: '2022-04-21' +output: + html_document: + + +css: + - css/template.css + - css/footer.css + +--- + + +```{r setup, include=FALSE} +library(htrsim) +library(tidyverse) +library(reshape2) +library(sva) +library(kableExtra) +library(gridExtra) +``` + +## SVA + +```{r} + +## Import & reshape table counts +fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID +tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName + +## import design of bioProject +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") +bioDesign <- read.table(file = fn, header = T, sep = ';') + + +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) +dds = S4Vectors::mcols(dds,use.names=TRUE) +deviance_i = dds$deviance + +testing_tmp2 = deviance_i %>% data.frame() +testing_tmp2$n = "0" + +``` + + + + +## Take into account SVA + + +```{r} + +########## SVA +dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) + +dat <- DESeq2::counts(dds, normalized = TRUE) +idx <- rowMeans(dat) > 1 +dat <- dat[idx, ] +mod <- model.matrix(~genotype + env + genotype:env, bioDesign) +#mod <- model.matrix(~ dex, colData(dds)) +mod0 <- model.matrix(~ 1, bioDesign) + +## 1 SVA #### +svseq <- svaseq(dat, mod, mod0, n.sv = 1) +ddsva <- dds +ddsva$SV1 <- svseq$sv[,1] +DESeq2::design(ddsva) <- ~ SV1 + genotype + env + genotype:env +ddsva <- DESeq2::DESeq(ddsva) + +ddsva.mcols = S4Vectors::mcols(ddsva,use.names=TRUE) +deviance_i = ddsva.mcols$deviance + +testing_tmp = deviance_i %>% data.frame() +testing_tmp$n = "1" +testing <- rbind(testing_tmp, testing_tmp2) + +### 2 SVA #### +svseq <- svaseq(dat, mod, mod0, n.sv = 2) +ddsva <- dds +ddsva$SV1 <- svseq$sv[,1] +ddsva$SV2 <- svseq$sv[,2] +DESeq2::design(ddsva) <- ~ SV1 + SV2 + genotype + env + genotype:env +ddsva <- DESeq2::DESeq(ddsva) + +ddsva.mcols = S4Vectors::mcols(ddsva,use.names=TRUE) +deviance_i = ddsva.mcols$deviance + +testing_tmp2 = deviance_i %>% data.frame() +testing_tmp2$n = "2" +testing <- rbind(testing, testing_tmp2) +## 18 SVA ### +svseq <- svaseq(dat, mod, mod0, n.sv = 4) +ddsva <- dds +ddsva$SV1 <- svseq$sv[,1] +ddsva$SV2 <- svseq$sv[,2] +ddsva$SV3 <- svseq$sv[,3] +ddsva$SV4 <- svseq$sv[,4] + +DESeq2::design(ddsva) <- ~ SV1 + SV2 + SV3 + SV4 + genotype + env + genotype:env +ddsva <- DESeq2::DESeq(ddsva) + +ddsva.mcols = S4Vectors::mcols(ddsva,use.names=TRUE) +deviance_i = ddsva.mcols$deviance + +testing_tmp = deviance_i %>% data.frame() +testing_tmp$n = "4" +testing <- rbind(testing, testing_tmp) +testing<- testing %>% rename(. , 'deviance_i' = .) +#DESeq2::results(ddsva) + + + +``` + +```{r} +# Histogram logarithmic y axis +ggplot(data.frame(testing), aes(deviance_i)) + + geom_histogram(bins = 100) + facet_grid(~n, scales = "free") #+ scale_x_log10() +``` + +## PCA + +```{r echo = FALSE,echo = FALSE, message=FALSE, warning=FALSE, include=TRUE, fig.align='center', fig.width = 6, fig.cap="Table: PCA results"} +data2PCA <- dat %>% data.frame() %>% + mutate(gene_id = rownames(.)) %>% remove_rownames() %>% + reshape2::melt(.,id = c('gene_id'), variable.name = "sample_id", value.name = "count_norm") %>% + dcast(., sample_id ~ gene_id) + +rownames(data2PCA) = data2PCA$sample_id +data2PCA = data2PCA %>% select(-sample_id) + +## PCA processed +pca.obj <- prcomp( data2PCA , scale. = FALSE) + +res = summary(pca.obj) + +res$importance[,1:3] %>% + kbl(., caption = "Table: Variance explained per Principal Component", position = "bottom", align = 'c') %>% + kable_styling(full_width = F) + +``` + + +```{r fig.align='center', fig.width = 10} +## Annotation of the pca object +sample_id = data2PCA %>% rownames() +environment = data2PCA %>% rownames() %>% str_split(pattern = "_", simplify = TRUE)%>% .[,2] +genotype = data2PCA %>% rownames() %>% str_split(pattern = "_", simplify = TRUE)%>% .[,1] + +dtp <- data.frame( 'sample_id' = sample_id , + 'genotype' = genotype, + 'environment' = environment, + pca.obj$x[,1:3]) + +## Plot +P1 <- ggplot(data = dtp) + + geom_point(aes(x = PC1, y = PC2, + col = genotype, ### Possible choice: period, compartments, day, lumen_mucus, ... + shape = environment, ### Possible choice: period, compartments, day, lumen_mucus, ... + text = paste("ID:", sample_id)), size = 5) + + theme_minimal() +#ggplotly(P, tooltip = c("text")) +#P + +P2 <- ggplot(data = dtp) + + geom_point(aes(x = PC2, y = PC3, + col = genotype, ### Possible choice: period, compartments, day, lumen_mucus, ... + shape = environment, ### Possible choice: period, compartments, day, lumen_mucus, ... + text = paste("ID:", sample_id)), size = 5) + + theme_minimal() +#ggplotly(P, tooltip = c("text")) +#P + +grid.arrange(P1, P2, ncol = 2) + +``` diff --git a/results/css/footer.css b/results/css/footer.css new file mode 100644 index 0000000..d4bc7f0 --- /dev/null +++ b/results/css/footer.css @@ -0,0 +1,29 @@ +/*------------FOOTER----------*/ + +/* Divider line above footer */ +.footer hr{ + width: 100%; +} + +.footer { + font-size: 16px; + color: #808080; + text-align: center; + width: 90%; + margin: 3rem auto; + font-weight: 300; +} + +.footer.logo { + width: 25px; + margin: 0px !important; +} + +.rstudio4edu-footer { + font-size: 12px; + text-transform: uppercase; +} + +.tocify-extend-page { + height: 0px !important; /* Gets rid of extra space after footer*/ +} diff --git a/results/css/template.css b/results/css/template.css new file mode 100644 index 0000000..d92e7e6 --- /dev/null +++ b/results/css/template.css @@ -0,0 +1,84 @@ + /*------------- Whole Document---------------- */ + +body { + font-family: 'Muli'; + font-size: 19px; + + +} + +title { + font-size: 40px; + margin-top: 200px; + text-align: center; + +} + + + + +h1, .h1 { + font-size: 40px; + margin-top: 84px; + text-align: center; + color: #0A3B95; + padding: 10px; + display: grid; + /*background: #0A3B95 ;*/ + background: #E7EEF1; +} + + +h2, .h2{ + font-size: 30px; + /*color: #fff;*/ + color: #0A3B95; + padding: 4px; + /*background: #0A3B95 ;*/ + background: #E7EEF1; + + text-align: center; + margin-bottom: 0.75em; + margin-top: 30px; + +} + + +h3, .h3 { + font-size: 30px; + text-align: center; + + +} + +h4, .h4 { + font-style: italic; + font-size: 20px; + text-align: center; + + +} + +h5, .h5{ + /*color:#8A7B60 ;*/ + text-indent: 5em; + font-weight: bold; + font-size: 20px; + margin-top: 20px; + margin-bottom: 0.75em; + text-decoration: underline; + +} + +/* FIG */ + +.caption { + font-size: 0.9em; + font-style: italic; + color: grey; + margin-right: 10%; + margin-left: 10%; + text-align: center; +} + + diff --git a/results/lab-meeting_preparation.Rmd b/results/lab-meeting_preparation.Rmd new file mode 100644 index 0000000..8d8498e --- /dev/null +++ b/results/lab-meeting_preparation.Rmd @@ -0,0 +1,61 @@ +--- +title: "lab-meeting" +output: + html_document +--- + + +```{r} +library(tidyverse) + + +x=runif(100,0,100) +epsilon.simu = rnorm(100,0,16) +y = 150 + 4*x + epsilon.simu +mysimu1 = cbind(x, y) %>% data.frame() +mysimu1$epsilon = "eps ~ N(mean = 0, sd = 16)" + +ggplot(mysimu1) + geom_point(aes(x=x, y=y),col= "#00BFC4") + geom_abline(intercept = 150, slope = 4, col= "#F8766D") + ylim(100, 500) + xlim(0, 100) + +ggplot(mysimu1) + geom_abline(intercept = 150, slope = 4, col= "#F8766D") + ylim(100, 500) + xlim(0, 100) + +epsilon.simu= epsilon.simu %>% data.frame() %>% mutate(., epsilon = .) %>% select(-.) +ggplot(epsilon.simu) + geom_density(aes(x=epsilon)) + +``` + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} + +x=runif(100,0,100) +epsilon.simu = rnorm(1000,0,160) +y = 150 + 4*x + epsilon.simu +mysimu2 = cbind(x, y) %>% data.frame() +mysimu2$epsilon = "eps ~ N(mean = 0, sd = 160)" +ggplot(mysimu2) + geom_point(aes(x=x, y=y), col= "#00BFC4") + geom_abline(intercept = 150, slope = 4, col= "#F8766D") + ylim(100, 500) + xlim(0, 100) +epsilon.simu= epsilon.simu %>% data.frame() %>% mutate(., epsilon = .) %>% select(-.) +ggplot(epsilon.simu) + geom_density(aes(x=epsilon)) + +``` + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +dtf.merged <- rbind(mysimu1, mysimu2) + +ggplot(dtf.merged) + geom_point(aes(x=x, y=y), col= "#00BFC4") + geom_abline(intercept = 150, slope = 4, col= "#F8766D") + ylim(100, 500) + xlim(0, 100) + facet_grid(~epsilon) + +``` + +```{r} +dtf.merged <- rbind(mysimu1, mysimu2) + +ggplot(dtf.merged) + geom_point(aes(x=x, y=y), col= "#00BFC4") + ylim(100, 500) + xlim(0, 100) + facet_grid(~epsilon) +``` + +```{r} +x=runif(10,40,70) +epsilon.simu = rnorm(10,0,100) +y = 150 + 4*x + epsilon.simu +mysimu1 = cbind(x, y) %>% data.frame() +mysimu1$epsilon = "eps ~ N(mean = 0, sd = 16)" + +ggplot(mysimu1) + geom_point(aes(x=x, y=y),col= "#00BFC4") + geom_abline(intercept = 150, slope = 4, col= "#F8766D") +``` -- GitLab