From 58cd580eb83cc5004822214c714916be5abe8793 Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Wed, 29 Jun 2022 09:44:41 +0200 Subject: [PATCH] update git --- results/report_22-04-2022.Rmd | 294 ------------------ results/template.css | 29 -- src/htrsim/R/input_estimation.R | 22 +- src/htrsim/R/launch_deseq.R | 12 +- src/htrsim/inst/extdata/public_bioDesign.csv | 13 - src/htrsim/man/estim.mu.Rd | 4 +- src/htrsim/man/handle_except.Rd | 8 +- src/htrsim/man/reshape_input2setup.Rd | 4 +- src/htrsim/man/setup_countGener.Rd | 5 +- .../comment-utiliser-mon-package.Rmd | 105 +++++-- 10 files changed, 115 insertions(+), 381 deletions(-) delete mode 100644 results/report_22-04-2022.Rmd delete mode 100644 results/template.css delete mode 100644 src/htrsim/inst/extdata/public_bioDesign.csv diff --git a/results/report_22-04-2022.Rmd b/results/report_22-04-2022.Rmd deleted file mode 100644 index 6f5ce4d..0000000 --- a/results/report_22-04-2022.Rmd +++ /dev/null @@ -1,294 +0,0 @@ ---- -title: "Report_2022-04-21" -output: html_document -date: '2022-04-21' -css: template.css ---- - - -## Introduction - - -In living world, phenotypes are understanding as a mixture between a genotype effect, an environment effect and an interaction between G&E. -$$Phenotype = Genotype + Environment + Genotype.Environment$$ -The quantification of each strengths (G,E; G&E) can be estimate by a coefficient $\beta$. -Then, our expression becomes: -$$Phenotype = \beta_{G} * Genotype + \beta_{E}*Environment + \beta_{G*E} * Genotype.Environment + \epsilon$$ -Notice that $\beta$ is specific of each component. Furthermore, we introduced above $\epsilon$. It's the residual of the model. $\epsilon$ can be seen as the difference between observed values and values predicted by the model. - -Genes expression can be also considered as a phenotype. <br> -According to this, the quantification of $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene in a given condition may open the possibility to assess differences between the strengths in presence in different conditions. - -That's the purpose of Htrsim ! - -## Htrsim - -<u> Model </u> - -In this aim, Htrsim is based on a model. <br> -Because of is easy of use this model is managed by DESEQ2. -Then, $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. - -$$ -K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) -$$ -$$ -\mu_{ij} = s_jq_{ij} -$$ -$$ -log_2(q_{ij}) = x_j*\beta_i -$$ -The fitted mean is composed of a sample-specific size factor $s_j$ and a parameter qij proportional to the expected true concentration of fragments for sample j. -The coefficients $\beta_i$ give the log2 fold changes for gene i for each column of the model matrix X. The sample-specific size factors can be replaced by gene-specific normalization factors for each sample using normalizationFactors. - -According to the DESEQ2 GLM and our purpose, we can write: -$$ -log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} -$$ - -According to this generalized linear model, we wish to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$ for a given gene i, in a given condition j. Achieve this, would allow us to quantify each strengths (G, E, G&E) for a given gene i, in a given condition j. - - -<u> Required </u> - -```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} -library(htrsim) -library(tidyverse) -library(reshape2) -``` - -<u> Worklow </u> - -Using public libraries (from BioProject PRJNA675209b - chinese paper), and an usual RNA-seq pipeline, we build actual RNA-seq counts per genes for 3 genotypes and 2 environments.<br> -<br> -Using htrsim (in particular DESEQ2) and this count table, we are able to estimate $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$. - - -a. Input - - -```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} -## Import & reshape table counts -fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim") -tabl_cnts <- read.table(file = fn, header = TRUE) -rownames(tabl_cnts) <- tabl_cnts$gene_id -tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID -tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName - -## import design of bioProject -fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") -bioDesign <- read.table(file = fn, header = T, sep = ';') - -``` - -b. Launch DESEQ2 - -```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} -dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) -``` - - -DESEQ returns a dds object which contains many, many things ... <br> -In particular it contains the $\beta$ coefficients. <br> -<br> -You can access to beta coefficients using: - -```{r} -dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) -``` - -c. $\mu_{ij}$ - -Following our model, we can estimate $log_2(\mu_{ij]})$ from $\beta$ coefficients inferred by DESEQ2, - -$$ -log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} -$$ - -Then, $\mu_{ij]}$ can be estimate - -$$ -\mu_{ij} = s_j * 2^{log_2\mu_{ij]}} -$$ -```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} -## Model matrix per samples -mm <- model.matrix(~genotype + env + genotype:env, bioDesign) - -## Input estimation -estim_mu = estim.mu(dds, mm) -mu.input = estim_mu$mu -``` - -d. $K_{ij}$ - -As defined by our model, counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. - -$$ -K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) -$$ -The gene-specific dispersion parameter $\alpha_i$ is also stored in the dds object.<br> -You can access to $\alpha_i$ using: -```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} -alpha.input = estim.alpha(dds) -``` - -Knowing $\alpha_i$ and $\mu_{ij]}$ for each gene and each condition given by the BioProject PRJNA675209b - chinese paper. -We are now able to simulate $K_{ij}$ for each gene and each condition given by the BioProject PRJNA675209b. - -```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} -# Setup simulation -input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) - -#input$gene_id -setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, - n_rep = 1, - alpha = input$alpha, - gene_id = input$gene_id, - mu = input$mu) - -#setup.simulation %>% dim() -# Simulate counts -htrs <- generate_counts(setup.simulation) - -``` - - -```{r message=FALSE, warning=FALSE, include=TRUE} -k_ij.simu = htrs %>% select(-gene_id) %>% flatten() %>% unlist() -k_ij.actual = tabl_cnts %>% flatten() %>% unlist() - -df = cbind(k_ij.actual, k_ij.simu) %>% reshape2::melt(., value.name = "k_ij", variable.name = "origin") -df$origin = df$Var2 -df = df %>% select(-Var2) -max_k_ij.simu = df %>% filter(origin == "k_ij.simu") %>% select(k_ij) %>% max() -max_k_ij.actual = df %>% filter(origin == "k_ij.actual") %>% select(k_ij) %>% max() - -ggplot(df, aes(x = k_ij, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + - geom_vline(xintercept = max_k_ij.actual, col = "#F8766D" ) + - geom_vline(xintercept = (max_k_ij.simu), col= "#00BFC4" ) + - scale_x_log10() -``` - -Comment: $K_{ij}$ simulated are abnormally huge ! -Comment: $K_{ij}$ simulated are slightly different from the actual K_{ij} ! - - -## Why so much differences - -b. $\epsilon$ - -In our model, we define as follow: -$$ -\epsilon_{ij} \sim {\sf N}(0 ; deviance_i) -$$ - -Let's see the distribution of $deviance_{i}$. - - -```{r warning=FALSE} -#estim_mu$beta.matrix - -deviance_i = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)]^2 -#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% flatten() %>% unlist() - - -# Histogram logarithmic y axis -ggplot(data.frame(deviance_i), aes(deviance_i)) + - geom_histogram(bins = 100) #+ scale_x_log10() - - -``` - - -The deviance is also inferred by DESEQ while computing its model. -deviance is mostly inferred between 100 and 200. - -$$ -log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} + \epsilon_{ij} -$$ - -```{r warning=FALSE} -#estim_mu$beta.matrix - -deviance_i.sqrt = estim_mu$deviance.sqrt[!is.na(estim_mu$deviance.sqrt)] -epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 )) %>% data.frame() %>% flatten() %>% unlist() -#epsilon_ij <- mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 )) %>% data.frame() %>% flatten() %>% unlist() - -# Histogram logarithmic y axis -ggplot(data.frame(epsilon_ij), aes(epsilon_ij)) + - geom_histogram(bins = 100) #+ scale_x_log10() - - -``` - - -Comment: Some $\epsilon_{ij}$ are huge ! -Recall: $\epsilon$ can be seen as the difference between observed values and values predicted by the model. - -A large panel of $\epsilon$ mean that the model doesn't fit well with the observed data. - -It means that even if $\beta$ coefficients are well estimate. $\log_2(\mu_{ij]})$ will vary around them with a large panel of values (+/- 40) - - - - -```{r warning=FALSE} -beta.dtf = estim_mu$beta.matrix %>% data.frame() -beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") - -ggplot(beta.dtf.long, aes(x = beta )) + geom_density(bins = 100, alpha = 0.5, fill = 'grey') + facet_grid(~origin, scales = "free_x") - -``` -```{r warning=FALSE} -beta.dtf = estim_mu$beta.matrix %>% data.frame() -beta.dtf.long = beta.dtf %>% reshape2::melt(., value.name = "beta", variable.name = "origin") - - - -B0 = estim_mu$dds.mcols$SE_Intercept -B1 = estim_mu$dds.mcols$SE_genotype_msn2D_vs_wt -B2 <- estim_mu$dds.mcols$SE_genotype_msn4D_vs_wt -B3 <- estim_mu$dds.mcols$SE_env_kcl_vs_control -B4 <- estim_mu$dds.mcols$SE_genotypemsn2D.envkcl -B5 <- estim_mu$dds.mcols$SE_genotypemsn4D.envkcl - - -SE_B.dtf <- cbind(B0, B1, B2, B3, B4, B5) %>% data.frame() -SE_B.dtf.long = SE_B.dtf %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") - -ggplot(SE_B.dtf.long, aes(x = SE_beta, fill= origin )) + geom_density(bins = 100, alpha = 0.5) + facet_grid(~origin) -``` - -```{r warning=FALSE} -bind_dtf<- cbind(SE_B.dtf.long, beta.dtf.long %>% select(-origin)) -ggplot(bind_dtf, aes(x = beta, y= SE_beta, fill= origin )) + geom_point(alpha = 0.1) + facet_grid(~origin) - - -#new <- bind_dtf %>% mutate(annot = ifelse(origin == "B4 | B5" && SE_beta > 6 , TRUE, FALSE )) -#new <- bind_dtf %>% tail -#new %>% filter(beta == "B4") -``` - - -```{r warning=FALSE} -dim(htrs) - -new <- bind_dtf %>% mutate(annot = ifelse(((origin == "B4") | (origin == "B5")) & (SE_beta > 6) , TRUE, FALSE )) -### WARNING -new %>% dcast(., annot ~ origin) - - -SE_threshold = 6 -SE_B.dtf.annot = SE_B.dtf %>% mutate(annot = ifelse((B4 > SE_threshold) | (B5 > SE_threshold) , TRUE, FALSE )) -SE_B.dtf.annot %>% group_by(annot) %>% tally() -SE_B.dtf.annot.long = SE_B.dtf.annot %>% reshape2::melt(., value.name = "SE_beta", variable.name = "origin") - - -bind_dtf.annot<- cbind(SE_B.dtf.annot.long, beta.dtf.long %>% select(-origin)) -bind_dtf.annot = bind_dtf.annot %>% filter(!is.na(annot)) -ggplot(bind_dtf.annot, aes(x = beta, y= SE_beta, col = annot )) + geom_point(alpha = 0.1) + facet_grid(~origin) - - -``` - - diff --git a/results/template.css b/results/template.css deleted file mode 100644 index 0649808..0000000 --- a/results/template.css +++ /dev/null @@ -1,29 +0,0 @@ - -title { - margin-top: 200px; - - text-align: center; -} - - - - -h1, .h1 { - margin-top: 84px; - - text-align: center; -} - - -h3, .h3, h4 { - text-align: center; -} - -.caption { - font-size: 0.9em; - font-style: italic; - color: grey; - margin-right: 10%; - margin-left: 10%; - text-align: center; -} diff --git a/src/htrsim/R/input_estimation.R b/src/htrsim/R/input_estimation.R index b601c5e..cfbe88b 100644 --- a/src/htrsim/R/input_estimation.R +++ b/src/htrsim/R/input_estimation.R @@ -57,7 +57,7 @@ estim.alpha <- function(dds, export = FALSE){ #' @import S4Vectors #' @examples #' @importFrom rlang .data -estim.mu <- function(dds, mm, export = FALSE){ +estim.mu <- function(dds, mm, epsilon = TRUE, export = FALSE){ gene_id <- NULL @@ -90,14 +90,22 @@ estim.mu <- function(dds, mm, export = FALSE){ beta.matrix = cbind(B0, B1,B2,B3,B4,B5) %>% as.matrix() #p_ij = B0_i*mm1_j + B1_i*mm2_j + B3_i*mm3_j + B4_i*mm4_j + B5_i*mm5_j p_ij = beta.matrix %*% t(mm) - #epsilon_ij ~ N(0, deviance) - #epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 )) %>% data.frame() %>% as.matrix() - epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% as.matrix() - ## log_qij = p_ij + epsilon_ij - log_qij <- p_ij + epsilon_ij + + + if (epsilon == TRUE){ + message("Epsilon : TRUE") + #epsilon_ij ~ N(0, deviance) + epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% as.matrix() + ## log_qij = p_ij + epsilon_ij + log_qij <- p_ij + epsilon_ij + } + else { + message("Epsilon : FALSE") + log_qij <- p_ij + } + ## s_j s_j = dds$sizeFactor - mu_ij = s_j * 2^log_qij diff --git a/src/htrsim/R/launch_deseq.R b/src/htrsim/R/launch_deseq.R index d500f78..1f5f266 100644 --- a/src/htrsim/R/launch_deseq.R +++ b/src/htrsim/R/launch_deseq.R @@ -9,13 +9,11 @@ #' #' @examples run.deseq <- function(tabl_cnts, bioDesign ){ - ########### LAUNCH DESEQ ############# - ## Design model - specify reference - bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D')) - bioDesign$env <- factor(x = bioDesign$env,levels = c('control', 'kcl')) - ## DESEQ standard analysis - dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ genotype + env + genotype:env) + dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ genotype + env + genotype:env ) + + dds <- DESeq2::DESeq(dds) return(dds) -} + + } diff --git a/src/htrsim/inst/extdata/public_bioDesign.csv b/src/htrsim/inst/extdata/public_bioDesign.csv deleted file mode 100644 index 9c0ad93..0000000 --- a/src/htrsim/inst/extdata/public_bioDesign.csv +++ /dev/null @@ -1,13 +0,0 @@ -sample;env;genotype -WT_control_rep1;control;wt -WT_control_rep2;control;wt -WT_KCl_rep1;kcl;wt -WT_KCl_rep2;kcl;wt -Msn4D_control_rep1;control;msn4D -Msn4D_control_rep2;control;msn4D -Msn4D_KCl_rep1;kcl;msn4D -Msn4D_KCl_rep2;kcl;msn4D -Msn2D_control_rep1;control;msn2D -Msn2D_control_rep2;control;msn2D -Msn2D_KCl_rep1;kcl;msn2D -Msn2D_KCl_rep2;kcl;msn2D diff --git a/src/htrsim/man/estim.mu.Rd b/src/htrsim/man/estim.mu.Rd index 55ffd21..0a203bd 100644 --- a/src/htrsim/man/estim.mu.Rd +++ b/src/htrsim/man/estim.mu.Rd @@ -4,11 +4,13 @@ \alias{estim.mu} \title{Estimate mu_ij} \usage{ -estim.mu(dds, export = FALSE) +estim.mu(dds, mm, epsilon = TRUE, export = FALSE) } \arguments{ \item{dds}{DESEQ2 object} +\item{mm}{a model matrix} + \item{export}{Boolean} } \value{ diff --git a/src/htrsim/man/handle_except.Rd b/src/htrsim/man/handle_except.Rd index 563ca47..083d5be 100644 --- a/src/htrsim/man/handle_except.Rd +++ b/src/htrsim/man/handle_except.Rd @@ -4,18 +4,16 @@ \alias{handle_except} \title{Handle exception} \usage{ -handle_except(bioSample, n_rep, gene_id, alpha, n_genes) +handle_except(bioSample, n_rep, gene_id, alpha) } \arguments{ +\item{bioSample}{vector of id for each bioSample} + \item{n_rep}{number of replicates} \item{gene_id}{vector of id for each gene} \item{alpha}{vector of alpha_i} - -\item{n_genes}{number of genes} - -\item{bioSample_id}{vector of id for each bioSample} } \value{ diff --git a/src/htrsim/man/reshape_input2setup.Rd b/src/htrsim/man/reshape_input2setup.Rd index eb85d5a..27551e3 100644 --- a/src/htrsim/man/reshape_input2setup.Rd +++ b/src/htrsim/man/reshape_input2setup.Rd @@ -4,12 +4,14 @@ \alias{reshape_input2setup} \title{Reshape input before building setup} \usage{ -reshape_input2setup(mu.dtf, alpha.dtf) +reshape_input2setup(mu.dtf, alpha.dtf, average_rep = FALSE) } \arguments{ \item{mu.dtf}{dataframe of mu_ij} \item{alpha.dtf}{dataframe of alpha_i} + +\item{average_rep}{bool} } \value{ diff --git a/src/htrsim/man/setup_countGener.Rd b/src/htrsim/man/setup_countGener.Rd index c9b1556..4d6b277 100644 --- a/src/htrsim/man/setup_countGener.Rd +++ b/src/htrsim/man/setup_countGener.Rd @@ -5,11 +5,10 @@ \title{Build setup for counts generator} \usage{ setup_countGener( - bioSample_id = "my_first_lib", + bioSample_id = NULL, n_rep = NULL, gene_id = NULL, alpha = NULL, - n_genes = NULL, mu = NULL ) } @@ -22,8 +21,6 @@ setup_countGener( \item{alpha}{vector of alpha_i} -\item{n_genes}{number of genes} - \item{mu}{dataframe of mu_ij} } \value{ diff --git a/src/htrsim/vignettes/comment-utiliser-mon-package.Rmd b/src/htrsim/vignettes/comment-utiliser-mon-package.Rmd index 5f5daa5..ba6fe98 100644 --- a/src/htrsim/vignettes/comment-utiliser-mon-package.Rmd +++ b/src/htrsim/vignettes/comment-utiliser-mon-package.Rmd @@ -7,28 +7,47 @@ vignette: > %\VignetteEncoding{UTF-8} --- +<style> +body { +text-align: justify} +</style> + # A. Introduction + +A differential expression analysis uses a generalized linear model of the form: + +$$ +K_{ij} \sim {\sf NB}(\mu_{ij} ; \sigma_i) +$$ $$ -Phenotype = Genotype + Environment + Genotype.Environment +\mu_{ij} = s_jq_{ij} $$ -From this expression, $\beta_{G}$, $\beta_{E}$, $\beta_{G*E}$ can be seen as coefficients which allow quantifying the participation of each factors (Genotype, Environment and interaction Genotype/Environment). -In mathematical term, it leads to a linear expression such: $$ -P = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} +log_2(q_{ij}) = x_j*\beta_i +$$ +where counts $K_{ij}$ for gene i, sample j are modeled using a Negative Binomial distribution with fitted mean $\mu_{ij}$ and a gene-specific dispersion parameter $\alpha_i$. +The fitted mean is composed of a sample-specific size factor $s_j$ and a parameter qij proportional to the expected true concentration of fragments for sample j. +The coefficients $\beta_i$ give the log2 fold changes for gene i for each column of the model matrix X. The sample-specific size factors can be replaced by gene-specific normalization factors for each sample using normalizationFactors. + +Basically, genes expression is understanding as a shake between a genotype effect, an environment effect and an interaction between G&E. The part of each effect can be modelized by a coefficient $\beta$. +Considering genes expression as a phenotype we can write: +$$Phenotype = \beta_{G} * Genotype + \beta_{E}*Environment + \beta_{G*E} * Genotype.Environment$$ + +From a generalized linear model we can try to quantify each effect : $\beta_{G}$, $\beta_{E}$ and $\beta_{G*E}$. +Quantifying such coefficients will allow to evaluate the participation of each factors (Genotype, Environment and interaction Genotype/Environment) for each gene. Then, differencies between genes or conditions could be assessed. + +According to the DESEQ2 GLM, we can write: +$$ +log_2(\mu_{ij]}) = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} $$ -In order to estimate these coefficients, a Generalized Linear Model (GLM) can be used. # B. HTRSIM getting started <u>a. Required</u> -```{r, setupWD, include=FALSE} -#knitr::opts_knit$set(root.dir = '~/counts_simulation/') -``` - -```{r setup} +```{r setup, results='hide', message=FALSE, warning=FALSE} library(htrsim) library(tidyverse) ``` @@ -58,27 +77,73 @@ tabl_cnts <- read.table(file = fn, header = TRUE) rownames(tabl_cnts) <- tabl_cnts$gene_id tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName +``` + +```{r} +fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") + +bioDesign <- read.table(file = fn, header = TRUE, sep = ';') ``` <u> e. Launch HTRSIM</u> ```{r message=FALSE, warning=FALSE} -fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim") -## import design of bioProject -bioDesign <- read.table(file = fn, header = T, sep = ';') -#bioDesign -#source(file = "htrsim/main.R") -tabl_cnts %>% dim() -bioDesign %>% dim() -#simul_cnts = htrsim(tabl_cnts, bioDesign = bioDesign, 2) +#dds = run.deseq(tabl_cnts = tabl_cnts, bioDesign = bioDesign) + +#htrsim.results = htrsim(countData = tabl_cnts, bioDesign= bioDesign, N_replicates=1) + + + +``` +<u> f. Evaluate simulation </u> + +```{r} +build_design2Deseq = function(htrs.tablCnts){ + #samples = samples %>% data.frame() + sample = colnames(htrs.tablCnts %>% select(-gene_id)) + #design.deseq <- list(samples = colnames(htrs.tablCnts)) + genotype <- map(sample, ~str_split(., pattern = "_")[[1]][1]) %>% unlist() + env <- map(sample, ~str_split(., pattern = "_")[[1]][2]) %>% unlist() + new_design = cbind(sample,genotype, env) %>% data.frame() + + return (new_design) +} + +designSimu <- build_design2Deseq(htrsim.results$countDataSim) + +row.names(htrsim.results$countDataSim) = htrsim.results$countDataSim$gene_id +htrsim.results$countDataSim = htrsim.results$countDataSim %>% select(-gene_id) + +htrsim.results$countDataSim %>% dim() +designSimu %>% dim() +#bioDesign$ +designSimu$env +colnames(htrsim.results$countDataSim) +model.matrix(~genotype + env + genotype:env, designSimu) +simu_deseq = run.deseq(tabl_cnts = htrsim.results$countDataSim, designSimu) + +designSimu$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D')) +designSimu$env <- factor(x = bioDesign$env,levels = c('control', 'kcl')) + + ## DESEQ standard analysis + +htrs.tablCnts <- htrsim.results$countDataSim %>% replace(is.na(.), 0) + +max(htrs.tablCnts) +dds = DESeq2::DESeqDataSetFromMatrix( countData = round(htrs.tablCnts/10), colData = designSimu , design = ~ genotype + env + genotype:env) +dds <- DESeq2::DESeq(dds) -htrsim.results = htrsim(countData = tabl_cnts, bioDesign= bioDesign, N_replicates=2) ``` + <u> f. Visualize results</u> ```{r message=FALSE, warning=FALSE} -htrsim.results$input$d + +devtools::load_all() + + + ``` -- GitLab