From bd2f35dd0960c70ca735e153cd2fa5df53848bda Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Fri, 25 Mar 2022 15:53:44 +0000 Subject: [PATCH] update tuto --- src/tutorial_htrsim.Rmd | 495 ++++++++++++++++++++++++++++++++-------- 1 file changed, 404 insertions(+), 91 deletions(-) diff --git a/src/tutorial_htrsim.Rmd b/src/tutorial_htrsim.Rmd index 8ddc0c5..36d2092 100644 --- a/src/tutorial_htrsim.Rmd +++ b/src/tutorial_htrsim.Rmd @@ -3,10 +3,29 @@ title: "HTRSIM Getting started" output: html_document --- +# A. Introduction + +$$ +Phenotype = Genotype + Environment + Genotype.Environment +$$ +From this expression, $\beta_{G}$, $\beta_{E}$, $\beta_{G*E}$ can be seen as coefficients which allow quantifying the participation of each factors (Genotype, Environment and interaction Genotype/Environment). +Then, +$$ +P = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0} +$$ +In order to estimate these coefficients, a Generalized Linear Model (GLM) can be used. + + +```{r echo=FALSE, out.width='100%'} +knitr::include_graphics('../img/schema_loop.jpg') +``` + + + This is a tutorial for *htrsim* utilization -```{r required} +```{r required, message=FALSE, echo = T, results = "hide"} library(data.table) library(tidyverse) library(DESeq2) @@ -15,7 +34,9 @@ library(DESeq2) ```{r setworkdir} # on berthollet -setwd("/home2/aduvermy/counts_simulation/src/") +#setwd("/home2/aduvermy/counts_simulation/src/") +# on VM +setwd("/home/rstudio/mydatalocal/counts_simulation/src/") ``` @@ -35,7 +56,7 @@ As a first step we will simulate a single library with 100 replicates.</br> <u>a. Setup the simulation</u> -```{r simple design, echo=TRUE} +```{r simple design, echo=TRUE, message=FALSE, warning=FALSE} N_samples = 1 ## number of samples N_replicates = 3 ## Nb of replicates per samples N_genes = 3 # number of genes in your favorite organism @@ -46,7 +67,9 @@ samples ### SETUP SIMULATION source(file = "htrsim/setup_cntsGenerator.R") -setup.simulation <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , n_genes = N_genes) +setup.simulation <- setup_countGener(sample_names = samples$name, + n_rep= samples$n_rep, + n_genes = N_genes) setup.simulation ``` @@ -54,7 +77,8 @@ setup.simulation Now you can generate a lovely counts table from parameters *mu* and *alpha* stored in ```setup.simulation```.</br> -```{r generate counts, echo=TRUE} +```{r generate counts, echo=TRUE, message=FALSE, warning=FALSE} +## GENERATE COUNTS source(file = "htrsim/counts_generator.R") htrs <- generate_counts(setup.simulation) htrs %>% dim() @@ -86,7 +110,7 @@ Then, $\mu_{ij}$ and $\alpha_{i}$ define a negative binomial distribution from w </br> Please redo the simulation with a large set of replicates for your library. </br> -```{r understand mu & alpha, echo=TRUE} +```{r understand mu & alpha, echo=TRUE, message=FALSE, warning=FALSE} N_samples = 1 ## number of samples N_replicates = 2000 ## Nb of replicates per samples N_genes = 3 # number of genes in your favorite organism @@ -96,7 +120,10 @@ samples <- list(name = "my_sample", n_rep = N_replicates) ### SETUP SIMULATION source(file = "htrsim/setup_cntsGenerator.R") -setup.simulation <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , n_genes = N_genes) +setup.simulation <- setup_countGener(sample_names= samples$name, + n_rep= samples$n_rep, + n_genes = N_genes) +### GENERATE COUNTS source(file = "htrsim/counts_generator.R") htrs <- generate_counts(setup.simulation) ``` @@ -106,16 +133,19 @@ Next, plot your distribution of simulated counts per gene.</br> <span style="color:red">*WARNING: Code bellow is informative only with one library, a small set of genes, but a lot of replicates !*</span></br> -```{r plot hist, echo = TRUE} - +```{r plot hist, message=FALSE, warning=FALSE} +## Reshape and build dataframe easy to plot data2plot <- htrs %>% reshape2::melt( ., id=c("name_gene"), variable.name = "Run") -data2plot <- data2plot %>% dplyr::group_by(name_gene) %>% dplyr::mutate(mean_obs = mean(value)) %>% dplyr::ungroup() +data2plot <- data2plot %>% + dplyr::group_by(name_gene) %>% + dplyr::mutate(mean_obs = mean(value)) %>% + dplyr::ungroup() figure = data2plot %>% ggplot(., aes(x=value)) + - geom_histogram(fill= "grey", binwidth = 30) + - facet_wrap(~name_gene, scale = "free_y") + - geom_vline(aes(xintercept = mean_obs), col="#0072B2") + - geom_text(aes(x = mean_obs, y = 0, label = paste0("Mean obs\n", mean_obs), vjust = -2, hjust=-0.2), col="#0072B2") + geom_histogram(fill= "grey", binwidth = 30) + + facet_wrap(~name_gene, scale = "free_y") + + geom_vline(aes(xintercept = mean_obs), col="#0072B2") + + geom_text(aes(x = mean_obs, y = 0, label = paste0("Mean obs\n", mean_obs), vjust = -2, hjust=-0.2), col="#0072B2") figure setup.simulation ``` @@ -131,11 +161,10 @@ And alpha is the dispersion of the distribution. <u>d. Setup mu and alpha before counts generation</u> *htrsim* gives you the possibility to setup your own parameters for negative binomial. </br> -You can control the distribution from which each count $\c_{ij}$ are sampled. +You can control the distribution from which each count $c_{ij}$ are sampled. ```{r setup params} - N_samples = 2 ## number of samples N_replicates = 2 ## Nb of replicates per samples N_genes = 3 # number of genes in your favorite organism @@ -145,47 +174,58 @@ samples <- list(name = paste0('sample', 1:N_samples), n_rep = N_replicates) ### Defining alpha params for each gene -alphaGene.set = runif(0.2,120, n = N_genes) ## randomly defined between 2 and 100 +alphaGene.set = stats::runif(0.2,120, n = N_genes) ## randomly defined between 2 and 120 nameGene.set = paste0('gene', 1:N_genes) genes <- list(name = nameGene.set , alpha = alphaGene.set) %>% as.data.frame() -#genes -### Defining mu params for each gene of each sample + ## from an input dataframe mu.dtf.colnames = samples$name mu.dtf.geneID = genes$name mu.dtf.init = data.frame(matrix(0, nrow = length(mu.dtf.geneID), ncol = length(mu.dtf.colnames))) -colnames(mu.dtf.init) = mu.dtf.colnames# -mu.dtf <- mu.dtf.init %>% mutate_all(., ~ runif(100, 2000, n = N_genes)) ## fill mu.dtf from uniform law -mu.dtf <- mu.dtf %>% mutate(gene_id = mu.dtf.geneID) %>% select(gene_id ,everything()) #optionnal -## from a statistical law -mu.foo <- function(x) return(runif(100,1000, n = x )) - +colnames(mu.dtf.init) = mu.dtf.colnames +mu.dtf <- mu.dtf.init %>% + dplyr::mutate_all(., ~ runif(100, 2000, n = N_genes)) ## fill mu.dtf from uniform law +mu.dtf <- mu.dtf %>% #optionnal + dplyr::mutate(gene_id = mu.dtf.geneID) %>% + dplyr::select(gene_id ,everything()) ## Setup simulation source(file = "htrsim/setup_cntsGenerator.R") -## with mu dataframe -setup.simulation1 <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , gene_dispersion = genes$alpha, gene_names = genes$name, mu = mu.dtf) +## with mu define from a dataframe +setup.simulation1 <- setup_countGener(sample_names= samples$name, + n_rep= samples$n_rep, + gene_dispersion = genes$alpha, + gene_names = genes$name, + mu = mu.dtf) setup.simulation1 -## with mu function -setup.simulation2 <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , gene_dispersion = genes$alpha, gene_names = genes$name, mu = mu.foo) + +### Defining mu params for each gene of each sample +## from a statistical law +mu.foo <- function(x) return(runif(100,1000, n = x )) +## with mu define from function +setup.simulation2 <- setup_countGener(sample_names= samples$name, + n_rep= samples$n_rep, + gene_dispersion = genes$alpha, + gene_names = genes$name, + mu = mu.foo) setup.simulation2 ``` Above we randomly defined $\mu_{ij}$ and $\alpha_{i}$.</br> -Defining mu and alpha value per gene for each sample is useful when you can infer them from experimental data.</br> +Defining mu and alpha value per gene for each sample is useful when you can infer them from experimental data.</br> [See rna-seq_public_library_investigation](https://gitbio.ens-lyon.fr/aduvermy/rna-seq_public_library_investigations) Use filter to understand the ```setup.simulation``` object.</br> ```{r filter setup, echo=TRUE} -setup.simulation1 %>% filter(name_gene == "gene1") +setup.simulation1 %>% dplyr::filter(name_gene == "gene1") ``` -You should notice that $$\mu$$ and $$\alpha$$ are equal for equivalent gene between replicates.</br> +You should notice that $\mu$ and $\alpha$ are equal for equivalent gene between replicates.</br> But not $\mu$ ! </br> As defined in : <br> $$ @@ -194,7 +234,7 @@ $$ ## B. Performing complex design -Now you understood how mu and alpha affect the distribution of counts per genes. +Now you understood how $\mu$ and $\alpha$ affect the distribution of counts per gene.<br> Also, you mastered using *htrsim* to build a library from simple RNA-seq design. @@ -203,7 +243,7 @@ Also, you mastered using *htrsim* to build a library from simple RNA-seq design. *Htrsim* was developed to build complex RNA-seq design. By complex RNA-seq design I mean several libraries with several replicates and not often the same number of replicates per samples. -```{r complex design} +```{r complex design, message=FALSE, warning=FALSE} N_samples = 10 max_N_replicates = 5 ## maximum number of replicates per sample -> heterogeneous design N_genes = 6000 @@ -224,17 +264,24 @@ genes <- list(name = nameGene.set , alpha = alphaGene.set) %>% as.data.frame() mu.dtf.colnames = samples$name mu.dtf.geneID = genes$name mu.dtf.init = data.frame(matrix(0, nrow = length(mu.dtf.geneID), ncol = length(mu.dtf.colnames))) -colnames(mu.dtf.init) = mu.dtf.colnames# -mu.dtf <- mu.dtf.init %>% mutate_all(., ~ runif(100, 2000, n = N_genes)) ## fill mu.dtf from uniform law -mu.dtf <- mu.dtf %>% mutate(gene_id = mu.dtf.geneID) %>% select(gene_id ,everything()) +colnames(mu.dtf.init) = mu.dtf.colnames +mu.dtf <- mu.dtf.init %>% + mutate_all(., ~ runif(100, 2000, n = N_genes)) ## fill mu.dtf from uniform law +mu.dtf <- mu.dtf %>% + mutate(gene_id = mu.dtf.geneID) %>% # reshape for convenience + dplyr::select(gene_id ,everything()) ## Setup simulation source(file = "htrsim/setup_cntsGenerator.R") ## with mu dataframe -setup.simulation <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , gene_dispersion = genes$alpha, gene_names = genes$name, mu = mu.dtf) +setup.simulation <- setup_countGener(sample_names= samples$name, + n_rep= samples$n_rep, + gene_dispersion = genes$alpha, + gene_names = genes$name, + mu = mu.dtf) source(file= "htrsim/counts_generator.R" ) htrs <- generate_counts(setup.simulation) -htrs %>% filter(name_gene == "gene1") +htrs %>% dplyr::filter(name_gene == "gene1") ``` About columns names:</br> @@ -248,21 +295,23 @@ About columns names:</br> Once you convert name_gene columns to rownames ```htrs``` can be used as input of DESEQ2. -```{r input deseq 1} +```{r input deseq 1, message=FALSE, warning=FALSE} +## Reshape for DESEQ rownames(htrs) <- htrs$name_gene -htrs.deseq = htrs %>% select(-name_gene) # remove name gene column -htrs.deseq %>% filter(rownames(.)=='gene1') +htrs.deseq = htrs %>% dplyr::select(-name_gene) # remove name gene column ``` -But DESEQ2 also needs to understand your design. For this follow this (link to deseq tuto) -Otherwise, we developed a function to convert ```setup.simulation``` into a conventional deseq2 design input. +But DESEQ2 also needs to understand your design. [DESeq2 tutorial](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)<br> +For this, we developed a function to convert ```setup.simulation``` into a conventional deseq2 design input. -```{r input deseq 2} +```{r input deseq 2, message=FALSE, warning=FALSE} ## add information for each sample set.seed(101) samples$env <- c(rep("kcl", 5), rep("control", 5)) ## 5 x 2 environments samples$genotype <- sample(c("msn2d", "msn4D", "WT"),10, replace = TRUE) + +## SETUP DESEQ source(file = "htrsim/setup_deseq.R") setup.deseq <- setup_deseq(samples, htrs.deseq) setup.deseq %>% head() @@ -270,17 +319,19 @@ setup.deseq %>% head() <u>c. Run DESeq2</u> -```{r} +```{r message=FALSE, warning=FALSE} #source(file = "htrsim/launch_deseq.R") htrs.deseq %>% dim()# htrs %>% select(-name_gene) setup.deseq %>% dim() #model.matrix(~genotype + env + genotype:env, setup.deseq) # check if a column is fully equal to 0 if yes -> lead to an error -dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq , colData = setup.deseq , design = ~ genotype + env + genotype:env) +dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq, + colData = setup.deseq, + design = ~ genotype + env + genotype:env) ``` <u>d. alpha</u> -```{r} +```{r message=FALSE, warning=FALSE} dds <- DESeq2::estimateSizeFactors(dds) dds <- DESeq2::estimateDispersions(dds) #dispersion_estimate <- dip @@ -299,16 +350,18 @@ Whereas we produced alpha per gene from a uniform distribution.<br> <u>d. mu </u> -```{r} +```{r message=FALSE, warning=FALSE} mu_estimate <- dds@assays@data$mu mu_inference.vec = as.vector(mu_estimate) mu.inference = data.frame(mu = mu_inference.vec, from = "inference") -mu_input.vec = as.vector(mu.dtf %>% select(-gene_id)) %>% flatten() %>% unlist() +mu_input.vec = as.vector(mu.dtf %>% dplyr::select(-gene_id)) %>% + purrr::flatten() %>% + BiocGenerics::unlist() mu.input = data.frame(mu = mu_input.vec, from = "input") -mu.dtf <- rbind(mu.input, mu.inference) +mu.dtf <- BiocGenerics::rbind(mu.input, mu.inference) ggplot(mu.dtf, aes(x=mu)) + geom_density() + facet_grid(~from) ``` @@ -319,32 +372,49 @@ Whereas we produced mu per gene from a uniform distribution.<br> ## C. BioProject PRJNA675209b as input -[link github](https://gitbio.ens-lyon.fr/aduvermy/rna-seq_public_library_investigations)<br> +PRJNA675209b is a RNA-seq project which proposes 3 genotypes studied into 2 environmental conditions.<br> +Using the conventional RNA-Seq procedure described at [see](https://gitbio.ens-lyon.fr/aduvermy/rna-seq_public_library_investigations), and a Generalized Linear Model it is possible to deduce $\mu$ and $\alpha$ from real counts per gene. +Using this RNA-seq conventional procedure we defined $\mu$ and $\alpha$ from RNA-seq real table counts (PRJNA675209b). +<br> -We will mimic an existing design. +By using as input parameters of ```htrsim```, $\mu_{ij}$ and $\alpha_{i}$ deduced from PRJNA675209b table counts, we will simulate table counts ($c_{ij}$). +By using a Generalized Linear Model, we will infer $\mu_{ij}$ and $\alpha_{i}$ from $c_{ij}$ obtained in simulation.<br> +Finally, we will compare $\mu_{ij}$, $\alpha_{i}$ deduced from real counts, and $\mu_{ij}$, $\alpha_{i}$ obtained from simulate counts. +**By playing on pragmatical parameters (number of replicates, library size, ...) we aim to understand on which way we may maximize the fit between $\mu_{ij}$ and $\alpha_{i}$ obtained from real and simulate counts** 1. Input for simulation ```{r message=FALSE, warning=FALSE} -## import mu(ij) params for each gene & each sample -mu_params = readr::read_tsv(file="../../rna-seq_public_library_investigations/results/2022-03-03/estimate_mu.tsv", show_col_type=F) -mu_params <- mu_params %>% select(., contains("rep1")) - ## import alpha(i) for each genes -alpha_params = readr::read_tsv(file="../../rna-seq_public_library_investigations/results/2022-03-03/estimate_dispersion.tsv", show_col_type=F) +alpha_params = readr::read_tsv(file="../../rna-seq_public_library_investigations/results/2022-03-03/estimate_dispersion.tsv")#, show_col_type=F) +## import mu(ij) params for each gene & each sample +mu_params = readr::read_tsv(file="../../rna-seq_public_library_investigations/results/2022-03-03/estimate_mu.tsv")#, show_col_type=F) ## Defining sample names sample_names <- mu_params %>% - #select(-gene_id) %>% + select(-gene_id) %>% colnames() %>% - map(., ~str_split(.,"_")[[1]][1:2] %>% - paste(., collapse='_') ) %>% - unlist() %>% unique() + purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>% + BiocGenerics::paste(., collapse='_') ) %>% + BiocGenerics::unlist() %>% BiocGenerics::unique() +## Mu is same for replicate +### case 1: choose 1 replicate +#mu_params <- mu_params %>% dplyr::select(., contains("rep1")) ## rename mu_params colnames to ensure corresponding with sample_names -colnames(mu_params) <- sample_names +#colnames(mu_params) <- sample_names +### case 2: average replicates +average_rep <- function(x) { + varname <- x + mu_params %>% + select(.,contains(x)) %>% + mutate(!!varname := rowMeans(.)) %>% + select(varname) +} +mu_params <- sample_names %>% map(.x = ., .f = ~average_rep(.x)) %>% data.frame() +mu_params$gene_id <- alpha_params$gene_id ## Defining number of genes: N_genes = length(alpha_params$gene_id) @@ -358,75 +428,318 @@ samples <- list(name = sample_names, n_rep = N_replicates) ## Nb of replicates ### Defining alpha params for each gene alphaGene.set = alpha_params$dispersion nameGene.set = alpha_params$gene_id -genes <- list(name = nameGene.set , alpha = alphaGene.set) %>% as.data.frame() +genes <- list(name = nameGene.set, + alpha = alphaGene.set) %>% + as.data.frame() ``` 2. setup & process simulation -```{r} +```{r message=FALSE, warning=FALSE} source(file = "htrsim/setup_cntsGenerator.R") -## with mu dataframe -setup.simulation <- setup_countGener(sample_names= samples$name , n_rep= samples$n_rep , gene_dispersion = genes$alpha, gene_names = genes$name, mu = mu_params) -setup.simulation %>% head +## Setup simulation from PRJNA675209b table counts +setup.simulation <- setup_countGener(sample_names = samples$name, + n_rep= samples$n_rep, + gene_dispersion = genes$alpha, + gene_names = genes$name, + mu = mu_params) +## Generate counts source(file= "htrsim/counts_generator.R" ) htrs <- generate_counts(setup.simulation) ``` 3. Setup & Run DESeq2 -```{r} +```{r message=FALSE, warning=FALSE} ## reshape htrs for convenience rownames(htrs) <- htrs$name_gene -htrs.deseq = htrs %>% select(-name_gene) # remove name gene column +htrs.deseq = htrs %>% + dplyr::select(-name_gene) # remove name gene column ## add information for each sample -samples$env <- sample_names %>% map(., ~str_split(.,"_")[[1]][2]) %>% unlist() ## 5 x 2 environments -samples$genotype <- sample_names %>% map(., ~str_split(.,"_")[[1]][1]) %>% unlist() +samples$env <- sample_names %>% + purrr::map(., ~stringr::str_split(.,"_")[[1]][2]) %>% + BiocGenerics::unlist() ## 5 x 2 environments +samples$genotype <- sample_names %>% + purrr::map(., ~stringr::str_split(.,"_")[[1]][1]) %>% + BiocGenerics::unlist() + ## build input dataframe for deseq -setup.deseq = samples %>% as.data.frame() %>% uncount(n_rep, .id = "idx") %>% unite(name, name, idx , sep = "_") +setup.deseq = samples %>% + as.data.frame() %>% + tidyr::uncount(n_rep, .id = "idx") %>% + tidyr::unite(name, name, idx , sep = "_") rownames(setup.deseq) <- c() ## drop rownames - #check homogeneity between countData column number & colData row number -htrs.deseq %>% dim()# htrs %>% select(-name_gene) +htrs.deseq %>% dim() setup.deseq %>% dim() -#model.matrix(~genotype + env + genotype:env, setup.deseq) # check if a column is fully equal to 0 if yes -> lead to an error -dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq , colData = setup.deseq , design = ~ genotype + env + genotype:env) + +# check if a column is fully equal to 0 if yes -> lead to an error +#model.matrix(~genotype + env + genotype:env, setup.deseq) +dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq, + colData = setup.deseq, + design = ~ genotype + env + genotype:env) ``` 4. alpha -```{r} +```{r message=FALSE, warning=FALSE} + +## Dispersion inference dds <- DESeq2::estimateSizeFactors(dds) dds <- DESeq2::estimateDispersions(dds) -#dispersion_estimate <- dip dispersion_estimate <- DESeq2::dispersions(dds) -alpha.inference = data.frame(alpha = dispersion_estimate, from = "inference") -alpha.input = data.frame(alpha = alphaGene.set, from = "input") +## Build dataframes +alpha.inference = data.frame(inference = dispersion_estimate) -alpha.dtf <- rbind(alpha.input, alpha.inference) -ggplot(alpha.dtf, aes(x=alpha), na.rm = TRUE) + geom_density() + scale_x_log10() + facet_grid(~from) +alpha.input = data.frame(input = alpha_params$dispersion, + name_gene = alpha_params$gene_id) + +## Same dimension +#alpha.input %>% dim +#alpha.inference %>% dim + +## Merge & plot +alpha.dtf <- BiocGenerics::cbind(alpha.input, alpha.inference) + + +## Plot 2 by 2 +ggplot(alpha.dtf, aes(x=input, y = inference), na.rm = TRUE) + geom_point(alpha=0.4) + +# Reshape and plot distribution +alpha.dtf.reshape = alpha.dtf %>% + reshape2::melt( ., id=c("name_gene"), + variable.name = "from", + value.name = "alpha") + + +ggplot(alpha.dtf.reshape, aes(x=alpha), na.rm = TRUE) + geom_density() + scale_x_log10() + facet_grid(~from) ``` 5. mu ```{r} - +## Mu inference mu_estimate <- dds@assays@data$mu -mu_inference.vec = as.vector(mu_estimate) %>% as.numeric() -mu.inference = data.frame(mu = mu_inference.vec, from = "inference") +colnames(mu_estimate) <- htrs.deseq %>% colnames() +mu_estimate = mu_estimate %>% as.data.frame() +## Average replicates +average_rep <- function(x) { + varname <- x + mu_estimate %>% + select(.,contains(x)) %>% + mutate(!!varname := rowMeans(.)) %>% + select(varname) +} + +mu_estimate <- sample_names %>% map(.x = ., .f = ~average_rep(.x)) %>% data.frame() +mu_inference.vec = mu_estimate %>% + purrr::flatten() %>% + BiocGenerics::unlist() %>% + as.numeric() +mu.inference = data.frame(inference = mu_inference.vec, + env = rep(samples$env, each = N_genes), + genotype = rep(samples$genotype, each = N_genes)) + +## Mu used as input +mu_input.vec = mu_params %>% + select(-gene_id) %>% + purrr::flatten() %>% + BiocGenerics::unlist() %>% + as.numeric() +mu.input = data.frame(input = mu_input.vec, name_gene = mu_params$gene_id ) + +#Same dimension +#mu.inference %>% dim() +#mu.input %>% dim() + +## Merged input & inference +mu.dtf <- BiocGenerics::cbind(mu.input, mu.inference) + +## Plot 2 by 2 +ggplot(mu.dtf, aes(x=input, y = inference, col=genotype, shape=env), na.rm = TRUE) + geom_point(alpha=0.4) + +# Reshape and plot distribution +mu.dtf.reshape = mu.dtf %>% + reshape2::melt( ., id=c("name_gene", "env", "genotype"), + variable.name = "from", + value.name = "mu") + +ggplot(mu.dtf.reshape, aes(x=mu), na.rm = TRUE) + geom_density() + + scale_x_log10() + facet_grid(~from) +``` -mu_input.vec = as.vector(mu.dtf) %>% flatten() %>% unlist() %>% as.numeric() -mu.input = data.frame(mu = mu_input.vec, from = "input") +6. Number of replicates effect -mu.dtf <- rbind(mu.input, mu.inference) +Bellow, $\mu_{ij}$ and $\alpha_{i}$ deduced from PRJNA675209b table counts, will be reuse as input of simulation.<br> +By increasing the number of replicates per sample we hope to improve fitting $\mu_{ij}$ and $\alpha_{i}$ obtained from real and simulate counts. + +<br> +Please, redo the simulation and the inference of $\mu_{ij}$ and $\alpha_{i}$ by increasing the number of replicates per sample. + +```{r message=FALSE, warning=FALSE} +## See *B-1. Input for simulation* to define : +# - mu_params +# - alpha_params +# - samples (see *B-3. Setup & Run DESeq2* to add information for each sample) + + + +# number of replicate per simulation +replicate_number2simul = seq(2, 16, by=4) + +#init output +alpha.inference = data.frame(inference = numeric(), + additional_params = factor()) +mu.inference = data.frame(inference = numeric(), + env = factor(), + genotype = factor(), + additional_params = factor(), + name_gene = character()) + +## Mu used as input +mu_input.vec = mu_params %>% + select(-gene_id) %>% + purrr::flatten() %>% + BiocGenerics::unlist() %>% + as.numeric() +mu.input = data.frame(input = mu_input.vec, + name_gene = mu_params$gene_id, + env = rep(samples$env, each = N_genes), + genotype = rep(samples$genotype, each = N_genes)) + + +alpha.input = data.frame(input = alpha_params$dispersion, + name_gene = alpha_params$gene_id) + + + + +for (N in replicate_number2simul){ + + samples$n_rep = N ## modify nb of replicate per simulation + + ## Setup simulation + source(file = "htrsim/setup_cntsGenerator.R") + setup.simulation <- setup_countGener(sample_names= samples$name, + n_rep= samples$n_rep, + gene_dispersion = genes$alpha, + gene_names = genes$name, + mu = mu_params) + ## Generate counts from setup + source(file = "htrsim/counts_generator.R" ) + htrs <- generate_counts(setup.simulation) + + ## reshape htrs for convenience + rownames(htrs) <- htrs$name_gene + htrs.deseq = htrs %>% dplyr::select(-name_gene) # remove name gene column + + + ## build input dataframe for deseq + setup.deseq = samples %>% + as.data.frame() %>% + tidyr::uncount(n_rep, .id = "idx") %>% + tidyr::unite(name, name, idx , sep = "_") + rownames(setup.deseq) <- c() ## drop rownames + + ## run DESEQ + dds = DESeq2::DESeqDataSetFromMatrix(countData = htrs.deseq , + colData = setup.deseq , + design = ~ genotype + env + genotype:env) + + # Build results + dds <- DESeq2::estimateSizeFactors(dds) + dds <- DESeq2::estimateDispersions(dds) + dispersion_estimate <- DESeq2::dispersions(dds) + + + ################# Alpha ################ + tmp = data.frame(inference = dispersion_estimate, + additional_params=N) + + alpha.inference = BiocGenerics::rbind(alpha.inference, tmp) + + ################## Mu ################ + mu_estimate <- dds@assays@data$mu %>% data.frame() + colnames(mu_estimate) <- htrs.deseq %>% colnames() + average_rep <- function(x) { + varname <- x + mu_estimate %>% + select(.,contains(x)) %>% + mutate(!!varname := rowMeans(.)) %>% + select(varname) + } + + mu_estimate <- sample_names %>% map(.x = ., .f = ~average_rep(.x)) %>% data.frame() + mu_inference.vec = mu_estimate %>% + purrr::flatten() %>% + BiocGenerics::unlist() %>% + as.numeric() + tmp = data.frame(inference = mu_inference.vec, + env = rep(samples$env, each = N_genes), + genotype = rep(samples$genotype, each = N_genes), + additional_params=N, + name_gene = rep(mu_params$gene_id, length(sample_names))) + mu.inference = BiocGenerics::rbind(mu.inference, tmp) + +} -ggplot(mu.dtf, aes(x=mu), na.rm = TRUE) + geom_density() + - scale_x_log10() + facet_grid(~from) ``` +Now, you can compare $\alpha_{i}$ from real and simulate counts.<br> + + +```{r message=FALSE, warning=FALSE} +## bind alpha used as input and alpha inferred +alpha.dtf <- BiocGenerics::cbind(alpha.input, alpha.inference) + + +## plot 2 by 2 +alpha.inference$additional_params = alpha.inference$additional_params %>% as.factor() +ggplot(alpha.dtf, na.rm = TRUE) + geom_point(aes(x=input, y=inference), alpha=0.02) + scale_y_log10() + scale_x_log10() + facet_wrap(~additional_params, scales = 'free_y') + + + +# Reshape and plot distribution +alpha.dtf.reshape = alpha.dtf %>% + reshape2::melt( ., id=c("name_gene", "additional_params"), + variable.name = "from", + value.name = "alpha") + +ggplot(alpha.dtf.reshape, aes(x=alpha), na.rm = TRUE) + geom_density() + scale_x_log10() + facet_grid(from~additional_params) + + +``` + +The number of replicates does not seem to have a significant effect on the inference of $\alpha_{i}$.<br> +<br> +Now, compare $\mu_{ij}$ from real and simulate counts.<br> + +```{r message=FALSE, warning=FALSE} +## Join input & inference +dt1 <- data.table(mu.inference %>% group_by(additional_params), key = c("name_gene", "env", "genotype")) +dt2 <- data.table(mu.input, key = c("name_gene", "env", "genotype")) +mu.dtf = dt1[dt2, nomatch=0, roll=1] + + +## Plot 2 by 2 +ggplot(mu.dtf, aes(x=input, y = inference), na.rm = TRUE) + geom_point(alpha=0.4) + +## Reshape and plot distribution +mu.dtf.reshape = mu.dtf %>% + reshape2::melt( ., id=c("name_gene", "env", "genotype", "additional_params"), + variable.name = "from", + value.name = "mu") + +ggplot(mu.dtf.reshape, aes(x=mu), na.rm = TRUE) + geom_density() + + scale_x_log10() + facet_grid(from~additional_params) + +``` + +**Increasing the number of replicates seems to improve the fit between the distribution of $\mu_{ij}$, used as input, and the distribution obtained by inference.** -- GitLab