Skip to content
Snippets Groups Projects
Commit 96e96a1a authored by Arnaud Duvermy's avatar Arnaud Duvermy
Browse files

update package

parent a3dbde3b
Branches
Tags
No related merge requests found
......@@ -27,9 +27,14 @@ rn_sim <- function(mu, alpha, n_replicates, ...){
#' @import plyr
#' @import BiocGenerics
#' @import purrr
#' @import readr
#'
#' @examples
generate_counts <- function(setup_dtf, export = FALSE){
full_name <- NULL
name <- NULL
variable <- NULL
message("reading and processing counts per genes ...")
......@@ -46,7 +51,7 @@ generate_counts <- function(setup_dtf, export = FALSE){
tidyr::drop_na(counts) %>%
reshape2::dcast(., gene_id ~ full_name, value.var= "counts")
if (export == TRUE) write.tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv')
if (export == TRUE) readr::write_tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv')
return(cnt.dtf)
}
......
......@@ -12,21 +12,26 @@
#' @examples
htrsim <- function(countData, bioDesign, N_replicates){
# launch standard DESEQ2 analysis
dds = run.deseq(tabl_cnts, bioDesign = bioDesign)
dds = run.deseq(tabl_cnts = countData, bioDesign = bioDesign)
## Model matrix per samples
mm <- model.matrix(~genotype + env + genotype:env, bioDesign)
## Input estimation
mu.input = estim.mu(dds)
res = estim.mu(dds, mm)
mu.input = res$mu
alpha.input = estim.alpha(dds)
# Setup simulation
input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input)
input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE)
setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id,
n_rep = N_replicates,
n_rep = 1,
alpha = input$alpha,
gene_id = input$gene_id,
mu = input$mu)
# Simulate counts
htrs <- generate_counts(setup.simulation)
return(list(countDataSim = htrs, input = input, dds = dds))
......
......@@ -11,15 +11,20 @@
#' @import dplyr
#' @import tibble
#' @import BiocGenerics
#' @import S4Vectors
#' @import readr
#' @examples
#' @importFrom rlang .data
estim.alpha <- function(dds, export = FALSE){
gene_id <- NULL
expressed_gene_dispersion <- NULL
################### Estimate alpha per gene ########################
#N.B: alpha = dispersion per gene
#dds <- DESeq2::estimateDispersions(dds, quiet = F)
#dispersion estimation
dispersion_estimate <- DESeq2::dispersions(dds)
dds.mcols = S4Vectors::mcols(dds,use.names=TRUE)
dispersion_estimate = dds.mcols$dispersion
#dispersion_estimate <- DESeq2::dispersions(dds)
## Shape and export
names(dispersion_estimate) <- names(dds@rowRanges)
......@@ -31,12 +36,103 @@ estim.alpha <- function(dds, export = FALSE){
tibble::rownames_to_column() %>%
dplyr::rename("alpha" = .data$., gene_id = "rowname")
if (export == TRUE) write_tsv(disp_gene_express, 'results/2022-03-03/estimate_dispersion.tsv')
if (export == TRUE) readr::write_tsv(expressed_gene_dispersion, 'results/2022-03-03/estimate_dispersion.tsv')
return(expressed_gene_dispersion)
}
#' Estimate mu_ij
#'
#' @param dds DESEQ2 object
#' @param export Boolean
#' @param mm a model matrix
#'
#' @return mu_ij only for gene expressed c_ij != 0
#' @export
#' @import stats
#' @import dplyr
#' @import tibble
#' @import BiocGenerics
#' @import S4Vectors
#' @examples
#' @importFrom rlang .data
estim.mu <- function(dds, mm, export = FALSE){
gene_id <- NULL
nb_sples = BiocGenerics::rownames(dds@colData) %>% length()
nb_genes = BiocGenerics::rownames(dds@assays@data$counts) %>% length()
mm_epsi = rep(1, nb_sples)
names(mm_epsi) = 1 : nb_sples
dds.mcols = S4Vectors::mcols(dds,use.names=TRUE)
## BETA
B0 <- dds.mcols$Intercept
B1 <- dds.mcols$genotype_msn2D_vs_wt
B2 <- dds.mcols$genotype_msn4D_vs_wt
B3 <- dds.mcols$env_kcl_vs_control
B4 <- dds.mcols$genotypemsn2D.envkcl
B5 <- dds.mcols$genotypemsn4D.envkcl
#print(max(B0, na.rm=TRUE))
#print(max(B1, na.rm=TRUE))
#print(max(B2, na.rm=TRUE))
#print(max(B3, na.rm=TRUE))
#print(max(B4, na.rm=TRUE))
#print(max(B5, na.rm = TRUE))
## deviance = sigma2 -> estimate epsilon
deviance_i.sqrt = sqrt(dds.mcols$deviance)
beta.matrix = cbind(B0, B1,B2,B3,B4,B5) %>% as.matrix()
#p_ij = B0_i*mm1_j + B1_i*mm2_j + B3_i*mm3_j + B4_i*mm4_j + B5_i*mm5_j
p_ij = beta.matrix %*% t(mm)
#epsilon_ij ~ N(0, deviance)
#epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 )) %>% data.frame() %>% as.matrix()
epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% as.matrix()
## log_qij = p_ij + epsilon_ij
log_qij <- p_ij + epsilon_ij
## s_j
s_j = dds$sizeFactor
mu_ij = s_j * 2^log_qij
################# Estimate mu #########################
mu_estimate <- dds@assays@data$mu
#dds@assays@data$mu %>% dim()
#mu_estimate %>% dim()
rownames(mu_ij) = BiocGenerics::rownames(dds@assays@data$counts)
## drop NA in dispersion estimate (link to unexpressed gene)
### and convert to lovely dataframe
mu_gene_express = mu_ij %>%
stats::na.omit() %>%
data.frame()
colnames(mu_gene_express) <- rownames(dds@colData)
mu_gene_express <- mu_gene_express %>%
tibble::rownames_to_column(var = "gene_id")
if (export == TRUE) readr::write_tsv(mu_gene_express, 'results/2022-03-03/estimate_mu.tsv')
res = list(mu = mu_gene_express, beta.matrix = beta.matrix, deviance.sqrt = deviance_i.sqrt, dds.mcols = dds.mcols)
return(res)
}
#.xMm.foo <- function(b, m) return(b * m)
#.epsilon.foo <- function(x) return(rnorm(mean = 0 ,sd = x, n = 1 ))
#.epsilon_i <- function(dev_i) return(dev_i %>% map(., ~.epsilon.foo(.))%>% unlist())
#.getMu_i <- function(s, qi) return(2^(qi))
#' Estimate mu_ij
......@@ -52,7 +148,7 @@ estim.alpha <- function(dds, export = FALSE){
#' @import BiocGenerics
#' @examples
#' @importFrom rlang .data
estim.mu <- function(dds, export = FALSE){
estim.mu_beta <- function(dds, export = FALSE){
gene_id <- NULL
################# Estimate mu #########################
mu_estimate <- dds@assays@data$mu
......
......@@ -10,9 +10,12 @@
#' @examples
run.deseq <- function(tabl_cnts, bioDesign ){
########### LAUNCH DESEQ #############
## Design model
dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ mutant + env + mutant:env)
## Design model - specify reference
bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D'))
bioDesign$env <- factor(x = bioDesign$env,levels = c('control', 'kcl'))
## DESEQ standard analysis
dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ genotype + env + genotype:env)
dds <- DESeq2::DESeq(dds)
return(dds)
}
......@@ -2,6 +2,7 @@
#'
#' @param mu.dtf dataframe of mu_ij
#' @param alpha.dtf dataframe of alpha_i
#' @param average_rep bool
#'
#' @return
#' @import purrr
......@@ -11,103 +12,122 @@
#' @export
#'
#' @examples
reshape_input2setup <- function(mu.dtf, alpha.dtf){
## Defining sample names
bioSample_id <- mu.dtf %>%
reshape_input2setup <- function(mu.dtf, alpha.dtf, average_rep = FALSE){
gene_id <- NULL
if(average_rep == TRUE){
## Defining sample names
bioSample_id <- mu.dtf %>%
dplyr::select(-gene_id) %>%
BiocGenerics::colnames() %>%
purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>%
BiocGenerics::paste(., collapse='_') ) %>%
BiocGenerics::unlist() %>% BiocGenerics::unique()
############### Mu is equal for biosample replicate #############
### case 1: choose 1 replicate
#mu_params <- mu_params %>% dplyr::select(., contains("rep1"))
## rename mu_params colnames to ensure corresponding with sample_names
#colnames(mu_params) <- sample_names
### case 2: average replicates
average_rep <- function(x, dtf) {
varname <- x
dtf %>%
dplyr::select(.,contains(x)) %>%
dplyr::mutate(!!varname := rowMeans(.)) %>%
dplyr::select(varname)
}
mu_ij <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf)) %>% data.frame()
mu_ij$gene_id <- alpha.dtf$gene_id
}
else {
bioSample_id <- mu.dtf %>%
dplyr::select(-gene_id) %>%
BiocGenerics::colnames() %>%
purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>%
BiocGenerics::paste(., collapse='_') ) %>%
BiocGenerics::unlist() %>% BiocGenerics::unique()
############### Mu is same for biosample replicate #############
### case 1: choose 1 replicate
#mu_params <- mu_params %>% dplyr::select(., contains("rep1"))
## rename mu_params colnames to ensure corresponding with sample_names
#colnames(mu_params) <- sample_names
### case 2: average replicates
average_rep <- function(x, dtf) {
varname <- x
dtf %>%
dplyr::select(.,contains(x)) %>%
dplyr::mutate(!!varname := rowMeans(.)) %>%
dplyr::select(varname)
BiocGenerics::colnames()
mu_ij = mu.dtf
}
mu_avg.dtf <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf)) %>% data.frame()
mu_avg.dtf$gene_id <- alpha.dtf$gene_id
return(list(alpha = alpha.dtf , mu = mu_avg.dtf, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id))
return(list(alpha = alpha.dtf , mu = mu_ij, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id))
}
#' Handle exception
#'
#' @param bioSample_id vector of id for each bioSample
#' @param bioSample vector of id for each bioSample
#' @param n_rep number of replicates
#' @param gene_id vector of id for each gene
#' @param alpha vector of alpha_i
#' @param n_genes number of genes
#'
#' @return
#' @export
#'
#' @examples
handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
handle_except <- function(bioSample, n_rep , gene_id , alpha){
if(is.numeric(n_rep) && length(n_rep) == 1){
message("Homogeneous number of replicates between samples: ", n_rep, " replicates per samples\n")
n_rep = rep(n_rep, length(bioSample))
if(is.null(bioSample)){
message("BioSample ID: NULL" )
bioSample = "my_first_lib"
message("Assuming only one library will be setup. A library named 'my_first_lib'")
}
else message("BioSample ID: ", "OK")
if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n")
if(is.null(n_rep)){
message("N_rep: ", "NULL")
n_rep = 1
message("Number of replicates unspecified\nAssuming n_rep = 1")
}
else message("N_rep: ", "OK")
if(is.numeric(n_rep) && length(n_rep) == 1){
message(n_rep, " replicates per samples")
n_rep = rep(n_rep, length(bioSample))
}
if(is.null(n_genes)) {
if(!is.null(gene_id) || !is.null(alpha)){
ifelse(length(alpha) == length(alpha),
(n_genes = length(gene_id)), ## if
stop("ERROR: unconsistent value between n_genes, length(gene_names) and length(gene_disp)\n")) ## else
}
if(is.null(gene_id)){
message("Gene_id: ", "NULL")
}
else message("Gene_id: ", "OK")
if(!is.null(n_genes)) {
if(is.null(gene_id) && is.null(alpha)) {
### Precised alpha params for each genes
alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
id = paste0('gene', 1:n_genes)
alpha <- list(gene_id = id, alpha = alpha)
}
if(is.null(alpha)){
message("Alpha: ", "NULL")
}
else message("Alpha: ", "OK")
if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep")
if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_id)) {
message("Number of genes unspecified\nAssuming n_genes = 3\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
if(!is.null(gene_id) && !is.null(alpha)) {
if (is.data.frame(alpha)) if (length(gene_id) != dim(alpha)[1]) stop("ERROR: unconsistent length between gene_id and alpha")
if (is.vector(alpha)) if (length(gene_id) != length(alpha)) stop("ERROR: unconsistent length between gene_id and alpha")
else n_genes = length(alpha)
}
if( is.null(gene_id) && is.null(alpha) ){
message("Assuming n_genes = 3")
n_genes = 3
alpha = runif(2,100, n = n_genes)
gene_id = paste0('gene', 1:n_genes)
}
if(is.null(gene_id) && is.null(alpha)) {
message("n_genes = ", n_genes, "\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
alpha = runif(2,100, n = n_genes)
if(is.null(gene_id) && !is.null(alpha)) {
message("Built gene_id")
n_genes = length(alpha)
gene_id = paste0('gene', 1:n_genes)
}
if(length(bioSample) == 1 && bioSample == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n")
if(is.null(n_rep)){
message("Number of replicates not provided.\nAssuming 10 replicates per sample will be setup")
n_rep = 10
if(!is.null(gene_id) && is.null(alpha)) {
message("Alpha randomly defined from uniform law between 2 and 100")
n_genes = length(gene_id)
alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
}
if ( !exists("n_genes")) n_genes = length(gene_id)
if(is.null(gene_id)) gene_id = paste0('gene', 1:n_genes)
my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes, alpha = alpha)
my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes, alpha = alpha, gene_id = gene_id)
return(my_list)
}
......@@ -120,7 +140,6 @@ handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
#' @param n_rep number of replicates
#' @param gene_id vector of id for each gene
#' @param alpha vector of alpha_i
#' @param n_genes number of genes
#' @param mu dataframe of mu_ij
#' @import purrr
#' @import data.table
......@@ -128,10 +147,11 @@ handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
#' @export
#'
#' @examples
setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_id = NULL , alpha = NULL, n_genes = NULL, mu = NULL ){
setup_countGener <- function(bioSample_id = NULL, n_rep = NULL , gene_id = NULL , alpha = NULL, mu = NULL ){
message("\nSetup counts generator ...")
######### HANDLE EXCEPTION #######
setup = handle_except(bioSample_id, n_rep , gene_id , alpha, n_genes)
setup = handle_except(bioSample_id, n_rep , gene_id , alpha)
######## HANDLE TYPE MU ##########
if(is.null(mu)) mu = .mu_generator # default function to generate mu
......@@ -141,9 +161,10 @@ setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_
nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep,
~(list(name=.x, #sample_name
n_replicates = .y, # random int between 1 & max_N_replicates
gene_id = setup$alpha$gene_id, # gene_id
gene_id = setup$gene_id, # gene_id
mu = mu.set , #mu(ij)
alpha = setup$alpha$alpha))) %>% # alpha(i)
alpha = setup$alpha))) %>% # alpha(i)
data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment