From 96e96a1a1b6a0eb65a2cb27fb9e4cfeaf91ed0e7 Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Fri, 22 Apr 2022 15:46:17 +0200 Subject: [PATCH] update package --- src/htrsim/R/generate_counts.R | 7 +- src/htrsim/R/htrsim_workflow.R | 15 ++- src/htrsim/R/input_estimation.R | 102 +++++++++++++++++++- src/htrsim/R/launch_deseq.R | 7 +- src/htrsim/R/setup_cntsGenerator.R | 149 ++++++++++++++++------------- 5 files changed, 205 insertions(+), 75 deletions(-) diff --git a/src/htrsim/R/generate_counts.R b/src/htrsim/R/generate_counts.R index beeaa85..54ddafb 100644 --- a/src/htrsim/R/generate_counts.R +++ b/src/htrsim/R/generate_counts.R @@ -27,9 +27,14 @@ rn_sim <- function(mu, alpha, n_replicates, ...){ #' @import plyr #' @import BiocGenerics #' @import purrr +#' @import readr #' #' @examples generate_counts <- function(setup_dtf, export = FALSE){ + full_name <- NULL + name <- NULL + variable <- NULL + message("reading and processing counts per genes ...") @@ -46,7 +51,7 @@ generate_counts <- function(setup_dtf, export = FALSE){ tidyr::drop_na(counts) %>% reshape2::dcast(., gene_id ~ full_name, value.var= "counts") - if (export == TRUE) write.tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv') + if (export == TRUE) readr::write_tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv') return(cnt.dtf) } diff --git a/src/htrsim/R/htrsim_workflow.R b/src/htrsim/R/htrsim_workflow.R index 3569b1f..4d618e4 100644 --- a/src/htrsim/R/htrsim_workflow.R +++ b/src/htrsim/R/htrsim_workflow.R @@ -12,21 +12,26 @@ #' @examples htrsim <- function(countData, bioDesign, N_replicates){ + # launch standard DESEQ2 analysis - dds = run.deseq(tabl_cnts, bioDesign = bioDesign) + dds = run.deseq(tabl_cnts = countData, bioDesign = bioDesign) + + ## Model matrix per samples + mm <- model.matrix(~genotype + env + genotype:env, bioDesign) ## Input estimation - mu.input = estim.mu(dds) + res = estim.mu(dds, mm) + mu.input = res$mu alpha.input = estim.alpha(dds) # Setup simulation - input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input) + input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE) + setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id, - n_rep = N_replicates, + n_rep = 1, alpha = input$alpha, gene_id = input$gene_id, mu = input$mu) - # Simulate counts htrs <- generate_counts(setup.simulation) return(list(countDataSim = htrs, input = input, dds = dds)) diff --git a/src/htrsim/R/input_estimation.R b/src/htrsim/R/input_estimation.R index 7699137..b601c5e 100644 --- a/src/htrsim/R/input_estimation.R +++ b/src/htrsim/R/input_estimation.R @@ -11,15 +11,20 @@ #' @import dplyr #' @import tibble #' @import BiocGenerics +#' @import S4Vectors +#' @import readr #' @examples #' @importFrom rlang .data estim.alpha <- function(dds, export = FALSE){ gene_id <- NULL + expressed_gene_dispersion <- NULL ################### Estimate alpha per gene ######################## #N.B: alpha = dispersion per gene #dds <- DESeq2::estimateDispersions(dds, quiet = F) #dispersion estimation - dispersion_estimate <- DESeq2::dispersions(dds) + dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) + dispersion_estimate = dds.mcols$dispersion + #dispersion_estimate <- DESeq2::dispersions(dds) ## Shape and export names(dispersion_estimate) <- names(dds@rowRanges) @@ -31,12 +36,103 @@ estim.alpha <- function(dds, export = FALSE){ tibble::rownames_to_column() %>% dplyr::rename("alpha" = .data$., gene_id = "rowname") - if (export == TRUE) write_tsv(disp_gene_express, 'results/2022-03-03/estimate_dispersion.tsv') + if (export == TRUE) readr::write_tsv(expressed_gene_dispersion, 'results/2022-03-03/estimate_dispersion.tsv') return(expressed_gene_dispersion) } +#' Estimate mu_ij +#' +#' @param dds DESEQ2 object +#' @param export Boolean +#' @param mm a model matrix +#' +#' @return mu_ij only for gene expressed c_ij != 0 +#' @export +#' @import stats +#' @import dplyr +#' @import tibble +#' @import BiocGenerics +#' @import S4Vectors +#' @examples +#' @importFrom rlang .data +estim.mu <- function(dds, mm, export = FALSE){ + + + gene_id <- NULL + nb_sples = BiocGenerics::rownames(dds@colData) %>% length() + nb_genes = BiocGenerics::rownames(dds@assays@data$counts) %>% length() + mm_epsi = rep(1, nb_sples) + names(mm_epsi) = 1 : nb_sples + + + dds.mcols = S4Vectors::mcols(dds,use.names=TRUE) + ## BETA + B0 <- dds.mcols$Intercept + B1 <- dds.mcols$genotype_msn2D_vs_wt + B2 <- dds.mcols$genotype_msn4D_vs_wt + B3 <- dds.mcols$env_kcl_vs_control + B4 <- dds.mcols$genotypemsn2D.envkcl + B5 <- dds.mcols$genotypemsn4D.envkcl + + #print(max(B0, na.rm=TRUE)) + #print(max(B1, na.rm=TRUE)) + #print(max(B2, na.rm=TRUE)) + #print(max(B3, na.rm=TRUE)) + #print(max(B4, na.rm=TRUE)) + #print(max(B5, na.rm = TRUE)) + + ## deviance = sigma2 -> estimate epsilon + deviance_i.sqrt = sqrt(dds.mcols$deviance) + + + beta.matrix = cbind(B0, B1,B2,B3,B4,B5) %>% as.matrix() + #p_ij = B0_i*mm1_j + B1_i*mm2_j + B3_i*mm3_j + B4_i*mm4_j + B5_i*mm5_j + p_ij = beta.matrix %*% t(mm) + #epsilon_ij ~ N(0, deviance) + #epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 )) %>% data.frame() %>% as.matrix() + epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt )) %>% data.frame() %>% as.matrix() + ## log_qij = p_ij + epsilon_ij + log_qij <- p_ij + epsilon_ij + ## s_j + s_j = dds$sizeFactor + + mu_ij = s_j * 2^log_qij + + + ################# Estimate mu ######################### + mu_estimate <- dds@assays@data$mu + #dds@assays@data$mu %>% dim() + #mu_estimate %>% dim() + rownames(mu_ij) = BiocGenerics::rownames(dds@assays@data$counts) + ## drop NA in dispersion estimate (link to unexpressed gene) + ### and convert to lovely dataframe + mu_gene_express = mu_ij %>% + stats::na.omit() %>% + data.frame() + colnames(mu_gene_express) <- rownames(dds@colData) + mu_gene_express <- mu_gene_express %>% + tibble::rownames_to_column(var = "gene_id") + + + + + if (export == TRUE) readr::write_tsv(mu_gene_express, 'results/2022-03-03/estimate_mu.tsv') + + + res = list(mu = mu_gene_express, beta.matrix = beta.matrix, deviance.sqrt = deviance_i.sqrt, dds.mcols = dds.mcols) + return(res) + + +} + + +#.xMm.foo <- function(b, m) return(b * m) +#.epsilon.foo <- function(x) return(rnorm(mean = 0 ,sd = x, n = 1 )) +#.epsilon_i <- function(dev_i) return(dev_i %>% map(., ~.epsilon.foo(.))%>% unlist()) +#.getMu_i <- function(s, qi) return(2^(qi)) + #' Estimate mu_ij @@ -52,7 +148,7 @@ estim.alpha <- function(dds, export = FALSE){ #' @import BiocGenerics #' @examples #' @importFrom rlang .data -estim.mu <- function(dds, export = FALSE){ +estim.mu_beta <- function(dds, export = FALSE){ gene_id <- NULL ################# Estimate mu ######################### mu_estimate <- dds@assays@data$mu diff --git a/src/htrsim/R/launch_deseq.R b/src/htrsim/R/launch_deseq.R index 6650ac1..d500f78 100644 --- a/src/htrsim/R/launch_deseq.R +++ b/src/htrsim/R/launch_deseq.R @@ -10,9 +10,12 @@ #' @examples run.deseq <- function(tabl_cnts, bioDesign ){ ########### LAUNCH DESEQ ############# - ## Design model - dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ mutant + env + mutant:env) + ## Design model - specify reference + bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D')) + bioDesign$env <- factor(x = bioDesign$env,levels = c('control', 'kcl')) + ## DESEQ standard analysis + dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ genotype + env + genotype:env) dds <- DESeq2::DESeq(dds) return(dds) } diff --git a/src/htrsim/R/setup_cntsGenerator.R b/src/htrsim/R/setup_cntsGenerator.R index 0558c19..7e18062 100644 --- a/src/htrsim/R/setup_cntsGenerator.R +++ b/src/htrsim/R/setup_cntsGenerator.R @@ -2,6 +2,7 @@ #' #' @param mu.dtf dataframe of mu_ij #' @param alpha.dtf dataframe of alpha_i +#' @param average_rep bool #' #' @return #' @import purrr @@ -11,103 +12,122 @@ #' @export #' #' @examples -reshape_input2setup <- function(mu.dtf, alpha.dtf){ - ## Defining sample names - bioSample_id <- mu.dtf %>% +reshape_input2setup <- function(mu.dtf, alpha.dtf, average_rep = FALSE){ + gene_id <- NULL + + if(average_rep == TRUE){ + ## Defining sample names + bioSample_id <- mu.dtf %>% + dplyr::select(-gene_id) %>% + BiocGenerics::colnames() %>% + purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>% + BiocGenerics::paste(., collapse='_') ) %>% + BiocGenerics::unlist() %>% BiocGenerics::unique() + ############### Mu is equal for biosample replicate ############# + ### case 1: choose 1 replicate + #mu_params <- mu_params %>% dplyr::select(., contains("rep1")) + ## rename mu_params colnames to ensure corresponding with sample_names + #colnames(mu_params) <- sample_names + ### case 2: average replicates + average_rep <- function(x, dtf) { + varname <- x + dtf %>% + dplyr::select(.,contains(x)) %>% + dplyr::mutate(!!varname := rowMeans(.)) %>% + dplyr::select(varname) + } + mu_ij <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf)) %>% data.frame() + mu_ij$gene_id <- alpha.dtf$gene_id + } + + else { + + bioSample_id <- mu.dtf %>% dplyr::select(-gene_id) %>% - BiocGenerics::colnames() %>% - purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>% - BiocGenerics::paste(., collapse='_') ) %>% - BiocGenerics::unlist() %>% BiocGenerics::unique() - - - ############### Mu is same for biosample replicate ############# - ### case 1: choose 1 replicate - #mu_params <- mu_params %>% dplyr::select(., contains("rep1")) - ## rename mu_params colnames to ensure corresponding with sample_names - #colnames(mu_params) <- sample_names - ### case 2: average replicates - average_rep <- function(x, dtf) { - varname <- x - dtf %>% - dplyr::select(.,contains(x)) %>% - dplyr::mutate(!!varname := rowMeans(.)) %>% - dplyr::select(varname) + BiocGenerics::colnames() + + mu_ij = mu.dtf + } - mu_avg.dtf <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf)) %>% data.frame() - mu_avg.dtf$gene_id <- alpha.dtf$gene_id - return(list(alpha = alpha.dtf , mu = mu_avg.dtf, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id)) + return(list(alpha = alpha.dtf , mu = mu_ij, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id)) } #' Handle exception #' -#' @param bioSample_id vector of id for each bioSample +#' @param bioSample vector of id for each bioSample #' @param n_rep number of replicates #' @param gene_id vector of id for each gene #' @param alpha vector of alpha_i -#' @param n_genes number of genes #' #' @return #' @export #' #' @examples -handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){ +handle_except <- function(bioSample, n_rep , gene_id , alpha){ - if(is.numeric(n_rep) && length(n_rep) == 1){ - message("Homogeneous number of replicates between samples: ", n_rep, " replicates per samples\n") - n_rep = rep(n_rep, length(bioSample)) + + + if(is.null(bioSample)){ + message("BioSample ID: NULL" ) + bioSample = "my_first_lib" + message("Assuming only one library will be setup. A library named 'my_first_lib'") } + else message("BioSample ID: ", "OK") - if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n") + if(is.null(n_rep)){ + message("N_rep: ", "NULL") + n_rep = 1 + message("Number of replicates unspecified\nAssuming n_rep = 1") + } + else message("N_rep: ", "OK") + if(is.numeric(n_rep) && length(n_rep) == 1){ + message(n_rep, " replicates per samples") + n_rep = rep(n_rep, length(bioSample)) + } - if(is.null(n_genes)) { - if(!is.null(gene_id) || !is.null(alpha)){ - ifelse(length(alpha) == length(alpha), - (n_genes = length(gene_id)), ## if - stop("ERROR: unconsistent value between n_genes, length(gene_names) and length(gene_disp)\n")) ## else - } + if(is.null(gene_id)){ + message("Gene_id: ", "NULL") } + else message("Gene_id: ", "OK") - if(!is.null(n_genes)) { - if(is.null(gene_id) && is.null(alpha)) { - ### Precised alpha params for each genes - alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100 - id = paste0('gene', 1:n_genes) - alpha <- list(gene_id = id, alpha = alpha) - } + if(is.null(alpha)){ + message("Alpha: ", "NULL") } + else message("Alpha: ", "OK") + if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep") - if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_id)) { - message("Number of genes unspecified\nAssuming n_genes = 3\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n") + if(!is.null(gene_id) && !is.null(alpha)) { + if (is.data.frame(alpha)) if (length(gene_id) != dim(alpha)[1]) stop("ERROR: unconsistent length between gene_id and alpha") + if (is.vector(alpha)) if (length(gene_id) != length(alpha)) stop("ERROR: unconsistent length between gene_id and alpha") + else n_genes = length(alpha) + } + if( is.null(gene_id) && is.null(alpha) ){ + message("Assuming n_genes = 3") n_genes = 3 - alpha = runif(2,100, n = n_genes) gene_id = paste0('gene', 1:n_genes) } - if(is.null(gene_id) && is.null(alpha)) { - message("n_genes = ", n_genes, "\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n") - alpha = runif(2,100, n = n_genes) + if(is.null(gene_id) && !is.null(alpha)) { + message("Built gene_id") + n_genes = length(alpha) gene_id = paste0('gene', 1:n_genes) } - if(length(bioSample) == 1 && bioSample == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n") - - if(is.null(n_rep)){ - message("Number of replicates not provided.\nAssuming 10 replicates per sample will be setup") - n_rep = 10 + if(!is.null(gene_id) && is.null(alpha)) { + message("Alpha randomly defined from uniform law between 2 and 100") + n_genes = length(gene_id) + alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100 } + if ( !exists("n_genes")) n_genes = length(gene_id) - if(is.null(gene_id)) gene_id = paste0('gene', 1:n_genes) - - - my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes, alpha = alpha) + my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes, alpha = alpha, gene_id = gene_id) return(my_list) } @@ -120,7 +140,6 @@ handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){ #' @param n_rep number of replicates #' @param gene_id vector of id for each gene #' @param alpha vector of alpha_i -#' @param n_genes number of genes #' @param mu dataframe of mu_ij #' @import purrr #' @import data.table @@ -128,10 +147,11 @@ handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){ #' @export #' #' @examples -setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_id = NULL , alpha = NULL, n_genes = NULL, mu = NULL ){ +setup_countGener <- function(bioSample_id = NULL, n_rep = NULL , gene_id = NULL , alpha = NULL, mu = NULL ){ + message("\nSetup counts generator ...") ######### HANDLE EXCEPTION ####### - setup = handle_except(bioSample_id, n_rep , gene_id , alpha, n_genes) + setup = handle_except(bioSample_id, n_rep , gene_id , alpha) ######## HANDLE TYPE MU ########## if(is.null(mu)) mu = .mu_generator # default function to generate mu @@ -141,9 +161,10 @@ setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_ nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep, ~(list(name=.x, #sample_name n_replicates = .y, # random int between 1 & max_N_replicates - gene_id = setup$alpha$gene_id, # gene_id + gene_id = setup$gene_id, # gene_id mu = mu.set , #mu(ij) - alpha = setup$alpha$alpha))) %>% # alpha(i) + alpha = setup$alpha))) %>% # alpha(i) + data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf } -- GitLab