update package

96e96a1a · Arnaud Duvermy · a3dbde3b · 96e96a1a · 96e96a1a · 96e96a1a
Commit 96e96a1a authored 3 years ago by Arnaud Duvermy
--- a/src/htrsim/R/generate_counts.R
+++ b/src/htrsim/R/generate_counts.R
@@ -27,9 +27,14 @@ rn_sim <- function(mu, alpha, n_replicates, ...){
 #' @import plyr
 #' @import BiocGenerics
 #' @import purrr
+#' @import readr
 #'
 #' @examples
 generate_counts <- function(setup_dtf, export = FALSE){
+  full_name <- NULL
+  name <- NULL
+  variable <- NULL
+

  message("reading and processing counts per genes ...")

@@ -46,7 +51,7 @@ generate_counts <- function(setup_dtf, export = FALSE){
              tidyr::drop_na(counts) %>%
              reshape2::dcast(., gene_id ~ full_name, value.var= "counts")

-  if (export == TRUE) write.tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv')
+  if (export == TRUE) readr::write_tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv')

  return(cnt.dtf)
 }

--- a/src/htrsim/R/htrsim_workflow.R
+++ b/src/htrsim/R/htrsim_workflow.R
@@ -12,21 +12,26 @@
 #' @examples
 htrsim <- function(countData, bioDesign, N_replicates){

+
    # launch standard DESEQ2 analysis
-    dds = run.deseq(tabl_cnts, bioDesign = bioDesign)
+    dds = run.deseq(tabl_cnts = countData, bioDesign = bioDesign)
+
+    ## Model matrix per samples
+    mm <- model.matrix(~genotype + env + genotype:env, bioDesign)

    ## Input estimation
-    mu.input = estim.mu(dds)
+    res = estim.mu(dds, mm)
+    mu.input = res$mu
    alpha.input = estim.alpha(dds)

    # Setup simulation
-    input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input)
+    input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE)
+
    setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id,
-                                         n_rep = N_replicates,
+                                         n_rep = 1,
                                         alpha = input$alpha,
                                         gene_id = input$gene_id,
                                         mu = input$mu)
-
    # Simulate counts
    htrs <- generate_counts(setup.simulation)
    return(list(countDataSim = htrs, input = input, dds = dds))

--- a/src/htrsim/R/input_estimation.R
+++ b/src/htrsim/R/input_estimation.R
@@ -11,15 +11,20 @@
 #' @import dplyr
 #' @import tibble
 #' @import BiocGenerics
+#' @import S4Vectors
+#' @import readr
 #' @examples
 #' @importFrom rlang .data
 estim.alpha <- function(dds, export = FALSE){
  gene_id <- NULL
+  expressed_gene_dispersion <- NULL
  ###################      Estimate alpha per gene        ########################
  #N.B: alpha = dispersion per gene
  #dds  <- DESeq2::estimateDispersions(dds, quiet = F)
  #dispersion estimation
-  dispersion_estimate <- DESeq2::dispersions(dds)
+  dds.mcols = S4Vectors::mcols(dds,use.names=TRUE)
+  dispersion_estimate = dds.mcols$dispersion
+  #dispersion_estimate <- DESeq2::dispersions(dds)

  ## Shape and export
  names(dispersion_estimate) <-  names(dds@rowRanges)
@@ -31,12 +36,103 @@ estim.alpha <- function(dds, export = FALSE){
                                    tibble::rownames_to_column() %>%
                                    dplyr::rename("alpha" = .data$., gene_id = "rowname")

-  if (export == TRUE)   write_tsv(disp_gene_express, 'results/2022-03-03/estimate_dispersion.tsv')
+  if (export == TRUE)   readr::write_tsv(expressed_gene_dispersion, 'results/2022-03-03/estimate_dispersion.tsv')

  return(expressed_gene_dispersion)

 }

+#' Estimate mu_ij
+#'
+#' @param dds DESEQ2 object
+#' @param export Boolean
+#' @param mm a model matrix
+#'
+#' @return mu_ij only for gene expressed c_ij != 0
+#' @export
+#' @import stats
+#' @import dplyr
+#' @import tibble
+#' @import BiocGenerics
+#' @import S4Vectors
+#' @examples
+#' @importFrom rlang .data
+estim.mu <- function(dds, mm, export = FALSE){
+
+
+  gene_id <- NULL
+  nb_sples = BiocGenerics::rownames(dds@colData) %>% length()
+  nb_genes =  BiocGenerics::rownames(dds@assays@data$counts) %>% length()
+  mm_epsi = rep(1, nb_sples)
+  names(mm_epsi) = 1 : nb_sples
+
+
+  dds.mcols = S4Vectors::mcols(dds,use.names=TRUE)
+  ## BETA
+  B0 <- dds.mcols$Intercept
+  B1 <- dds.mcols$genotype_msn2D_vs_wt
+  B2 <- dds.mcols$genotype_msn4D_vs_wt
+  B3 <- dds.mcols$env_kcl_vs_control
+  B4 <- dds.mcols$genotypemsn2D.envkcl
+  B5 <- dds.mcols$genotypemsn4D.envkcl
+
+  #print(max(B0, na.rm=TRUE))
+  #print(max(B1, na.rm=TRUE))
+  #print(max(B2, na.rm=TRUE))
+  #print(max(B3, na.rm=TRUE))
+  #print(max(B4, na.rm=TRUE))
+  #print(max(B5, na.rm = TRUE))
+
+  ## deviance = sigma2 -> estimate epsilon
+  deviance_i.sqrt = sqrt(dds.mcols$deviance)
+
+
+  beta.matrix = cbind(B0, B1,B2,B3,B4,B5) %>% as.matrix()
+  #p_ij = B0_i*mm1_j + B1_i*mm2_j + B3_i*mm3_j + B4_i*mm4_j + B5_i*mm5_j
+  p_ij = beta.matrix %*% t(mm)
+  #epsilon_ij ~ N(0, deviance)
+  #epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 ))  %>% data.frame() %>% as.matrix()
+  epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt ))  %>% data.frame() %>% as.matrix()
+  ## log_qij = p_ij + epsilon_ij
+  log_qij <- p_ij + epsilon_ij
+  ## s_j
+  s_j = dds$sizeFactor
+
+  mu_ij = s_j * 2^log_qij
+
+
+  #################     Estimate mu        #########################
+  mu_estimate <- dds@assays@data$mu
+  #dds@assays@data$mu %>% dim()
+  #mu_estimate %>% dim()
+  rownames(mu_ij) = BiocGenerics::rownames(dds@assays@data$counts)
+  ## drop NA in dispersion estimate (link to unexpressed gene)
+  ### and convert to lovely dataframe
+  mu_gene_express = mu_ij %>%
+    stats::na.omit() %>%
+    data.frame()
+  colnames(mu_gene_express) <- rownames(dds@colData)
+  mu_gene_express <- mu_gene_express %>%
+    tibble::rownames_to_column(var = "gene_id")
+
+
+
+
+  if (export == TRUE)  readr::write_tsv(mu_gene_express, 'results/2022-03-03/estimate_mu.tsv')
+
+
+  res = list(mu = mu_gene_express, beta.matrix = beta.matrix, deviance.sqrt = deviance_i.sqrt, dds.mcols = dds.mcols)
+  return(res)
+
+
+}
+
+
+#.xMm.foo <- function(b, m) return(b * m)
+#.epsilon.foo <- function(x) return(rnorm(mean = 0 ,sd = x, n = 1 ))
+#.epsilon_i <- function(dev_i) return(dev_i %>% map(., ~.epsilon.foo(.))%>% unlist())
+#.getMu_i <- function(s, qi) return(2^(qi))
+


 #' Estimate mu_ij
@@ -52,7 +148,7 @@ estim.alpha <- function(dds, export = FALSE){
 #' @import BiocGenerics
 #' @examples
 #' @importFrom rlang .data
-estim.mu <- function(dds, export = FALSE){
+estim.mu_beta <- function(dds, export = FALSE){
  gene_id <- NULL
  #################     Estimate mu        #########################
  mu_estimate <- dds@assays@data$mu

--- a/src/htrsim/R/launch_deseq.R
+++ b/src/htrsim/R/launch_deseq.R
@@ -10,9 +10,12 @@
 #' @examples
 run.deseq <- function(tabl_cnts, bioDesign ){
  ########### LAUNCH DESEQ #############
-  ## Design model
-  dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ mutant + env + mutant:env)
+  ## Design model - specify reference
+  bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D'))
+  bioDesign$env <- factor(x = bioDesign$env,levels = c('control', 'kcl'))
+
  ## DESEQ standard analysis
+  dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ genotype + env + genotype:env)
  dds <- DESeq2::DESeq(dds)
  return(dds)
 }
--- a/src/htrsim/R/setup_cntsGenerator.R
+++ b/src/htrsim/R/setup_cntsGenerator.R
@@ -2,6 +2,7 @@
 #'
 #' @param mu.dtf dataframe of mu_ij
 #' @param alpha.dtf dataframe of alpha_i
+#' @param average_rep bool
 #'
 #' @return
 #' @import purrr
@@ -11,103 +12,122 @@
 #' @export
 #'
 #' @examples
-reshape_input2setup <- function(mu.dtf, alpha.dtf){
-  ## Defining sample names
-  bioSample_id <- mu.dtf %>%
+reshape_input2setup <- function(mu.dtf, alpha.dtf,  average_rep = FALSE){
+  gene_id <- NULL
+
+  if(average_rep == TRUE){
+      ## Defining sample names
+      bioSample_id <- mu.dtf %>%
+        dplyr::select(-gene_id) %>%
+        BiocGenerics::colnames() %>%
+        purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>%
+                     BiocGenerics::paste(., collapse='_') )  %>%
+        BiocGenerics::unlist() %>% BiocGenerics::unique()
+      ############### Mu is equal for biosample replicate #############
+      ### case 1: choose 1  replicate
+      #mu_params <- mu_params %>% dplyr::select(., contains("rep1"))
+      ## rename mu_params colnames to ensure corresponding with sample_names
+      #colnames(mu_params) <- sample_names
+      ### case 2: average replicates
+      average_rep <- function(x, dtf) {
+        varname <- x
+        dtf %>%
+          dplyr::select(.,contains(x)) %>%
+          dplyr::mutate(!!varname := rowMeans(.)) %>%
+          dplyr::select(varname)
+      }
+      mu_ij <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf))  %>% data.frame()
+      mu_ij$gene_id <- alpha.dtf$gene_id
+  }
+
+  else {
+
+  bioSample_id <-  mu.dtf %>%
    dplyr::select(-gene_id) %>%
-    BiocGenerics::colnames() %>%
-    purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>%
-                 BiocGenerics::paste(., collapse='_') )  %>%
-    BiocGenerics::unlist() %>% BiocGenerics::unique()
-
-
-  ############### Mu is same for biosample replicate #############
-  ### case 1: choose 1  replicate
-  #mu_params <- mu_params %>% dplyr::select(., contains("rep1"))
-  ## rename mu_params colnames to ensure corresponding with sample_names
-  #colnames(mu_params) <- sample_names
-  ### case 2: average replicates
-  average_rep <- function(x, dtf) {
-    varname <- x
-    dtf %>%
-      dplyr::select(.,contains(x)) %>%
-      dplyr::mutate(!!varname := rowMeans(.)) %>%
-      dplyr::select(varname)
+    BiocGenerics::colnames()
+
+  mu_ij = mu.dtf
+
  }
-  mu_avg.dtf <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf))  %>% data.frame()
-  mu_avg.dtf$gene_id <- alpha.dtf$gene_id

-  return(list(alpha = alpha.dtf , mu = mu_avg.dtf, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id))
+  return(list(alpha = alpha.dtf , mu = mu_ij, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id))
 }



 #' Handle exception
 #'
-#' @param bioSample_id vector of id for each bioSample
+#' @param bioSample vector of id for each bioSample
 #' @param n_rep number of replicates
 #' @param gene_id vector of id for each gene
 #' @param alpha vector of alpha_i
-#' @param n_genes number of genes
 #'
 #' @return
 #' @export
 #'
 #' @examples
-handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
+handle_except <- function(bioSample, n_rep , gene_id , alpha){

-  if(is.numeric(n_rep) && length(n_rep) == 1){
-    message("Homogeneous number of replicates between samples: ", n_rep, " replicates per samples\n")
-    n_rep = rep(n_rep, length(bioSample))
+
+
+  if(is.null(bioSample)){
+    message("BioSample ID: NULL" )
+    bioSample = "my_first_lib"
+    message("Assuming only one library will be setup. A library named 'my_first_lib'")
  }
+  else message("BioSample ID: ", "OK")

-  if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n")
+  if(is.null(n_rep)){
+    message("N_rep: ", "NULL")
+    n_rep = 1
+    message("Number of replicates unspecified\nAssuming n_rep = 1")
+  }
+  else message("N_rep: ", "OK")

+  if(is.numeric(n_rep) && length(n_rep) == 1){
+    message(n_rep, " replicates per samples")
+    n_rep = rep(n_rep, length(bioSample))
+  }

-  if(is.null(n_genes)) {
-    if(!is.null(gene_id) || !is.null(alpha)){
-      ifelse(length(alpha) == length(alpha),
-             (n_genes = length(gene_id)), ## if
-             stop("ERROR: unconsistent value between n_genes, length(gene_names) and length(gene_disp)\n")) ## else
-    }
+  if(is.null(gene_id)){
+    message("Gene_id: ", "NULL")
  }
+  else message("Gene_id: ", "OK")

-  if(!is.null(n_genes)) {
-    if(is.null(gene_id) && is.null(alpha)) {
-      ### Precised alpha params for each genes
-      alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
-      id = paste0('gene', 1:n_genes)
-      alpha <-  list(gene_id = id, alpha = alpha)
-    }
+  if(is.null(alpha)){
+    message("Alpha: ", "NULL")
  }
+  else message("Alpha: ", "OK")


+  if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep")

-  if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_id)) {
-    message("Number of genes unspecified\nAssuming n_genes = 3\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
+  if(!is.null(gene_id)  && !is.null(alpha)) {
+      if (is.data.frame(alpha)) if (length(gene_id) != dim(alpha)[1]) stop("ERROR: unconsistent length between gene_id and alpha")
+      if (is.vector(alpha)) if (length(gene_id) != length(alpha)) stop("ERROR: unconsistent length between gene_id and alpha")
+      else n_genes = length(alpha)
+  }
+  if( is.null(gene_id) && is.null(alpha) ){
+    message("Assuming n_genes = 3")
    n_genes = 3
-    alpha = runif(2,100, n = n_genes)
    gene_id = paste0('gene', 1:n_genes)
  }

-  if(is.null(gene_id) && is.null(alpha)) {
-    message("n_genes = ", n_genes, "\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
-    alpha = runif(2,100, n = n_genes)
+  if(is.null(gene_id)  && !is.null(alpha)) {
+    message("Built gene_id")
+    n_genes = length(alpha)
    gene_id = paste0('gene', 1:n_genes)
  }

-  if(length(bioSample) == 1 && bioSample == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n")
-
-  if(is.null(n_rep)){
-    message("Number of replicates not provided.\nAssuming 10 replicates per sample will be setup")
-    n_rep = 10
+  if(!is.null(gene_id)  && is.null(alpha)) {
+    message("Alpha randomly defined from uniform law between 2 and 100")
+    n_genes = length(gene_id)
+    alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
  }

+  if ( !exists("n_genes")) n_genes = length(gene_id)

-  if(is.null(gene_id)) gene_id = paste0('gene', 1:n_genes)
-
-
-  my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes,   alpha = alpha)
+  my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes,   alpha = alpha, gene_id = gene_id)
  return(my_list)
 }

@@ -120,7 +140,6 @@ handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
 #' @param n_rep number of replicates
 #' @param gene_id vector of id for each gene
 #' @param alpha vector of alpha_i
-#' @param n_genes number of genes
 #' @param mu dataframe of mu_ij
 #' @import purrr
 #' @import data.table
@@ -128,10 +147,11 @@ handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
 #' @export
 #'
 #' @examples
-setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_id = NULL , alpha = NULL, n_genes = NULL, mu = NULL ){
+setup_countGener <- function(bioSample_id = NULL, n_rep = NULL , gene_id = NULL , alpha = NULL, mu = NULL ){

+  message("\nSetup counts generator ...")
  ######### HANDLE EXCEPTION #######
-  setup = handle_except(bioSample_id, n_rep , gene_id , alpha, n_genes)
+  setup = handle_except(bioSample_id, n_rep , gene_id , alpha)
  ######## HANDLE TYPE MU ##########
  if(is.null(mu)) mu = .mu_generator # default function to generate mu

@@ -141,9 +161,10 @@ setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_
    nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep,
                                 ~(list(name=.x, #sample_name
                                        n_replicates = .y, # random int between 1 & max_N_replicates
-                                        gene_id = setup$alpha$gene_id,  # gene_id
+                                        gene_id = setup$gene_id,  # gene_id
                                        mu = mu.set ,  #mu(ij)
-                                        alpha = setup$alpha$alpha))) %>%  # alpha(i)
+                                        alpha = setup$alpha))) %>%  # alpha(i)
+
      data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf
  }