From 96e96a1a1b6a0eb65a2cb27fb9e4cfeaf91ed0e7 Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Fri, 22 Apr 2022 15:46:17 +0200
Subject: [PATCH] update package

---
 src/htrsim/R/generate_counts.R     |   7 +-
 src/htrsim/R/htrsim_workflow.R     |  15 ++-
 src/htrsim/R/input_estimation.R    | 102 +++++++++++++++++++-
 src/htrsim/R/launch_deseq.R        |   7 +-
 src/htrsim/R/setup_cntsGenerator.R | 149 ++++++++++++++++-------------
 5 files changed, 205 insertions(+), 75 deletions(-)

diff --git a/src/htrsim/R/generate_counts.R b/src/htrsim/R/generate_counts.R
index beeaa85..54ddafb 100644
--- a/src/htrsim/R/generate_counts.R
+++ b/src/htrsim/R/generate_counts.R
@@ -27,9 +27,14 @@ rn_sim <- function(mu, alpha, n_replicates, ...){
 #' @import plyr
 #' @import BiocGenerics
 #' @import purrr
+#' @import readr
 #'
 #' @examples
 generate_counts <- function(setup_dtf, export = FALSE){
+  full_name <- NULL
+  name <- NULL
+  variable <- NULL
+
 
   message("reading and processing counts per genes ...")
 
@@ -46,7 +51,7 @@ generate_counts <- function(setup_dtf, export = FALSE){
               tidyr::drop_na(counts) %>%
               reshape2::dcast(., gene_id ~ full_name, value.var= "counts")
 
-  if (export == TRUE) write.tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv')
+  if (export == TRUE) readr::write_tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv')
 
   return(cnt.dtf)
 }
diff --git a/src/htrsim/R/htrsim_workflow.R b/src/htrsim/R/htrsim_workflow.R
index 3569b1f..4d618e4 100644
--- a/src/htrsim/R/htrsim_workflow.R
+++ b/src/htrsim/R/htrsim_workflow.R
@@ -12,21 +12,26 @@
 #' @examples
 htrsim <- function(countData, bioDesign, N_replicates){
 
+
     # launch standard DESEQ2 analysis
-    dds = run.deseq(tabl_cnts, bioDesign = bioDesign)
+    dds = run.deseq(tabl_cnts = countData, bioDesign = bioDesign)
+
+    ## Model matrix per samples
+    mm <- model.matrix(~genotype + env + genotype:env, bioDesign)
 
     ## Input estimation
-    mu.input = estim.mu(dds)
+    res = estim.mu(dds, mm)
+    mu.input = res$mu
     alpha.input = estim.alpha(dds)
 
     # Setup simulation
-    input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input)
+    input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input, average_rep = FALSE)
+
     setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id,
-                                         n_rep = N_replicates,
+                                         n_rep = 1,
                                          alpha = input$alpha,
                                          gene_id = input$gene_id,
                                          mu = input$mu)
-
     # Simulate counts
     htrs <- generate_counts(setup.simulation)
     return(list(countDataSim = htrs, input = input, dds = dds))
diff --git a/src/htrsim/R/input_estimation.R b/src/htrsim/R/input_estimation.R
index 7699137..b601c5e 100644
--- a/src/htrsim/R/input_estimation.R
+++ b/src/htrsim/R/input_estimation.R
@@ -11,15 +11,20 @@
 #' @import dplyr
 #' @import tibble
 #' @import BiocGenerics
+#' @import S4Vectors
+#' @import readr
 #' @examples
 #' @importFrom rlang .data
 estim.alpha <- function(dds, export = FALSE){
   gene_id <- NULL
+  expressed_gene_dispersion <- NULL
   ###################      Estimate alpha per gene        ########################
   #N.B: alpha = dispersion per gene
   #dds  <- DESeq2::estimateDispersions(dds, quiet = F)
   #dispersion estimation
-  dispersion_estimate <- DESeq2::dispersions(dds)
+  dds.mcols = S4Vectors::mcols(dds,use.names=TRUE)
+  dispersion_estimate = dds.mcols$dispersion
+  #dispersion_estimate <- DESeq2::dispersions(dds)
 
   ## Shape and export
   names(dispersion_estimate) <-  names(dds@rowRanges)
@@ -31,12 +36,103 @@ estim.alpha <- function(dds, export = FALSE){
                                     tibble::rownames_to_column() %>%
                                     dplyr::rename("alpha" = .data$., gene_id = "rowname")
 
-  if (export == TRUE)   write_tsv(disp_gene_express, 'results/2022-03-03/estimate_dispersion.tsv')
+  if (export == TRUE)   readr::write_tsv(expressed_gene_dispersion, 'results/2022-03-03/estimate_dispersion.tsv')
 
   return(expressed_gene_dispersion)
 
 }
 
+#' Estimate mu_ij
+#'
+#' @param dds DESEQ2 object
+#' @param export Boolean
+#' @param mm a model matrix
+#'
+#' @return mu_ij only for gene expressed c_ij != 0
+#' @export
+#' @import stats
+#' @import dplyr
+#' @import tibble
+#' @import BiocGenerics
+#' @import S4Vectors
+#' @examples
+#' @importFrom rlang .data
+estim.mu <- function(dds, mm, export = FALSE){
+
+
+  gene_id <- NULL
+  nb_sples = BiocGenerics::rownames(dds@colData) %>% length()
+  nb_genes =  BiocGenerics::rownames(dds@assays@data$counts) %>% length()
+  mm_epsi = rep(1, nb_sples)
+  names(mm_epsi) = 1 : nb_sples
+
+
+  dds.mcols = S4Vectors::mcols(dds,use.names=TRUE)
+  ## BETA
+  B0 <- dds.mcols$Intercept
+  B1 <- dds.mcols$genotype_msn2D_vs_wt
+  B2 <- dds.mcols$genotype_msn4D_vs_wt
+  B3 <- dds.mcols$env_kcl_vs_control
+  B4 <- dds.mcols$genotypemsn2D.envkcl
+  B5 <- dds.mcols$genotypemsn4D.envkcl
+
+  #print(max(B0, na.rm=TRUE))
+  #print(max(B1, na.rm=TRUE))
+  #print(max(B2, na.rm=TRUE))
+  #print(max(B3, na.rm=TRUE))
+  #print(max(B4, na.rm=TRUE))
+  #print(max(B5, na.rm = TRUE))
+
+  ## deviance = sigma2 -> estimate epsilon
+  deviance_i.sqrt = sqrt(dds.mcols$deviance)
+
+
+  beta.matrix = cbind(B0, B1,B2,B3,B4,B5) %>% as.matrix()
+  #p_ij = B0_i*mm1_j + B1_i*mm2_j + B3_i*mm3_j + B4_i*mm4_j + B5_i*mm5_j
+  p_ij = beta.matrix %*% t(mm)
+  #epsilon_ij ~ N(0, deviance)
+  #epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = 0 ))  %>% data.frame() %>% as.matrix()
+  epsilon_ij = mm[,1] %>% map(., ~rnorm(deviance_i.sqrt, mean = 0, sd = deviance_i.sqrt ))  %>% data.frame() %>% as.matrix()
+  ## log_qij = p_ij + epsilon_ij
+  log_qij <- p_ij + epsilon_ij
+  ## s_j
+  s_j = dds$sizeFactor
+
+  mu_ij = s_j * 2^log_qij
+
+
+  #################     Estimate mu        #########################
+  mu_estimate <- dds@assays@data$mu
+  #dds@assays@data$mu %>% dim()
+  #mu_estimate %>% dim()
+  rownames(mu_ij) = BiocGenerics::rownames(dds@assays@data$counts)
+  ## drop NA in dispersion estimate (link to unexpressed gene)
+  ### and convert to lovely dataframe
+  mu_gene_express = mu_ij %>%
+    stats::na.omit() %>%
+    data.frame()
+  colnames(mu_gene_express) <- rownames(dds@colData)
+  mu_gene_express <- mu_gene_express %>%
+    tibble::rownames_to_column(var = "gene_id")
+
+
+
+
+  if (export == TRUE)  readr::write_tsv(mu_gene_express, 'results/2022-03-03/estimate_mu.tsv')
+
+
+  res = list(mu = mu_gene_express, beta.matrix = beta.matrix, deviance.sqrt = deviance_i.sqrt, dds.mcols = dds.mcols)
+  return(res)
+
+
+}
+
+
+#.xMm.foo <- function(b, m) return(b * m)
+#.epsilon.foo <- function(x) return(rnorm(mean = 0 ,sd = x, n = 1 ))
+#.epsilon_i <- function(dev_i) return(dev_i %>% map(., ~.epsilon.foo(.))%>% unlist())
+#.getMu_i <- function(s, qi) return(2^(qi))
+
 
 
 #' Estimate mu_ij
@@ -52,7 +148,7 @@ estim.alpha <- function(dds, export = FALSE){
 #' @import BiocGenerics
 #' @examples
 #' @importFrom rlang .data
-estim.mu <- function(dds, export = FALSE){
+estim.mu_beta <- function(dds, export = FALSE){
   gene_id <- NULL
   #################     Estimate mu        #########################
   mu_estimate <- dds@assays@data$mu
diff --git a/src/htrsim/R/launch_deseq.R b/src/htrsim/R/launch_deseq.R
index 6650ac1..d500f78 100644
--- a/src/htrsim/R/launch_deseq.R
+++ b/src/htrsim/R/launch_deseq.R
@@ -10,9 +10,12 @@
 #' @examples
 run.deseq <- function(tabl_cnts, bioDesign ){
   ########### LAUNCH DESEQ #############
-  ## Design model
-  dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ mutant + env + mutant:env)
+  ## Design model - specify reference
+  bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c('wt','msn2D', 'msn4D'))
+  bioDesign$env <- factor(x = bioDesign$env,levels = c('control', 'kcl'))
+
   ## DESEQ standard analysis
+  dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ genotype + env + genotype:env)
   dds <- DESeq2::DESeq(dds)
   return(dds)
 }
diff --git a/src/htrsim/R/setup_cntsGenerator.R b/src/htrsim/R/setup_cntsGenerator.R
index 0558c19..7e18062 100644
--- a/src/htrsim/R/setup_cntsGenerator.R
+++ b/src/htrsim/R/setup_cntsGenerator.R
@@ -2,6 +2,7 @@
 #'
 #' @param mu.dtf dataframe of mu_ij
 #' @param alpha.dtf dataframe of alpha_i
+#' @param average_rep bool
 #'
 #' @return
 #' @import purrr
@@ -11,103 +12,122 @@
 #' @export
 #'
 #' @examples
-reshape_input2setup <- function(mu.dtf, alpha.dtf){
-  ## Defining sample names
-  bioSample_id <- mu.dtf %>%
+reshape_input2setup <- function(mu.dtf, alpha.dtf,  average_rep = FALSE){
+  gene_id <- NULL
+
+  if(average_rep == TRUE){
+      ## Defining sample names
+      bioSample_id <- mu.dtf %>%
+        dplyr::select(-gene_id) %>%
+        BiocGenerics::colnames() %>%
+        purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>%
+                     BiocGenerics::paste(., collapse='_') )  %>%
+        BiocGenerics::unlist() %>% BiocGenerics::unique()
+      ############### Mu is equal for biosample replicate #############
+      ### case 1: choose 1  replicate
+      #mu_params <- mu_params %>% dplyr::select(., contains("rep1"))
+      ## rename mu_params colnames to ensure corresponding with sample_names
+      #colnames(mu_params) <- sample_names
+      ### case 2: average replicates
+      average_rep <- function(x, dtf) {
+        varname <- x
+        dtf %>%
+          dplyr::select(.,contains(x)) %>%
+          dplyr::mutate(!!varname := rowMeans(.)) %>%
+          dplyr::select(varname)
+      }
+      mu_ij <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf))  %>% data.frame()
+      mu_ij$gene_id <- alpha.dtf$gene_id
+  }
+
+  else {
+
+  bioSample_id <-  mu.dtf %>%
     dplyr::select(-gene_id) %>%
-    BiocGenerics::colnames() %>%
-    purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>%
-                 BiocGenerics::paste(., collapse='_') )  %>%
-    BiocGenerics::unlist() %>% BiocGenerics::unique()
-
-
-  ############### Mu is same for biosample replicate #############
-  ### case 1: choose 1  replicate
-  #mu_params <- mu_params %>% dplyr::select(., contains("rep1"))
-  ## rename mu_params colnames to ensure corresponding with sample_names
-  #colnames(mu_params) <- sample_names
-  ### case 2: average replicates
-  average_rep <- function(x, dtf) {
-    varname <- x
-    dtf %>%
-      dplyr::select(.,contains(x)) %>%
-      dplyr::mutate(!!varname := rowMeans(.)) %>%
-      dplyr::select(varname)
+    BiocGenerics::colnames()
+
+  mu_ij = mu.dtf
+
   }
-  mu_avg.dtf <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf))  %>% data.frame()
-  mu_avg.dtf$gene_id <- alpha.dtf$gene_id
 
-  return(list(alpha = alpha.dtf , mu = mu_avg.dtf, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id))
+  return(list(alpha = alpha.dtf , mu = mu_ij, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id))
 }
 
 
 
 #' Handle exception
 #'
-#' @param bioSample_id vector of id for each bioSample
+#' @param bioSample vector of id for each bioSample
 #' @param n_rep number of replicates
 #' @param gene_id vector of id for each gene
 #' @param alpha vector of alpha_i
-#' @param n_genes number of genes
 #'
 #' @return
 #' @export
 #'
 #' @examples
-handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
+handle_except <- function(bioSample, n_rep , gene_id , alpha){
 
-  if(is.numeric(n_rep) && length(n_rep) == 1){
-    message("Homogeneous number of replicates between samples: ", n_rep, " replicates per samples\n")
-    n_rep = rep(n_rep, length(bioSample))
+
+
+  if(is.null(bioSample)){
+    message("BioSample ID: NULL" )
+    bioSample = "my_first_lib"
+    message("Assuming only one library will be setup. A library named 'my_first_lib'")
   }
+  else message("BioSample ID: ", "OK")
 
-  if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n")
+  if(is.null(n_rep)){
+    message("N_rep: ", "NULL")
+    n_rep = 1
+    message("Number of replicates unspecified\nAssuming n_rep = 1")
+  }
+  else message("N_rep: ", "OK")
 
+  if(is.numeric(n_rep) && length(n_rep) == 1){
+    message(n_rep, " replicates per samples")
+    n_rep = rep(n_rep, length(bioSample))
+  }
 
-  if(is.null(n_genes)) {
-    if(!is.null(gene_id) || !is.null(alpha)){
-      ifelse(length(alpha) == length(alpha),
-             (n_genes = length(gene_id)), ## if
-             stop("ERROR: unconsistent value between n_genes, length(gene_names) and length(gene_disp)\n")) ## else
-    }
+  if(is.null(gene_id)){
+    message("Gene_id: ", "NULL")
   }
+  else message("Gene_id: ", "OK")
 
-  if(!is.null(n_genes)) {
-    if(is.null(gene_id) && is.null(alpha)) {
-      ### Precised alpha params for each genes
-      alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
-      id = paste0('gene', 1:n_genes)
-      alpha <-  list(gene_id = id, alpha = alpha)
-    }
+  if(is.null(alpha)){
+    message("Alpha: ", "NULL")
   }
+  else message("Alpha: ", "OK")
 
 
+  if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep")
 
-  if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_id)) {
-    message("Number of genes unspecified\nAssuming n_genes = 3\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
+  if(!is.null(gene_id)  && !is.null(alpha)) {
+      if (is.data.frame(alpha)) if (length(gene_id) != dim(alpha)[1]) stop("ERROR: unconsistent length between gene_id and alpha")
+      if (is.vector(alpha)) if (length(gene_id) != length(alpha)) stop("ERROR: unconsistent length between gene_id and alpha")
+      else n_genes = length(alpha)
+  }
+  if( is.null(gene_id) && is.null(alpha) ){
+    message("Assuming n_genes = 3")
     n_genes = 3
-    alpha = runif(2,100, n = n_genes)
     gene_id = paste0('gene', 1:n_genes)
   }
 
-  if(is.null(gene_id) && is.null(alpha)) {
-    message("n_genes = ", n_genes, "\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
-    alpha = runif(2,100, n = n_genes)
+  if(is.null(gene_id)  && !is.null(alpha)) {
+    message("Built gene_id")
+    n_genes = length(alpha)
     gene_id = paste0('gene', 1:n_genes)
   }
 
-  if(length(bioSample) == 1 && bioSample == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n")
-
-  if(is.null(n_rep)){
-    message("Number of replicates not provided.\nAssuming 10 replicates per sample will be setup")
-    n_rep = 10
+  if(!is.null(gene_id)  && is.null(alpha)) {
+    message("Alpha randomly defined from uniform law between 2 and 100")
+    n_genes = length(gene_id)
+    alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
   }
 
+  if ( !exists("n_genes")) n_genes = length(gene_id)
 
-  if(is.null(gene_id)) gene_id = paste0('gene', 1:n_genes)
-
-
-  my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes,   alpha = alpha)
+  my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes,   alpha = alpha, gene_id = gene_id)
   return(my_list)
 }
 
@@ -120,7 +140,6 @@ handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
 #' @param n_rep number of replicates
 #' @param gene_id vector of id for each gene
 #' @param alpha vector of alpha_i
-#' @param n_genes number of genes
 #' @param mu dataframe of mu_ij
 #' @import purrr
 #' @import data.table
@@ -128,10 +147,11 @@ handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
 #' @export
 #'
 #' @examples
-setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_id = NULL , alpha = NULL, n_genes = NULL, mu = NULL ){
+setup_countGener <- function(bioSample_id = NULL, n_rep = NULL , gene_id = NULL , alpha = NULL, mu = NULL ){
 
+  message("\nSetup counts generator ...")
   ######### HANDLE EXCEPTION #######
-  setup = handle_except(bioSample_id, n_rep , gene_id , alpha, n_genes)
+  setup = handle_except(bioSample_id, n_rep , gene_id , alpha)
   ######## HANDLE TYPE MU ##########
   if(is.null(mu)) mu = .mu_generator # default function to generate mu
 
@@ -141,9 +161,10 @@ setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_
     nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep,
                                  ~(list(name=.x, #sample_name
                                         n_replicates = .y, # random int between 1 & max_N_replicates
-                                        gene_id = setup$alpha$gene_id,  # gene_id
+                                        gene_id = setup$gene_id,  # gene_id
                                         mu = mu.set ,  #mu(ij)
-                                        alpha = setup$alpha$alpha))) %>%  # alpha(i)
+                                        alpha = setup$alpha))) %>%  # alpha(i)
+
       data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf
   }
 
-- 
GitLab