From 5d9064045e62517b696986756b29031b97ff1796 Mon Sep 17 00:00:00 2001
From: aduvermy <arnaud.duvermy@ens-lyon.fr>
Date: Mon, 28 Mar 2022 15:28:34 +0000
Subject: [PATCH] update src

---
 src/htrsim/counts_generator.R    |  6 +--
 src/htrsim/setup_cntsGenerator.R | 85 +++++++++++++++++++++-----------
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/src/htrsim/counts_generator.R b/src/htrsim/counts_generator.R
index d51a7ca..e3a4911 100644
--- a/src/htrsim/counts_generator.R
+++ b/src/htrsim/counts_generator.R
@@ -14,11 +14,11 @@ generate_counts <- function(setup_dtf){
   message("reshaping to dataframe ...")
   cnt.dtf <- cnt.list %>%
                   plyr::ldply(., rbind) %>% 
-                  BiocGenerics::cbind(setup_dtf %>% select(c("name_gene", "name"))) %>%
-                  reshape2::melt(.,id=c('name','name_gene'),value.name = "counts") %>% 
+                  BiocGenerics::cbind(setup_dtf %>% select(c("gene_id", "name"))) %>%
+                  reshape2::melt(.,id=c('name','gene_id'),value.name = "counts") %>% 
                   tidyr::unite(full_name, name, variable) %>%
                   tidyr::drop_na(counts) %>%
-                  reshape2::dcast(., name_gene ~ full_name, value.var= "counts")
+                  reshape2::dcast(., gene_id ~ full_name, value.var= "counts")
   return(cnt.dtf)
 }
 
diff --git a/src/htrsim/setup_cntsGenerator.R b/src/htrsim/setup_cntsGenerator.R
index f8d9c0f..1099575 100644
--- a/src/htrsim/setup_cntsGenerator.R
+++ b/src/htrsim/setup_cntsGenerator.R
@@ -1,49 +1,79 @@
 ########## BUILD INPUT DATAFRAME ###########
 
+reshape_input2setup <- function(mu.dtf, alpha.dtf){
+  ## Defining sample names
+  bioSample_id <- mu.dtf %>% 
+      select(-gene_id) %>% 
+      colnames() %>% 
+      purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>% 
+                 BiocGenerics::paste(., collapse='_') )  %>% 
+      BiocGenerics::unlist() %>% BiocGenerics::unique() 
+  
+  
+  ############### Mu is same for biosample replicate ############# 
+  ### case 1: choose 1  replicate 
+  #mu_params <- mu_params %>% dplyr::select(., contains("rep1")) 
+  ## rename mu_params colnames to ensure corresponding with sample_names 
+  #colnames(mu_params) <- sample_names
+  ### case 2: average replicates
+  average_rep <- function(x, dtf) {
+    varname <- x
+    dtf %>% 
+      select(.,contains(x)) %>% 
+      mutate(!!varname := rowMeans(.)) %>% 
+      select(varname)
+  }
+  mu_avg.dtf <- bioSample_id %>% map(.x = ., .f = ~average_rep(.x, mu.dtf))  %>% data.frame() 
+  mu_avg.dtf$gene_id <- alpha.dtf$gene_id
+
+  return(list(alpha = alpha.dtf , mu = mu_avg.dtf, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id))
+} 
 
-handle_except <- function(sample_names, n_rep , gene_names , gene_disp, n_genes){
+
+
+handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
     
     if(is.numeric(n_rep) && length(n_rep) == 1){
       message("Homogeneous number of replicates between samples: ", n_rep, " replicates per samples\n")
-      n_rep = rep(n_rep, length(sample_names))
+      n_rep = rep(n_rep, length(bioSample))
     }
   
-    if(!is.null(n_rep) && length(sample_names) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n")
+    if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n")
   
     
     if(is.null(n_genes)) {
-       if(!is.null(gene_names) || !is.null(gene_disp)){
-         ifelse(length(gene_disp) == length(gene_names), 
-                  (n_genes = length(gene_names)), ## if 
+       if(!is.null(gene_id) || !is.null(alpha)){
+         ifelse(length(alpha) == length(alpha), 
+                  (n_genes = length(gene_id)), ## if 
                   stop("ERROR: unconsistent value between n_genes, length(gene_names) and length(gene_disp)\n")) ## else
        } 
     }
   
     if(!is.null(n_genes)) {
-      if(is.null(gene_names) && is.null(gene_disp)) {
+      if(is.null(gene_id) && is.null(alpha)) {
         ### Precised alpha params for each genes
-        gene_disp = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
-        gene_names = paste0('gene', 1:n_genes)
-        genes <-  list(name = gene_names, alpha = gene_disp) 
+        alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
+        id = paste0('gene', 1:n_genes)
+        alpha <-  list(gene_id = id, alpha = alpha) 
       }
     }
     
     
   
-    if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_names)) {
+    if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_id)) {
       message("Number of genes unspecified\nAssuming n_genes = 3\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
       n_genes = 3
-      gene_disp = runif(2,100, n = n_genes)
-      gene_names = paste0('gene', 1:n_genes)
+      alpha = runif(2,100, n = n_genes)
+      gene_id = paste0('gene', 1:n_genes)
     }
     
-    if(is.null(gene_names) && is.null(disp_gene)) {
+    if(is.null(gene_id) && is.null(alpha)) {
       message("n_genes = ", n_genes, "\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
-      gene_disp = runif(2,100, n = n_genes)
-      gene_names = paste0('gene', 1:n_genes)
+      alpha = runif(2,100, n = n_genes)
+      gene_id = paste0('gene', 1:n_genes)
     }
   
-    if(length(sample_names) == 1 && sample_names == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n")
+    if(length(bioSample) == 1 && bioSample == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n")
   
     if(is.null(n_rep)){
       message("Number of replicates not provided.\nAssuming 10 replicates per sample will be setup")
@@ -51,33 +81,32 @@ handle_except <- function(sample_names, n_rep , gene_names , gene_disp, n_genes)
     }
 
   
-    if(is.null(gene_names)) gene_names = paste0('gene', 1:n_genes)
+    if(is.null(gene_id)) gene_id = paste0('gene', 1:n_genes)
     
   
-  my_list = list(samples = sample_names, rep = n_rep, n_g = n_genes,  genes = gene_names, alpha = gene_disp)
+  my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes,   alpha = alpha)
   return(my_list)
 }
 
 
 
 
-setup_countGener <- function(sample_names = "my_first_lib", n_rep = NULL , gene_names = NULL , gene_dispersion = NULL, n_genes = NULL, mu = NULL ){
+setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_id = NULL , alpha = NULL, n_genes = NULL, mu = NULL ){
   
   ######### HANDLE EXCEPTION #######
-  setup = handle_except(sample_names, n_rep , gene_names , gene_dispersion, n_genes)
-  
+  setup = handle_except(bioSample_id, n_rep , gene_id , alpha, n_genes)
   ######## HANDLE TYPE MU ##########
   if(is.null(mu)) mu = .mu_generator # default function to generate mu
   
   if(is.function(mu)) {  #mu = function
     mu.set = mu(setup$n_g)
     ######## BUILD AN INPUT DTF FOR count_generator ############
-    nBinom_params <- purrr::map2(.x= setup$samples, .y = setup$rep, 
+    nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep, 
                                   ~(list(name=.x, #sample_name
                                          n_replicates = .y, # random int between 1 & max_N_replicates 
-                                         name_gene = setup$genes,  # gene_name
+                                         gene_id = setup$alpha$gene_id,  # gene_id
                                          mu = mu.set ,  #mu(ij)
-                                         alpha = setup$alpha))) %>%  # alpha(i)
+                                         alpha = setup$alpha$alpha))) %>%  # alpha(i)
                     data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf
   }
   
@@ -85,12 +114,12 @@ setup_countGener <- function(sample_names = "my_first_lib", n_rep = NULL , gene_
   if(is.data.frame(mu)) {  # mu = data.frame
     mu.dtf = mu
     ######## BUILD AN INPUT DTF FOR count_generator ############
-    nBinom_params <- purrr::map2(.x= setup$samples, .y = setup$rep, 
+    nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep, 
                                  ~(list(name=.x, #sample_name
                                         n_replicates = .y, # number replicates
-                                        name_gene = setup$genes,  # gene_name
+                                        gene_id = setup$alpha$gene_id,  # gene_name
                                         mu = mu.dtf %>% dplyr::select(all_of(.x)) %>% unlist() ,  #mu(ij)
-                                        alpha = setup$alpha))) %>%  # alpha(i)
+                                        alpha = setup$alpha$alpha))) %>%  # alpha(i)
                       data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf
   }
   
-- 
GitLab