From 5d9064045e62517b696986756b29031b97ff1796 Mon Sep 17 00:00:00 2001 From: aduvermy <arnaud.duvermy@ens-lyon.fr> Date: Mon, 28 Mar 2022 15:28:34 +0000 Subject: [PATCH] update src --- src/htrsim/counts_generator.R | 6 +-- src/htrsim/setup_cntsGenerator.R | 85 +++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/src/htrsim/counts_generator.R b/src/htrsim/counts_generator.R index d51a7ca..e3a4911 100644 --- a/src/htrsim/counts_generator.R +++ b/src/htrsim/counts_generator.R @@ -14,11 +14,11 @@ generate_counts <- function(setup_dtf){ message("reshaping to dataframe ...") cnt.dtf <- cnt.list %>% plyr::ldply(., rbind) %>% - BiocGenerics::cbind(setup_dtf %>% select(c("name_gene", "name"))) %>% - reshape2::melt(.,id=c('name','name_gene'),value.name = "counts") %>% + BiocGenerics::cbind(setup_dtf %>% select(c("gene_id", "name"))) %>% + reshape2::melt(.,id=c('name','gene_id'),value.name = "counts") %>% tidyr::unite(full_name, name, variable) %>% tidyr::drop_na(counts) %>% - reshape2::dcast(., name_gene ~ full_name, value.var= "counts") + reshape2::dcast(., gene_id ~ full_name, value.var= "counts") return(cnt.dtf) } diff --git a/src/htrsim/setup_cntsGenerator.R b/src/htrsim/setup_cntsGenerator.R index f8d9c0f..1099575 100644 --- a/src/htrsim/setup_cntsGenerator.R +++ b/src/htrsim/setup_cntsGenerator.R @@ -1,49 +1,79 @@ ########## BUILD INPUT DATAFRAME ########### +reshape_input2setup <- function(mu.dtf, alpha.dtf){ + ## Defining sample names + bioSample_id <- mu.dtf %>% + select(-gene_id) %>% + colnames() %>% + purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>% + BiocGenerics::paste(., collapse='_') ) %>% + BiocGenerics::unlist() %>% BiocGenerics::unique() + + + ############### Mu is same for biosample replicate ############# + ### case 1: choose 1 replicate + #mu_params <- mu_params %>% dplyr::select(., contains("rep1")) + ## rename mu_params colnames to ensure corresponding with sample_names + #colnames(mu_params) <- sample_names + ### case 2: average replicates + average_rep <- function(x, dtf) { + varname <- x + dtf %>% + select(.,contains(x)) %>% + mutate(!!varname := rowMeans(.)) %>% + select(varname) + } + mu_avg.dtf <- bioSample_id %>% map(.x = ., .f = ~average_rep(.x, mu.dtf)) %>% data.frame() + mu_avg.dtf$gene_id <- alpha.dtf$gene_id + + return(list(alpha = alpha.dtf , mu = mu_avg.dtf, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id)) +} -handle_except <- function(sample_names, n_rep , gene_names , gene_disp, n_genes){ + + +handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){ if(is.numeric(n_rep) && length(n_rep) == 1){ message("Homogeneous number of replicates between samples: ", n_rep, " replicates per samples\n") - n_rep = rep(n_rep, length(sample_names)) + n_rep = rep(n_rep, length(bioSample)) } - if(!is.null(n_rep) && length(sample_names) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n") + if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n") if(is.null(n_genes)) { - if(!is.null(gene_names) || !is.null(gene_disp)){ - ifelse(length(gene_disp) == length(gene_names), - (n_genes = length(gene_names)), ## if + if(!is.null(gene_id) || !is.null(alpha)){ + ifelse(length(alpha) == length(alpha), + (n_genes = length(gene_id)), ## if stop("ERROR: unconsistent value between n_genes, length(gene_names) and length(gene_disp)\n")) ## else } } if(!is.null(n_genes)) { - if(is.null(gene_names) && is.null(gene_disp)) { + if(is.null(gene_id) && is.null(alpha)) { ### Precised alpha params for each genes - gene_disp = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100 - gene_names = paste0('gene', 1:n_genes) - genes <- list(name = gene_names, alpha = gene_disp) + alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100 + id = paste0('gene', 1:n_genes) + alpha <- list(gene_id = id, alpha = alpha) } } - if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_names)) { + if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_id)) { message("Number of genes unspecified\nAssuming n_genes = 3\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n") n_genes = 3 - gene_disp = runif(2,100, n = n_genes) - gene_names = paste0('gene', 1:n_genes) + alpha = runif(2,100, n = n_genes) + gene_id = paste0('gene', 1:n_genes) } - if(is.null(gene_names) && is.null(disp_gene)) { + if(is.null(gene_id) && is.null(alpha)) { message("n_genes = ", n_genes, "\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n") - gene_disp = runif(2,100, n = n_genes) - gene_names = paste0('gene', 1:n_genes) + alpha = runif(2,100, n = n_genes) + gene_id = paste0('gene', 1:n_genes) } - if(length(sample_names) == 1 && sample_names == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n") + if(length(bioSample) == 1 && bioSample == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n") if(is.null(n_rep)){ message("Number of replicates not provided.\nAssuming 10 replicates per sample will be setup") @@ -51,33 +81,32 @@ handle_except <- function(sample_names, n_rep , gene_names , gene_disp, n_genes) } - if(is.null(gene_names)) gene_names = paste0('gene', 1:n_genes) + if(is.null(gene_id)) gene_id = paste0('gene', 1:n_genes) - my_list = list(samples = sample_names, rep = n_rep, n_g = n_genes, genes = gene_names, alpha = gene_disp) + my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes, alpha = alpha) return(my_list) } -setup_countGener <- function(sample_names = "my_first_lib", n_rep = NULL , gene_names = NULL , gene_dispersion = NULL, n_genes = NULL, mu = NULL ){ +setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_id = NULL , alpha = NULL, n_genes = NULL, mu = NULL ){ ######### HANDLE EXCEPTION ####### - setup = handle_except(sample_names, n_rep , gene_names , gene_dispersion, n_genes) - + setup = handle_except(bioSample_id, n_rep , gene_id , alpha, n_genes) ######## HANDLE TYPE MU ########## if(is.null(mu)) mu = .mu_generator # default function to generate mu if(is.function(mu)) { #mu = function mu.set = mu(setup$n_g) ######## BUILD AN INPUT DTF FOR count_generator ############ - nBinom_params <- purrr::map2(.x= setup$samples, .y = setup$rep, + nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep, ~(list(name=.x, #sample_name n_replicates = .y, # random int between 1 & max_N_replicates - name_gene = setup$genes, # gene_name + gene_id = setup$alpha$gene_id, # gene_id mu = mu.set , #mu(ij) - alpha = setup$alpha))) %>% # alpha(i) + alpha = setup$alpha$alpha))) %>% # alpha(i) data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf } @@ -85,12 +114,12 @@ setup_countGener <- function(sample_names = "my_first_lib", n_rep = NULL , gene_ if(is.data.frame(mu)) { # mu = data.frame mu.dtf = mu ######## BUILD AN INPUT DTF FOR count_generator ############ - nBinom_params <- purrr::map2(.x= setup$samples, .y = setup$rep, + nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep, ~(list(name=.x, #sample_name n_replicates = .y, # number replicates - name_gene = setup$genes, # gene_name + gene_id = setup$alpha$gene_id, # gene_name mu = mu.dtf %>% dplyr::select(all_of(.x)) %>% unlist() , #mu(ij) - alpha = setup$alpha))) %>% # alpha(i) + alpha = setup$alpha$alpha))) %>% # alpha(i) data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf } -- GitLab