diff --git a/src/tutorial_htrsim.R b/src/tutorial_htrsim.R new file mode 100644 index 0000000000000000000000000000000000000000..5a2f88883f912c86eb167a9a9f63cab6bdce1b6b --- /dev/null +++ b/src/tutorial_htrsim.R @@ -0,0 +1,152 @@ +################################### Getting started ######################################################## + + + +############################################################################### + ####################### Experimental design ####################### +N_samples = 4 ## number of samples +N_replicates = 3 ## Nb of replicates per samples -> homogeneous design +N_genes = 10 # number of genes in your favorite organism + ########################################################################### + + +######################### Define NB params ###################### +### GENERATE SAMPLE INFO (name + nb of replicates per sample) +samples <- list(name = paste0('sample', 1:N_samples), n_rep = rep(N_replicates,N_samples)) +samples + +### GENERATE Negative binomial params (dispersion(i) & mu(ij)) +## N.B: dispersion != between gene +## but dispersion == between sample (for same gene) +set_alpha_per_gene = runif(2,100, n = N_genes) +set_gene_name = paste0('gene', 1:N_genes) +## N.B: mu != between gene and between sample +set_gene_name + +### Fill INPUT Dataframe +genes_NB_params <- samples$name %>% + map(~(list(name=., #sample_name + n_replicates = sample(1:max_N_replicates, 1), #random int between 1 & max_N_replicates + name_gene = set_gene_name, # gene_name + mu = runif(100,1000, n = N_genes), #mu(ij) + alpha = set_alpha_per_gene))) %>% # alpha(i) + rbindlist(.) %>% data.frame() ## convert to dtf + +## Use filter to understand our dtf +## Notice that alpha is equal for equivalent gene between sample +genes_NB_params %>% filter(name_gene == "gene2") +## But not mu ! + +genes_NB_params + +######################### BUILD COUNTS TABLES ###################### + +### Simulate N values from a Negative binomial distribution +rn_sim <- function(mu, alpha, n_replicates, ...){ + simul<- rnbinom(mu=mu, size=alpha, n = n_replicates) + return(simul) +} + + +### Simulate counts and convert simulation to deseq input +htrSim <- function(dtf){ + dtf %>% + pmap(rn_sim) %>% + plyr::ldply(., rbind) %>% + cbind(genes_NB_params %>% select(c("name_gene", "name")))%>% + reshape2::melt(.,id=c('name','name_gene'),value.name = "counts") %>% + unite(full_name, name, variable) %>% + drop_na(counts)%>% + reshape2::dcast(., name_gene ~ full_name, value.var= "counts") +} + +htrs<- htrSim(genes_NB_params) +htrs + +### check if counts tables create is consistent with the design previously defined +dim(htrs[, -1]) ## dimension without column name_gene +genes_NB_params %>% filter(name_gene == "gene1") %>% select(n_replicates)%>% sum() + + + + + +################################### COMPLEX LIBRARIES SIMULATOR ######################################################## + +####################### Experimental design ####################### +N_samples = 200 +max_N_replicates = 5 ## maximum number of replicates per sample -> heterogeneous design +N_genes = 6000 +############################################################################## + + + +######################### Define NB params ###################### +### GENERATE SAMPLE INFO (name + nb of replicates per sample) +samples <- list(name = paste0('sample', 1:N_samples), n_rep = sample(1:max_N_replicates, N_samples, replace=TRUE)) +samples + + +### GENERATE Negative binomial params (dispersion(i) & mu(ij)) +## N.B: dispersion != between gene +## but dispersion == between sample (for same gene) +set_alpha_per_gene = runif(2,100, n = N_genes) +set_gene_name = paste0('gene', 1:N_genes) +## N.B: mu != between gene and between sample +genes_NB_params <- samples$name %>% + map(~(list(name=., #sample_name + n_replicates = sample(1:max_N_replicates, 1), #random int between 1 & max_N_replicates + name_gene = set_gene_name, # gene_name + mu = runif(100,1000, n = N_genes), #mu(ij) + alpha = set_alpha_per_gene))) %>% # alpha(i) + rbindlist(.) %>% data.frame() ## convert to dtf + +genes_NB_params +## Use filter to understand our dtf +## Notice that alpha is equal for equivalent gene between sample +genes_NB_params %>% filter(name_gene == "gene251") +## But not mu ! + + +######################### BUILD COUNTS TABLES ###################### + +# a bit long :/ + +ptm <- proc.time() +htrs<- htrSim(genes_NB_params) +proc.time() - ptm + + +### check if the dtf create is consistent with the design previsouly defined +dim(htrs[, -1]) ## dimension without column name_gene +genes_NB_params %>% filter(name_gene == "gene1") %>% select(n_replicates)%>% sum() + + + +htrs %>% head() + + + + + + +#list(name_sample = samples_name, name_gene = paste0('gene', 1:N_genes), mu = runif(100,1000, n = N_genes)) + +## samples and genes dtf will serve as input for our simulation +## using samples and genes dtf, library_generator function build a list of HTRsim obj +lib_sim <- library_generator(samples_info = samples, genes) +lib_sim + +## each element of the list is an HTRsim obj +## each HTRsim obj is corresponding to a sample (-> remember : N_samples = 5) +length(lib_sim) +samples + +##For each element of the list you can used HTRsim attributes +lib_sim[[4]]@genes_NB_params +lib_sim[[3]]@n_replicates + +## Finally you can convertlist of HTRsim to an usual data.frame +## which can be used as input by DESEQ +my_lib_for_deseq <- getDESEQ_input(lib_sim) +head(my_lib_for_deseq)