Skip to content
Snippets Groups Projects
Commit d540e6fa authored by Arnaud Duvermy's avatar Arnaud Duvermy
Browse files

update htrsim to devtools package

parent 6328af1f
No related branches found
No related tags found
No related merge requests found
Showing
with 709 additions and 0 deletions
Package: htrsim
Title: Hightoughtput RNA-seq simulation
Version: 0.1
Authors@R: person('Duvermy', 'Arnaud', email = 'arnaud.duvermy@ens-lyon.Fr', role = c('aut', 'cre'))
Description: blabla.
License: GPL-3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.2
Suggests:
rmarkdown,
knitr
VignetteBuilder: knitr
Imports:
stats,
DESeq2,
magrittr,
dplyr,
BiocGenerics,
tibble,
rlang,
stringr,
purrr,
data.table,
plyr,
reshape2,
tidyr
# Generated by roxygen2: do not edit by hand
export(estim.alpha)
export(estim.mu)
export(generate_counts)
export(handle_except)
export(htrsim)
export(reshape_input2setup)
export(rn_sim)
export(run.deseq)
export(setup_countGener)
import(BiocGenerics)
import(DESeq2)
import(data.table)
import(dplyr)
import(plyr)
import(purrr)
import(reshape2)
import(stats)
import(stringr)
import(tibble)
import(tidyr)
importFrom(rlang,.data)
#' Sampling counts from Negative Binomial distribution
#'
#' @param mu mu_ij value
#' @param alpha alpha_i value
#' @param n_replicates number of replicates
#' @param ... everything else
#'
#' @return vector of length n_replicates
#' @export
#'
#' @examples
rn_sim <- function(mu, alpha, n_replicates, ...){
simul<- rnbinom(mu=mu, size=alpha, n = n_replicates)
return(simul)
}
#' Simulate counts and convert to lovely deseq input
#'
#' @param setup_dtf Output from setup_cntsGenerator.R
#' @param export Boolean
#'
#' @return dataframe with counts c_ij
#' @export
#' @import tidyr
#' @import reshape2
#' @import plyr
#' @import BiocGenerics
#' @import purrr
#'
#' @examples
generate_counts <- function(setup_dtf, export = FALSE){
message("reading and processing counts per genes ...")
cnt.list = setup_dtf %>%
purrr::pmap(rn_sim)
message("reshaping to dataframe ...")
cnt.dtf <- cnt.list %>%
plyr::ldply(., rbind) %>%
BiocGenerics::cbind(setup_dtf %>% select(c("gene_id", "name"))) %>%
reshape2::melt(.,id=c('name','gene_id'),value.name = "counts") %>%
tidyr::unite(full_name, name, variable) %>%
tidyr::drop_na(counts) %>%
reshape2::dcast(., gene_id ~ full_name, value.var= "counts")
if (export == TRUE) write.tsv(cnt.dtf, 'results/2022-03-03/estimate_dispersion.tsv')
return(cnt.dtf)
}
#' HTRSIM workflow
#'
#' @param countData dataframe with actual count per gene
#' @param bioDesign dataframe defining bioDesign
#' @param N_replicates Number of replicate
#'
#' @return dataframe with simulated count per gene
#' @export
#'
#' @examples
htrsim <- function(countData, bioDesign, N_replicates){
# launch standard DESEQ2 analysis
dds = run.deseq(tabl_cnts, bioDesign = bioDesign)
## Input estimation
mu.input = estim.mu(dds)
alpha.input = estim.alpha(dds)
# Setup simulation
input = reshape_input2setup(mu.dtf = mu.input, alpha.dtf = alpha.input)
setup.simulation <- setup_countGener(bioSample_id = input$bioSample_id,
n_rep = N_replicates,
alpha = input$alpha,
gene_id = input$gene_id,
mu = input$mu)
# Simulate counts
htrs <- generate_counts(setup.simulation)
return(list(countDataSim = htrs, input = input, dds = dds))
}
#' Estimate alpha_i
#'
#' @param dds DESEQ2 object
#' @param export Boolean
#'
#' @return alpha_i per gene only for gene expressed c_ij != 0
#' @export
#' @import DESeq2
#' @import stats
#' @import dplyr
#' @import tibble
#' @import BiocGenerics
#' @examples
#' @importFrom rlang .data
estim.alpha <- function(dds, export = FALSE){
gene_id <- NULL
################### Estimate alpha per gene ########################
#N.B: alpha = dispersion per gene
#dds <- DESeq2::estimateDispersions(dds, quiet = F)
#dispersion estimation
dispersion_estimate <- DESeq2::dispersions(dds)
## Shape and export
names(dispersion_estimate) <- names(dds@rowRanges)
## drop NA in dispersion estimate (link to unexpressed gene)
### and convert to lovely dataframe
expressed_gene_dispersion <- dispersion_estimate[!is.na(dispersion_estimate)] %>%
data.frame() %>%
tibble::rownames_to_column() %>%
dplyr::rename("alpha" = .data$., gene_id = "rowname")
if (export == TRUE) write_tsv(disp_gene_express, 'results/2022-03-03/estimate_dispersion.tsv')
return(expressed_gene_dispersion)
}
#' Estimate mu_ij
#'
#' @param dds DESEQ2 object
#' @param export Boolean
#'
#' @return mu_ij only for gene expressed c_ij != 0
#' @export
#' @import stats
#' @import dplyr
#' @import tibble
#' @import BiocGenerics
#' @examples
#' @importFrom rlang .data
estim.mu <- function(dds, export = FALSE){
gene_id <- NULL
################# Estimate mu #########################
mu_estimate <- dds@assays@data$mu
#dds@assays@data$mu %>% dim()
#mu_estimate %>% dim()
rownames(mu_estimate) = BiocGenerics::rownames(dds@assays@data$counts)
## drop NA in dispersion estimate (link to unexpressed gene)
### and convert to lovely dataframe
mu_gene_express = mu_estimate %>%
stats::na.omit() %>%
data.frame()
colnames(mu_gene_express) <- rownames(dds@colData)
mu_gene_express <- mu_gene_express %>%
tibble::rownames_to_column(var = "gene_id")
if (export == TRUE) write_tsv(mu_gene_express, 'results/2022-03-03/estimate_mu.tsv')
return(mu_gene_express)
}
#' Title
#'
#' @param tabl_cnts table containing counts per genes & samples
#' @param bioDesign table describing bioDesgin of input
#' @import DESeq2
#' @return DESEQ2 object
#' @export
#'
#' @examples
run.deseq <- function(tabl_cnts, bioDesign ){
########### LAUNCH DESEQ #############
## Design model
dds = DESeq2::DESeqDataSetFromMatrix( countData = round(tabl_cnts), colData = bioDesign , design = ~ mutant + env + mutant:env)
## DESEQ standard analysis
dds <- DESeq2::DESeq(dds)
return(dds)
}
#' Reshape input before building setup
#'
#' @param mu.dtf dataframe of mu_ij
#' @param alpha.dtf dataframe of alpha_i
#'
#' @return
#' @import purrr
#' @import stringr
#' @import dplyr
#' @import BiocGenerics
#' @export
#'
#' @examples
reshape_input2setup <- function(mu.dtf, alpha.dtf){
## Defining sample names
bioSample_id <- mu.dtf %>%
dplyr::select(-gene_id) %>%
BiocGenerics::colnames() %>%
purrr::map(., ~stringr::str_split(.,"_")[[1]][1:2] %>%
BiocGenerics::paste(., collapse='_') ) %>%
BiocGenerics::unlist() %>% BiocGenerics::unique()
############### Mu is same for biosample replicate #############
### case 1: choose 1 replicate
#mu_params <- mu_params %>% dplyr::select(., contains("rep1"))
## rename mu_params colnames to ensure corresponding with sample_names
#colnames(mu_params) <- sample_names
### case 2: average replicates
average_rep <- function(x, dtf) {
varname <- x
dtf %>%
dplyr::select(.,contains(x)) %>%
dplyr::mutate(!!varname := rowMeans(.)) %>%
dplyr::select(varname)
}
mu_avg.dtf <- bioSample_id %>% purrr::map(.x = ., .f = ~average_rep(.x, mu.dtf)) %>% data.frame()
mu_avg.dtf$gene_id <- alpha.dtf$gene_id
return(list(alpha = alpha.dtf , mu = mu_avg.dtf, bioSample_id = bioSample_id, gene_id = alpha.dtf$gene_id))
}
#' Handle exception
#'
#' @param bioSample_id vector of id for each bioSample
#' @param n_rep number of replicates
#' @param gene_id vector of id for each gene
#' @param alpha vector of alpha_i
#' @param n_genes number of genes
#'
#' @return
#' @export
#'
#' @examples
handle_except <- function(bioSample, n_rep , gene_id , alpha, n_genes){
if(is.numeric(n_rep) && length(n_rep) == 1){
message("Homogeneous number of replicates between samples: ", n_rep, " replicates per samples\n")
n_rep = rep(n_rep, length(bioSample))
}
if(!is.null(n_rep) && length(bioSample) != length(n_rep)) stop("ERROR: unconsistent length between samples_names and n_rep\n")
if(is.null(n_genes)) {
if(!is.null(gene_id) || !is.null(alpha)){
ifelse(length(alpha) == length(alpha),
(n_genes = length(gene_id)), ## if
stop("ERROR: unconsistent value between n_genes, length(gene_names) and length(gene_disp)\n")) ## else
}
}
if(!is.null(n_genes)) {
if(is.null(gene_id) && is.null(alpha)) {
### Precised alpha params for each genes
alpha = runif(0.2,120, n = n_genes) ## randomly defined between 2 and 100
id = paste0('gene', 1:n_genes)
alpha <- list(gene_id = id, alpha = alpha)
}
}
if(is.null(n_genes) && is.null(gene_disp) && is.null(gene_id)) {
message("Number of genes unspecified\nAssuming n_genes = 3\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
n_genes = 3
alpha = runif(2,100, n = n_genes)
gene_id = paste0('gene', 1:n_genes)
}
if(is.null(gene_id) && is.null(alpha)) {
message("n_genes = ", n_genes, "\nAssuming gene dispersion (alpha) follow a uniform law between 2 and 100\n")
alpha = runif(2,100, n = n_genes)
gene_id = paste0('gene', 1:n_genes)
}
if(length(bioSample) == 1 && bioSample == "my_first_lib") message("No sample name is provided.\nAssuming only one library will be setup\n")
if(is.null(n_rep)){
message("Number of replicates not provided.\nAssuming 10 replicates per sample will be setup")
n_rep = 10
}
if(is.null(gene_id)) gene_id = paste0('gene', 1:n_genes)
my_list = list(bioSample = bioSample, rep = n_rep, n_g = n_genes, alpha = alpha)
return(my_list)
}
#' Build setup for counts generator
#'
#' @param bioSample_id vector of id for each bioSample
#' @param n_rep number of replicates
#' @param gene_id vector of id for each gene
#' @param alpha vector of alpha_i
#' @param n_genes number of genes
#' @param mu dataframe of mu_ij
#' @import purrr
#' @import data.table
#' @return
#' @export
#'
#' @examples
setup_countGener <- function(bioSample_id = "my_first_lib", n_rep = NULL , gene_id = NULL , alpha = NULL, n_genes = NULL, mu = NULL ){
######### HANDLE EXCEPTION #######
setup = handle_except(bioSample_id, n_rep , gene_id , alpha, n_genes)
######## HANDLE TYPE MU ##########
if(is.null(mu)) mu = .mu_generator # default function to generate mu
if(is.function(mu)) { #mu = function
mu.set = mu(setup$n_g)
######## BUILD AN INPUT DTF FOR count_generator ############
nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep,
~(list(name=.x, #sample_name
n_replicates = .y, # random int between 1 & max_N_replicates
gene_id = setup$alpha$gene_id, # gene_id
mu = mu.set , #mu(ij)
alpha = setup$alpha$alpha))) %>% # alpha(i)
data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf
}
if(is.data.frame(mu)) { # mu = data.frame
mu.dtf = mu
######## BUILD AN INPUT DTF FOR count_generator ############
nBinom_params <- purrr::map2(.x= setup$bioSample, .y = setup$rep,
~(list(name=.x, #sample_name
n_replicates = .y, # number replicates
gene_id = setup$alpha$gene_id, # gene_name
mu = mu.dtf %>% dplyr::select(all_of(.x)) %>% unlist() , #mu(ij)
alpha = setup$alpha$alpha))) %>% # alpha(i)
data.table::rbindlist(.) %>% as.data.frame() ## convert to lovely dtf
}
return(nBinom_params)
}
.mu_generator <- function(x) return(runif(100,1000, n = x ))
usethis::use_build_ignore("devtools_history.R")
usethis::use_package('DESeq2')
usethis::use_package('magrittr')
usethis::use_package('stats')
usethis::use_package('dplyr')
usethis::use_package("BiocGenerics")
usethis::use_package("tibble")
usethis::use_package("stringr")
usethis::use_package("purrr")
usethis::use_package("data.table")
usethis::use_package("plyr")
usethis::use_package("reshape2")
usethis::use_package("tidyr")
devtools::load_all()
sample;env;mutant;mutant_env
Msn2D_KCl_rep1;kcl;msn2D;msn2D_kcl
Msn2D_KCl_rep2;kcl;msn2D;msn2D_kcl
Msn4D_KCl_rep1;kcl;msn4D;msn4D_kcl
Msn4D_KCl_rep2;kcl;msn4D;msn4D_kcl
Msn2D_control_rep1;control;msn2D;msn2D_control
Msn2D_control_rep1;control;msn2D;msn2D_control
Msn4D_control_rep1;control;msn4D;msn4D_control
Msn4D_control_rep2;control;msn4D;msn4D_control
WT_control_rep1;control;wt;wt_control
WT_control_rep2;control;wt;wt_control
WT_KCl_rep1;kcl;wt;wt_kcl
WT_KCl_rep2;kcl;wt;wt_kcl
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/input_estimation.R
\name{estim.alpha}
\alias{estim.alpha}
\title{Estimate alpha_i}
\usage{
estim.alpha(dds, export = FALSE)
}
\arguments{
\item{dds}{DESEQ2 object}
\item{export}{Boolean}
}
\value{
alpha_i per gene only for gene expressed c_ij != 0
}
\description{
Estimate alpha_i
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/input_estimation.R
\name{estim.mu}
\alias{estim.mu}
\title{Estimate mu_ij}
\usage{
estim.mu(dds, export = FALSE)
}
\arguments{
\item{dds}{DESEQ2 object}
\item{export}{Boolean}
}
\value{
mu_ij only for gene expressed c_ij != 0
}
\description{
Estimate mu_ij
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/generate_counts.R
\name{generate_counts}
\alias{generate_counts}
\title{Simulate counts and convert to lovely deseq input}
\usage{
generate_counts(setup_dtf, export = FALSE)
}
\arguments{
\item{setup_dtf}{Output from setup_cntsGenerator.R}
\item{export}{Boolean}
}
\value{
dataframe with counts c_ij
}
\description{
Simulate counts and convert to lovely deseq input
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/setup_cntsGenerator.R
\name{handle_except}
\alias{handle_except}
\title{Handle exception}
\usage{
handle_except(bioSample, n_rep, gene_id, alpha, n_genes)
}
\arguments{
\item{n_rep}{number of replicates}
\item{gene_id}{vector of id for each gene}
\item{alpha}{vector of alpha_i}
\item{n_genes}{number of genes}
\item{bioSample_id}{vector of id for each bioSample}
}
\value{
}
\description{
Handle exception
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/htrsim_workflow.R
\name{htrsim}
\alias{htrsim}
\title{HTRSIM workflow}
\usage{
htrsim(countData, bioDesign, N_replicates)
}
\arguments{
\item{countData}{dataframe with actual count per gene}
\item{bioDesign}{dataframe defining bioDesign}
\item{N_replicates}{Number of replicate}
}
\value{
dataframe with simulated count per gene
}
\description{
HTRSIM workflow
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/setup_cntsGenerator.R
\name{reshape_input2setup}
\alias{reshape_input2setup}
\title{Reshape input before building setup}
\usage{
reshape_input2setup(mu.dtf, alpha.dtf)
}
\arguments{
\item{mu.dtf}{dataframe of mu_ij}
\item{alpha.dtf}{dataframe of alpha_i}
}
\value{
}
\description{
Reshape input before building setup
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/generate_counts.R
\name{rn_sim}
\alias{rn_sim}
\title{Sampling counts from Negative Binomial distribution}
\usage{
rn_sim(mu, alpha, n_replicates, ...)
}
\arguments{
\item{mu}{mu_ij value}
\item{alpha}{alpha_i value}
\item{n_replicates}{number of replicates}
\item{...}{everything else}
}
\value{
vector of length n_replicates
}
\description{
Sampling counts from Negative Binomial distribution
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/launch_deseq.R
\name{run.deseq}
\alias{run.deseq}
\title{Title}
\usage{
run.deseq(tabl_cnts, bioDesign)
}
\arguments{
\item{tabl_cnts}{table containing counts per genes & samples}
\item{bioDesign}{table describing bioDesgin of input}
}
\value{
DESEQ2 object
}
\description{
Title
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/setup_cntsGenerator.R
\name{setup_countGener}
\alias{setup_countGener}
\title{Build setup for counts generator}
\usage{
setup_countGener(
bioSample_id = "my_first_lib",
n_rep = NULL,
gene_id = NULL,
alpha = NULL,
n_genes = NULL,
mu = NULL
)
}
\arguments{
\item{bioSample_id}{vector of id for each bioSample}
\item{n_rep}{number of replicates}
\item{gene_id}{vector of id for each gene}
\item{alpha}{vector of alpha_i}
\item{n_genes}{number of genes}
\item{mu}{dataframe of mu_ij}
}
\value{
}
\description{
Build setup for counts generator
}
---
title: "HTRSIM tutorial"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{HTRSIM}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
# A. Introduction
$$
Phenotype = Genotype + Environment + Genotype.Environment
$$
From this expression, $\beta_{G}$, $\beta_{E}$, $\beta_{G*E}$ can be seen as coefficients which allow quantifying the participation of each factors (Genotype, Environment and interaction Genotype/Environment).
In mathematical term, it leads to a linear expression such:
$$
P = \beta_{G}*G + \beta_{E}*E + \beta_{G*E}*G.E + \beta_{0}
$$
In order to estimate these coefficients, a Generalized Linear Model (GLM) can be used.
# B. HTRSIM getting started
<u>a. Required</u>
```{r, setupWD, include=FALSE}
#knitr::opts_knit$set(root.dir = '~/counts_simulation/')
```
```{r setup}
library(htrsim)
library(tidyverse)
```
<u>b. Workflow</u>
```{r echo=FALSE, out.width='50%'}
#knitr::include_graphics('img/schema_loop.jpg')
```
<u>c. RNA-seq pipeline</u>
You can used your favorite pipeline to obtain table counts from real data.
If you don't have any idea of how to obtain such table counts rendez-vous [at](https://gitbio.ens-lyon.fr/aduvermy/rna-seq_public_library_investigations)
<u> d. BioProject PRJNA675209b as input</u>
To easily test *HTRSIM* we produced an usual table counts from BioProject PRJNA675209b.
Take the time to clean up your table counts before using it as input of htrsim.
```{r}
fn = system.file("extdata/", "public_tablCnts.tsv", package = "htrsim")
tabl_cnts <- read.table(file = fn, header = TRUE)
rownames(tabl_cnts) <- tabl_cnts$gene_id
tabl_cnts <- tabl_cnts %>% select(-gene_id)##suppr colonne GeneID
tabl_cnts <- tabl_cnts %>% select(-gene_name) ##suppr colonne GeneName
```
<u> e. Launch HTRSIM</u>
```{r message=FALSE, warning=FALSE}
fn = system.file("extdata/", "public_bioDesign.csv", package = "htrsim")
## import design of bioProject
bioDesign <- read.table(file = fn, header = T, sep = ';')
#bioDesign
#source(file = "htrsim/main.R")
tabl_cnts %>% dim()
bioDesign %>% dim()
#simul_cnts = htrsim(tabl_cnts, bioDesign = bioDesign, 2)
htrsim.results = htrsim(countData = tabl_cnts, bioDesign= bioDesign, N_replicates=2)
```
<u> f. Visualize results</u>
```{r message=FALSE, warning=FALSE}
htrsim.results$input$d
```
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment