diff --git a/results/v2/2022-07-25_dev.Rmd b/results/v2/2022-07-25_dev.Rmd index c83afc108e815117927c036ebc1e0576fb38fdbb..36cc31ea1425557ca456e1bc7168e3fdb6e45b6c 100644 --- a/results/v2/2022-07-25_dev.Rmd +++ b/results/v2/2022-07-25_dev.Rmd @@ -107,7 +107,6 @@ kij.simulated = getK_ij(mu_ij, gene_dispersion) #kij.simulated = kij.simulated %>% data.frame() %>% filter_all(all_vars(. < 1000000)) - ``` @@ -148,15 +147,15 @@ ggsave(filename = "figures/DESEQ_dispersion.png", plot = p, width = 15, height = ```{r} #### params ##### -n_G = 100 +n_G = 10 n_E = 2 -n_genes = 100 +n_genes = 2 ################## ## Get beta for simulation beta.input = getBetaforSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = dds.extraction$beta) -design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 15) +design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 3) design2simulate$model_matrix %>% dim() log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) #log_qij %>% as.numeric() %>% .[log_qij %>% which.max()] @@ -425,9 +424,502 @@ environment = design2simulate$design2simulate$environment df_gene_i = list(y = y , genotype = genotype,environment = environment) %>% data.frame() rownames(df_gene_i) <- NULL #print(i) - fit = MASS::glm.nb(y ~ genotype + environment + genotype:environment, data = df_gene_i, link = log) +fit = MASS::glm.nb(y ~ genotype + environment + genotype:environment, data = df_gene_i, link = log) +library(aod) +wald.test.res = wald.test(Sigma = vcov(fit), b = coef(fit), Terms = 1, H0 = 0.40)# %>% .[3] +wald.test.pvalue = wald.test.res$result$chi2 %>% data.frame() %>% .[3,] + + + + + + +test_wald(fit, 0.3) +coef(fit) %>% abs %>% which.min() +coef(fit) %>% abs %>% min() +coef(fit) %>% length() + pnorm(summary(m)$coefficients[2,3], lower.tail = F) +res = coef(summary(fit))[,c(1,4)] %>% + data.frame() %>% + dplyr::rename(., pval = "Pr...z.." , Inference = "Estimate") %>% + dplyr::mutate(beta = stringr::str_remove_all(rownames(.), "[//(//)]")) %>% + dplyr::mutate(beta = stringr::str_replace(beta, ":", ".")) + + +test_wald <- function(model.res, term, threshold){ + wald.test.res = wald.test(Sigma = vcov(model.res), b = coef(model.res), Terms = term, H0 = threshold) + wald.test.pvalue = wald.test.res$result$chi2 %>% data.frame() %>% .[3,] + return(wald.test.pvalue) +} + + +glmglrt::p_value.glm(fit) + +list_pval = 1:length(coef(fit)) %>% map(.x = ., ~ test_wald(fit,.x , 0)) %>% unlist() +list_pval.adj = p.adjust(list_pval, method = 'fdr') + +glmglrt::p_value_contrast(fit, contrast = x , alternative ="two.sided", method = "Wald", H0 = 0) + +coef(fit %>% summary) +res %>% mutate(p.adj = list_pval.adj ) + + + +## news +coef(fit %>% summary)[2,] + +x = c(rep(0, length(coef(fit))-1),1) + + +test_wald2 <- function(model.res, term, threshold, initvec){ + constrast_vec = replace(initvec, term , 1) + wald.test.pvalue = glmglrt::p_value_contrast(fit, contrast = constrast_vec , alternative ="two.sided", method = "Wald", H0 = threshold) + return(wald.test.pvalue %>% as.numeric()) +} + + +kij.simulated %>% dim() +design2simulate$model_matrix %>% dim() + +vecofzero = rep(0, length(coef(fit))) +test_wald + +list_pval = 1:length(coef(fit)) %>% furrr::future_map(.x = ., ~ test_wald2(model.res = fit, term= .x , threshold = 0.5, initvec = vecofzero) , print("hd") ) %>% unlist() + + +which.min(coef(fit)) +library(aod) + + +coef(fit) + + + +WaldTest = function(L,thetahat,Vn,h=0) # H0: L theta = h + # Note Vn is the asymptotic covariance matrix, so it's the +# Consistent estimator divided by n. For true Wald tests +# based on numerical MLEs, just use the inverse of the Hessian. +{ +WaldTest = numeric(3) +names(WaldTest) = c("W","df","p-value") +r = dim(L)[1] +W = t(L%*%thetahat-h) %*% solve(L%*%Vn%*%t(L)) %*% (L%*%thetahat-h) +W = as.numeric(W) +pval = 1-pchisq(W,r) +WaldTest[1] = W; WaldTest[2] = r; WaldTest[3] = pval +WaldTest +} # End function WaldTest + +WaldTest(coef(fit),thetahat,vcov(fit)) +p <- 1 - pchisq(0.3, df = 1) + + +which.min(coef(fit)) +coef(fit)[41] + + +wald.test.res = wald.test(b = abs(coef(fit)), Sigma = vcov(fit), Terms = 41, H0 = 1) +stat.chi2 = wald.test.res$result$chi2[1] %>% as.numeric() +pchisq(stat.chi2, df = 1, lower.tail = T) + + + +initvec = rep(0, length(coef(fit))) +x = replace(initvec, 41 , 1) + + + + +``` + + + + +## mvronrom + +```{r} +getBetaforSimulation2 <- function(n_genes = 100, n_genotypes = 20, n_environments = 2, beta.dtf, theta = 10 ){ + + x = beta.dtf %>% as.matrix() + fit.mvrnorm <- Rfast::mvnorm.mle(x) + print(fit.mvrnorm$mu) + x <- NULL + print('base') + print(diag(fit.mvrnorm$sigma)) + print(MASS::mvrnorm(n = n_genes, + mu = fit.mvrnorm$mu, + Sigma = fit.mvrnorm$sigma ) %>% data.frame() %>% summarise_if(is.numeric, sd)) + print("modif") + diag(fit.mvrnorm$sigma) <- diag(fit.mvrnorm$sigma) + 10 + beta.matrix.tmp <- MASS::mvrnorm(n = n_genes, + mu = fit.mvrnorm$mu, + Sigma = fit.mvrnorm$sigma ) + + print(fit.mvrnorm$sigma) + print(diag(fit.mvrnorm$sigma)) + print(beta.matrix.tmp %>% cov()) + replicate_beta <- function(beta_vec, n, theta){ + beta_vec.rep = rep(beta_vec, n) + beta_vec.rep + rnorm(length(beta_vec.rep), mean = 0, sd = abs(beta_vec/theta)) + } + + beta0 = beta.matrix.tmp[,1] + beta.matrix.tmp = purrr::map2(.x = c(2,3,4), .y = c(n_genotypes-1, + n_environments-1, + (n_genotypes-1)*(n_environments-1)), + ~ replicate_beta(beta.matrix.tmp[,.x], .y, theta) %>% matrix(ncol = .y)) %>% + do.call(cbind, .) + + + + beta.matrix = cbind(beta0, beta.matrix.tmp) + betaG.colnames = base::paste("genotype", "G", 1:(n_genotypes-1), sep = "") + betaE.colnames = base::paste("environment", "E", 1:(n_environments-1), sep = "") + betaGE.colnames = as.vector(outer(betaG.colnames, betaE.colnames, paste, sep=":")) + matrix.colnames = c('Intercept', betaG.colnames, betaE.colnames, betaGE.colnames) + + colnames(beta.matrix) = matrix.colnames + rownames(beta.matrix) = base::paste("gene", 1:(n_genes), sep = "") + + return(beta.matrix) +} + +beta.input2 = getBetaforSimulation2(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = dds.extraction$beta) +``` + + +```{r} + +# set up the custom data simulation function +n_subj = 100 # number of subjects +n_ingroup = 25 # number of ingroup stimuli +n_outgroup = 25 # number of outgroup stimuli +beta_0 = 800 # grand mean +beta_1 = 50 # effect of category +omega_0 = 80 # by-item random intercept sd +tau_0 = 100 # by-subject random intercept sd +tau_1 = 40 # by-subject random slope sd +rho = 0.2 # correlation between intercept and slope +sigma = 200 + + items <- data.frame( + item_id = seq_len(n_ingroup + n_outgroup), + category = rep(c("ingroup", "outgroup"), c(n_ingroup, n_outgroup)), + X_i = rep(c(-0.5, 0.5), c(n_ingroup, n_outgroup)), + O_0i = rnorm(n = n_ingroup + n_outgroup, mean = 0, sd = omega_0)) + items + # variance-covariance matrix + cov_mx <- matrix( + c(tau_0^2, rho * tau_0 * tau_1, + rho * tau_0 * tau_1, tau_1^2 ), + nrow = 2, byrow = TRUE) + + subjects <- data.frame(subj_id = seq_len(n_subj), + + MASS::mvrnorm(n = n_subj, + mu = c(T_0s = 0, T_1s = 0), + Sigma = cov_mx)) + + crossing(subjects, items) %>% + mutate(e_si = rnorm(nrow(.), mean = 0, sd = sigma), + RT = beta_0 + T_0s + O_0i + (beta_1 + T_1s) * X_i + e_si) %>% + select(subj_id, item_id, category, X_i, RT) + + + + +my_sim_data() +``` + + +```{r} +get_beta_gene_i <- function(fit.mvnorm, n_G, n_E){ + beta.matrix.tmp <- MASS::mvrnorm(n = (n_E-1)*(n_G-1), + mu = fit.mvnorm$mu, + Sigma = fit.mvnorm$sigma ) + + beta0 = beta.matrix.tmp[1,1] %>% unname() + betaG = beta.matrix.tmp[1:(n_G-1),2] + betaE = beta.matrix.tmp[1:(n_E-1),3] + betaGE = beta.matrix.tmp[,4] + + ### name + betaG.colnames = base::paste("genotype", "G", 1:(n_G-1), sep = "") + betaE.colnames = base::paste("environment", "E", 1:(n_E-1), sep = "") + betaGE.colnames = as.vector(outer(betaG.colnames, betaE.colnames, paste, sep=":")) + matrix.colnames = c('Intercept', betaG.colnames, betaE.colnames, betaGE.colnames) + + beta_gene_i <- c( beta0, betaG, betaE, betaGE) + names(beta_gene_i) = matrix.colnames + + return(beta_gene_i) +} + + +n_g = 1 +n_group_gene = 3 +n_E = 2 +n_G = 400 + + +dds.extraction$beta +kmean.res = kmeans(dds.extraction$beta[,c(2,3)], n_group_gene) + + +beta.cluster1= dds.extraction$beta[kmean.res$cluster == 1,] +beta.cluster2= dds.extraction$beta[kmean.res$cluster == 2,] +beta.cluster3= dds.extraction$beta[kmean.res$cluster == 3,] + + +x = beta.cluster2 %>% as.matrix() +fit.mvrnorm <- Rfast::mvnorm.mle(x) +fit.mvrnorm$sigma[c(2,4),c(2,4)] = fit.mvrnorm$sigma[c(2,4),c(2,4)]*3 +#diag(fit.mvrnorm$sigma) <- diag(fit.mvrnorm$sigma) + +#### BETAG.E ##### +fit.mvrnorm$mu[4] = 0 +fit.mvrnorm$sigma[,4] = 0 +fit.mvrnorm$sigma[4,] = 0 +################# +#fit.mvrnorm$mu[0] = 30 + +#### BETA0 ##### +fit.mvrnorm$mu[1] = 0 +fit.mvrnorm$sigma[,1] = 0 +fit.mvrnorm$sigma[1,] = 0 +################# + +#### BETAE ##### +fit.mvrnorm$mu[3] = 0 +fit.mvrnorm$sigma[,3] = 0 +fit.mvrnorm$sigma[3,] = 0 + +#fit.mvrnorm$mu[2] = 8 +#fit.mvrnorm$sigma[2,2]= 6 +x <- NULL + + +n_gene = n_g +a = purrr::map(.x = 1:n_gene, ~ get_beta_gene_i(fit.mvrnorm, n_G, n_E)) +beta.matrix = do.call(rbind, a) +rownames(beta.matrix) = base::paste("gene", 1:(n_g), sep = "") + + + +x = beta.matrix %>% data.frame() %>% #%>% select(!starts_with("environment")) %>% + rownames_to_column('gene_id') %>% reshape2::melt(., id = "gene_id") %>% dplyr::mutate(type = dplyr::case_when( + str_detect(variable, "genotypeG\\d+\\.environment") ~ "GxE", + str_detect(variable, "genotypeG\\d+$") ~ "G", + str_detect(variable, "environmentE\\d+$") ~ "E", + str_detect(variable, "Intercept$") ~ "Intercept")) %>% reshape2::dcast(., gene_id ~ type, value.var = "value", fun.aggregate = list) + + +g = x$G +names(g) <- x$gene_id +df_tmp2 = data.frame(g) %>% mutate(type = 'betaG') %>% reshape2::melt( id = "type", value.name = "betaG") +g = x$GxE + +names(g) <- x$gene_id +df_tmp3 = data.frame(g) %>% mutate(type = 'GxE') %>% reshape2::melt( id = "type", value.name = "betaGxE") + + +df = cbind(df_tmp2, df_tmp3) %>% dplyr::select(c(betaGxE, betaG)) %>% mutate(from = 'Simulated') %>% mutate(cluster = 3) +df_simu = df +df_simu = rbind(df_simu, df) + + +df_simu$cluster <- factor(df_simu$cluster) +ggplot(df_simu) + geom_point(aes(x = betaG, y = betaGxE, col = cluster), alpha = 0.2) + +``` + +```{r} + +### Visualisation +dtf = dds.extraction$beta %>% mutate(cluster = kmean.res$cluster ) %>% + reshape2::melt(. ,id=c("cluster"), value.name = "value", variable.name= "type") +dtf$cluster <- factor(dtf$cluster) + + +p = ggplot(dtf) + geom_density(aes(x = value, fill = cluster ), alpha = 0.4) + facet_grid(~ type, scales = "free") + +p +ggsave("figures/densityBeta_kmeans.png", p) + + +betas = dds.extraction$beta %>% mutate(cluster = kmean.res$cluster) %>% mutate(from = "Actual") +betas$cluster = factor(betas$cluster) +df2 <- rbind(betas %>% dplyr::select(betaG, betaGE, from, cluster), df_simu %>% rename(betaGE = "betaGxE")) +p = ggplot(df2) + geom_point(aes(x= betaG, betaGE, col = cluster), alpha = 0.1) + facet_grid(~from) + +p +ggsave("figures/scatterplot_clustering_covIncreased.png", p, width = 10) + +``` + +```{r} + +beta.input = beta.matrix +n_genes = n_g +design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 15) +#rowSums(beta.input) %>% which.max() +#rowSums(beta.input) %>% max() +log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) +log_qij %>% as.numeric() %>% .[log_qij %>% which.max()] + + +gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, dispersion.vec = dds.extraction$gene_dispersion, model_matrix = design2simulate$model_matrix, dispUniform_btweenCondition = T) + + +#2* 2^log_qij +mu_ij = getMu_ij(log_qij, 1) +#max(mu_ij) +kij.simulated = getK_ij(mu_ij, gene_dispersion) + ``` + +## Mixte model + + +```{r} +library(lme4) +kij.simulated %>% dim() +df_2glmmm <- function(y , design2simulate, i){ + genotype = design2simulate$design2simulate$genotype + environment = design2simulate$design2simulate$environment + #message("Fitting model ...") + + df_gene_i = list(y = y , genotype = genotype,environment = environment) %>% data.frame() %>% mutate(inter = paste(environment, genotype, sep = '_')) + df_gene_i$inter <- factor( df_gene_i$inter ) + rownames(df_gene_i) <- NULL + df_gene_i = df_gene_i %>% mutate(gene_id = paste("gene", i, sep = "")) + return(df_gene_i) +} + + +library(broom.mixed) +overdisp_fun <- function(model) { + rdf <- df.residual(model) + rp <- residuals(model,type="pearson") + Pearson.chisq <- sum(rp^2) + prat <- Pearson.chisq/rdf + pval <- pchisq(Pearson.chisq, df=rdf, lower.tail=FALSE) + c(chisq=Pearson.chisq,ratio=prat,rdf=rdf,p=pval) +} + +tidy_quasi <- function(model, phi=overdisp_fun(model)["ratio"], conf.level=0.95) { + tt <- (tidy(model, effects="ran_vals") %>% + mutate(std.error=std.error*sqrt(phi), + statistic=estimate/std.error, + p.value=2*pnorm(abs(statistic), lower.tail=FALSE)) + ) + return(tt) +} + +``` + +```{r} + +beta.matrix.long = beta.matrix %>% data.frame() %>% rownames_to_column('gene_id') %>% reshape2::melt(.,id= "gene_id", value.name = "value", variable.name= "beta") +beta.matrix.long = beta.matrix.long %>% filter(gene_id == "gene1") %>% dplyr::mutate(type = dplyr::case_when( + str_detect(beta, "genotypeG\\d+\\.environment") ~ "GxE", + str_detect(beta, "genotypeG\\d+$") ~ "G", + str_detect(beta, "environmentE\\d+$") ~ "E", + str_detect(beta, "Intercept$") ~ "Intercept")) + +beta.long.actual = beta.matrix.long %>% filter(type == "G") %>% mutate(from = "actual") %>% dplyr::select(-gene_id) + +``` + + + +```{r} +library(lme4) +f = map(1:1, ~df_2glmmm(kij.simulated[1,], design2simulate, .x) ) +n = do.call(rbind, f) +n2 = n %>% mutate(group = paste(gene_id, genotype, sep = "_")) +data_glmm = n +m.nb <- glmer.nb(y ~ environment + ( 1 | genotype ) , data=data_glmm, verbose=TRUE) + +summary(m.nb) +tidy(m.nb) +fit.mvrnorm$sigma +fit.mvrnorm$mu +beta.input[1,] +tidy_quasi(m.nb) +mean(res.glmm$value) +sd(res.glmm$value) +sd((ranef(m.nb)$genotype[,1])) + +plot(density(coef(m.nb)$genotype[,1])) + +res.glmm = tidy_quasi(m.nb) %>% dplyr::select(estimate, level) %>% + dplyr::mutate(beta = str_replace(level, "G", "genotypeG")) %>% rename(c("type" = level, 'value' = estimate)) + +B0 = tidy(m.nb, effects = 'fixed') %>% .$estimate %>% .[1] + +res.glmm$value = res.glmm$value + B0 +coef(m.nb) + +## TIPS +tidy_quasi(m.nb) %>% rbind(tidy(m.nb, effects = 'fixed') %>% mutate(group ="NA") %>% mutate(level = 'NA')) +tidy(m.nb) + +``` + + +```{r} +res.glmm = res.glmm %>% mutate(from = 'infered') +beta.long.actual$value = beta.long.actual$value + beta.matrix %>% data.frame() %>% .$Intercept +#beta.long.actual$value = beta.long.actual$value*log(2) +df_distrib = rbind(beta.long.actual, res.glmm) + + +B0 = tidy(m.nb, effects = 'fixed') %>% .$estimate %>% .[1] +stadd = tidy(m.nb) %>% .$estimate %>% .[2] +rnorm_df = list(beta = df_distrib$beta %>% unique(), value = rnorm(n = 100000, mean = B0, sd = stadd ), type = "G", from = paste('rnorm(', B0 %>% signif(2),",", stadd %>% signif(2), ')', sep = '')) %>% data.frame() + +df_distrib = rbind(df_distrib, rnorm_df) +ggplot(df_distrib) + geom_density(aes(x = value, col = from ), alpha = 0.5) + + +df_distrib.short = df_distrib %>% filter(from %in% c("actual", 'infered')) %>% reshape2::dcast(., beta ~ from, value.var = "value") +ggplot(df_distrib.short) + geom_point(aes(x = actual, y = infered )) + geom_abline(slope=1, intercept=0)# + + +fit.mvrnorm + +``` + +```{r} + +beta0.input = beta.input[,1] +######## beta G +mean_betaG = beta.input[,2:n_G] %>% mean() +mean.input = beta0.input + mean_betaG +sd_betaG = beta.input[,2:n_G] %>% sd() +######## beta E +mean_betaE = beta.input[,(n_G+1):(n_G+n_E-1)] %>% mean() + + +rnorm_df_input = list(beta = df_distrib$beta %>% unique(), value = rnorm(n = 100000, mean = mean.input, sd = sd_betaG ), type = "G", from = "Actual") %>% data.frame() + +df_distrib = rbind(rnorm_df_input, rnorm_df) +ggplot(df_distrib) + geom_density(aes(x = value, col = from ), alpha = 0.5) + +``` + + +### GLMTMBB + +```{r} +library(glmmTMB) + +glmmTMB::glmmTMB(y ~ environment + ( genotype | genotype ) , data=data_glmm, + family=nbinom2, verbose = T) + +``` diff --git a/results/v2/2022-09-01_simulationBetaDemo.Rmd b/results/v2/2022-09-01_simulationBetaDemo.Rmd index 3cc3cd9433e7b3f5bb2a7f745975ca36af9f5453..bd812d29076dac146f9dd3338b982f61e93f8a59 100644 --- a/results/v2/2022-09-01_simulationBetaDemo.Rmd +++ b/results/v2/2022-09-01_simulationBetaDemo.Rmd @@ -169,7 +169,7 @@ a = 1:n_genes %>% furrr::future_map(.x = ., ~run.glm(kij.simulated[.x,], .x, design2simulate$design2simulate) ) -c = do.call(rbind, a) +c = do.call(rbind, a) %>% mutate(p.val.adj = p.adjust(p.val, method= "fdr")) beta.input.long = beta.matrix %>% data.frame() %>% tibble::rownames_to_column(., var = "gene_id") %>% dplyr::mutate(origin = "Actual") %>% @@ -178,11 +178,13 @@ beta.input.long = beta.matrix %>% data.frame() %>% dplyr::select(-origin) -df_comparison = merge(c, beta.input.long) %>% mutate(Ngenotype = n_genotypes) +df_comparison = merge(c, beta.input.long) %>% mutate(Ngenotype = n_genotypes) %>% mutate(pred = ifelse(p.val.adj < 0.05, "DE", "nonDE")) + +df_comparison %>% dplyr::filter(type %in% c("G", "GxE")) %>% group_by(type, pred) %>% tally() ggplot(df_comparison %>% dplyr::filter(type %in% c("G", "GxE")) ) + - geom_histogram(aes(x=Inference, y = ..density..), fill = 'white' ) + - geom_density(aes(x = Actual)) + + geom_histogram(aes(x=Inference, y = ..density.., col = pred, fill = pred), ) + + geom_density(aes(x = Actual*log(2))) + facet_grid(~type, scales = "free") df_comparison$type ggplot(df_comparison %>% dplyr::filter(type %in% c("Intercept")) ) + @@ -209,92 +211,123 @@ replicate_beta <- function(beta_vec, n, theta){ beta_vec.rep + rnorm(length(beta_vec.rep), mean = beta_vec/theta , sd = 1) } + ## Params n_genes = 1 n_environments = 2 theta = 10 -gen_vector = c(100, 300 ,500, 700, 1000) -for (n_genotypes in gen_vector ){ - - beta.matrix.tmp = purrr::map2(.x = c(2,3,4), .y = c(n_genotypes-1, - n_environments-1, - (n_genotypes-1)*(n_environments-1)), - ~ replicate_beta(beta.matrix.template[.x], .y, theta) %>% matrix(ncol = .y)) %>% - do.call(cbind, .) - - beta.matrix = cbind(beta.matrix.template[1], beta.matrix.tmp) - betaG.colnames = base::paste("genotype", "G", 1:(n_genotypes-1), sep = "") - betaE.colnames = base::paste("environment", "E", 1:(n_environments-1), sep = "") - betaGE.colnames = as.vector(outer(betaG.colnames, betaE.colnames, paste, sep=":")) - matrix.colnames = c('Intercept', betaG.colnames, betaE.colnames, betaGE.colnames) - - colnames(beta.matrix) = matrix.colnames - rownames(beta.matrix) = base::paste("gene", 1:(n_genes), sep = "") - beta.input = beta.matrix - #beta.matrix %>% dim() - - ########## kij ################ - n_G = n_genotypes - n_E = 2 - - design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 25) - design2simulate$model_matrix %>% dim() - log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.matrix) - #log_qij %>% as.numeric() %>% .[log_qij %>% which.max()] +#gen_vector = c(100, 300 ,500, 700, 1000) +gen_vector = 150 +n_genotypes = 150 +thr = c(0, 0.07, 0.48, 1) +#thr = c(0.04, 1) + +beta.matrix.tmp = purrr::map2(.x = c(2,3,4), .y = c(n_genotypes-1, + n_environments-1, + (n_genotypes-1)*(n_environments-1)), + ~ replicate_beta(beta.matrix.template[.x], .y, theta) %>% matrix(ncol = .y)) %>% + do.call(cbind, .) + +beta.matrix = cbind(beta.matrix.template[1], beta.matrix.tmp) +betaG.colnames = base::paste("genotype", "G", 1:(n_genotypes-1), sep = "") +betaE.colnames = base::paste("environment", "E", 1:(n_environments-1), sep = "") +betaGE.colnames = as.vector(outer(betaG.colnames, betaE.colnames, paste, sep=":")) +matrix.colnames = c('Intercept', betaG.colnames, betaE.colnames, betaGE.colnames) + +colnames(beta.matrix) = matrix.colnames +beta.matrix %>% dim() +rownames(beta.matrix) = base::paste("gene", 1:(n_genes), sep = "") +beta.input = beta.matrix +#beta.matrix %>% dim() + +########## kij ################ +n_G = n_genotypes +n_E = 2 + +design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 10) +design2simulate$model_matrix %>% dim() +log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.matrix) +#log_qij %>% as.numeric() %>% .[log_qij %>% which.max()] + + +gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, + n_genotypes = n_G, + n_environments = n_E, + dispersion.vec = dds.extraction$gene_dispersion, + model_matrix = design2simulate$model_matrix, + dispUniform_btweenCondition = T) + + +mu_ij = getMu_ij(log_qij, 1) + +kij.simulated = getK_ij(mu_ij, gene_dispersion) + +for (t in thr){ + for (n_genotypes in gen_vector ){ + + + ############# GLM fitting ################### + plan(multisession, workers = 1) + a = 1:n_genes %>% furrr::future_map(.x = ., ~run.glm(kij.simulated[.x,], + .x, + design2simulate$design2simulate, threshold = t) ) - gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, - n_genotypes = n_G, - n_environments = n_E, - dispersion.vec = dds.extraction$gene_dispersion, - model_matrix = design2simulate$model_matrix, - dispUniform_btweenCondition = T) + #kij.simulated %>% dim() + ############ save res ##### + c = do.call(rbind, a) + beta.input.long = beta.input %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Actual") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") %>% + dplyr::rename(Actual = "value") %>% + dplyr::select(-origin) + df_tmp = merge(c, beta.input.long) %>% mutate(Ngenotype = n_genotypes) %>% mutate(p.val.adj = p.adjust(p.val, method= "fdr")) %>% mutate(threshold = t) - mu_ij = getMu_ij(log_qij, 1) - - kij.simulated = getK_ij(mu_ij, gene_dispersion) - - ############# GLM fitting ################### - plan(multisession, workers = 1) - a = 1:n_genes %>% furrr::future_map(.x = ., ~run.glm(kij.simulated[.x,], - .x, - design2simulate$design2simulate) ) - - - ############ save res ##### - c = do.call(rbind, a) - beta.input.long = beta.input %>% data.frame() %>% - tibble::rownames_to_column(., var = "gene_id") %>% - dplyr::mutate(origin = "Actual") %>% - reshape2::melt(., value.name = "value", variable.name= "beta") %>% - dplyr::rename(Actual = "value") %>% - dplyr::select(-origin) - - df_tmp = merge(c, beta.input.long) %>% mutate(Ngenotype = n_genotypes) - - if (exists('df_comparison') && is.data.frame(get('df_comparison'))){ - df_comparison = rbind(df_comparison, df_tmp) - - } - else{ - df_comparison = df_tmp - } - + if (exists('df_comparison') && is.data.frame(get('df_comparison'))){ + df_comparison = rbind(df_comparison, df_tmp) + + } + else{ + df_comparison = df_tmp + } + + } } +df_comparison$threshold %>% table() df_comparison$type <- factor(df_comparison$type, levels = c("Intercept", "G", "E", "GxE")) -df_comparison %>% + +df_comparison = df_comparison %>% mutate(pred = ifelse(p.val.adj < 0.05, "DE", "nonDE")) + + + +ggplot(df_comparison) + geom_bar(aes(x= p.val)) + xlim(c(0,0.1)) + + +df_comparison %>% filter(type %in% c("G", "GxE")) %>% group_by(threshold, type, pred, Ngenotype ) %>% tally() + +df_comparison %>% filter(type %in% c("G", "GxE")) %>% group_by(type, threshold, pred) %>% tally() + + +df_comparison$Ngenotype %>% table() -p = ggplot(df_comparison %>% filter(type %in% c("G", "GxE")) ) + - geom_histogram(aes(x=Inference, y = ..density..), fill = 'white' ) + - geom_density(aes(x = Actual * log(2))) + - facet_grid(type~Ngenotype, scales = "free") +#df_comparison = df_comparison %>% filter(gene_id == "gene1") +p = ggplot(df_comparison %>% dplyr::filter(type %in% c("G", "GxE"))) + + geom_histogram(aes(x = Inference, y=..density..) , fill = 'black' ) + + geom_histogram(data = df_comparison %>% dplyr::filter(type %in% c("G", "GxE")) %>% dplyr::filter(pred == "DE"), aes(x=Inference, y = ..density.. , fill = pred )) + + facet_grid(type~threshold , scales = "free") + + geom_vline(aes(xintercept= -threshold), linetype="dotted", + color = "blue") + + geom_vline(aes(xintercept= threshold), linetype="dotted", + color = "blue") + + geom_density(aes(x = Actual * log(2) )) p -ggsave(filename = "figures/GLM_genotype_distrib.png", plot = p, width = 20, height = 12) +ggsave(filename = "../../results/v2/figures/GLM_distrib.png", plot = p, width = 20, height = 12) ``` diff --git a/results/v2/2022-09-07_Postinference_filtering.Rmd b/results/v2/2022-09-07_Postinference_filtering.Rmd index eff9b29bba0090d682c763c5d1b2bc3256dc3db3..ffbeb9299cb3d21339a32280a75860e2865ee234 100644 --- a/results/v2/2022-09-07_Postinference_filtering.Rmd +++ b/results/v2/2022-09-07_Postinference_filtering.Rmd @@ -132,6 +132,10 @@ dds_simu = run.deseq(tabl_cnts = kij.simulated , bioDesign = design2simulate$des ```{r} +n_G = 40 +n_E = 2 +n_genes = 10 + rm(df2return) threshold_list = c(0, 0.04, 0.07, 0.2, 0.48, 1) diff --git a/results/v2/2022-09-22_NgenotypesEffectBIS.Rmd b/results/v2/2022-09-22_NgenotypesEffectBIS.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..120c215aa3575a4b948148365693f05a51b62a1f --- /dev/null +++ b/results/v2/2022-09-22_NgenotypesEffectBIS.Rmd @@ -0,0 +1,187 @@ +--- +title: "HTRSIM" +date: '2022-07-26' +output: + html_document: + +css: + - css/template.css + +--- + + +```{r setup, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(HTRSIM) +library(furrr) +#library(tidyverse) +#library(reshape2) +#library(kableExtra) +#library(gridExtra) +#library(MatrixGenerics) +``` + + + +## Simu + +```{r} +n_genes = 80 +n_genotypes_list = c( 100, 200, 400 ) +threshold = 0.5 +n_E = 2 + + +rm(df_roc) + + +for (n_G in n_genotypes_list){ + + beta.input = getBetaforSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = dds.extraction$beta) + design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 3) + log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) + gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, + n_genotypes = n_G, n_environments = n_E, + dispersion.vec = dds.extraction$gene_dispersion, + model_matrix = design2simulate$model_matrix, dispUniform_btweenCondition = T) + mu_ij = getMu_ij(log_qij, 1) + kij.simulated = getK_ij(mu_ij, gene_dispersion) + + + + ######################### DESEQ + beta.actual.matrix = beta.input + ## 2 modifY + #dds_simu = run.deseq(tabl_cnts = kij.simulated , bioDesign = design2simulate$design2simulate ) + + ############# + # Param + threshold = 0.5 + ############ + + #listBeta = DESeq2::resultsNames(dds_simu) + #plan(multisession, workers = 4) + #res = listBeta %>% furrr::future_map(.x = ., ~DESeq2::results(dds_simu, contrast=list(.x), lfcThreshold = threshold) %>% data.frame() %>% .$padj) + #padj.matrix = do.call("cbind", res) + + + + #dds_simu.mcols = S4Vectors::mcols(dds_simu,use.names=TRUE) + + #dds.simu.mcols.colnamesReshaped = colnames(dds_simu.mcols) %>% + # stringr::str_replace(., "_vs_G0", "") %>% + # stringr::str_replace(., "_vs_E0", "") %>% + # stringr::str_replace_all(., "_", "") %>% + # stringr::str_replace(., "\\.", ":") + + #columnOfInterest = design2simulate$model_matrix %>% base::colnames() %>% stringr::str_replace_all(., "[//(//)]", "") + #dds_simu.mcols[,columnOfInterest] + + ## Get only column of interest + #idx_cols = base::match(columnOfInterest, dds.simu.mcols.colnamesReshaped) + #beta.infered = dds_simu.mcols[,idx_cols] + + ## homogeneize column names & rownames + #idx_cols = base::match(columnOfInterest, beta.actual.matrix %>% colnames()) + #beta.actual.matrix = beta.actual.matrix[,idx_cols] + #colnames(beta.infered) = base::colnames(beta.actual.matrix) + #colnames(padj.matrix) = base::colnames(beta.actual.matrix) + #rownames(padj.matrix) = base::rownames(beta.actual.matrix) + #beta.actual.matrix %>% dim() + #padj.matrix %>% dim() + #beta.infer.long = beta.infered %>% data.frame() %>% + # tibble::rownames_to_column(., var = "gene_id") %>% + # dplyr::mutate(origin = "Inference") %>% + # reshape2::melt(., value.name = "value", variable.name= "beta") + #beta.actual.matrix.long = beta.actual.matrix %>% data.frame() %>% + # tibble::rownames_to_column(., var = "gene_id") %>% + # dplyr::mutate(origin = "Actual") %>% + # reshape2::melt(., value.name = "value", variable.name= "beta") + #padj.matrix.long = padj.matrix %>% data.frame() %>% + # tibble::rownames_to_column(., var = "gene_id") %>% + # dplyr::mutate(origin = "padj") %>% + # reshape2::melt(., value.name = "value", variable.name= "beta") + + #beta.merged.long = rbind(beta.infer.long, beta.actual.matrix.long, padj.matrix.long) + #beta.merged.long$beta %>% unique() + + #beta.merged.long.reshape = beta.merged.long %>% dplyr::mutate(type = dplyr::case_when( + # str_detect(beta, "genotypeG\\d+\\.environment") ~ "GxE", + # str_detect(beta, "genotypeG\\d+$") ~ "G", + # str_detect(beta, "environmentE\\d+$") ~ "E", + # str_detect(beta, "Intercept$") ~ "Intercept") + #) + + + #beta.merged.long.reshape2 = beta.merged.long.reshape %>% reshape2::dcast(., gene_id + beta + type ~ origin) + #beta.merged.long.reshape2$type = factor(beta.merged.long.reshape2$type, levels = c("Intercept", "G", "E", "GxE")) + #beta.merged.long.reshape2$threshold = threshold + + + + ## 2 modifY + #df_roc_deseq = beta.merged.long.reshape2 %>% mutate(from = "DESEQ2") %>% mutate(n_genotype = n_G) + + ############## GLM + a = 1:n_genes %>% furrr::future_map(.x = ., ~run.glm(kij.simulated[.x,], ## 2 modify !!! + .x, + design2simulate$design2simulate, threshold = threshold) ) + + ############ save res ##### + c = do.call(rbind, a) + beta.input.long = beta.input %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Actual") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") %>% + dplyr::rename(Actual = "value") %>% + dplyr::select(-origin) + + df_tmp = merge(c, beta.input.long) %>% mutate(padj = p.adjust(pval, method= "fdr")) %>% mutate(threshold = threshold) + + + + + + ## 2 modify + df_roc_glm = df_tmp %>% dplyr::select(c(-dispersion, -pval)) %>% mutate(from = 'MASS::glm.nb') %>% mutate(n_genotype = n_G) + + #### merge + + #df_roc_deseq = df_roc_deseq %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + # mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) + df_roc_glm = df_roc_glm %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) + + #df_tmp2 = rbind(df_roc_deseq, df_roc_glm) + df_tmp2 = df_roc_glm + if (exists('df_roc') && is.data.frame(get('df_roc'))){ + df_roc = rbind(df_roc, df_tmp2) + } + else{ + df_roc = df_tmp2 + } + + + +} + +df_roc %>% group_by(from, n_genotype) %>% tally() +``` + +```{r} +df_roc %>% filter(n_genotype == 100) + + +df_roc$n_genotype %>% table() +library(plotROC) +p = ggplot(df_roc %>% filter(type %in% c("G","GxE")) , aes(d = label , m = padj, color = from)) + + geom_roc(n.cuts = 0, labels = F) + facet_grid(~n_genotype) + +p + style_roc() +ggsave(filename = "figures/ROCcurve18.png", plot = p + style_roc(), width = 20) + +p = ggplot(calc_auc(p), aes(x = threshold, y = AUC, group=1)) + geom_point() + geom_line() + facet_grid(~type) +ggsave(filename = "figures/AUC.png", plot = p, width = 15, height = 10) +### boxplot FP + +``` + diff --git a/results/v2/2022-09-22_kmean.Rmd b/results/v2/2022-09-22_kmean.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..ab2e4c3343fd723530910501d5deb472e00f1667 --- /dev/null +++ b/results/v2/2022-09-22_kmean.Rmd @@ -0,0 +1,327 @@ +--- +title: "HTRSIM" +date: '2022-09-22' +output: + html_document: + +css: + - css/template.css + +--- + + +```{r setup, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(HTRSIM) +library(furrr) +#library(tidyverse) +#library(reshape2) +#library(kableExtra) +#library(gridExtra) +#library(MatrixGenerics) +``` + + +## Public data + +```{r} +## Import & reshape table counts +fn = system.file("extdata/", "SRP217588_YM_vkallisto.tsv", package = "HTRSIM") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% dplyr::select(-gene_id)##suppr colonne GeneID +tabl_cnts = tabl_cnts[order(tabl_cnts %>% rownames()),] +## DESIGN +fn = system.file("extdata/", "SRP217588_YM_bioDesign.csv", package = "HTRSIM") +bioDesign <- read.table(file = fn, header = T, sep = ';') + +## defining reference +bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c("GSY147", "RM11")) +bioDesign$environment <- factor(x = bioDesign$environment, levels = c( "untreated", "treated")) + + +bioDesign = bioDesign %>% dplyr::filter(!str_detect(sample, "ru_rm_5") ) +tabl_cnts = tabl_cnts %>% dplyr::select(., !matches( "ru_rm_5")) +#tabl_cntsA = tabl_cnts + +``` + +```{r } +## Launch DESEQ2 +dds = run.deseq(tabl_cnts, bioDesign = bioDesign) + +## Extract + +dds.extraction = HTRSIM::extractDistributionFromDDS(dds_obj = dds) +#dds.extraction$gene_dispersion + +``` + +## params visualization + + +```{r} +beta_obs.dtf.long = dds.extraction$beta %>% reshape2::melt(. , na.rm = T, variable.name = "parameter") + +alpha_obs.dtf.long = dds.extraction$gene_dispersion %>% base::data.frame() %>% + dplyr::rename(., value = ".") %>% + dplyr::mutate(parameter = "dispersion") + +dtf.params_obs = rbind(beta_obs.dtf.long, alpha_obs.dtf.long) + + + +p = ggplot(dtf.params_obs, aes(x= value)) + + geom_histogram(aes(y=..density..), colour="black", fill="white") + facet_grid(~parameter)+ + theme(strip.text.x = element_text(size = 13), + axis.title = element_text(size = 5), + axis.text = element_text(size = 5)) + +p +``` + + + +## gene segemntation + + +```{r} +dds.extraction$beta +kmean.res = kmeans(dds.extraction$beta[,c(2,3)], 3) + +pca.obj= prcomp(dds.extraction$beta[,c(2,3)]) + + +res = summary(pca.obj) +library(kableExtra) +res$importance[,1:2] %>% + kbl(., caption = "Table: Variance explained per Principal Component", position = "bottom", align = 'c') %>% + kable_styling(full_width = F) + + +dtp <- data.frame( 'cluster' = kmean.res$cluster , + pca.obj$x[,1:2]) # the first two components are selected (NB: you can also select 3 for 3D plotting or 3+) + + +dtp$cluster <- factor(dtp$cluster) +## Plot +P1 <- ggplot(data = dtp) + + geom_point(aes(x = PC1, y = PC2, + col = cluster), + size =3) + + theme_minimal() +P1 + +ggsave("../results/figures/ACP_kmeans.png", P1) + +dtf = dds.extraction$beta %>% mutate(cluster = kmean.res$cluster ) %>% + reshape2::melt(. ,id=c("cluster"), value.name = "value", variable.name= "type") +dtf$cluster <- factor(dtf$cluster) + + +p = ggplot(dtf) + geom_density(aes(x = value, fill = cluster ), alpha = 0.4) + facet_grid(~ type, scales = "free") +p +ggsave("figures/densityBeta_kmeans.png", p) + + +betas = dds.extraction$beta %>% mutate(cluster = kmean.res$cluster) +betas$cluster = factor(betas$cluster) +p = ggplot(betas) + geom_point(aes(x= betaG, betaGE, col = cluster)) +p +ggsave("figures/scattrplot_clustering.png", p) + +``` + + +```{r} +dds.extraction.bis = dds.extraction$beta %>% mutate(cluster = kmean.res$cluster) + + +beta.cluster1= dds.extraction$beta[kmean.res$cluster == 1,] +beta.cluster2= dds.extraction$beta[kmean.res$cluster == 2,] +beta.cluster3= dds.extraction$beta[kmean.res$cluster == 3,] + + +``` + + + + + + +## all-in + +```{r} + +beta.input1 = getBetaforSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = beta.cluster1) +beta.input2 = getBetaforSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = beta.cluster2) +rownames(beta.input2) = base::paste("gene", 101:(200), sep = "") +beta.input3 = getBetaforSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = beta.cluster3) +rownames(beta.input3) = base::paste("gene", 201:(300), sep = "") + +beta.input = rbind(beta.input1, beta.input2, beta.input3) + +design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 15) +log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) +gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, dispersion.vec = dds.extraction$gene_dispersion, model_matrix = design2simulate$model_matrix, dispUniform_btweenCondition = T) +mu_ij = getMu_ij(log_qij, 1) +kij.simulated = getK_ij(mu_ij, gene_dispersion) + +kij.simulated %>% dim() +``` + +## Deseq + +```{r} + +beta.actual.matrix = beta.input +## 2 modifY +dds_simu = run.deseq(tabl_cnts = kij.simulated , bioDesign = design2simulate$design2simulate ) + +############# +# Param +threshold = 0.5 +############ +listBeta = DESeq2::resultsNames(dds_simu) +plan(multisession, workers = 4) +res = listBeta %>% furrr::future_map(.x = ., ~DESeq2::results(dds_simu, contrast=list(.x), lfcThreshold = threshold) %>% data.frame() %>% .$padj) +padj.matrix = do.call("cbind", res) + + + +dds_simu.mcols = S4Vectors::mcols(dds_simu,use.names=TRUE) + +dds.simu.mcols.colnamesReshaped = colnames(dds_simu.mcols) %>% + stringr::str_replace(., "_vs_G0", "") %>% + stringr::str_replace(., "_vs_E0", "") %>% + stringr::str_replace_all(., "_", "") %>% + stringr::str_replace(., "\\.", ":") + +columnOfInterest = design2simulate$model_matrix %>% base::colnames() %>% stringr::str_replace_all(., "[//(//)]", "") +#dds_simu.mcols[,columnOfInterest] + +## Get only column of interest +idx_cols = base::match(columnOfInterest, dds.simu.mcols.colnamesReshaped) +beta.infered = dds_simu.mcols[,idx_cols] + +## homogeneize column names & rownames +idx_cols = base::match(columnOfInterest, beta.actual.matrix %>% colnames()) +beta.actual.matrix = beta.actual.matrix[,idx_cols] +colnames(beta.infered) = base::colnames(beta.actual.matrix) +colnames(padj.matrix) = base::colnames(beta.actual.matrix) +rownames(padj.matrix) = base::rownames(beta.actual.matrix) +beta.actual.matrix %>% dim() +padj.matrix %>% dim() +beta.infer.long = beta.infered %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Inference") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") +beta.actual.matrix.long = beta.actual.matrix %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Actual") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") +padj.matrix.long = padj.matrix %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "padj") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") + +beta.merged.long = rbind(beta.infer.long, beta.actual.matrix.long, padj.matrix.long) +#beta.merged.long$beta %>% unique() + +beta.merged.long.reshape = beta.merged.long %>% dplyr::mutate(type = dplyr::case_when( + str_detect(beta, "genotypeG\\d+\\.environment") ~ "GxE", + str_detect(beta, "genotypeG\\d+$") ~ "G", + str_detect(beta, "environmentE\\d+$") ~ "E", + str_detect(beta, "Intercept$") ~ "Intercept") +) + + +beta.merged.long.reshape2 = beta.merged.long.reshape %>% reshape2::dcast(., gene_id + beta + type ~ origin) +beta.merged.long.reshape2$type = factor(beta.merged.long.reshape2$type, levels = c("Intercept", "G", "E", "GxE")) +beta.merged.long.reshape2$threshold = threshold + + + +## 2 modifY +df_roc_deseq = beta.merged.long.reshape2 %>% mutate(from = "DESEQ2") + + +df_roc_deseq %>% group_by(gene_id, type) %>% tally() +df_roc_deseq$gene_id %>% unique() %>% length +``` + +## GLM + + +```{r} +kij.simulated %>% dim() +############# GLM fitting ################### +plan(multisession, workers = 4) +a = 1:300 %>% furrr::future_map(.x = ., ~run.glm(kij.simulated[.x,], ## 2 modify !!! + .x, + design2simulate$design2simulate, threshold = threshold) ) + +#kij.simulated %>% dim() +############ save res ##### + +c = do.call(rbind, a) +beta.input.long = beta.input %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Actual") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") %>% + dplyr::rename(Actual = "value") %>% + dplyr::select(-origin) + +df_tmp = merge(c, beta.input.long) %>% mutate(padj = p.adjust(pval, method= "fdr")) %>% mutate(threshold = threshold) + + + + + +## 2 modify +df_roc_glm = df_tmp %>% dplyr::select(c(-dispersion, -pval)) %>% mutate(from = 'MASS::glm.nb') + + +df_roc_glm %>% group_by(gene_id, type) %>% tally() +df_roc_glm$gene_id %>% unique() %>% length() +``` + + +## merge df + +```{r} + +df_roc_deseq = df_roc_deseq %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) +df_roc_glm = df_roc_glm %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) + +df_roc = rbind(df_roc_deseq, df_roc_glm) + + +cluster1 = base::paste("gene", 1:(100), sep = "") +cluster2 = base::paste("gene", 101:(200), sep = "") +cluster3 = base::paste("gene", 201:(300), sep = "") + +df_roc = df_roc %>% mutate(cluster = ifelse(gene_id %in% cluster1, 1, NA)) %>% mutate(cluster = ifelse(gene_id %in% cluster2, 2, cluster)) %>% mutate(cluster = ifelse(gene_id %in% cluster3, 3, cluster)) +df_roc$cluster = factor(df_roc$cluster) +``` + + +```{r} + +df_roc %>% dplyr::filter(cluster == 2 ) %>% group_by(type, label, from ) %>% tally() +#%>% filter(cluster == 2 ) + +library(plotROC) +p = ggplot(df_roc %>% filter(type != "Intercept") , aes(d = label , m = padj, color = from)) + + geom_roc(n.cuts = 0, labels = F) + facet_grid(type~cluster) + +p + style_roc() +ggsave(filename = "figures/ROCcurve3.png", plot = p + style_roc(), width = 15) + +p = ggplot(calc_auc(p), aes(x = threshold, y = AUC, group=1)) + geom_point() + geom_line() + facet_grid(~type) +ggsave(filename = "figures/AUC.png", plot = p, width = 15, height = 10) +### boxplot FP + +``` + diff --git a/results/v2/2022-11-23_dev2.Rmd b/results/v2/2022-11-23_dev2.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..eafef0c4056680f2ede0a67298ce49af7435a399 --- /dev/null +++ b/results/v2/2022-11-23_dev2.Rmd @@ -0,0 +1,418 @@ +--- +title: "HTRSIM" +date: '2022-11-23' +output: + html_document: + +css: + - css/template.css + +--- + + +```{r setup, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(HTRSIM) +library(furrr) +#library(tidyverse) +#library(reshape2) +#library(kableExtra) +#library(gridExtra) +#library(MatrixGenerics) +``` + + +## Simulation without ru_rm_5 - all genes + + +```{r} +## Import & reshape table counts +fn = system.file("extdata/", "SRP217588_YM_vkallisto.tsv", package = "HTRSIM") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% dplyr::select(-gene_id)##suppr colonne GeneID +tabl_cnts = tabl_cnts[order(tabl_cnts %>% rownames()),] +## DESIGN +fn = system.file("extdata/", "SRP217588_YM_bioDesign.csv", package = "HTRSIM") +bioDesign <- read.table(file = fn, header = T, sep = ';') + +## defining reference +bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c("GSY147", "RM11")) +bioDesign$environment <- factor(x = bioDesign$environment, levels = c( "untreated", "treated")) + + +bioDesign = bioDesign %>% dplyr::filter(!str_detect(sample, "ru_rm_5") ) +tabl_cnts = tabl_cnts %>% dplyr::select(., !matches( "ru_rm_5")) +#tabl_cntsA = tabl_cnts + +``` + + + + +```{r } +## Launch DESEQ2 +dds = run.deseq(tabl_cnts, bioDesign = bioDesign) + +## Extract + +dds.extraction = HTRSIM::extractDistributionFromDDS(dds_obj = dds) +#dds.extraction$gene_dispersion + +``` + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Data viz +p<- ddsExtraction.viz(dds.extraction = dds.extraction) +p +ggsave(filename = "figures/inputParams_distrib.png", plot = p, width = 20, height = 12) + +``` + +## Gene segmentation + +```{r} + +n_group_gene = 3 +dds.extraction$beta +kmean.res = kmeans(dds.extraction$beta[,c(2,3)], n_group_gene) + + +beta.cluster1= dds.extraction$beta[kmean.res$cluster == 1,] +beta.cluster2= dds.extraction$beta[kmean.res$cluster == 2,] +beta.cluster3= dds.extraction$beta[kmean.res$cluster == 3,] + + +x = beta.cluster1 %>% as.matrix() +fit.mvrnorm <- Rfast::mvnorm.mle(x) +#### BETAG.E ##### +fit.mvrnorm$sigma[,2] = fit.mvrnorm$sigma[,2] +fit.mvrnorm$sigma[4,] = fit.mvrnorm$sigma[4,]*2 +``` + + + +## BETA + + +```{r} + +getGenesDispersions <- function( n_genes, sample_ids, dispersion.vec ,dispUniform_btweenCondition = T ){ + + if (dispUniform_btweenCondition == T ) { + gene_dispersion.dtf = base::sample( dispersion.vec , replace = T, size = n_genes) %>% base::data.frame() + n_rep = length(sample_ids) + gene_dispersion.dtf = gene_dispersion.dtf[,base::rep(base::seq_len(base::ncol(gene_dispersion.dtf)), n_rep)] + rownames(gene_dispersion.dtf) = base::paste("gene", 1:(n_genes), sep = "") + colnames(gene_dispersion.dtf) = sample_ids + + } + + else { + + replication_table = sample_ids %>% stringr::str_replace(., pattern = "_[0-9]+","" ) %>% table() + gene_dispersion.dtf = replication_table %>% purrr::map(., ~sample( dispersion.vec, replace = T, size = n_genes) ) %>% data.frame() + gene_dispersion.dtf = gene_dispersion.dtf[,rep(seq_len(ncol(gene_dispersion.dtf)), replication_table %>% as.numeric())] + colnames(gene_dispersion.dtf) = sample_ids + rownames(gene_dispersion.dtf) = base::paste("gene", 1:(n_genes), sep = "") + + } + + return(gene_dispersion.dtf %>% as.matrix) + +} + + +get_kij <- function(mu_ij.matx, dispersion.matx, n_genes, sample_id_list, idx_replicat){ + n_sples = length(sample_id_list) + alpha_gene = 1/dispersion.matx + k_ij = stats::rnbinom(length(mu_ij.matx), size = alpha_gene , mu = mu_ij.matx) %>% matrix(. , nrow = n_genes, ncol = n_sples ) + k_ij[is.na(k_ij)] = 0 + colnames(k_ij) = base::paste(sample_id_list, idx_replicat, sep = '_') + rownames(k_ij) = rownames(mu_ij.matrix) + return(k_ij) +} + +######### Utils ########################### +uniform_replication <- function(maxN, n_samples) return(rep(T, time = maxN) %>% rep(., each = n_samples ) %>% matrix(ncol = n_samples)) + +random_replication <- function(maxN, n_samples){ + replicating <- function(maxN) return(sample(x = c(T,F), size = maxN, replace = T)) + res = purrr::map(1:n_samples, ~replicating(maxN-1)) + rep_table = do.call(cbind, res) + rep_table = rbind(rep(T, times = n_samples), rep_table) + return(rep_table) +} +################################# + +``` + +```{r} +######### Settings ############# +n_genes = 4 +n_genotypes = 100 +n_E = 2 +max_n_replicates = 15 +################################# + + +##### Sampling from mvnorm ######## +beta.matrix <- MASS::mvrnorm(n = n_genes*(n_genotypes), + mu = fit.mvrnorm$mu, + Sigma = fit.mvrnorm$sigma ) +################################### + +#### Some reshaping ############## +genes_vec = base::paste("gene", 1:n_genes, sep = "") +genotype_vec = base::paste("G", 0:(n_genotypes-1), sep = "") +environment_vec = base::paste("E", 0:(n_E-1), sep = "") +######################################## +m = c(1,1,0,0,1,1,1,1) +design.matrix = matrix(data = m , ncol = 2, byrow = F) +colnames(design.matrix) = environment_vec +rownames(design.matrix) = beta.matrix %>% colnames() +################################## + +#### Computing log_qij & mu_ij ########### +log_qij = beta.matrix %*% design.matrix +mu_ij = getMu_ij(log_qij, 2) + + +label_genotype = genotype_vec %>% rep(time = n_genes) +label_gene = rep(genes_vec, each = n_genotypes) + + +##### Preparing data for simulation ########" +mu_ij.matrix = mu_ij %>% + data.frame() %>% + dplyr::mutate(genotype = label_genotype) %>% + dplyr::mutate(gene_id = label_gene ) %>% + #dplyr::mutate(replicate_idx = 1 ) %>% + reshape2::melt(., id.vars = c("gene_id", 'genotype'), + value.name = "mu_ij", variable.name= "environment") %>% + reshape2::dcast(., gene_id ~ genotype + environment , value.var = "mu_ij") %>% + column_to_rownames("gene_id") %>% as.matrix() + + +######################################### + +#replication.matrix = random_replication(maxN = max_n_replicates, n_samples = n_genotypes*n_E) +replication.matrix = uniform_replication(maxN = max_n_replicates, n_samples = n_genotypes*n_E) + +########### SIMU +sample_ids = colnames(mu_ij.matrix) +dispersion.matrix = getGenesDispersions(n_genes, sample_ids, dispersion.vec = dds.extraction$gene_dispersion, dispUniform_btweenCondition = T ) +kij.list = purrr::map(.x = 1:max_n_replicates, .f = ~get_kij(mu_ij.matrix[ ,replication.matrix[.x, ]], + dispersion.matrix[ ,replication.matrix[.x, ]], + n_genes = n_genes, + sample_ids[replication.matrix[.x, ]], + .x + ) ) +kij.simulated = do.call(cbind, kij.list) + +kij.simulated %>% dim() + + +``` +## FIT + + + +```{r} +## benchmarking lme4 / glmmTMB +library(lme4) +library(glmmTMB) +library(broom.mixed) +library(mice) + +df_2glmmm <- function(y , design_simulation, gene_name){ + df_gene_i = cbind(design_simulation, y) + df_gene_i = df_gene_i %>% mutate(gene_id = gene_name) + return(df_gene_i) +} + + +fit_extraction <- function(fit){ + fit.res = broom.mixed::tidy(fit) %>% arrange(term)%>% .$estimate %>% as.numeric() + B0 = fit.res[3] + BE = fit.res[4] + sd_BGE_E0 = fit.res[5] + sd_BGE_E1 = fit.res[7] + sd_BG_E0 = fit.res[6] + sd_BG_E1 = fit.res[8] + correlation_genotype = fit.res[2] + correlation_interaction = fit.res[1] + #################################################################### + res = list(mean_E0 = B0, mean_E1 = BE, sd_BGE_E0 = sd_BGE_E0, sd_BGE_E1 = sd_BGE_E1, sd_BG_E0 = sd_BG_E0, sd_BG_E1 = sd_BG_E1, correlation_genotype = correlation_genotype, correlation_interaction = correlation_interaction) %>% data.frame() + return(res) +} + +fit_glmm <- function(data_glmm, i){ + gene_i = paste('gene', i, sep = "") + data_glmm.filter = data_glmm %>% filter(gene_id == gene_i) + ########################### FIT LME4 ################################# + print("LME4") + m.nb <- glmer.nb(y ~ 0 + environment + ( 1 + environment | genotype/environment ) , data= data_glmm.filter , verbose=F) + res_lme4 = fit_extraction(m.nb) + res_lme4 = res_lme4 %>% mutate(gene_id = gene_i) %>% mutate(from = "lme4") + + ##################### FIT GMTMB 1 ######################### + print("glmTMB 1") + m.nb <- glmmTMB::glmmTMB(y ~ 0 + environment + ( 1 + environment | genotype/environment ) , data=data_glmm.filter, family=nbinom1, verbose = F) + res_glmTMB1 = fit_extraction(m.nb) + res_glmTMB1 = res_glmTMB1 %>% mutate(gene_id = gene_i) %>% mutate(from = "glmTMB nbinom1") + + ##################### FIT GMTMB 2 ######################### + print("glmTMB 2") + + m.nb <- glmmTMB::glmmTMB(y ~ 0 + environment + ( 1 + environment | genotype/environment ) , data=data_glmm.filter, family=nbinom2, verbose = F) + res_glmTMB2 = fit_extraction(m.nb) + res_glmTMB2 = res_glmTMB2 %>% mutate(gene_id = gene_i) %>% mutate(from = "glmTMB nbinom2") + + res = rbind(res_lme4, res_glmTMB1, res_glmTMB2) + return(res) +} + +``` + +```{r} +####### ESTIMATION ############* +experimental_design = colnames(kij.simulated) %>% + str_split(., pattern = '_', simplify = T) %>% + .[,c(1,2)] %>% data.frame() +colnames(experimental_design) = c("genotype", "environment") +gene_id_list = rownames(kij.simulated) +f = map(.x = gene_id_list, ~df_2glmmm(kij.simulated[.x,], experimental_design, .x) ) +data_glmm = do.call(rbind, f) +plan(multisession, workers = 4) +results = furrr::future_map(.x = 1:n_genes, ~fit_glmm(data_glmm, .x) ) +res = do.call(rbind, results) +################################################################# + +ground_truth = beta.matrix %>% data.frame() %>% + mutate(gene_id = label_gene) %>% + mutate(genotype = label_genotype) %>% + mutate(E0 = beta0 + betaG ) %>% + mutate(E1 = beta0 + betaE + betaG + betaGE ) + +gt_glmm = ground_truth %>% group_by(gene_id) %>% + summarize( mean_E0 = mean(E0), + mean_E1 = mean(E1), + sd_BG_E0 = sd(betaG), ## ok + sd_BG_E1 = sd(betaGE), + sd_BGE_E0 = sd(betaE), + sd_BGE_E1 = sd(betaE)) %>% + mutate(correlation = NA) + + +######### JOIN ##### +res.long = res %>% reshape2::melt( id = c("gene_id", "from"), value.name = "Inference") #%>% mutate(type = 'Inference') +input.long = gt_glmm %>% reshape2::melt( id = "gene_id", value.name = "Actual") +df_merged = merge(res.long, input.long, by = c("gene_id", "variable")) #%>% mutate(n_genotype = n_G) +df_benchmark = df_merged +``` + +```{r} + +p = ggplot(df_benchmark) + geom_point(aes(x = Actual , y = Inference, color = from ), alpha = 0.5, size = 4) + + facet_wrap(~variable, scales = "free", ncol = 2 ) + geom_abline(slope=1, intercept=0) + + scale_color_manual(values = c("#D55E00", "#E69F00", "#0072B2")) +p +ggsave(filename = 'figures/benchmark_identity8.png',p, width = 12) + + + +df_benchmark %>% filter(variable == 'correlation_interaction') %>% .$Inference %>% mean() +df_benchmark %>% filter(variable == 'mean_E1') %>% .$Inference %>% mean() + +6.5-22+2.34 + + +fit.mvrnorm$sigma[2,3] = 0.1 +fit.mvrnorm$sigma[3,2] = 0.1 + + +mean(res$correlation) +mean(res$sd_E0*res$sd_E1*res$correlation) +``` + +## GLM classique + +```{r} + +run.glm2 <- function(data_glm, i, threshold = 0) { + rownames(data_glm) <- NULL + tryCatch({ + fit = MASS::glm.nb(y ~ genotype + environment + genotype:environment, data = data_glm, link = log) + return(reshapeGlmRes(fit, i, threshold)) + }, + + error = function(cnd){ + print('noop') + return(reshapeGlmRes(NULL, i, threshold , error_bool = T)) +} +) +} + +run.glm3 <- function(data_glm, i, threshold = 0) { + rownames(data_glmm) <- NULL + #data_glm = data_glm %>% select(-gene_id) + #print(data_glm) + fit = MASS::glm.nb(y ~ genotype + environment + genotype:environment, data = data_glm, link = log) + reshapeGlmRes(fit, i, threshold) +} + +############# GLM fitting ################### +data_glm = data_glmm +rownames(data_glm) <- NULL +plan(multisession, workers = 4) +a = gene_id_list %>% furrr::future_map(.x = ., ~run.glm3(data_glm %>% filter(gene_id == .x), + .x, + threshold = 0.5) ) + + + +.############ save res ##### +c = do.call(rbind, a) + +c %>% filter(beta == 'genotypeG5.environmentE1') +``` + +```{r} +ground_truth = beta.matrix %>% data.frame() %>% + mutate(gene_id = label_gene) %>% + mutate(genotype = label_genotype) %>% + reshape2::melt(., id = c("gene_id", 'genotype'), value.name = "Actual", variable.name = "beta") + + +part1 = ground_truth %>% filter(beta %in% c("beta0", 'betaE')) %>% + group_by(gene_id, beta) %>% + summarize( Actual = mean(Actual)) %>% + mutate(beta = ifelse(beta == "beta0", "Intercept", 'betaE')) + +part2 = ground_truth %>% filter(beta == "betaG") %>% + mutate(beta = base::paste("genotype", genotype, sep = "")) %>% select(-genotype) + +part3 = ground_truth %>% filter(beta == "betaGE") %>% + mutate(beta = base::paste("genotype", genotype, ".environmentE1", sep = "")) %>% select(-genotype) + + +input = rbind(part1,part2, part3) %>% ungroup() +input %>% dim() +df_merged = merge(c, input, by = c("gene_id", "beta")) #%>% mutate(n_genotype = n_G) + +``` + +```{r} + + + +p = ggplot(df_merged) + geom_point(aes(x = Actual * log(2) , y = Inference ), alpha = 0.5, size = 4) + + facet_grid(~type, scales = "free" ) + geom_abline(slope=1, intercept=0) + + scale_color_manual(values = c("#D55E00", "#E69F00", "#0072B2")) +p + + +``` + + diff --git a/results/v2/glmm_benchmark.Rmd b/results/v2/glmm_benchmark.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..11978f9a920216806de8d1a2247ef014cd6d302d --- /dev/null +++ b/results/v2/glmm_benchmark.Rmd @@ -0,0 +1,421 @@ +--- +title: "HTRSIM" +date: '2022-07-26' +output: + html_document: + +css: + - css/template.css + +--- + + +```{r setup, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(HTRSIM) +library(furrr) +#library(tidyverse) +#library(reshape2) +#library(kableExtra) +#library(gridExtra) +#library(MatrixGenerics) +``` + + +## Simulation without ru_rm_5 - all genes + + +```{r} +## Import & reshape table counts +fn = system.file("extdata/", "SRP217588_YM_vkallisto.tsv", package = "HTRSIM") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% dplyr::select(-gene_id)##suppr colonne GeneID +tabl_cnts = tabl_cnts[order(tabl_cnts %>% rownames()),] +## DESIGN +fn = system.file("extdata/", "SRP217588_YM_bioDesign.csv", package = "HTRSIM") +bioDesign <- read.table(file = fn, header = T, sep = ';') + +## defining reference +bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c("GSY147", "RM11")) +bioDesign$environment <- factor(x = bioDesign$environment, levels = c( "untreated", "treated")) + + +bioDesign = bioDesign %>% dplyr::filter(!str_detect(sample, "ru_rm_5") ) +tabl_cnts = tabl_cnts %>% dplyr::select(., !matches( "ru_rm_5")) +#tabl_cntsA = tabl_cnts + +``` + + + + +```{r } +## Launch DESEQ2 +dds = run.deseq(tabl_cnts, bioDesign = bioDesign) + +## Extract + +dds.extraction = HTRSIM::extractDistributionFromDDS(dds_obj = dds) +#dds.extraction$gene_dispersion + +``` + + +```{r message=FALSE, warning=FALSE, include=TRUE, results="hide"} +## Data viz +p<- ddsExtraction.viz(dds.extraction = dds.extraction) +p +ggsave(filename = "figures/inputParams_distrib.png", plot = p, width = 20, height = 12) + +``` + +## Gene segmentation + +```{r} +get_beta_gene_i <- function(fit.mvnorm, n_G, n_E){ + beta.matrix.tmp <- MASS::mvrnorm(n = (n_E-1)*(n_G-1), + mu = fit.mvnorm$mu, + Sigma = fit.mvnorm$sigma ) + + beta0 = beta.matrix.tmp[1,1] %>% unname() + betaG = beta.matrix.tmp[1:(n_G-1),2] + betaE = beta.matrix.tmp[1:(n_E-1),3] + betaGE = beta.matrix.tmp[,4] + + ### name + betaG.colnames = base::paste("genotype", "G", 1:(n_G-1), sep = "") + betaE.colnames = base::paste("environment", "E", 1:(n_E-1), sep = "") + betaGE.colnames = as.vector(outer(betaG.colnames, betaE.colnames, paste, sep=":")) + matrix.colnames = c('Intercept', betaG.colnames, betaE.colnames, betaGE.colnames) + + beta_gene_i <- c( beta0, betaG, betaE, betaGE) + names(beta_gene_i) = matrix.colnames + + return(beta_gene_i) +} + + +n_g = 1 +n_group_gene = 3 +n_E = 2 +n_G = 400 + + +dds.extraction$beta +kmean.res = kmeans(dds.extraction$beta[,c(2,3)], n_group_gene) + + +beta.cluster1= dds.extraction$beta[kmean.res$cluster == 1,] +beta.cluster2= dds.extraction$beta[kmean.res$cluster == 2,] +beta.cluster3= dds.extraction$beta[kmean.res$cluster == 3,] + + +x = beta.cluster3 %>% as.matrix() +fit.mvrnorm <- Rfast::mvnorm.mle(x) +fit.mvrnorm$sigma[c(2,4),c(2,4)] = fit.mvrnorm$sigma[c(2,4),c(2,4)]*3 +#diag(fit.mvrnorm$sigma) <- diag(fit.mvrnorm$sigma) + +#### BETAG.E ##### +fit.mvrnorm$mu[4] = 0 +fit.mvrnorm$sigma[,4] = 0 +fit.mvrnorm$sigma[4,] = 0 +################# +#fit.mvrnorm$mu[0] = 30 + +#### BETA0 ##### +fit.mvrnorm$mu[1] = 0 +fit.mvrnorm$sigma[,1] = 0 +fit.mvrnorm$sigma[1,] = 0 +################# + +#### BETAE ##### +fit.mvrnorm$mu[3] = 0 +fit.mvrnorm$sigma[,3] = 0 +fit.mvrnorm$sigma[3,] = 0 + +#fit.mvrnorm$mu[2] = 8 +#fit.mvrnorm$sigma[2,2]= 6 +x <- NULL + + +n_gene = n_g +a = purrr::map(.x = 1:n_gene, ~ get_beta_gene_i(fit.mvrnorm, n_G, n_E)) +beta.matrix = do.call(rbind, a) +rownames(beta.matrix) = base::paste("gene", 1:(n_g), sep = "") + + + +x = beta.matrix %>% data.frame() %>% #%>% select(!starts_with("environment")) %>% + rownames_to_column('gene_id') %>% reshape2::melt(., id = "gene_id") %>% dplyr::mutate(type = dplyr::case_when( + str_detect(variable, "genotypeG\\d+\\.environment") ~ "GxE", + str_detect(variable, "genotypeG\\d+$") ~ "G", + str_detect(variable, "environmentE\\d+$") ~ "E", + str_detect(variable, "Intercept$") ~ "Intercept")) %>% reshape2::dcast(., gene_id ~ type, value.var = "value", fun.aggregate = list) + + +g = x$G +names(g) <- x$gene_id +df_tmp2 = data.frame(g) %>% mutate(type = 'betaG') %>% reshape2::melt( id = "type", value.name = "betaG") +g = x$GxE + +names(g) <- x$gene_id +df_tmp3 = data.frame(g) %>% mutate(type = 'GxE') %>% reshape2::melt( id = "type", value.name = "betaGxE") + + +df = cbind(df_tmp2, df_tmp3) %>% dplyr::select(c(betaGxE, betaG)) %>% mutate(from = 'Simulated') %>% mutate(cluster = 3) +#df_simu = df +df_simu = rbind(df_simu, df) + + +df_simu$cluster <- factor(df_simu$cluster) +ggplot(df_simu) + geom_point(aes(x = betaG, y = betaGxE, col = cluster), alpha = 0.2) + +``` + +```{r} + +### Visualisation +dtf = dds.extraction$beta %>% mutate(cluster = kmean.res$cluster ) %>% + reshape2::melt(. ,id=c("cluster"), value.name = "value", variable.name= "type") +dtf$cluster <- factor(dtf$cluster) + + +p = ggplot(dtf) + geom_density(aes(x = value, fill = cluster ), alpha = 0.4) + facet_grid(~ type, scales = "free") + +p +ggsave("figures/densityBeta_kmeans.png", p) + + +betas = dds.extraction$beta %>% mutate(cluster = kmean.res$cluster) %>% mutate(from = "Actual") +betas$cluster = factor(betas$cluster) +df2 <- rbind(betas %>% dplyr::select(betaG, betaGE, from, cluster), df_simu %>% rename(betaGE = "betaGxE")) +p = ggplot(df2) + geom_point(aes(x= betaG, betaGE, col = cluster), alpha = 0.1) + facet_grid(~from) + +p +ggsave("figures/scatterplot.png", p, width = 10) + +``` + +## Build simulated counts + +```{r} +df_2glmmm <- function(y , design2simulate, i){ + genotype = design2simulate$design2simulate$genotype + environment = design2simulate$design2simulate$environment + #message("Fitting model ...") + + df_gene_i = list(y = y , genotype = genotype,environment = environment) %>% data.frame() %>% mutate(inter = paste(environment, genotype, sep = '_')) + df_gene_i$inter <- factor( df_gene_i$inter ) + rownames(df_gene_i) <- NULL + df_gene_i = df_gene_i %>% mutate(gene_id = paste("gene", i, sep = "")) + return(df_gene_i) +} + +## benchmarking lme4 / glmmTMB +library(lme4) +library(glmmTMB) +library(broom.mixed) +library(mice) + + +list_nb_G = c( 10, 100, 400, 700, 1000) +n_attempt = 1:6 +remove(res_benchmark) + +n_G = 10 +for (n_G in list_nb_G){ + ################### Simulation counts ############### + n_gene = 4 + a = purrr::map(.x = 1:n_gene, ~ get_beta_gene_i(fit.mvrnorm, n_G, n_E)) + beta.matrix = do.call(rbind, a) + beta.input = beta.matrix + n_genes = n_g + design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 10) + + log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) + gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, dispersion.vec = dds.extraction$gene_dispersion, model_matrix = design2simulate$model_matrix, dispUniform_btweenCondition = T) + mu_ij = getMu_ij(log_qij, 1) + kij.simulated = getK_ij(mu_ij, gene_dispersion) + ####################################################### + f = map(.x = 1:n_genes, ~df_2glmmm(kij.simulated[.x,], design2simulate, .x) ) + data_glmm = do.call(rbind, f) + + + ## benchmark lme4 + print('lme4') + start_time <- Sys.time() + m.nb <- glmer.nb(y ~ environment + ( 1 | genotype ) , data=data_glmm, verbose=F) + end_time <- Sys.time() + time_process = difftime(end_time, start_time, units = "secs") %>% as.numeric() + res_lme4 = list(package = "lme4", n_genotype = n_G, duration = time_process, attempt = i) %>% data.frame() + + + print('glmm') + start_time <- Sys.time() + m.nb <- glmmTMB::glmmTMB(y ~ environment + ( 1 | genotype ) , data=data_glmm, family=nbinom1, verbose = F) + end_time <- Sys.time() + time_process = difftime(end_time, start_time, units = "secs") %>% as.numeric() + res_glmmTMB = list(package = "glmmTMB nbinom1", n_genotype = n_G, duration = time_process, attempt = i) %>% data.frame() + + ## bencbmark glmTMB + print('glmm') + start_time <- Sys.time() + m.nb <- glmmTMB::glmmTMB(y ~ environment + ( 1 | genotype ) , data=data_glmm, family=nbinom2, verbose = F) + end_time <- Sys.time() + time_process = difftime(end_time, start_time, units = "secs") %>% as.numeric() + res_glmmTMB2 = list(package = "glmmTMB nbinom2", n_genotype = n_G, duration = time_process, attempt = i) %>% data.frame() + + tmp = rbind( res_lme4, res_glmmTMB, res_glmmTMB2 ) + + if (exists("res_benchmark")) res_benchmark = rbind( res_benchmark, tmp ) + else res_benchmark = tmp + +} + + +``` + +```{r} + +p= ggplot(res_benchmark) + geom_violin(aes(x = duration, y = package, fill = package )) + scale_fill_manual(values = c("#D55E00", "#E69F00", "#0072B2")) + +ggsave(filename = 'figures/benchmark_violin_glmmm.png',p, height = 6) + + +res_benchmark2 = res_benchmark +res_benchmark2$n_genotype = factor(res_benchmark2$n_genotype) +p = ggplot(res_benchmark2, aes(x = n_genotype, y = duration, color = package )) + geom_boxplot() + geom_point(position = position_jitterdodge()) + scale_y_log10() + ylab('duration (sec)') + scale_color_manual(values = c("#D55E00", "#E69F00", "#0072B2")) + +p + +ggsave(filename = 'figures/benchmark_glmm.png',p, height = 6) + +``` + + +```{r} + +fit_extraction <- function(fit){ + fit.res = broom.mixed::tidy(fit) %>% arrange(term)%>% .$estimate %>% as.numeric() + B0 = fit.res[2] + BE = fit.res[3] + sd_BGE = fit.res[5] + correlation = fit.res[1] + sd_BG = fit.res[4] + #################################################################### + res = list(B0 = B0, BE = BE, sd_BG = sd_BG, sd_BGE = sd_BGE, correlation = correlation) %>% data.frame() + return(res) +} + +fit_glmm <- function(data_glmm, i){ + gene_i = paste('gene', i, sep = "") + data_glmm.filter = data_glmm %>% filter(gene_id == gene_i) + ########################### FIT LME4 ################################# + print("LME4") + m.nb <- glmer.nb(y ~ 0 + environment + ( 1 + environment | genotype ) , data= data_glmm.filter , verbose=F) + res_lme4 = fit_extraction(m.nb) + res_lme4 = res_lme4 %>% mutate(gene_id = gene_i) %>% mutate(from = "lme4") + + ##################### FIT GMTMB 1 ######################### + print("glmTMB 1") + m.nb <- glmmTMB::glmmTMB(y ~ 0 + environment + ( 1 + environment | genotype ) , data=data_glmm.filter, family=nbinom1, verbose = F) + res_glmTMB1 = fit_extraction(m.nb) + res_glmTMB1 = res_glmTMB1 %>% mutate(gene_id = gene_i) %>% mutate(from = "glmTMB nbinom1") + + ##################### FIT GMTMB 2 ######################### + print("glmTMB 2") + + m.nb <- glmmTMB::glmmTMB(y ~ 0 + environment + ( 1 + environment | genotype ) , data=data_glmm.filter, family=nbinom2, verbose = F) + res_glmTMB2 = fit_extraction(m.nb) + res_glmTMB2 = res_glmTMB2 %>% mutate(gene_id = gene_i) %>% mutate(from = "glmTMB nbinom2") + + res = rbind(res_lme4, res_glmTMB1, res_glmTMB2) + return(res) +} + + +######################## +## SETUP +######################## +n_gene = 4 +list_nb_G = c(200) +remove(df_benchmark) +######################### +for (n_G in list_nb_G){ + ################### Simulation counts ############### + a = purrr::map(.x = 1:n_gene, ~ get_beta_gene_i(fit.mvrnorm, n_G, n_E)) + beta.matrix = do.call(rbind, a) + beta.input = beta.matrix + design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 8) + log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) + gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_gene, n_genotypes = n_G, n_environments = n_E, dispersion.vec = dds.extraction$gene_dispersion, model_matrix = design2simulate$model_matrix, dispUniform_btweenCondition = T) + mu_ij = getMu_ij(log_qij, 1) + kij.simulated = getK_ij(mu_ij, gene_dispersion) + + + ######## INPUT ####### + beta0.input = beta.input[,1] + mean_betaG = beta.input[,2:n_G] %>% rowMeans() + mean_betaE = beta.input[,(n_G+1):(n_G+n_E-1)] + mean_betaGE = beta.input[,(n_G+n_E):dim(beta.input)[2]] %>% rowMeans() + beta0 = beta0.input + mean_betaG + #sd(beta0) + betaE = beta0.input + mean_betaE + mean_betaGE + mean_betaG + #sd_betaG = sd(beta0) + #sd_betaGE = sd(betaE) + sd_betaG = apply(beta.input[,2:n_G], 1, sd) + sd_betaGE = apply(beta.input[,(n_G+n_E):dim(beta.input)[2]], 1, sd) + + sd_betaG = apply(beta.input, 1, sd) + sd_betaGE = apply(beta.input[,2:dim(beta.input)[2]], 1, sd) + input = list(B0 = beta0, BE = betaE, sd_BG = sd_betaG, sd_BGE = sd_betaGE, gene_id = paste('gene', 1:n_gene, sep = ''), correlation = NA) %>% data.frame() + ####################################################### + + + + ####### ESTIMATION ############* + f = map(.x = 1:n_gene, ~df_2glmmm(kij.simulated[.x,], design2simulate, .x) ) + data_glmm = do.call(rbind, f) + plan(multisession, workers = 4) + results = furrr::future_map(.x = 1:n_gene, ~fit_glmm(data_glmm, .x) ) + res = do.call(rbind, results) + ################################################################# + + ########## JOIN ##### + res.long = res %>% reshape2::melt( id = c("gene_id", "from"), value.name = "Inference") #%>% mutate(type = 'Inference') + input.long = input %>% reshape2::melt( id = "gene_id", value.name = "Actual") + df_merged = merge(res.long, input.long, by = c("gene_id", "variable")) %>% mutate(n_genotype = n_G) + + if (exists("df_benchmark")) df_benchmark = rbind( df_benchmark, df_merged ) + else df_benchmark = df_merged + +} + +``` + +```{r} + +p = ggplot(df_benchmark %>% filter(variable != 'correlation')) + geom_point(aes(x = Actual * log(2), y = Inference, color = from ), alpha = 0.5, size = 4) + + facet_wrap(n_genotype~variable, ncol = 4, scales = "free_x") + geom_abline(slope=1, intercept=0) + + scale_color_manual(values = c("#D55E00", "#E69F00", "#0072B2")) +p +ggsave(filename = 'figures/benchmark_identity8.png',p, width = 12) + + +df_benchmark %>% filter(variable == 'sd_BGE') %>% .$Inference %>% mean() +df_benchmark %>% filter(variable == 'sd_BG') %>% .$Inference %>% mean() + +df_benchmark %>% filter(variable == 'correlation') %>% .$Inference %>% mean() + +df_benchmark %>% filter(variable == 'correlation') %>% .$Inference %>% mean()*(df_benchmark %>% filter(variable == 'sd_BG') %>% .$Inference %>% mean() * df_benchmark %>% filter(variable == 'sd_BGE') %>% .$Inference %>% mean()) + +fit.mvrnorm$sigma[4,2] = 0.0001 +fit.mvrnorm$sigma[2,4] = 0.0001 + +fit.mvrnorm$mu[4] = -2 +``` + + + + + diff --git a/results/v2/rocCuves.Rmd b/results/v2/rocCuves.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..af79e1ee367e0269343ef34fd52716e03cd1c565 --- /dev/null +++ b/results/v2/rocCuves.Rmd @@ -0,0 +1,294 @@ +--- +title: "HTRSIM" +date: '2022-09-22' +output: + html_document: + +css: + - css/template.css + +--- + + +```{r setup, message=FALSE, warning=FALSE, include=TRUE, results="hide"} +library(HTRSIM) +library(furrr) +#install.packages("tidyverse") +#library(reshape2) +#library(kableExtra) +#library(gridExtra) +#library(MatrixGenerics) +``` + + +## Public data + +```{r} +## Import & reshape table counts +fn = system.file("extdata/", "SRP217588_YM_vkallisto.tsv", package = "HTRSIM") +tabl_cnts <- read.table(file = fn, header = TRUE) +rownames(tabl_cnts) <- tabl_cnts$gene_id +tabl_cnts <- tabl_cnts %>% dplyr::select(-gene_id)##suppr colonne GeneID +tabl_cnts = tabl_cnts[order(tabl_cnts %>% rownames()),] +## DESIGN +fn = system.file("extdata/", "SRP217588_YM_bioDesign.csv", package = "HTRSIM") +bioDesign <- read.table(file = fn, header = T, sep = ';') + +## defining reference +bioDesign$genotype <- factor(x = bioDesign$genotype,levels = c("GSY147", "RM11")) +bioDesign$environment <- factor(x = bioDesign$environment, levels = c( "untreated", "treated")) + + +bioDesign = bioDesign %>% dplyr::filter(!str_detect(sample, "ru_rm_5") ) +tabl_cnts = tabl_cnts %>% dplyr::select(., !matches( "ru_rm_5")) +#tabl_cntsA = tabl_cnts + +``` + +```{r } +## Launch DESEQ2 +dds = run.deseq(tabl_cnts, bioDesign = bioDesign) + +## Extract + +dds.extraction = HTRSIM::extractDistributionFromDDS(dds_obj = dds) +#dds.extraction$gene_dispersion + +``` + +## params visualization + + +```{r} +beta_obs.dtf.long = dds.extraction$beta %>% reshape2::melt(. , na.rm = T, variable.name = "parameter") + +alpha_obs.dtf.long = dds.extraction$gene_dispersion %>% base::data.frame() %>% + dplyr::rename(., value = ".") %>% + dplyr::mutate(parameter = "dispersion") + +dtf.params_obs = rbind(beta_obs.dtf.long, alpha_obs.dtf.long) + + + +p = ggplot(dtf.params_obs, aes(x= value)) + + geom_histogram(aes(y=..density..), colour="black", fill="white") + facet_grid(~parameter)+ + theme(strip.text.x = element_text(size = 13), + axis.title = element_text(size = 5), + axis.text = element_text(size = 5)) + +p +``` + + +## Simu + +```{r} +#### params ##### +n_G = 30 +n_E = 2 +n_genes = 100 + + +################## +beta.input = getBetaforSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = dds.extraction$beta) + + +design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 15) +#rowSums(beta.input) %>% which.max() +#rowSums(beta.input) %>% max() +log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) +log_qij %>% as.numeric() %>% .[log_qij %>% which.max()] + + +gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, dispersion.vec = dds.extraction$gene_dispersion, model_matrix = design2simulate$model_matrix, dispUniform_btweenCondition = T) + + +#2* 2^log_qij +mu_ij = getMu_ij(log_qij, 1) +#max(mu_ij) +kij.simulated = getK_ij(mu_ij, gene_dispersion) +#max(kij.simulated) + +#kij.simulated = kij.simulated %>% data.frame() %>% filter_all(all_vars(. < 1000000)) + + +``` + + +## DESEQ2 + +```{r} + +beta.actual.matrix = beta.input +dds_simu = run.deseq(tabl_cnts = kij.simulated , bioDesign = design2simulate$design2simulate ) + +############# +# Param +threshold = 0.07 +############ +listBeta = DESeq2::resultsNames(dds_simu) +plan(multisession, workers = 4) +res = listBeta %>% furrr::future_map(.x = ., ~DESeq2::results(dds_simu, contrast=list(.x), lfcThreshold = threshold) %>% data.frame() %>% .$padj) +padj.matrix = do.call("cbind", res) + + + + dds_simu.mcols = S4Vectors::mcols(dds_simu,use.names=TRUE) + + dds.simu.mcols.colnamesReshaped = colnames(dds_simu.mcols) %>% + stringr::str_replace(., "_vs_G0", "") %>% + stringr::str_replace(., "_vs_E0", "") %>% + stringr::str_replace_all(., "_", "") %>% + stringr::str_replace(., "\\.", ":") + + columnOfInterest = design2simulate$model_matrix %>% base::colnames() %>% stringr::str_replace_all(., "[//(//)]", "") + #dds_simu.mcols[,columnOfInterest] + + ## Get only column of interest + idx_cols = base::match(columnOfInterest, dds.simu.mcols.colnamesReshaped) + beta.infered = dds_simu.mcols[,idx_cols] + + ## homogeneize column names & rownames + idx_cols = base::match(columnOfInterest, beta.actual.matrix %>% colnames()) + beta.actual.matrix = beta.actual.matrix[,idx_cols] + colnames(beta.infered) = base::colnames(beta.actual.matrix) + colnames(padj.matrix) = base::colnames(beta.actual.matrix) + rownames(padj.matrix) = base::rownames(beta.actual.matrix) + beta.actual.matrix %>% dim() + padj.matrix %>% dim() + beta.infer.long = beta.infered %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Inference") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") + beta.actual.matrix.long = beta.actual.matrix %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Actual") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") + padj.matrix.long = padj.matrix %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "padj") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") + + beta.merged.long = rbind(beta.infer.long, beta.actual.matrix.long, padj.matrix.long) + #beta.merged.long$beta %>% unique() + + beta.merged.long.reshape = beta.merged.long %>% dplyr::mutate(type = dplyr::case_when( + str_detect(beta, "genotypeG\\d+\\.environment") ~ "GxE", + str_detect(beta, "genotypeG\\d+$") ~ "G", + str_detect(beta, "environmentE\\d+$") ~ "E", + str_detect(beta, "Intercept$") ~ "Intercept") + ) + + + beta.merged.long.reshape2 = beta.merged.long.reshape %>% reshape2::dcast(., gene_id + beta + type ~ origin) + beta.merged.long.reshape2$type = factor(beta.merged.long.reshape2$type, levels = c("Intercept", "G", "E", "GxE")) + beta.merged.long.reshape2$threshold = threshold +``` + +```{r} + +df_comparison = beta.merged.long.reshape2 + + + +df_comparison = df_comparison %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) + +df_comparison$label <- factor(df_comparison$label) +df_comparison$threshold <- factor(df_comparison$threshold) + + + +``` + + +```{r} +## ROC curve dtf +df_comparison$threshold = factor(df_comparison$threshold) +df_roc_deseq = df_comparison + +``` + +## GLM + + +```{r} +design2simulate$design2simulate %>% dim() +kij.simulated %>% dim() + +############# GLM fitting ################### +plan(multisession, workers = 4) +a = 1:n_genes %>% furrr::future_map(.x = ., ~run.glm(kij.simulated[.x,], + .x, + design2simulate$design2simulate, threshold = threshold) ) + +#kij.simulated %>% dim() +############ save res ##### +c = do.call(rbind, a) +beta.input.long = beta.input %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Actual") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") %>% + dplyr::rename(Actual = "value") %>% + dplyr::select(-origin) + +df_tmp = merge(c, beta.input.long) %>% mutate(padj = p.adjust(p.val, method= "fdr")) %>% mutate(threshold = threshold) + + +df_tmp = df_tmp %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) + +df_tmp$label <- factor(df_tmp$label) +df_tmp$threshold <- factor(df_tmp$threshold) + +``` + + +## merge DESEQ GLM + +```{r} +df_roc_glm = df_tmp %>% dplyr::select(c(-dispersion, -pval, -p.val)) +df_roc_glm = df_roc_glm %>% mutate(from = 'MASS::glm.nb') + +df_roc_deseq = df_roc_deseq %>% mutate(from = "DESEQ2") + + +df_roc = rbind(df_roc_deseq, df_roc_glm) + +``` + + + + + +```{r} + +library(plotROC) +p = ggplot(df_roc %>% filter(type != "Intercept") , aes(d = label , m = padj, color = from)) + + geom_roc(n.cuts = 0, labels = F) + facet_grid(~type) + +p + style_roc() +ggsave(filename = "figures/ROCcurve2.png", plot = p + style_roc(), width = 15, height = 8) + +p = ggplot(calc_auc(p), aes(x = threshold, y = AUC, group=1)) + geom_point() + geom_line() + facet_grid(~type) +ggsave(filename = "figures/AUC.png", plot = p, width = 15, height = 10) +### boxplot FP + +df_comparison2 = df_comparison %>% + filter(type != "Intercept") %>% + group_by(label, prediction, type, threshold) %>% + tally() %>% + ungroup() %>% + group_by(label, type, threshold) %>% + mutate(tt = sum(n)) %>% + mutate(proportion = n/tt) %>% + mutate(predict = ifelse(label == prediction, "true", "false")) + + +df_comparison2$threshold = factor(df_comparison2$threshold) +p = ggplot(df_comparison2) + geom_bar(aes(x = threshold , y = proportion, fill = predict),stat="identity") + facet_grid(type~label) +p +p = ggsave(filename = "figures/boxplotFP.png", plot = p, width = 15, height = 8) + +``` + diff --git a/results/v2/roccurvesBISBIS.R b/results/v2/roccurvesBISBIS.R new file mode 100644 index 0000000000000000000000000000000000000000..3d7053e5a205ec5ef2688a05625d352919141eda --- /dev/null +++ b/results/v2/roccurvesBISBIS.R @@ -0,0 +1,186 @@ +## Simu + + +n_genes = 80 +n_rep_list = c( 2, 3, 5, 15, 25 ) +threshold = 0.5 +n_E = 2 +n_G = 100 + + +rm(df_roc) + + + + +beta.input = getBetaforSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = dds.extraction$beta) + +for (n_rep in n_rep_list){ +design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = n_rep) +log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) +gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, + n_genotypes = n_G, n_environments = n_E, + dispersion.vec = dds.extraction$gene_dispersion, + model_matrix = design2simulate$model_matrix, dispUniform_btweenCondition = T) +mu_ij = getMu_ij(log_qij, 1) +kij.simulated = getK_ij(mu_ij, gene_dispersion) + + + +######################### DESEQ +beta.actual.matrix = beta.input +## 2 modifY +#dds_simu = run.deseq(tabl_cnts = kij.simulated , bioDesign = design2simulate$design2simulate ) + +############# +# Param +threshold = 0.5 + +############## GLM +a = 1:n_genes %>% furrr::future_map(.x = ., ~run.glm(kij.simulated[.x,], ## 2 modify !!! + .x, + design2simulate$design2simulate, threshold = threshold) ) + +############ save res ##### +c = do.call(rbind, a) +beta.input.long = beta.input %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Actual") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") %>% + dplyr::rename(Actual = "value") %>% + dplyr::select(-origin) + +df_tmp = merge(c, beta.input.long) %>% mutate(padj = p.adjust(pval, method= "fdr")) %>% mutate(threshold = threshold) + + + + + +## 2 modify +df_roc_glm = df_tmp %>% dplyr::select(c(-dispersion, -pval)) %>% mutate(from = 'MASS::glm.nb') %>% mutate(n_rep = n_rep) + +#### merge + +#df_roc_deseq = df_roc_deseq %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% +# mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) +df_roc_glm = df_roc_glm %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) + +#df_tmp2 = rbind(df_roc_deseq, df_roc_glm) +df_tmp2 = df_roc_glm +if (exists('df_roc') && is.data.frame(get('df_roc'))){ + df_roc = rbind(df_roc, df_tmp2) +} +else{ + df_roc = df_tmp2 +} + + + +} + +df_roc %>% filter(type %in% c("G","GxE"))%>% group_by(from, n_rep) %>% tally() + +df_roc$n_rep <- factor(df_roc$n_rep) +df_roc_REP = df_roc +library(plotROC) +p = ggplot(df_roc %>% filter(type %in% c("G","GxE")) , aes(d = label , m = padj, color = n_rep)) + + geom_roc(n.cuts = 0, labels = F) + + +p2 = p + style_roc() + scale_color_manual(values = c("#D3D3D3","#BDBDBD", "#9E9E9E", "#7D7D7D", "#696969" )) +ggsave(filename = "../../../results/v2/figures/ROCcurve9.png", plot = p2, height = 6, width = 8) + +###################### + +n_genes = 80 +sizeFac_vector = c(0.1, 1, 5, 10, 20 ) +threshold = 0.5 +n_E = 2 +n_G = 20 + + +rm(df_roc) + + + + +beta.input = getBetaforSimulation(n_genes = n_genes, n_genotypes = n_G, n_environments = n_E, beta.dtf = dds.extraction$beta) + +for (sj in sizeFac_vector ){ + print(sj) + + design2simulate = buildDesign2simulate(n_genotype = n_G, n_environment = n_E, n_replicate = 5) + log_qij = getLog_qij(model_matrix = design2simulate$model_matrix, beta.matrix.input = beta.input) + gene_dispersion = getGenesDispersionsForSimulation(n_genes = n_genes, + n_genotypes = n_G, n_environments = n_E, + dispersion.vec = dds.extraction$gene_dispersion, + model_matrix = design2simulate$model_matrix, dispUniform_btweenCondition = T) + mu_ij = getMu_ij(log_qij, sj) + kij.simulated = getK_ij(mu_ij, gene_dispersion) + + + ######################### DESEQ + beta.actual.matrix = beta.input + ## 2 modifY + #dds_simu = run.deseq(tabl_cnts = kij.simulated , bioDesign = design2simulate$design2simulate ) + + ############# + # Param + threshold = 0.5 + + ############## GLM + a = 1:n_genes %>% furrr::future_map(.x = ., ~run.glm(kij.simulated[.x,], ## 2 modify !!! + .x, + design2simulate$design2simulate, threshold = threshold) ) + + ############ save res ##### + c = do.call(rbind, a) + beta.input.long = beta.input %>% data.frame() %>% + tibble::rownames_to_column(., var = "gene_id") %>% + dplyr::mutate(origin = "Actual") %>% + reshape2::melt(., value.name = "value", variable.name= "beta") %>% + dplyr::rename(Actual = "value") %>% + dplyr::select(-origin) + + df_tmp = merge(c, beta.input.long) %>% mutate(padj = p.adjust(pval, method= "fdr")) %>% mutate(threshold = threshold) + + + + + + ## 2 modify + mean_readsPerSample = kij.simulated %>% colSums() %>% mean() %>% round() %>% format(., scientific = FALSE, big.mark = ',' ) + df_roc_glm = df_tmp %>% dplyr::select(c(-dispersion, -pval)) %>% mutate(from = 'MASS::glm.nb') %>% mutate(Depth_seq = mean_readsPerSample) + + #### merge + + #df_roc_deseq = df_roc_deseq %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + # mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) + df_roc_glm = df_roc_glm %>% mutate(label = ifelse(abs(Actual) > threshold, "DE", "nonDE" )) %>% + mutate( prediction = ifelse(padj < 0.05, "DE", "nonDE" )) + + #df_tmp2 = rbind(df_roc_deseq, df_roc_glm) + df_tmp2 = df_roc_glm + if (exists('df_roc') && is.data.frame(get('df_roc'))){ + df_roc = rbind(df_roc, df_tmp2) + } + else{ + df_roc = df_tmp2 + } + + + +} + +df_roc$Depth_seq %>% unique() +df_roc$Depth_seq <- factor(df_roc$Depth_seq, levels = c("24,546", "250,278" , "1,223,084", "2,470,148","4,995,472")) +#df_roc_REP = df_roc +library(plotROC) +p = ggplot(df_roc %>% filter(type %in% c("G","GxE")) , aes(d = label , m = padj, color = Depth_seq)) + + geom_roc(n.cuts = 0, labels = F) +p +p + style_roc() + scale_color_manual(values = c("#D3D3D3","#BDBDBD", "#9E9E9E", "#7D7D7D", "#696969" )) +p +p2 = p + style_roc() + scale_color_manual(values = c("#D3D3D3","#BDBDBD", "#9E9E9E", "#7D7D7D", "#696969" )) +ggsave(filename = "../../../results/v2/figures/ROCcurve10.png", plot = p2, height = 6, width = 8) diff --git a/src/v2/HTRSIM/DESCRIPTION b/src/v2/HTRSIM/DESCRIPTION index 3191056902d98e3a426ac306c70517fbf9a629c5..5c93a758b217026790e10370a079ad22b0dae2f9 100644 --- a/src/v2/HTRSIM/DESCRIPTION +++ b/src/v2/HTRSIM/DESCRIPTION @@ -18,6 +18,7 @@ Depends: Imports: DESeq2, furrr, + glmglrt, MASS, S4Vectors, stats, diff --git a/src/v2/HTRSIM/R/countsGenerator.R b/src/v2/HTRSIM/R/countsGenerator.R index fb17cfc189b64ae15b684fe89d7154c2750cfd40..3af9bf322fa2b1ed25bc362bb51e1482909eba32 100644 --- a/src/v2/HTRSIM/R/countsGenerator.R +++ b/src/v2/HTRSIM/R/countsGenerator.R @@ -1,7 +1,3 @@ - - - - #' Get beta_ij #' #' @param n_genes an integer diff --git a/src/v2/HTRSIM/R/evaluation.R b/src/v2/HTRSIM/R/evaluation.R index 8e9609433abf4577fd8581463f7bc4dc046cd616..6f246c2c003a967ccffb1e6f297b0b174abb305a 100644 --- a/src/v2/HTRSIM/R/evaluation.R +++ b/src/v2/HTRSIM/R/evaluation.R @@ -24,9 +24,7 @@ getDfComparison <- function(dds_simu , model_matrix, beta.actual.matrix, threads padj.matrix = do.call("cbind", res) - dds_simu.mcols = S4Vectors::mcols(dds_simu,use.names=TRUE) - dds.simu.mcols.colnamesReshaped = colnames(dds_simu.mcols) %>% stringr::str_replace(., "_vs_G0", "") %>% stringr::str_replace(., "_vs_E0", "") %>% diff --git a/src/v2/HTRSIM/R/modelFitting.R b/src/v2/HTRSIM/R/modelFitting.R index a374328d48df144fdcd256350482639e00272f09..6918c11c074e9d339e4be3ab5602878c20de3fef 100644 --- a/src/v2/HTRSIM/R/modelFitting.R +++ b/src/v2/HTRSIM/R/modelFitting.R @@ -21,7 +21,7 @@ test_wald <- function(model.res, term, threshold, initvec){ #' Launch MASS:GLM.NB #' #' @param gene_count a row of kij simulated table -#' @param i index of the gene +#' @param gene_name name of the gene #' @import dplyr #' @import stringr #' @import stats @@ -31,10 +31,9 @@ test_wald <- function(model.res, term, threshold, initvec){ #' @export #' #' @examples -reshapeGlmRes <- function(fit, i, threshold , error_bool = F){ +reshapeGlmRes <- function(fit, gene_name, threshold , error_bool = F){ if (error_bool == T){ ## error while fiting model - res = list(Inference = NA, pval = NA, - beta = NA, gene_id = paste("gene", i, sep = ""), type = NA, deviance = NA, p.val = NA) %>% + res = list(Inference = NA, pval = NA, beta = NA, gene_id = paste("gene", i, sep = ""), type = NA, dispersion = NA) %>% data.frame() } else{ ## success to fit model @@ -57,7 +56,7 @@ reshapeGlmRes <- function(fit, i, threshold , error_bool = F){ rownames(res) <- NULL res = res %>% - dplyr::mutate(gene_id = paste("gene", i, sep = "") ) %>% + dplyr::mutate(gene_id = gene_name ) %>% dplyr::mutate(type = dplyr::case_when( stringr::str_detect(beta, "genotypeG\\d+\\.environment") ~ "GxE", stringr::str_detect(beta, "genotypeG\\d+$") ~ "G", diff --git a/src/v2/HTRSIM/devtools_history.R b/src/v2/HTRSIM/devtools_history.R index 4b905d399a8bd38d8957e4af0e11492f568b6c9a..9ad37d1dbbceb46a1b30e98d8129245ca7c390d4 100644 --- a/src/v2/HTRSIM/devtools_history.R +++ b/src/v2/HTRSIM/devtools_history.R @@ -5,4 +5,5 @@ usethis::use_package("stringr") usethis::use_package("MASS") usethis::use_package("DESeq2") usethis::use_package("furrr") +usethis::use_package("glmglrt")