diff --git a/DESCRIPTION b/DESCRIPTION index 2f0e5a8b8d89a99d1f3fffa811332cf31b853fcf..af151bd0364da75aa907e30dfef4faf15edb99e0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: kmerclust Title: Kmerclust A Small Package To Test If The Kmer Count Correspond To A XY Or XO Genome -Version: 0.0.7 +Version: 0.0.8 Authors@R: person("Laurent", "Modolo", , "laurent.modolo@ens-lyon.fr", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7606-4110")) @@ -31,7 +31,7 @@ Suggests: VignetteBuilder: knitr Config/fusen/version: 0.5.2 -Date/Publication: 2023/10/06 +Date/Publication: 2023/10/20 Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 diff --git a/NAMESPACE b/NAMESPACE index 1cdb5c6be59b68a3182cb39c90bc932fee53387e..3771780b04c53a8fef6d6e362e7d5f73939759b0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,8 +9,10 @@ export(compare_models_constaint) export(compute_tpm) export(e_bipoiss) export(e_bipoiss_clust) +export(e_bipoiss_clust_batch) export(em_bipoiss) export(em_bipoiss_clust) +export(get_cluster_poiss) export(loglik_bipoiss) export(loglik_bipoiss_clust) export(m_bipoiss) diff --git a/R/bic_bipoiss_clust.R b/R/bic_bipoiss_clust.R index 5f3685c0dd6273c48676080b8ed65215d61b3c1b..2b07c72423d1bec2deab6a5d01fe9d0b97f05d8e 100644 --- a/R/bic_bipoiss_clust.R +++ b/R/bic_bipoiss_clust.R @@ -11,10 +11,10 @@ bic_bipoiss_clust <- function(x, params, loglik) { k <- 3 if (length(params) >= 2) { - k <- k + 3 + k <- k + 5 } if (length(params) >= 3) { - k <- k + 3 + k <- k + 4 } return(k * log(nrow(x)) - 2 * loglik) } diff --git a/R/e_bipoiss_clust.R b/R/e_bipoiss_clust.R index 98fff50495f9857f8741e6acc8b9e99de7e19c28..726aac69ecc4d488543737ebed921e2d52f97fd9 100644 --- a/R/e_bipoiss_clust.R +++ b/R/e_bipoiss_clust.R @@ -12,13 +12,18 @@ e_bipoiss_clust <- function(x, params) { s <- matrix(0, nrow(x), length(params)) h <- s for (i in 1:length(params)) { - h[, i] <- extraDistr::dbvpois(x, a = params[[i]]$l1, b = params[[i]]$l2, c = params[[i]]$l3) - s[id, i] <- params[[i]]$l3 * - extraDistr::dbvpois(x[id, ] - 1, a = params[[i]]$l1, b = params[[i]]$l2, c = params[[i]]$l3) / - h[id, i] - h[, i] <- params[[i]]$kappa * h[, i] + suppressWarnings({ + h[, i] <- extraDistr::dbvpois(x, a = params[[i]]$l1, b = params[[i]]$l2, c = params[[i]]$l3) + s[id, i] <- params[[i]]$l3 * + extraDistr::dbvpois(x[id, ] - 1, a = params[[i]]$l1, b = params[[i]]$l2, c = params[[i]]$l3) / + h[id, i] + h[, i] <- params[[i]]$kappa * h[, i] + }) } s[is.na(s)] <- 0 - h <- h / rowSums(h) + h[is.na(h)] <- 0 + denum <- rowSums(h) + h[denum > 0, ] <- h[denum > 0, ] / denum[denum > 0] + h[denum == 0, ] <- 1 / ncol(h) return(list(s = s, h = h)) } diff --git a/R/e_bipoiss_clust_batch.R b/R/e_bipoiss_clust_batch.R new file mode 100644 index 0000000000000000000000000000000000000000..b4334578d6262719afcd2b2c22980f758e743a4c --- /dev/null +++ b/R/e_bipoiss_clust_batch.R @@ -0,0 +1,38 @@ +# WARNING - Generated by {fusen} from dev/flat_full_poisson.Rmd: do not edit by hand + +#' run the expetation step for each patch iteratively +#' +#' @param x the data matrix (2 columns) +#' @param params the list of 3 lambda +#' @param hidden the list of hidden variable +#' @param nbatch the number of batch +#' @return a list of hidden variable +#' @export +e_bipoiss_clust_batch <- function(x, params, hidden = NULL, nbatch = 10) { + if (is.null(hidden)) { + hidden <- list() + hidden$current <- 1 + hidden$batch <- rep(1:nbatch, round(nrow(x) / nbatch) + 1)[1:nrow(x)] + hidden$s <- matrix(0, nrow(x), length(params)) + hidden$h <- hidden$s + for (i in 1:length(params)) { + hidden$s[, i] <- rpois(nrow(x), params[[1]]$l3) + hidden$h[, i] <- runif(nrow(x)) + } + hidden$h <- hidden$h / rowSums(hidden$h) + } + res <- e_bipoiss_clust(x[hidden$batch == hidden$current, ], params) + hidden$s[hidden$batch == hidden$current, ] = res$s + hidden$h[hidden$batch == hidden$current, ] = res$h + hidden$current <- hidden$current + 1 + if (hidden$current > nbatch) { + hidden$current <- 1 + } + if(any(is.na(hidden$h))) { + stop("error in e_bipoiss_clust_batch h") + } + if(any(is.na(hidden$s))) { + stop("error in e_bipoiss_clust_batch s") + } + return(hidden) +} diff --git a/R/em_bipoiss_clust.R b/R/em_bipoiss_clust.R index 645b48cca377baf426d90b8ccea688e6fc9fff35..f7919361f807a54e62e4892d16026f325daed0f2 100644 --- a/R/em_bipoiss_clust.R +++ b/R/em_bipoiss_clust.R @@ -4,7 +4,8 @@ #' #' @param x the data matrix (2 columns) #' @param params the list of 3 lambda (initial value) -#' @param frac (default: 1.) fraction of the data to use at each step +#' @param gamma a vector of kappa prior +#' @param nbatch (default: 10) the number of batch #' @param threshold (default: c(0.1, 5)) maximum difference between kappa and lambda #' @param max_iter (default: 100) maximum number of iteration #' @return a list of 3 lambda @@ -21,41 +22,31 @@ em_bipoiss_clust <- function(x, params = list( A = list(kappa = 0.4, l1 = mean(x), l2 = mean(x), l3 = mean(x)), X = list(kappa = 0.3, l1 = 1, l2 = mean(x), l3 = mean(x)), Y = list(kappa = 0.3, l1 = mean(x), l2 = 1, l3 = mean(x)) - ), frac = 1., threshold = c(0.1, 1), max_iter = 100) { - size <- round(nrow(x) * frac) + ), + gamma = c(round(nrow(x) * .8), round(nrow(x) * .15), round(nrow(x) * .05)), + nbatch = 10, threshold = c(0.1, 1), max_iter = 100) { old_params <- params old_params[[1]]$kappa <- Inf + hidden <- NULL iter <- 0 while (param_diff_poiss(params, old_params, threshold = threshold) & iter < max_iter) { old_params <- params - if (size < nrow(x)) { - id <- sample(1:nrow(x), size = size) - } else { - id <- 1:nrow(x) - } - hidden <- e_bipoiss_clust(x[id, ], params) - params <- m_bipoiss_clust(x[id, ], s = hidden$s, h = hidden$h) + hidden <- e_bipoiss_clust_batch(x, params = params, hidden = hidden, nbatch = nbatch) + params <- m_bipoiss_clust(x, s = hidden$s, h = round(hidden$h), gamma = gamma) iter <- iter + 1 } res <- params if (length(res) == 1) { names(res) <- "A" - print(paste(res$A$kappa, res$A$l1, res$A$l2, res$A$l3)) } if (length(res) == 2) { names(res) <- c("A", "X") - print(paste(res$A$kappa, res$A$l1, res$A$l2, res$A$l3)) - print(paste(res$X$kappa, res$X$l1, res$X$l2, res$X$l3)) } if (length(res) == 3) { names(res) <- c("A", "X", "Y") - print(paste(res$A$kappa, res$A$l1, res$A$l2, res$A$l3)) - print(paste(res$X$kappa, res$X$l1, res$X$l2, res$X$l3)) - print(paste(res$Y$kappa, res$Y$l1, res$Y$l2, res$Y$l3)) } - res$loglik <- loglik_bipoiss_clust(x, params) - res$BIC <- bic_bipoiss_clust(x, params, loglik = res$loglik) - print(res$loglik) - print(res$BIC) + res$loglik <- loglik_bipoiss_clust(x, params = params, gamma = gamma) + res$BIC <- bic_bipoiss_clust(x, params = params, loglik = res$loglik) + res <- get_cluster_poiss(x, res = res, params = params, hidden = hidden) return(res) } diff --git a/R/get_cluster_poiss.R b/R/get_cluster_poiss.R new file mode 100644 index 0000000000000000000000000000000000000000..d10b63df6a5f32a6e945282d7c6f62899cfd0f33 --- /dev/null +++ b/R/get_cluster_poiss.R @@ -0,0 +1,19 @@ +# WARNING - Generated by {fusen} from dev/flat_full_poisson.Rmd: do not edit by hand + +#' add the cluster assigment and proba to the results +#' +#' @param x the data matrix (2 columns) +#' @param res the list of the results +#' @param params the list of 3 lambda +#' @param hidden the list of hidden variable +#' @return a bool +#' @export +get_cluster_poiss <- function(x, res, params, hidden) { + chr_name <- c("A", "X", "Y") + hidden <- e_bipoiss_clust(x, params) + res$h <- hidden$h + res$chromosome <- chr_name[ + apply(res$h, 1, function(x){which(x == max(x))[1]}) + ] + return(res) +} diff --git a/R/loglik_bipoiss_clust.R b/R/loglik_bipoiss_clust.R index 6d769c0f2d97e327c7da2ad372d4cf6f8b1efba3..514f4b4aa42c148397f823422fbf12eae4ea2c3c 100644 --- a/R/loglik_bipoiss_clust.R +++ b/R/loglik_bipoiss_clust.R @@ -4,21 +4,22 @@ #' #' @param x the data matrix (2 columns) #' @param params the list of 3 lambda +#' @param gamma a vector of kappa prior #' @return logliklihood #' @importFrom extraDistr dbvpois +#' @importFrom Boom ddirichlet #' @export -loglik_bipoiss_clust <- function(x, params) { +loglik_bipoiss_clust <- function(x, params, gamma) { loglik <- matrix(0, nrow(x), length(params)) + kappa <- c() for (i in 1:length(params)) { loglik[, i] <- params[[i]]$kappa * extraDistr::dbvpois(x = x, a = params[[i]]$l1, b = params[[i]]$l2, c = params[[i]]$l3) + kappa <- c(kappa, params[[i]]$kappa) } loglik <- sum(log(rowSums(loglik))) - if (is.infinite(loglik)) { - print("Inf loglik") - } - if (is.na(loglik)) { - print("NA loglik") + if (length(kappa) > 1) { + loglik <- loglik + Boom::ddirichlet(probabilities = kappa, nu = gamma, logscale = T) } return(loglik) } diff --git a/R/m_bipoiss_clust.R b/R/m_bipoiss_clust.R index 5024bdcf7c6f06d5f38523977e71385c922fe22c..50ab9348a540cfd1f52c2e47dc9c12163e7e1bec 100644 --- a/R/m_bipoiss_clust.R +++ b/R/m_bipoiss_clust.R @@ -4,15 +4,25 @@ #' #' @param x the data matrix (2 columns) #' @param s of size 3 time nrow(x) +#' @param h of size 3 time nrow(x) +#' @param gamma a vector of kappa prior +#' @param m_chr the maximization function for each type of chromosome #' @return a list of 3 lambda #' @importFrom extraDistr dbvpois #' @export -m_bipoiss_clust <- function(x, s, h, +m_bipoiss_clust <- function(x, s, h, gamma, m_chr = list(m_bipoiss_clust_a_chr, m_bipoiss_clust_x_chr, m_bipoiss_clust_y_chr)) { params <- list() + gamma_sum <- sum(gamma) + kappa <- c() for (i in 1:ncol(h)) { params[[i]] <- m_chr[[i]](x, s[, i], h[, i]) - params[[i]]$kappa <- sum(h[, i]) / nrow(h) + params[[i]]$kappa <- (sum(h[, i]) + gamma[i] - 1) / (nrow(h) + gamma_sum - ncol(h)) + kappa <- c(kappa, params[[i]]$kappa) + } + params[[1]]$kappa <- 1 + if (length(kappa) > 1) { + params[[1]]$kappa <- 1 - sum(kappa[2:length(kappa)]) } return(params) } @@ -53,11 +63,15 @@ m_bipoiss_clust_a_chr <- function(x, s, h) { sum((x[, 1] - s) * h) / denum + sum((x[, 2] - s) * h) / denum ) / 2 - return(list( + res <- list( l1 = lambda_global, l2 = lambda_global, l3 = sum(s * h) / denum - )) + ) + for (i in which(is.na(res))) { + res[[i]] <- 1 + } + return(res) } #' maximisation step for bi-Poisson EM @@ -73,11 +87,15 @@ m_bipoiss_clust_a_chr <- function(x, s, h) { #' em_bipoiss(m_bipoiss = m_bipoiss_x_chr) m_bipoiss_clust_x_chr <- function(x, s, h) { denum <- sum(h) - return(list( + res <- list( l1 = 1, l2 = sum((x[, 2] - s) * h) / denum, l3 = sum(s * h) / denum - )) + ) + for (i in which(is.na(res))) { + res[[i]] <- 1 + } + return(res) } @@ -94,9 +112,13 @@ m_bipoiss_clust_x_chr <- function(x, s, h) { #' em_bipoiss(m_bipoiss = m_bipoiss_y_chr) m_bipoiss_clust_y_chr <- function(x, s, h) { denum <- sum(h) - return(list( + res <- list( l1 = sum((x[, 1] - s) * h) / denum, l2 = 1, l3 = sum(s * h) / denum - )) + ) + for (i in which(is.na(res))) { + res[[i]] <- 1 + } + return(res) } diff --git a/R/param_diff_poiss.R b/R/param_diff_poiss.R index 4dfdb6f74943a5e4b4276867083adb4952a95a28..c8f748ff6467d1a76fda173394b471bbf4debb61 100644 --- a/R/param_diff_poiss.R +++ b/R/param_diff_poiss.R @@ -35,6 +35,10 @@ param_diff_poiss <- function( } else { j <- 2 } + if (is.na(params[[i]][[param]])) { + warning(paste("param_diff_poiss: parameter value: ", i, ":", params, "\n")) + return(T) + } if (abs(params[[i]][[param]] - old_params[[i]][[param]]) > threshold[j]) return(T) } diff --git a/R/poiss_boostrap_em_sub.R b/R/poiss_boostrap_em_sub.R new file mode 100644 index 0000000000000000000000000000000000000000..d59aed85fbd47de1dcb47ac6d89f42c423f50fb2 --- /dev/null +++ b/R/poiss_boostrap_em_sub.R @@ -0,0 +1,109 @@ +# WARNING - Generated by {fusen} from dev/flat_full_poisson.Rmd: do not edit by hand + +#' Function to compile statistics bootstrap samples of the data +#' +#' @param x a matrix of male female kmers counts +#' @param params a list of init parameters +#' @param gamma a vector of kappa prior +#' @param threshold theshold to stop the EM algorithm +#' @param nboot number of boostrap sample +#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x) +#' @param core number of cpus to use for the computation +#' @param max_iter (default: 100) maximum number of iteration +#' @param max_error (default: 5) maximum number of try +#' @return tibble of the statistics +#' @importFrom dplyr sample_n sample_frac +#' @importFrom parallel mclapply +#' @examples +#' rbind( +#' extraDistr::rbvpois(1000, 10, 10, 20), # A +#' extraDistr::rbvpois(500, 0, 20, 20), # X +#' extraDistr::rbvpois(100, 20, 0, 5) # Y +#' ) %>% pois_boostrap_EM(nboot = 10) +#' @noRd +poiss_boostrap_EM_sub <- function(iter, x, bootsize, params, gamma, max_iter, max_error) { + res <- list() + res$loglik <- c(NA) + iter <- 0 + while (is.na(res$loglik) | iter < max_error) { + res <- x %>% + as_tibble() %>% + sample_frac(bootsize, replace = T) %>% + as.matrix() %>% + em_bipoiss_clust(params = params, gamma = gamma, max_iter = max_iter) + iter <- iter + 1 + } + return(tibble(loglik = res$loglik, BIC = res$BIC)) +} + +#' Function to compile statistics bootstrap samples of the data +#' +#' @param x a matrix of male female kmers counts +#' @param params a list of init parameters +#' @param gamma a vector of kappa prior +#' @param threshold theshold to stop the EM algorithm +#' @param nboot number of boostrap sample +#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x) +#' @param core number of cpus to use for the computation +#' @param max_iter (default: 100) maximum number of iteration +#' @param max_error (default: 5) maximum number of try +#' @return tibble of the statistics +#' @importFrom dplyr sample_n sample_frac +#' @importFrom parallel mclapply +#' @examples +#' rbind( +#' extraDistr::rbvpois(1000, 10, 10, 20), # A +#' extraDistr::rbvpois(500, 0, 20, 20), # X +#' extraDistr::rbvpois(100, 20, 0, 5) # Y +#' ) %>% pois_boostrap_EM(nboot = 10) +#' @noRd +poiss_boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1, core = 1, max_iter = 100, max_error = 5) { + params = list( + A = list(kappa = 0.4, l1 = mean(x), l2 = mean(x), l3 = mean(x)) + ) + gamma <- c(round(nrow(x) * bootsize)) + if (sex %in% c("XO", "XY")) { + params$X <- list(kappa = 0.3, l1 = 1, l2 = mean(x), l3 = mean(x)) + gamma <- c(round(nrow(x) * bootsize) * .8, round(nrow(x) * bootsize) * .2) + } + if (sex == "XY") { + params$Y <- list(kappa = 0.3, l1 = mean(x), l2 = 1, l3 = mean(x)) + gamma <- c(round(nrow(x) * bootsize) * .8, round(nrow(x) * bootsize) * .15, round(nrow(x) * bootsize) * .05) + } + if (core == 1) { + res <- lapply(as.list(1:nboot), FUN = poiss_boostrap_EM_sub, x = x, bootsize = bootsize, params = params, gamma = gamma, max_iter = max_iter, max_error = max_error) + } else { + res <- parallel::mclapply(as.list(1:nboot), FUN = poiss_boostrap_EM_sub, x = x, bootsize = bootsize, params = params, gamma = gamma, max_iter = max_iter, max_error = max_error, mc.cores = core) + } + return(res) +} + +#' Function to compile statistics differences between model XY and XO on boostrap +#' samples of the data +#' +#' @param x a matrix of male female kmers counts +#' @param threshold theshold to stop the EM algorithm +#' @param nboot number of boostrap sample +#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x) +#' @param core number of cpus to use for the computation +#' @param max_iter (default: 100) maximum number of iteration +#' @param max_error (default: 5) maximum number of try +#' @return tibble with the statistics +#' @importFrom tidyr pivot_longer +#' @export +#' @examples +#' res <- rbind( +#' extraDistr::rbvpois(1000, 10, 10, 20), # A +#' extraDistr::rbvpois(1000, 0, 20, 20), # X +#' extraDistr::rbvpois(1000, 20, 0, 5) # Y +#' ) %>% +#' poiss_compare_models(nboot = 3, core = 1) +poiss_compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1, core = 1, max_iter = 100, max_error = 5) { + tibble( + model_XY = poiss_boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core, max_iter = max_iter, max_error = max_error), + model_XO = poiss_boostrap_EM(x, sex = "XO", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core, max_iter = max_iter, max_error = max_error), + model_OO = poiss_boostrap_EM(x, sex = "OO", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core, max_iter = max_iter, max_error = max_error) + ) %>% + pivot_longer(cols = c(model_XY, model_XO, model_OO)) %>% + unnest(c(value)) +} diff --git a/dev/0-dev_history.Rmd b/dev/0-dev_history.Rmd index 5e59e626aa0a3d10f4f4e343f2f1229021162cf1..61822d62c99280fcd6e1eda97d8a39bb3bb186ef 100755 --- a/dev/0-dev_history.Rmd +++ b/dev/0-dev_history.Rmd @@ -21,7 +21,7 @@ fusen::fill_description( `Authors@R` = c( person("Laurent", "Modolo", email = "laurent.modolo@ens-lyon.fr", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7606-4110")) ), - Version = "0.0.7", + Version = "0.0.8", "Date/Publication" = format(Sys.time(), "%Y/%m/%d") ), overwrite = T diff --git a/dev/config_not_registered.csv b/dev/config_not_registered.csv index aea35fb5a6e73e28e627469350b0f1b657166199..a18a7b8865b2b5c72ff486bd7732b172d331642b 100644 --- a/dev/config_not_registered.csv +++ b/dev/config_not_registered.csv @@ -1,4 +1,7 @@ "type","path","origin" +"R","R/e_bipoiss_clust_batch.R","Possibly deprecated file. Please check its link with detected flat source: dev/flat_full_poisson.Rmd" +"R","R/get_cluster_poiss.R","Possibly deprecated file. Please check its link with detected flat source: dev/flat_full_poisson.Rmd" "R","R/kmerclust-package.R","No existing source path found." +"R","R/poiss_boostrap_em_sub.R","Possibly deprecated file. Please check its link with detected flat source: dev/flat_full_poisson.Rmd" "R","R/sim_kmer.R","Possibly deprecated file. Please check its link with detected flat source: dev/flat_full.Rmd" "R","R/utils-pipe.R","No existing source path found." diff --git a/dev/flat_full_bayesian.Rmd b/dev/flat_full_bayesian.Rmd index 9b5fcbc75c8f346d4d9ad57abf7408c7b6dd9b0c..99671d6ae506b3832cfc6807bde903f174fc80e5 100644 --- a/dev/flat_full_bayesian.Rmd +++ b/dev/flat_full_bayesian.Rmd @@ -1,5 +1,5 @@ --- -title: "flat_full_bayesian.Rmd for working package" +title: "Getting started bayesian version" output: html_document editor_options: chunk_output_type: inline @@ -722,7 +722,7 @@ test_model_XY <- function(data) { ) ) } -model_XO <- test_model_XO(data) +model_XO <- test_model_XY(data) plot_model(model_XO) ``` diff --git a/dev/flat_full_poisson.Rmd b/dev/flat_full_poisson.Rmd index 28370e538e7e64f68fe3c32a8ffa3a22a3b80361..fdfc0ba69e58f9ac5047c2fd8c597227876b2dac 100644 --- a/dev/flat_full_poisson.Rmd +++ b/dev/flat_full_poisson.Rmd @@ -1,5 +1,5 @@ --- -title: "flat_full_poisson.Rmd for working package" +title: "Getting started Poisson version" output: html_document editor_options: chunk_output_type: inline diff --git a/man/e_bipoiss_clust_batch.Rd b/man/e_bipoiss_clust_batch.Rd new file mode 100644 index 0000000000000000000000000000000000000000..01af96769066933d03cba23bf201e8ab05152a7d --- /dev/null +++ b/man/e_bipoiss_clust_batch.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/e_bipoiss_clust_batch.R +\name{e_bipoiss_clust_batch} +\alias{e_bipoiss_clust_batch} +\title{run the expetation step for each patch iteratively} +\usage{ +e_bipoiss_clust_batch(x, params, hidden = NULL, nbatch = 10) +} +\arguments{ +\item{x}{the data matrix (2 columns)} + +\item{params}{the list of 3 lambda} + +\item{hidden}{the list of hidden variable} + +\item{nbatch}{the number of batch} +} +\value{ +a list of hidden variable +} +\description{ +run the expetation step for each patch iteratively +} diff --git a/man/em_bipoiss_clust.Rd b/man/em_bipoiss_clust.Rd index ad78a3ed7752e2cb5b7fc3914d8e4b3077a58cd8..fd74ddffd12fabb504f0fb6813c996cab4bc200a 100644 --- a/man/em_bipoiss_clust.Rd +++ b/man/em_bipoiss_clust.Rd @@ -9,7 +9,8 @@ em_bipoiss_clust( params = list(A = list(kappa = 0.4, l1 = mean(x), l2 = mean(x), l3 = mean(x)), X = list(kappa = 0.3, l1 = 1, l2 = mean(x), l3 = mean(x)), Y = list(kappa = 0.3, l1 = mean(x), l2 = 1, l3 = mean(x))), - frac = 1, + gamma = c(round(nrow(x) * 0.8), round(nrow(x) * 0.15), round(nrow(x) * 0.05)), + nbatch = 10, threshold = c(0.1, 1), max_iter = 100 ) @@ -19,7 +20,9 @@ em_bipoiss_clust( \item{params}{the list of 3 lambda (initial value)} -\item{frac}{(default: 1.) fraction of the data to use at each step} +\item{gamma}{a vector of kappa prior} + +\item{nbatch}{(default: 10) the number of batch} \item{threshold}{(default: c(0.1, 5)) maximum difference between kappa and lambda} diff --git a/man/get_cluster_poiss.Rd b/man/get_cluster_poiss.Rd new file mode 100644 index 0000000000000000000000000000000000000000..78625d28d8406aa581251455ebc2fc8ed3b8bc9d --- /dev/null +++ b/man/get_cluster_poiss.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_cluster_poiss.R +\name{get_cluster_poiss} +\alias{get_cluster_poiss} +\title{add the cluster assigment and proba to the results} +\usage{ +get_cluster_poiss(x, res, params, hidden) +} +\arguments{ +\item{x}{the data matrix (2 columns)} + +\item{res}{the list of the results} + +\item{params}{the list of 3 lambda} + +\item{hidden}{the list of hidden variable} +} +\value{ +a bool +} +\description{ +add the cluster assigment and proba to the results +} diff --git a/man/loglik_bipoiss_clust.Rd b/man/loglik_bipoiss_clust.Rd index f375720fecbca224fd1e80eb69d310070b427a04..6bec48de8f0c3d1eafad01b6c58e6f062b68eaa0 100644 --- a/man/loglik_bipoiss_clust.Rd +++ b/man/loglik_bipoiss_clust.Rd @@ -4,12 +4,14 @@ \alias{loglik_bipoiss_clust} \title{logliklihood for bi-Poisson EM} \usage{ -loglik_bipoiss_clust(x, params) +loglik_bipoiss_clust(x, params, gamma) } \arguments{ \item{x}{the data matrix (2 columns)} \item{params}{the list of 3 lambda} + +\item{gamma}{a vector of kappa prior} } \value{ logliklihood diff --git a/man/m_bipoiss_clust.Rd b/man/m_bipoiss_clust.Rd index db9ef9b86cdd9c2a56ea3eccf24968a042907035..a7294c03ecd8ce2539d7f37cf8228c78f33225b6 100644 --- a/man/m_bipoiss_clust.Rd +++ b/man/m_bipoiss_clust.Rd @@ -8,6 +8,7 @@ m_bipoiss_clust( x, s, h, + gamma, m_chr = list(m_bipoiss_clust_a_chr, m_bipoiss_clust_x_chr, m_bipoiss_clust_y_chr) ) } @@ -15,6 +16,12 @@ m_bipoiss_clust( \item{x}{the data matrix (2 columns)} \item{s}{of size 3 time nrow(x)} + +\item{h}{of size 3 time nrow(x)} + +\item{gamma}{a vector of kappa prior} + +\item{m_chr}{the maximization function for each type of chromosome} } \value{ a list of 3 lambda diff --git a/man/poiss_compare_models.Rd b/man/poiss_compare_models.Rd index 4213ec43145ab623c83dd030d5da663a8e566041..852b3fd9242c6ae00b6af80543f3105e448706c8 100644 --- a/man/poiss_compare_models.Rd +++ b/man/poiss_compare_models.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/poiss_boostrap_em.R +% Please edit documentation in R/poiss_boostrap_em.R, R/poiss_boostrap_em_sub.R \name{poiss_compare_models} \alias{poiss_compare_models} \title{Function to compile statistics differences between model XY and XO on boostrap @@ -10,9 +10,19 @@ poiss_compare_models( threshold = 1, nboot = 100, bootsize = 1, - frac = 1, core = 1, - max_iter = 100 + max_iter = 100, + max_error = 5 +) + +poiss_compare_models( + x, + threshold = 1, + nboot = 100, + bootsize = 1, + core = 1, + max_iter = 100, + max_error = 5 ) } \arguments{ @@ -24,20 +34,33 @@ poiss_compare_models( \item{bootsize}{size of the boostrap sample (if < 0 we take a percentage of x)} -\item{frac}{(default: 1.) fraction of the data to use at each step} - \item{core}{number of cpus to use for the computation} \item{max_iter}{(default: 100) maximum number of iteration} + +\item{max_error}{(default: 5) maximum number of try} + +\item{frac}{(default: 1.) fraction of the data to use at each step} } \value{ +tibble with the statistics + tibble with the statistics } \description{ +Function to compile statistics differences between model XY and XO on boostrap +samples of the data + Function to compile statistics differences between model XY and XO on boostrap samples of the data } \examples{ +res <- rbind( + extraDistr::rbvpois(1000, 10, 10, 20), # A + extraDistr::rbvpois(1000, 0, 20, 20), # X + extraDistr::rbvpois(1000, 20, 0, 5) # Y + ) \%>\% + poiss_compare_models(nboot = 3, core = 1) res <- rbind( extraDistr::rbvpois(1000, 10, 10, 20), # A extraDistr::rbvpois(1000, 0, 20, 20), # X diff --git a/vignettes/getting-started-bayesian-version.Rmd b/vignettes/getting-started-bayesian-version.Rmd index d3327193148284e9812eccd2b78a9e07d8ddb24e..e7489829a295a570d2a0d0d4714fb90109915091 100644 --- a/vignettes/getting-started-bayesian-version.Rmd +++ b/vignettes/getting-started-bayesian-version.Rmd @@ -103,6 +103,29 @@ test_model_XY <- function(data) { } model_XY <- test_model_XY(data) plot_model(model_XY) + +data <- sim_kmer(1e4, 100, "XY", count = T) +test_model_XY <- function(data) { + data %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + # compute_tpm() %>% + EM_constraint( + frac = 1, + kappa_weight = c(.5, .5, .5), + mu_weight = c(.5, .5, .5), + sigma_weight = c(.1, .1, .1), + kappa_prior = c(1/3, 1/3, 1/3), + mu_prior = list(c(1, 1) * mean(.), c(.6, 1) * mean(.), c(.5, 0) * mean(.)), + sigma_prior = list( + matrix(c(1, .95, .95, 1) * mean(.) * 10, ncol = 2), + matrix(c(1.05, 1.5, 1.5, 4.05) * mean(.) * 2, ncol = 2), + matrix(c(2, .05, .05, .05) * mean(.) * 2, ncol = 2) + ) + ) +} +model_XO <- test_model_XY(data) +plot_model(model_XO) ``` ## XO diff --git a/vignettes/getting-started-poisson-version.Rmd b/vignettes/getting-started-poisson-version.Rmd index 983536d440f37b149a1f84ee0455f04adfa90520..bc6461f46bf0edbd6855dcd2d0a77e434fa84125 100644 --- a/vignettes/getting-started-poisson-version.Rmd +++ b/vignettes/getting-started-poisson-version.Rmd @@ -69,13 +69,18 @@ pkgload::load_all() ## Params diff +## Get clusters + +## E batch + ## EM # Clustering on simulated data ## XY -We simulate 3 clusters from bi-Poisson distribution +We simulate 3 clusters from bi-Poisson distribution and we use the `em_bipoiss_clust()` function and plot the results + ```{r simulating_data_XY} n <- 1e3 @@ -85,29 +90,13 @@ data <- rbind( extraDistr::rbvpois(n * .1, 200, 0, 50) # Y ) -data %>% - as_tibble() %>% - dplyr::rename(male = 'V1', female = 'V2') %>% - sample_n(1000) %>% - ggplot() + - geom_point(aes(x = male, y = female)) + - geom_abline(intercept = 0, slope = 1, color = "blue") + - geom_abline(intercept = 0, slope = 2, color = "red") + - geom_abline(intercept = 0, slope = 0, color = "green") + - coord_equal() + - theme_bw() -``` - -We use the `em_bipoiss_clust()` function and plot the results +res <- data %>% em_bipoiss_clust() - -```{r fit_simulation} -res <- data %>% em_bipoiss_clust(frac = .1) rbind( - extraDistr::rbvpois(1000*res[[1]]$kappa, res[[1]]$l1, res[[1]]$l2, res[[1]]$l3), # X - extraDistr::rbvpois(1000*res[[2]]$kappa, res[[2]]$l1, res[[2]]$l2, res[[2]]$l3), # A - extraDistr::rbvpois(1000*res[[3]]$kappa, res[[3]]$l1, res[[3]]$l2, res[[3]]$l3) # Y - ) %>% + extraDistr::rbvpois(1000*res$A$kappa, res$A$l1, res$A$l2, res$A$l3), # X + extraDistr::rbvpois(1000*res$X$kappa, res$X$l1, res$X$l2, res$X$l3), # A + extraDistr::rbvpois(1000*res$Y$kappa, res$Y$l1, res$Y$l2, res$Y$l3) # Y +) %>% as_tibble() %>% dplyr::rename(male = 'V1', female = 'V2') %>% ggplot() + @@ -116,6 +105,7 @@ rbind( geom_abline(intercept = 0, slope = 2, color = "red") + geom_abline(intercept = 0, slope = 0, color = "green") + coord_equal() + + ylim(c(-10,500)) + theme_bw() ``` @@ -150,7 +140,9 @@ res <- data %>% em_bipoiss_clust(params = list( l2 = mean(data), l3 = mean(data) ) - ), frac = .1) + ), + gamma = c(round(nrow(data) * .8), round(nrow(data) * .2)) + ) rbind( extraDistr::rbvpois(1000*res[[1]]$kappa, res[[1]]$l1, res[[1]]$l2, res[[1]]$l3), # X extraDistr::rbvpois(1000*res[[2]]$kappa, res[[2]]$l1, res[[2]]$l2, res[[2]]$l3) # A @@ -182,7 +174,8 @@ res <- data %>% em_bipoiss_clust(params = list( l2 = mean(data), l3 = mean(data) ) - ), frac = .1) + ), + gamma = c(nrow(data))) extraDistr::rbvpois(1000*res[[1]]$kappa, res[[1]]$l1, res[[1]]$l2, res[[1]]$l3) %>% # X as_tibble() %>% dplyr::rename(male = 'V1', female = 'V2') %>% @@ -208,7 +201,7 @@ data <- rbind( extraDistr::rbvpois(n, 0, 200, 200), # X extraDistr::rbvpois(n, 200, 0, 5) # Y ) -res <- data %>% poiss_compare_models(nboot = 5, frac = 1e-1, core = 1, max_iter = 30) +res <- data %>% poiss_compare_models(nboot = 5, core = 1, max_iter = 30) res %>% pivot_longer(cols = -c("name"), names_to = "metric") %>% ggplot(aes(x = name, y = value)) + @@ -227,7 +220,7 @@ data <- rbind( extraDistr::rbvpois(n, 100, 100, 200), # A extraDistr::rbvpois(n, 0, 200, 200) # X ) -res <- data %>% poiss_compare_models(nboot = 5, frac = 1e-2, core = 1, max_iter = 30) +res <- data %>% poiss_compare_models(nboot = 5, core = 1, max_iter = 30) res %>% pivot_longer(cols = -c("name"), names_to = "metric") %>% ggplot(aes(x = name, y = value)) + @@ -243,9 +236,9 @@ We simulate 1 clusters from bi-Poisson distribution ```{r model_compare_OO} data <- rbind( - extraDistr::rbvpois(1000, 100, 100, 200) # A + extraDistr::rbvpois(10000, 100, 100, 200) # A ) -res <- data %>% poiss_compare_models(nboot = 5, frac = 1e-2, core = 1, max_iter = 30) +res <- data %>% poiss_compare_models(nboot = 10, core = 1, max_iter = 30) res %>% pivot_longer(cols = -c("name"), names_to = "metric") %>% ggplot(aes(x = name, y = value)) + diff --git a/vignettes/getting-started.Rmd b/vignettes/getting-started.Rmd index 7ff18bcb13f80345222ef8462cc6f796a6b60716..0b299d3d1403cee8ee57d5a6aae8e27f59507049 100644 --- a/vignettes/getting-started.Rmd +++ b/vignettes/getting-started.Rmd @@ -64,6 +64,17 @@ data %>% coord_fixed() ``` +## OO data + +```{r sim_OO} +data <- sim_kmer(1e4, 1000, "OO") +data %>% + ggplot(aes(x = count_m, y = count_f, color = sex)) + + geom_point() + + coord_fixed() + +``` + # Clustering EN functions @@ -85,6 +96,32 @@ data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% plot_proba(model_XY$proba, sex = "XY") +data <- sim_kmer(1e6, 1000, "XY", count = T) +model_XY <- data %>% + dplyr::select(count_m, count_f) %>% + mutate( + count_m = log1p(count_m), + count_f = log1p(count_f), + ) %>% + as.matrix() %>% + EM_clust(sex = "XO") +data %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + plot_proba(model_XY$proba, sex = "XO") +data <- sim_kmer(1e6, 1000, "XY", count = T) +model_XY <- data %>% + dplyr::select(count_m, count_f) %>% + mutate( + count_m = log1p(count_m), + count_f = log1p(count_f), + ) %>% + as.matrix() %>% + EM_clust(sex = "OO") +data %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + plot_proba(model_XY$proba, sex = "OO") ``` # clustering XO @@ -104,7 +141,7 @@ data %>% # clustering OO ```{r clustering_OO} -data <- sim_kmer(1e4, 1000, "XO") +data <- sim_kmer(1e4, 1000, "OO") model_XO <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% @@ -152,26 +189,39 @@ pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5) ## For XY ```{r BIC_XY} -data <- sim_kmer(1e7, 1000, "XY") -res <- compare_models(data, nboot = 10, bootsize = 0.01, core = 1) +data <- sim_kmer(1e6, 1000, "XY") +res <- compare_models(data, nboot = 10, bootsize = 1, core = 1) res %>% ggplot(aes(x = name, y = BIC)) + geom_violin() res %>% - ggplot(aes(x = name, y = WSS_f / BSS)) + + ggplot(aes(x = name, y = loglik)) + geom_violin() ``` ## For XO ```{r BIC_XO} -data <- sim_kmer(1e7, 1000, "XO") -res <- compare_models(data, nboot = 10, bootsize = 0.01, core = 1) +data <- sim_kmer(1e6, 1000, "XO") +res <- compare_models(data, nboot = 10, bootsize = 1, core = 1) +res %>% + ggplot(aes(x = name, y = BIC)) + + geom_violin() +res %>% + ggplot(aes(x = name, y = loglik)) + + geom_violin() +``` + +## For OO + +```{r BIC_OO} +data <- sim_kmer(1e6, 1000, "OO") +res <- compare_models(data, nboot = 10, bootsize = 1, core = 1) res %>% ggplot(aes(x = name, y = BIC)) + geom_violin() res %>% - ggplot(aes(x = name, y = WSS_f / BSS)) + + ggplot(aes(x = name, y = loglik)) + geom_violin() ```