From cd0c90c476abf60335fcbcc2c9e86e05ff11bc4b Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Fri, 23 Jun 2023 13:38:44 +0200 Subject: [PATCH] add clustering on the log scale --- dev/flat_full.Rmd | 249 +++++++++++++++++++++++++++------------------- 1 file changed, 146 insertions(+), 103 deletions(-) diff --git a/dev/flat_full.Rmd b/dev/flat_full.Rmd index 14d833b..61d773a 100644 --- a/dev/flat_full.Rmd +++ b/dev/flat_full.Rmd @@ -2,12 +2,13 @@ title: "flat_full.Rmd for working package" output: html_document editor_options: - chunk_output_type: console + chunk_output_type: inline --- ```{r load} library(kmerclust) library(tidyverse) +library(parallel) ``` ```{r development-load} @@ -59,7 +60,7 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY", count = F) { bind_rows( tibble( sex = "A", - count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd") + count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(1, 1)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd") %>% as_tibble(.name_repair = "universal") ) %>% unnest(c(count)) ) @@ -123,18 +124,18 @@ expand_theta <- function(theta, cluster_coef, sex) { theta_ref <- list( "a" = list( "pi" = theta$pi[1], - "mu" = cluster_coef$a * theta$mu, + "mu" = cluster_coef$a + theta$mu, "sigma" = theta$sigma$a ), "f" = list( "pi" = theta$pi[2], - "mu" = cluster_coef$f * theta$mu, + "mu" = cluster_coef$f + theta$mu, "sigma" = theta$sigma$f )) if (sex == "XY") { theta_ref[["m"]] <- list( "pi" = theta$pi[3], - "mu" = cluster_coef$m * theta$mu, + "mu" = (cluster_coef$m + theta$mu) * c(1, 0), "sigma" = theta$sigma$m ) } @@ -243,8 +244,14 @@ E_N_clust <- function(proba) { M_mean <- function(x, proba, N_clust, cluster_coef, sex) { mu <- 0 for (cluster in 1:ncol(proba)) { - mu <- mu + - mean(colSums(x * cluster_coef[[cluster]] * proba[, cluster]) / N_clust[cluster]) + if (cluster == 3) { + mu <- mu + + mean(colSums((x - cluster_coef[[cluster]]) * c(1, 0) * proba[, cluster]) / N_clust[cluster]) + } else { + + mu <- mu + + mean(colSums((x - cluster_coef[[cluster]]) * proba[, cluster]) / N_clust[cluster]) + } } if (sex == "XY") { return(mu / 3) @@ -266,7 +273,11 @@ M_mean <- function(x, proba, N_clust, cluster_coef, sex) { M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) { cov_clust <- list() for (cluster in 1:ncol(proba)) { - cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster] + if (cluster == 3) { + cov_clust[[cluster]] <- t(proba[, cluster] * (x - (mu + cluster_coef[[cluster]]) * c(1, 0))) %*% (x - (mu + cluster_coef[[cluster]]) * c(1, 0)) / N_clust[cluster] + } else { + cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu + cluster_coef[[cluster]])) %*% (x - mu + cluster_coef[[cluster]]) / N_clust[cluster] + } } sigma <- list() sigma$a <- cov_clust[[1]] @@ -320,10 +331,6 @@ plot_proba <- function(x, proba, sex = "XY") { theme_bw() } } -sample %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - plot_proba(model_XY$proba, sex = "XY") ``` ```{r fun-init_param} @@ -334,19 +341,19 @@ sample %>% #' @return a list of parameters init_param <- function(x, sex) { cluster_coef <- list( - "a" = c(1, 1), - "f" = c(1, 0.5) + "a" = c(0, 0), + "f" = c(0, log(2)) ) theta <- list( "pi" = c(.85, .1, .05), - "mu" = mean(colMeans(x)) * .5 + "mu" = mean(colMeans(x)) ) theta$sigma <- list( "a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2), "f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2) ) if (sex == "XY") { - cluster_coef$m <- c(1, 0) + cluster_coef$m <- c(0, 0) theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2) } return(list(cluster_coef = cluster_coef, theta = theta)) @@ -392,13 +399,13 @@ EM_clust <- function(x, threshold = 1, sex = "XY") { new_loglik <- 0 param <- init_param(x, sex) while (abs(new_loglik - old_loglik) > threshold) { - old_loglik <- loglik(x, param$theta, param$cluster_coef, sex) - proba <- E_proba(x, param$theta, param$cluster_coef, sex) - param$theta$pi <- E_N_clust(proba) - param$theta$mu <- M_mean(x, proba, param$theta$pi, params$cluster_coef, sex) - param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex) + old_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex) + proba <- E_proba(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex) + param$theta$pi <- E_N_clust(proba = proba) + param$theta$mu <- M_mean(x = x, proba = proba, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex) + param$theta$sigma <- M_cov(x = x, proba = proba, mu = param$theta$mu, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex) param$theta$pi <- param$theta$pi / nrow(x) - new_loglik <- loglik(x, param$theta, param$cluster_coef, sex) + new_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex) if (is.infinite(new_loglik)) { break } @@ -505,13 +512,21 @@ BSS_WSS <- function(x, cluster) { # clustering XY ```{r clustering_XY} -data <- sim_kmer(1e6, 1000, "XY", count =T) +data <- sim_kmer(1e5, 1000, "XY", count =T) model_XY <- data %>% dplyr::select(count_m, count_f) %>% + mutate( + count_m = log1p(count_m), + count_f = log1p(count_f), + ) %>% as.matrix() %>% EM_clust() data %>% dplyr::select(count_m, count_f) %>% + mutate( + count_m = log1p(count_m), + count_f = log1p(count_f), + ) %>% as.matrix() %>% plot_proba(model_XY$proba) ``` @@ -519,13 +534,23 @@ data %>% # clustering XO ```{r clustering_XO} -data <- sim_kmer(1e4, 1000, "XO") +data <- sim_kmer(1e5, 1000, "XO") model_XO <- data %>% dplyr::select(count_m, count_f) %>% + mutate( + count_m = log1p(count_m), + count_f = log1p(count_f), + ) %>% + drop_na() %>% as.matrix() %>% EM_clust(sex = "X0") data %>% dplyr::select(count_m, count_f) %>% + mutate( + count_m = log1p(count_m), + count_f = log1p(count_f), + ) %>% + drop_na() %>% as.matrix() %>% plot_proba(model_XO$proba, sex = "X0") ``` @@ -656,7 +681,7 @@ annotate_counts <- function(annotation, count, name) { ## M belari data ```{r dev} -load("../../results/12/mbelari/sample.Rdata", v = T) +load("../../../results/12/mbelari/sample.Rdata") ``` ```{r dev} @@ -679,91 +704,42 @@ sample %>% ``` ```{r dev} -sample %>% - select(count_m, count_f) %>% - sample_frac(0.01) %>% - mutate( - count_m = expm1(count_m), - count_f = expm1(count_f), - ) %>% - ggplot(aes(x = count_m, y = count_f)) + - geom_point() + - theme_bw() +res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8) +res %>% + ggplot(aes(x = name, y = BIC)) + + geom_violin() +res %>% + ggplot(aes(x = name, y = WSS_f / BSS)) + + geom_violin() ``` +## M Longespiculosa data + ```{r dev} -sample %>% - select(count_m, count_f) %>% - sample_frac(0.01) %>% - ggplot(aes(x = count_m, y = count_f)) + - geom_point() + - theme_bw() +load("../../../results/12/mlongespiculosa/sample.Rdata") ``` ```{r dev} -library(flexmix) -count <- sample %>% +model_XY <- sample %>% dplyr::select(count_m, count_f) %>% - mutate( - count_m = round(expm1(count_m)), - count_f = round(expm1(count_f)), - ) %>% - filter(count_m > 0) %>% - mutate( - ratio = count_f / count_m - ) %>% - sample_n(10000) - -count %>% -ggplot(aes(x = ratio)) + - geom_histogram() + - scale_x_log10() - -m_xy <- flexmix( - ratio ~ ratio, - k = 3, - data = count, - ) -m_xo <- flexmix( - count_m ~ count_f, - k = 2, - data = count, - model = FLXglm(family = "poisson") - ) -summary(m_xy) -parameters(m_xy, component = 1) -parameters(m_xy, component = 2) -parameters(m_xy, component = 3) -plot(m_xy) -rm_xy <- refit(m_xy) -summary(rm_xy) -count %>% - mutate( - proba1 = m_xy@posterior$scaled[, 1], - proba2 = m_xy@posterior$scaled[, 2], - proba3 = m_xy@posterior$scaled[, 3] - ) %>% - pivot_longer(cols = c(proba1, proba2, proba3)) %>% - ggplot(aes(x = count_m, y = count_f, color = value)) + - geom_point() + - facet_wrap(~name) + - coord_equal() + - theme_bw() -``` - - - -```{r dev, eval=F} -data <- readr::read_tsv("../../results/12/mlongespiculosa/mlongespiculosa.csv", show_col_types = FALSE) -format(object.size(data), units = "Mb") -annotation <- parse_annotation("../../data/sample.csv") -count <- annotate_counts(annotation, data, "mlongespiculosa") -save(count, file = "results/12/mlongespiculosa/counts.Rdata") -load("results/12/mlongespiculosa/counts.Rdata") -``` - -```{r dev, eval=F} -res <- compare_models(count %>% dplyr::ungroup(), nboot = 100, bootsize = 0.1, core = 24) + as.matrix() %>% + EM_clust(sex = "XY") +model_XO <- sample %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + EM_clust(sex = "X0") +sample %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + plot_proba(model_XY$proba, sex = "XY") +sample %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + plot_proba(model_XO$proba, sex = "X0") +``` + +```{r dev} +res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8) res %>% ggplot(aes(x = name, y = BIC)) + geom_violin() @@ -772,4 +748,71 @@ res %>% geom_violin() ``` +## M Spiculigera data + +```{r dev} +load("../../../results/12/mspiculigera/sample.Rdata") +``` +```{r dev} +model_XY <- sample %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + EM_clust(sex = "XY") +model_XO <- sample %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + EM_clust(sex = "X0") +sample %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + plot_proba(model_XY$proba, sex = "XY") +sample %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + plot_proba(model_XO$proba, sex = "X0") +``` + +```{r dev} +res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8) +res %>% + ggplot(aes(x = name, y = BIC)) + + geom_violin() +res %>% + ggplot(aes(x = name, y = WSS_f / BSS)) + + geom_violin() +res +``` + +# tmvnorm test + +```{r dev} +library(tmvtnorm) + +mean <- c(1, 1) +sigma <- matrix(c(10, 0, + 0, 1), 2, 2) +# Linear Constraints +# +# a1 <= x1 + x2 <= b2 +# a2 <= x1 - x2 <= b2 +# +# [ a1 ] <= [ 1 1 ] [ x1 ] <= [b1] +# [ a2 ] [ 1 -1 ] [ x2 ] [b2] +a <- c(0, 0) +b <- c( 30, 30) +D <- matrix(c(1, 1, + -1, 1), 2, 2) + +X <- rtmvnorm(n=10000, mean, sigma, lower=a, upper=b, D=D, algorithm="gibbsR") +plot(X, main="Gibbs sampling for multivariate normal + with linear constraints according to Geweke (1991)") + + abline(a=0, b=1, col="blue", lwd = 2) + abline(a=0, b=0, col="blue", lwd = 2) +# mark linear constraints as lines +for (i in 1:nrow(D)) { + abline(a=a[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red") + abline(a=b[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red") +} +``` -- GitLab