From e0a3211559ae83362a8873aed7b7cc2073803d99 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Fri, 26 May 2023 15:56:12 +0200 Subject: [PATCH] clustering.Rmd: working XY clustering --- src/clustering.Rmd | 104 +++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 47 deletions(-) diff --git a/src/clustering.Rmd b/src/clustering.Rmd index 60c8032..4a77beb 100644 --- a/src/clustering.Rmd +++ b/src/clustering.Rmd @@ -49,7 +49,7 @@ test_slope(1.05, 2.05, 0.95) Simulate k-mer counts data ```{r} -n_kmer = 1e4 +n_kmer = 1e2 mean_coef = 1000 data <- tibble( sex = "F", @@ -89,31 +89,22 @@ plot(data_clust, what = "uncertainty") ``` ```{r} -theta2 <- list( - "pi" = c(.1, .05, .85), - "mu" = list(c(1000, 2000), c(1000, 0), c(1000, 1000)), - "sigma" = list( - "f" = matrix(c(1.05, 2, 2, 4.05) * mean_coef, ncol = 2), - "m" = matrix(c(.9, .05, .05, .05) * mean_coef, ncol = 2), - "a" = matrix(c(1, .95, .95, 1) * mean_coef, ncol = 2) - ) -) -expand_theta <- function(theta) { +expand_theta <- function(theta, cluster_coef) { list( "f" = list( "pi" = theta$pi[1], - "mu" = theta$mu[[1]], + "mu" = cluster_coef$f * theta$mu, "sigma" = theta$sigma$f ), "m" = list( "pi" = theta$pi[2], - "mu" = theta$mu[[2]], + "mu" = cluster_coef$m * theta$mu, "sigma" = theta$sigma$m ), "a" = list( "pi" = theta$pi[3], - "mu" = theta$mu[[3]], + "mu" = cluster_coef$a * theta$mu, "sigma" = theta$sigma$a ) ) @@ -122,12 +113,7 @@ expand_theta <- function(theta) { params_diff <- function(old_theta, theta, threshold) { result <- max( max(abs(old_theta$pi - theta$pi)), - max(abs(old_theta$mu[[1]] - theta$mu[[1]])), - max(abs(old_theta$mu[[2]] - theta$mu[[2]])), - max(abs(old_theta$mu[[3]] - theta$mu[[3]])), - max(abs(old_theta$sigma$f - theta$sigma$f)), - max(abs(old_theta$sigma$m - theta$sigma$m)), - max(abs(old_theta$sigma$a - theta$sigma$a)) + max(abs(old_theta$mu - theta$mu)) ) if (is.finite(result)) { return(results > threshold) @@ -135,37 +121,35 @@ params_diff <- function(old_theta, theta, threshold) { return(T) } -proba_total <- function(x, theta) { +proba_total <- function(x, theta, cluster_coef) { proba <- 0 - for (params in expand_theta(theta)) { - print(params) - proba <- proba + params$pi + - mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma, log = T) + for (params in expand_theta(theta, cluster_coef)) { + proba <- proba + params$pi * + mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma) } - print(proba) return(proba) } -proba_point <- function(x, theta) { +proba_point <- function(x, theta, cluster_coef) { proba <- c() - for (params in expand_theta(theta)) { - proba <- cbind(proba, params$pi + - mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma, log = T) + for (params in expand_theta(theta, cluster_coef)) { + proba <- cbind(proba, params$pi * + mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma) ) } return(proba) } -loglik <- function(x, theta) { - -sum(proba_total(x, theta)) +loglik <- function(x, theta, cluster_coef) { + -log(sum(proba_total(x, theta, cluster_coef))) } # EM function -E_proba <- function(x, theta) { - proba <- proba_point(x, theta) - proba_norm <- rowSums(exp(proba)) +E_proba <- function(x, theta, cluster_coef) { + proba <- proba_point(x, theta, cluster_coef) + proba_norm <- rowSums(proba) for (cluster in 1:ncol(proba)) { - proba[, cluster] <- exp(proba[, cluster]) / proba_norm + proba[, cluster] <- proba[, cluster] / proba_norm proba[proba_norm == 0, cluster] <- 1 / ncol(proba) } return(proba) @@ -177,17 +161,29 @@ E_N_clust <- function(proba) { # Function for mean update M_mean <- function(x, proba, N_clust) { - mu <- list() + mu <- 0 for (cluster in 1:ncol(proba)) { - mu[[cluster]] <- colSums(x * proba[, cluster]) / N_clust[cluster] + if (cluster == 1) { + mu <- mu + 1/3 * + mean(colSums(x * c(1, 0.5) * proba[, cluster]) / N_clust[cluster]) + } + if (cluster == 2) { + mu <- mu + 1/3 * + (colSums(x * c(1, 0) * proba[, cluster]) / N_clust[cluster])[1] + } + if (cluster == 2) { + mu <- mu + 1/3 * + mean(colSums(x * c(0.5, 0.5) * proba[, cluster]) / N_clust[cluster]) + } } return(mu) } -M_cov <- function(x, proba, mu, N_clust) { +M_cov <- function(x, proba, mu, N_clust, cluster_coef) { cov_clust <- list() for (cluster in 1:ncol(proba)) { - cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu[[cluster]])) %*% (x - mu[[cluster]]) / N_clust[cluster] + print(cluster_coef[[cluster]]) + cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster] } sigma <- list() sigma$f <- cov_clust[[1]] @@ -209,19 +205,33 @@ plot_proba <- function(x, proba) { scale_color_identity() } -EM_clust <- function(x, theta, threshold = 0.1) { +EM_clust <- function(x, threshold = 0.1) { old_loglik <- -Inf new_loglik <- 0 + cluster_coef <- list( + "f" = c(1, 2), + "m" = c(1, 0), + "a" = c(2, 2) + ) + theta <- list( + "pi" = c(.1, .05, .85), + "mu" = mean(colMeans(x)) * .5 + ) + theta$sigma <- list( + "f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2), + "m" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2), + "a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2) + ) while (abs(new_loglik - old_loglik) > threshold) { - old_loglik <- loglik(x, theta) - proba <- E_proba(x, theta) + old_loglik <- loglik(x, theta, cluster_coef) + proba <- E_proba(x, theta, cluster_coef) theta$pi <- E_N_clust(proba) theta$mu <- M_mean(x, proba, theta$pi) - theta$sigma <- M_cov(x, proba, theta$mu, theta$pi) + theta$sigma <- M_cov(x, proba, theta$mu, theta$pi, cluster_coef) theta$pi <- theta$pi / nrow(x) - new_loglik <- loglik(x, theta) - # Sys.sleep(.5) + new_loglik <- loglik(x, theta, cluster_coef) } + print(theta) return(proba) } @@ -229,7 +239,7 @@ EM_clust <- function(x, theta, threshold = 0.1) { proba <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% - EM_clust(theta2) + EM_clust() data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% -- GitLab