From e0a3211559ae83362a8873aed7b7cc2073803d99 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Fri, 26 May 2023 15:56:12 +0200
Subject: [PATCH] clustering.Rmd: working XY clustering

---
 src/clustering.Rmd | 104 +++++++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 47 deletions(-)

diff --git a/src/clustering.Rmd b/src/clustering.Rmd
index 60c8032..4a77beb 100644
--- a/src/clustering.Rmd
+++ b/src/clustering.Rmd
@@ -49,7 +49,7 @@ test_slope(1.05, 2.05, 0.95)
 Simulate k-mer counts data
 
 ```{r}
-n_kmer = 1e4
+n_kmer = 1e2
 mean_coef = 1000
 data <- tibble(
     sex = "F",
@@ -89,31 +89,22 @@ plot(data_clust, what = "uncertainty")
 ```
 
 ```{r}
-theta2 <- list(
-    "pi" = c(.1, .05, .85),
-    "mu" = list(c(1000, 2000), c(1000, 0), c(1000, 1000)),
-    "sigma" = list(
-        "f" = matrix(c(1.05, 2, 2, 4.05) * mean_coef, ncol = 2),
-        "m" = matrix(c(.9, .05, .05, .05) * mean_coef, ncol = 2),
-        "a" = matrix(c(1, .95, .95, 1) * mean_coef, ncol = 2)
-    )
-)
 
-expand_theta <- function(theta) {
+expand_theta <- function(theta, cluster_coef) {
     list(
     "f" = list(
         "pi" = theta$pi[1],
-        "mu" = theta$mu[[1]],
+        "mu" = cluster_coef$f * theta$mu,
         "sigma" = theta$sigma$f
     ),
     "m" = list(
         "pi" = theta$pi[2],
-        "mu" = theta$mu[[2]],
+        "mu" = cluster_coef$m * theta$mu,
         "sigma" = theta$sigma$m
     ),
     "a" = list(
         "pi" = theta$pi[3],
-        "mu" = theta$mu[[3]],
+        "mu" = cluster_coef$a * theta$mu,
         "sigma" = theta$sigma$a
     )
 )
@@ -122,12 +113,7 @@ expand_theta <- function(theta) {
 params_diff <- function(old_theta, theta, threshold) {
     result <- max(
         max(abs(old_theta$pi - theta$pi)),
-        max(abs(old_theta$mu[[1]] - theta$mu[[1]])),
-        max(abs(old_theta$mu[[2]] - theta$mu[[2]])),
-        max(abs(old_theta$mu[[3]] - theta$mu[[3]])),
-        max(abs(old_theta$sigma$f - theta$sigma$f)),
-        max(abs(old_theta$sigma$m - theta$sigma$m)),
-        max(abs(old_theta$sigma$a - theta$sigma$a))
+        max(abs(old_theta$mu - theta$mu))
     )
     if (is.finite(result)) {
         return(results > threshold)
@@ -135,37 +121,35 @@ params_diff <- function(old_theta, theta, threshold) {
     return(T)
 }
 
-proba_total <- function(x, theta) {
+proba_total <- function(x, theta, cluster_coef) {
     proba <- 0
-    for (params in expand_theta(theta)) {
-        print(params)
-        proba <- proba + params$pi + 
-            mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma, log = T)
+    for (params in expand_theta(theta, cluster_coef)) {
+        proba <- proba + params$pi * 
+            mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
     }
-    print(proba)
     return(proba)
 }
 
-proba_point <- function(x, theta) {
+proba_point <- function(x, theta, cluster_coef) {
     proba <- c()
-    for (params in expand_theta(theta)) {
-        proba <- cbind(proba, params$pi + 
-            mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma, log = T)
+    for (params in expand_theta(theta, cluster_coef)) {
+        proba <- cbind(proba, params$pi * 
+            mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
         )
     }
     return(proba)
 }
 
-loglik <- function(x, theta) {
-    -sum(proba_total(x, theta))
+loglik <- function(x, theta, cluster_coef) {
+    -log(sum(proba_total(x, theta, cluster_coef)))
 }
 
 # EM function
-E_proba <- function(x, theta) {
-    proba <- proba_point(x, theta)
-    proba_norm <- rowSums(exp(proba))
+E_proba <- function(x, theta, cluster_coef) {
+    proba <- proba_point(x, theta, cluster_coef)
+    proba_norm <- rowSums(proba)
     for (cluster in 1:ncol(proba)) {
-        proba[, cluster] <- exp(proba[, cluster]) / proba_norm
+        proba[, cluster] <- proba[, cluster] / proba_norm
         proba[proba_norm == 0, cluster] <- 1 / ncol(proba)
     }
     return(proba)
@@ -177,17 +161,29 @@ E_N_clust <- function(proba) {
 
 # Function for mean update
 M_mean <- function(x, proba, N_clust) {
-    mu <- list()
+    mu <- 0
     for (cluster in 1:ncol(proba)) {
-        mu[[cluster]] <- colSums(x * proba[, cluster]) / N_clust[cluster]
+        if (cluster == 1) {
+            mu <- mu + 1/3 *
+                mean(colSums(x * c(1, 0.5) * proba[, cluster]) / N_clust[cluster])
+        }
+        if (cluster == 2) {
+            mu <- mu + 1/3 *
+                (colSums(x * c(1, 0) * proba[, cluster]) / N_clust[cluster])[1]
+        }
+        if (cluster == 2) {
+            mu <- mu + 1/3 *
+                mean(colSums(x * c(0.5, 0.5) * proba[, cluster]) / N_clust[cluster])
+        }
     }
     return(mu)
 }
 
-M_cov <- function(x, proba, mu, N_clust) {
+M_cov <- function(x, proba, mu, N_clust, cluster_coef) {
     cov_clust <- list() 
     for (cluster in 1:ncol(proba)) {
-        cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu[[cluster]])) %*% (x - mu[[cluster]]) / N_clust[cluster]
+        print(cluster_coef[[cluster]])
+        cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster]
     }
     sigma <- list()
     sigma$f <- cov_clust[[1]]
@@ -209,19 +205,33 @@ plot_proba <- function(x, proba) {
         scale_color_identity()
 }
 
-EM_clust <- function(x, theta, threshold = 0.1) {
+EM_clust <- function(x, threshold = 0.1) {
     old_loglik <- -Inf
     new_loglik <- 0
+    cluster_coef <- list(
+        "f" = c(1, 2),
+        "m" = c(1, 0),
+        "a" = c(2, 2)
+    )
+    theta <- list(
+        "pi" = c(.1, .05, .85),
+        "mu" = mean(colMeans(x)) * .5
+    )
+    theta$sigma <- list(
+            "f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
+            "m" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
+            "a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
+    )
     while (abs(new_loglik - old_loglik) > threshold) {
-        old_loglik <- loglik(x, theta)
-        proba <- E_proba(x, theta)
+        old_loglik <- loglik(x, theta, cluster_coef)
+        proba <- E_proba(x, theta, cluster_coef)
         theta$pi <- E_N_clust(proba)
         theta$mu <- M_mean(x, proba, theta$pi)
-        theta$sigma <- M_cov(x, proba, theta$mu, theta$pi)
+        theta$sigma <- M_cov(x, proba, theta$mu, theta$pi, cluster_coef)
         theta$pi <- theta$pi / nrow(x)
-        new_loglik <- loglik(x, theta)
-        # Sys.sleep(.5)
+        new_loglik <- loglik(x, theta, cluster_coef)
     }
+    print(theta)
     return(proba)
 }
 
@@ -229,7 +239,7 @@ EM_clust <- function(x, theta, threshold = 0.1) {
 proba <- data %>%
     dplyr::select(count_m, count_f) %>%
     as.matrix() %>% 
-    EM_clust(theta2)
+    EM_clust()
 data %>%
     dplyr::select(count_m, count_f) %>%
     as.matrix() %>% 
-- 
GitLab