From 2e90d56b5263783449839e3c829cf899e1b74264 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Fri, 12 May 2023 11:57:07 +0200
Subject: [PATCH] clustering.Rmd: working clustering without constraints

---
 src/clustering.Rmd | 385 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 385 insertions(+)
 create mode 100644 src/clustering.Rmd

diff --git a/src/clustering.Rmd b/src/clustering.Rmd
new file mode 100644
index 0000000..555a4d9
--- /dev/null
+++ b/src/clustering.Rmd
@@ -0,0 +1,385 @@
+---
+title: "kmer clustering"
+author: "Laurent Modolo"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+
+```{r}
+library(mclust)
+library(tidyverse)
+library(mvtnorm)
+```
+
+## Simulation
+
+If we expect to have with $X_f$ the female k-mer count and $X_m$ the male k-mer count, the following relation for a XY species:
+
+- For the autosomal chromosomes: $X_m = X_f$
+- For the X chromosomes: $X_m = 2 X_m$
+- For the Y chromosomes: $X_m = 0^+ X_f$
+
+Which becomes on the $log$ scale:
+
+- For the autosomal chromosomes: $\log(X_m) = \log(X_f)$
+- For the X chromosomes: $\log(X_m) = \log(2) + \log(X_m)$
+- For the Y chromosomes: $\log(X_m) = log(0^+) + log(X_f)$
+
+
+Test slope for a given sigma matrice
+
+```{r}
+test_slope <- function(x, y, rho) {
+    test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>% 
+        as_tibble()
+    ggplot(data = test, aes(x = V1, y = V2)) +
+        geom_point() +
+        geom_smooth(method = lm) +
+        labs(title = lm(test$V2 ~ test$V1)$coef[2]) +
+        coord_fixed()
+}
+test_slope(1.05, 2.05, 0.95)
+```
+
+Simulate k-mer counts data
+
+```{r}
+n_kmer = 1e4
+mean_coef = 1000
+data <- tibble(
+    sex = "F",
+    count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>%
+        as_tibble()
+) %>%
+    unnest(count) %>% 
+    bind_rows(
+        tibble(
+            sex = "M",
+            count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd")
+            %>% as_tibble()
+        ) %>%
+        unnest(count)
+    ) %>% bind_rows(
+        tibble(
+            sex = "A",
+            count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
+            %>% as_tibble()
+        ) %>% unnest(count)
+    ) %>% 
+    rename(count_m = V1,
+           count_f = V2)
+data %>% 
+    ggplot(aes(x = count_m, y = count_f, color = sex)) +
+    geom_point() +
+    coord_fixed()
+```
+
+
+
+
+
+```{r}
+data_clust = data %>% select(-c("sex")) %>% mclust::Mclust(G = 3)
+```
+```{r}
+summary(data_clust)
+plot(data_clust, what = "classification")
+plot(data_clust, what = "uncertainty")
+```
+
+```{r}
+theta <- list(
+    "pi" = c(.1, .05, .85),
+    "mu" = list(c(1000, 2000), c(1000, 0), c(1000, 1000)),
+    "sigma" = list(
+        "f" = matrix(c(1.05, 2, 2, 4.05) * mean_coef, ncol = 2),
+        "m" = matrix(c(.9, .05, .05, .05) * mean_coef, ncol = 2),
+        "a" = matrix(c(1, .95, .95, 1) * mean_coef, ncol = 2)
+    )
+)
+
+expand_theta <- function(theta) {
+    list(
+    "f" = list(
+        "pi" = theta$pi[1],
+        "mu" = theta$mu[[1]],
+        "sigma" = theta$sigma$f
+    ),
+    "m" = list(
+        "pi" = theta$pi[2],
+        "mu" = theta$mu[[2]],
+        "sigma" = theta$sigma$m
+    ),
+    "a" = list(
+        "pi" = theta$pi[3],
+        "mu" = theta$mu[[3]],
+        "sigma" = theta$sigma$a
+    )
+)
+}
+
+params_diff <- function(old_theta, theta, threshold) {
+    result <- max(
+        max(abs(old_theta$pi - theta$pi)),
+        max(abs(old_theta$mu[[1]] - theta$mu[[1]])),
+        max(abs(old_theta$mu[[2]] - theta$mu[[2]])),
+        max(abs(old_theta$mu[[3]] - theta$mu[[3]])),
+        max(abs(old_theta$sigma$f - theta$sigma$f)),
+        max(abs(old_theta$sigma$m - theta$sigma$m)),
+        max(abs(old_theta$sigma$a - theta$sigma$a))
+    )
+    if (is.finite(result)) {
+        return(results > threshold)
+    }
+    return(T)
+}
+
+proba_total <- function(x, theta) {
+    proba <- 0
+    for (params in expand_theta(theta)) {
+        proba <- proba + params$pi * 
+            mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
+    }
+    return(proba)
+}
+
+proba_point <- function(x, theta) {
+    proba <- c()
+    for (params in expand_theta(theta)) {
+        proba <- cbind(proba, params$pi * 
+            mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
+        )
+    }
+    return(proba)
+}
+
+loglik <- function(x, theta) {
+    -log(sum(proba_total(x, theta)))
+}
+
+# EM function
+E_proba <- function(x, theta) {
+    proba <- proba_point(x, theta)
+    proba_norm <- rowSums(proba)
+    for (cluster in 1:ncol(proba)) {
+        proba[, cluster] <- proba[, cluster] / proba_norm
+        proba[proba_norm == 0, cluster] <- 1 / ncol(proba)
+    }
+    
+    return(proba)
+}
+
+E_N_clust <- function(proba) {
+    colSums(proba)
+}
+
+# Function for mean update
+M_mean <- function(x, proba, N_clust) {
+    mu <- list()
+    for (cluster in 1:ncol(proba)) {
+        mu[[cluster]] <- colSums(x * proba[, cluster]) / N_clust[cluster]
+    }
+    return(mu)
+}
+
+M_cov <- function(x, proba, mu, N_clust) {
+    cov_clust <- list() 
+    for (cluster in 1:ncol(proba)) {
+        cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu[[cluster]])) %*% (x - mu[[cluster]]) / N_clust[cluster]
+    }
+    sigma <- list()
+    sigma$f <- cov_clust[[1]]
+    sigma$m <- cov_clust[[2]]
+    sigma$a <- cov_clust[[3]]
+    return(sigma)
+}
+
+plot_proba <- function(x, proba) {
+    as_tibble(x) %>% 
+        mutate(
+            proba_f = proba[, 1],
+            proba_m = proba[, 2],
+            proba_a = proba[, 3],
+            color = rgb(proba_f * 255, proba_m * 255, proba_a * 255, maxColorValue = 255)
+        ) %>% 
+        ggplot(aes(x = count_m, y = count_f, color = color)) +
+        geom_point()
+}
+
+EM_clust <- function(x, theta, threshold = 0.1) {
+    old_theta <- theta
+    old_theta$mu <- list(c(-Inf, -Inf), c(-Inf, -Inf), c(-Inf, -Inf))
+    while (params_diff(old_theta, theta, threshold)) {
+        proba <- E_proba(x, theta)
+        theta$pi <- E_N_clust(proba)
+        theta$mu <- M_mean(x, proba, theta$pi)
+        theta$sigma <- M_cov(x, proba, theta$mu, theta$pi)
+        theta$pi <- theta$pi / nrow(x)
+        print(head(proba_point(x, theta)))
+        print(plot_proba(x, proba_point(x, theta)))
+        print(loglik(x, theta))
+        Sys.sleep(.5)
+    }
+    return(proba)
+}
+
+
+data %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    EM_clust(theta)
+```
+
+
+```{r}
+
+```
+
+```{r}
+## Example code for clustering on a three-component mixture model using the EM-algorithm.
+
+### First we load some libraries and define some useful functions
+
+library(mvtnorm)
+library(MASS)
+
+# Create a 'true' data set (an easy one)
+.create.data <- function(n)
+{
+  l <- list()
+  l[[1]] <- list(component=1,
+    mixing.weight=0.5,
+                 means=c(0,0),
+                 cov=matrix(c(1,0,0,1), ncol=2, byrow=T))
+  l[[2]] <- list(mixing.weight=0.3,
+                 component=2,
+                 means=c(5,5),
+                 cov=matrix(c(1, 0.5, 0.5, 1), ncol=2, byrow=T))
+  l[[3]] <- list(mixing.weight=0.2,
+                 component=3,
+                 means=c(10,10),
+                 cov=matrix(c(1,0.75,0.75,1), ncol=2, byrow=T))
+
+  do.call("rbind",sapply(l, function(e) {
+    dat <- mvtnorm::rmvnorm(e$mixing.weight * n, e$means, e$cov)
+    cbind(component=e$component,
+          x1=dat[,1],
+          x2=dat[,2])
+  }))
+}
+
+# Function for covariance update
+.cov <- function(n, r, dat, m, N.k)
+{
+  (t(r * (dat[,2:3] -m))  %*%  (( dat[,2:3]-m))) / N.k
+}
+
+# Generate starting values for means/covs/mixing weights
+.init <- function()
+{
+  l <- list()
+  l[[1]] <- list(mixing.weight=0.1,
+                 means=c(-2, -2),
+                 cov=matrix(c(1,0,0,1), ncol=2, byrow=T))
+  l[[2]] <- list(mixing.weight=0.1,
+                 means=c(10, 0),
+                 cov=matrix(c(1,0,0,1), ncol=2, byrow=T))
+  l[[3]] <- list(mixing.weight=0.8,
+                 means=c(0, 10),
+                 cov=matrix(c(1,0,0,1), ncol=2, byrow=T))
+
+  l
+}
+
+# Plot the 2D contours of the estimated Gaussian components
+.contour <- function(means, cov, l)
+{
+  X <- mvtnorm::rmvnorm(1000, means, cov)
+  z <- MASS::kde2d(X[,1], X[,2], n=50)
+  contour(z, drawlabels=FALSE, add=TRUE, lty=l, lwd=1.5)
+}
+
+# Do a scatter plot
+.scatter <- function(dat, clusters)
+{
+  plot(dat[,2], dat[,3],
+       xlab="X", ylab="Y", main="Three component Gaussian mixture model",
+       col=c("blue", "red", "orange", "black")[clusters],
+       pch=(1:4)[clusters])
+  col    <- c("blue", "red", "orange")
+  pch    <- 1:3
+  legend <- paste("Cluster", 1:3)
+  if (clusters == 4) {
+    col <- "black"
+    pch <- 4
+    legend = "No clusters"
+  }
+  legend("topleft", col=col, pch=pch, legend=legend)
+}
+
+n   <- 10000
+# create data with n samples
+dat <- .create.data(n)
+repeat
+{
+  # set initial parameters
+  l    <- .init()
+  # plot initial data
+  .scatter(dat, 4)
+  invisible(lapply(1:3, function(e) .contour(l[[e]]$means, l[[e]]$cov, 1)))
+  # Usually we would do a convergence criterion, e.g. compare difference of likelihoods
+  # but this will suffice for the hands on
+  for (i in seq(50))
+  {
+
+    ### E step
+
+    # Compute the sum of all responsibilities (for normalization)
+    r <- sapply(l, function(r)
+    {
+      r$mixing.weight *  mvtnorm::dmvnorm(dat[,2:3], r$means, r$cov)
+    })
+    r <- apply(r, 1, sum)
+    # Compute the responsibilities for each sample
+    rs <- sapply(l, function(e)
+    {
+      e$mixing.weight * mvtnorm::dmvnorm(dat[,2:3], e$means, e$cov) / r
+    })
+    # Compute number of points per cluster
+    N.k <- apply(rs, 2, sum)
+
+
+    ### M step
+
+    # Compute the new means
+    m <- lapply(1:3, function(e)
+    {
+      apply(rs[,e] * dat[,2:3], 2, sum) / N.k[e]
+    })
+    # Compute the new covariances
+    c <- lapply(1:3, function(e)
+    {
+      .cov(n, rs[,e], dat, m[[e]], N.k[e])
+    })
+    # Compute the new mixing weights
+    mi <- N.k / n
+    # Update the old parameters
+    l <- lapply(1:3, function(e)
+    {
+      list(mixing.weight = mi[e], means=m[[e]], cov=c[[e]])
+    })
+    # Plot a 2D density (contour) to show the estimated means and covariances
+    if (i %% 5 == 0)
+    {
+      Sys.sleep(1.5)
+      .scatter(dat, apply(rs, 1, which.max))
+      invisible(lapply(1:3, function(e) .contour(l[[e]]$means, l[[e]]$cov, e + 1)))
+    }
+  }
+}
+```
\ No newline at end of file
-- 
GitLab