From 2e90d56b5263783449839e3c829cf899e1b74264 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Fri, 12 May 2023 11:57:07 +0200 Subject: [PATCH] clustering.Rmd: working clustering without constraints --- src/clustering.Rmd | 385 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 385 insertions(+) create mode 100644 src/clustering.Rmd diff --git a/src/clustering.Rmd b/src/clustering.Rmd new file mode 100644 index 0000000..555a4d9 --- /dev/null +++ b/src/clustering.Rmd @@ -0,0 +1,385 @@ +--- +title: "kmer clustering" +author: "Laurent Modolo" +date: "`r Sys.Date()`" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +```{r} +library(mclust) +library(tidyverse) +library(mvtnorm) +``` + +## Simulation + +If we expect to have with $X_f$ the female k-mer count and $X_m$ the male k-mer count, the following relation for a XY species: + +- For the autosomal chromosomes: $X_m = X_f$ +- For the X chromosomes: $X_m = 2 X_m$ +- For the Y chromosomes: $X_m = 0^+ X_f$ + +Which becomes on the $log$ scale: + +- For the autosomal chromosomes: $\log(X_m) = \log(X_f)$ +- For the X chromosomes: $\log(X_m) = \log(2) + \log(X_m)$ +- For the Y chromosomes: $\log(X_m) = log(0^+) + log(X_f)$ + + +Test slope for a given sigma matrice + +```{r} +test_slope <- function(x, y, rho) { + test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>% + as_tibble() + ggplot(data = test, aes(x = V1, y = V2)) + + geom_point() + + geom_smooth(method = lm) + + labs(title = lm(test$V2 ~ test$V1)$coef[2]) + + coord_fixed() +} +test_slope(1.05, 2.05, 0.95) +``` + +Simulate k-mer counts data + +```{r} +n_kmer = 1e4 +mean_coef = 1000 +data <- tibble( + sex = "F", + count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>% + as_tibble() +) %>% + unnest(count) %>% + bind_rows( + tibble( + sex = "M", + count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd") + %>% as_tibble() + ) %>% + unnest(count) + ) %>% bind_rows( + tibble( + sex = "A", + count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd") + %>% as_tibble() + ) %>% unnest(count) + ) %>% + rename(count_m = V1, + count_f = V2) +data %>% + ggplot(aes(x = count_m, y = count_f, color = sex)) + + geom_point() + + coord_fixed() +``` + + + + + +```{r} +data_clust = data %>% select(-c("sex")) %>% mclust::Mclust(G = 3) +``` +```{r} +summary(data_clust) +plot(data_clust, what = "classification") +plot(data_clust, what = "uncertainty") +``` + +```{r} +theta <- list( + "pi" = c(.1, .05, .85), + "mu" = list(c(1000, 2000), c(1000, 0), c(1000, 1000)), + "sigma" = list( + "f" = matrix(c(1.05, 2, 2, 4.05) * mean_coef, ncol = 2), + "m" = matrix(c(.9, .05, .05, .05) * mean_coef, ncol = 2), + "a" = matrix(c(1, .95, .95, 1) * mean_coef, ncol = 2) + ) +) + +expand_theta <- function(theta) { + list( + "f" = list( + "pi" = theta$pi[1], + "mu" = theta$mu[[1]], + "sigma" = theta$sigma$f + ), + "m" = list( + "pi" = theta$pi[2], + "mu" = theta$mu[[2]], + "sigma" = theta$sigma$m + ), + "a" = list( + "pi" = theta$pi[3], + "mu" = theta$mu[[3]], + "sigma" = theta$sigma$a + ) +) +} + +params_diff <- function(old_theta, theta, threshold) { + result <- max( + max(abs(old_theta$pi - theta$pi)), + max(abs(old_theta$mu[[1]] - theta$mu[[1]])), + max(abs(old_theta$mu[[2]] - theta$mu[[2]])), + max(abs(old_theta$mu[[3]] - theta$mu[[3]])), + max(abs(old_theta$sigma$f - theta$sigma$f)), + max(abs(old_theta$sigma$m - theta$sigma$m)), + max(abs(old_theta$sigma$a - theta$sigma$a)) + ) + if (is.finite(result)) { + return(results > threshold) + } + return(T) +} + +proba_total <- function(x, theta) { + proba <- 0 + for (params in expand_theta(theta)) { + proba <- proba + params$pi * + mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma) + } + return(proba) +} + +proba_point <- function(x, theta) { + proba <- c() + for (params in expand_theta(theta)) { + proba <- cbind(proba, params$pi * + mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma) + ) + } + return(proba) +} + +loglik <- function(x, theta) { + -log(sum(proba_total(x, theta))) +} + +# EM function +E_proba <- function(x, theta) { + proba <- proba_point(x, theta) + proba_norm <- rowSums(proba) + for (cluster in 1:ncol(proba)) { + proba[, cluster] <- proba[, cluster] / proba_norm + proba[proba_norm == 0, cluster] <- 1 / ncol(proba) + } + + return(proba) +} + +E_N_clust <- function(proba) { + colSums(proba) +} + +# Function for mean update +M_mean <- function(x, proba, N_clust) { + mu <- list() + for (cluster in 1:ncol(proba)) { + mu[[cluster]] <- colSums(x * proba[, cluster]) / N_clust[cluster] + } + return(mu) +} + +M_cov <- function(x, proba, mu, N_clust) { + cov_clust <- list() + for (cluster in 1:ncol(proba)) { + cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu[[cluster]])) %*% (x - mu[[cluster]]) / N_clust[cluster] + } + sigma <- list() + sigma$f <- cov_clust[[1]] + sigma$m <- cov_clust[[2]] + sigma$a <- cov_clust[[3]] + return(sigma) +} + +plot_proba <- function(x, proba) { + as_tibble(x) %>% + mutate( + proba_f = proba[, 1], + proba_m = proba[, 2], + proba_a = proba[, 3], + color = rgb(proba_f * 255, proba_m * 255, proba_a * 255, maxColorValue = 255) + ) %>% + ggplot(aes(x = count_m, y = count_f, color = color)) + + geom_point() +} + +EM_clust <- function(x, theta, threshold = 0.1) { + old_theta <- theta + old_theta$mu <- list(c(-Inf, -Inf), c(-Inf, -Inf), c(-Inf, -Inf)) + while (params_diff(old_theta, theta, threshold)) { + proba <- E_proba(x, theta) + theta$pi <- E_N_clust(proba) + theta$mu <- M_mean(x, proba, theta$pi) + theta$sigma <- M_cov(x, proba, theta$mu, theta$pi) + theta$pi <- theta$pi / nrow(x) + print(head(proba_point(x, theta))) + print(plot_proba(x, proba_point(x, theta))) + print(loglik(x, theta)) + Sys.sleep(.5) + } + return(proba) +} + + +data %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + EM_clust(theta) +``` + + +```{r} + +``` + +```{r} +## Example code for clustering on a three-component mixture model using the EM-algorithm. + +### First we load some libraries and define some useful functions + +library(mvtnorm) +library(MASS) + +# Create a 'true' data set (an easy one) +.create.data <- function(n) +{ + l <- list() + l[[1]] <- list(component=1, + mixing.weight=0.5, + means=c(0,0), + cov=matrix(c(1,0,0,1), ncol=2, byrow=T)) + l[[2]] <- list(mixing.weight=0.3, + component=2, + means=c(5,5), + cov=matrix(c(1, 0.5, 0.5, 1), ncol=2, byrow=T)) + l[[3]] <- list(mixing.weight=0.2, + component=3, + means=c(10,10), + cov=matrix(c(1,0.75,0.75,1), ncol=2, byrow=T)) + + do.call("rbind",sapply(l, function(e) { + dat <- mvtnorm::rmvnorm(e$mixing.weight * n, e$means, e$cov) + cbind(component=e$component, + x1=dat[,1], + x2=dat[,2]) + })) +} + +# Function for covariance update +.cov <- function(n, r, dat, m, N.k) +{ + (t(r * (dat[,2:3] -m)) %*% (( dat[,2:3]-m))) / N.k +} + +# Generate starting values for means/covs/mixing weights +.init <- function() +{ + l <- list() + l[[1]] <- list(mixing.weight=0.1, + means=c(-2, -2), + cov=matrix(c(1,0,0,1), ncol=2, byrow=T)) + l[[2]] <- list(mixing.weight=0.1, + means=c(10, 0), + cov=matrix(c(1,0,0,1), ncol=2, byrow=T)) + l[[3]] <- list(mixing.weight=0.8, + means=c(0, 10), + cov=matrix(c(1,0,0,1), ncol=2, byrow=T)) + + l +} + +# Plot the 2D contours of the estimated Gaussian components +.contour <- function(means, cov, l) +{ + X <- mvtnorm::rmvnorm(1000, means, cov) + z <- MASS::kde2d(X[,1], X[,2], n=50) + contour(z, drawlabels=FALSE, add=TRUE, lty=l, lwd=1.5) +} + +# Do a scatter plot +.scatter <- function(dat, clusters) +{ + plot(dat[,2], dat[,3], + xlab="X", ylab="Y", main="Three component Gaussian mixture model", + col=c("blue", "red", "orange", "black")[clusters], + pch=(1:4)[clusters]) + col <- c("blue", "red", "orange") + pch <- 1:3 + legend <- paste("Cluster", 1:3) + if (clusters == 4) { + col <- "black" + pch <- 4 + legend = "No clusters" + } + legend("topleft", col=col, pch=pch, legend=legend) +} + +n <- 10000 +# create data with n samples +dat <- .create.data(n) +repeat +{ + # set initial parameters + l <- .init() + # plot initial data + .scatter(dat, 4) + invisible(lapply(1:3, function(e) .contour(l[[e]]$means, l[[e]]$cov, 1))) + # Usually we would do a convergence criterion, e.g. compare difference of likelihoods + # but this will suffice for the hands on + for (i in seq(50)) + { + + ### E step + + # Compute the sum of all responsibilities (for normalization) + r <- sapply(l, function(r) + { + r$mixing.weight * mvtnorm::dmvnorm(dat[,2:3], r$means, r$cov) + }) + r <- apply(r, 1, sum) + # Compute the responsibilities for each sample + rs <- sapply(l, function(e) + { + e$mixing.weight * mvtnorm::dmvnorm(dat[,2:3], e$means, e$cov) / r + }) + # Compute number of points per cluster + N.k <- apply(rs, 2, sum) + + + ### M step + + # Compute the new means + m <- lapply(1:3, function(e) + { + apply(rs[,e] * dat[,2:3], 2, sum) / N.k[e] + }) + # Compute the new covariances + c <- lapply(1:3, function(e) + { + .cov(n, rs[,e], dat, m[[e]], N.k[e]) + }) + # Compute the new mixing weights + mi <- N.k / n + # Update the old parameters + l <- lapply(1:3, function(e) + { + list(mixing.weight = mi[e], means=m[[e]], cov=c[[e]]) + }) + # Plot a 2D density (contour) to show the estimated means and covariances + if (i %% 5 == 0) + { + Sys.sleep(1.5) + .scatter(dat, apply(rs, 1, which.max)) + invisible(lapply(1:3, function(e) .contour(l[[e]]$means, l[[e]]$cov, e + 1))) + } + } +} +``` \ No newline at end of file -- GitLab