Skip to content
Snippets Groups Projects
Verified Commit 2e90d56b authored by Laurent Modolo's avatar Laurent Modolo
Browse files

clustering.Rmd: working clustering without constraints

parent c8c5ecbe
No related branches found
No related tags found
No related merge requests found
---
title: "kmer clustering"
author: "Laurent Modolo"
date: "`r Sys.Date()`"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(mclust)
library(tidyverse)
library(mvtnorm)
```
## Simulation
If we expect to have with $X_f$ the female k-mer count and $X_m$ the male k-mer count, the following relation for a XY species:
- For the autosomal chromosomes: $X_m = X_f$
- For the X chromosomes: $X_m = 2 X_m$
- For the Y chromosomes: $X_m = 0^+ X_f$
Which becomes on the $log$ scale:
- For the autosomal chromosomes: $\log(X_m) = \log(X_f)$
- For the X chromosomes: $\log(X_m) = \log(2) + \log(X_m)$
- For the Y chromosomes: $\log(X_m) = log(0^+) + log(X_f)$
Test slope for a given sigma matrice
```{r}
test_slope <- function(x, y, rho) {
test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>%
as_tibble()
ggplot(data = test, aes(x = V1, y = V2)) +
geom_point() +
geom_smooth(method = lm) +
labs(title = lm(test$V2 ~ test$V1)$coef[2]) +
coord_fixed()
}
test_slope(1.05, 2.05, 0.95)
```
Simulate k-mer counts data
```{r}
n_kmer = 1e4
mean_coef = 1000
data <- tibble(
sex = "F",
count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>%
as_tibble()
) %>%
unnest(count) %>%
bind_rows(
tibble(
sex = "M",
count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd")
%>% as_tibble()
) %>%
unnest(count)
) %>% bind_rows(
tibble(
sex = "A",
count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
%>% as_tibble()
) %>% unnest(count)
) %>%
rename(count_m = V1,
count_f = V2)
data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) +
geom_point() +
coord_fixed()
```
```{r}
data_clust = data %>% select(-c("sex")) %>% mclust::Mclust(G = 3)
```
```{r}
summary(data_clust)
plot(data_clust, what = "classification")
plot(data_clust, what = "uncertainty")
```
```{r}
theta <- list(
"pi" = c(.1, .05, .85),
"mu" = list(c(1000, 2000), c(1000, 0), c(1000, 1000)),
"sigma" = list(
"f" = matrix(c(1.05, 2, 2, 4.05) * mean_coef, ncol = 2),
"m" = matrix(c(.9, .05, .05, .05) * mean_coef, ncol = 2),
"a" = matrix(c(1, .95, .95, 1) * mean_coef, ncol = 2)
)
)
expand_theta <- function(theta) {
list(
"f" = list(
"pi" = theta$pi[1],
"mu" = theta$mu[[1]],
"sigma" = theta$sigma$f
),
"m" = list(
"pi" = theta$pi[2],
"mu" = theta$mu[[2]],
"sigma" = theta$sigma$m
),
"a" = list(
"pi" = theta$pi[3],
"mu" = theta$mu[[3]],
"sigma" = theta$sigma$a
)
)
}
params_diff <- function(old_theta, theta, threshold) {
result <- max(
max(abs(old_theta$pi - theta$pi)),
max(abs(old_theta$mu[[1]] - theta$mu[[1]])),
max(abs(old_theta$mu[[2]] - theta$mu[[2]])),
max(abs(old_theta$mu[[3]] - theta$mu[[3]])),
max(abs(old_theta$sigma$f - theta$sigma$f)),
max(abs(old_theta$sigma$m - theta$sigma$m)),
max(abs(old_theta$sigma$a - theta$sigma$a))
)
if (is.finite(result)) {
return(results > threshold)
}
return(T)
}
proba_total <- function(x, theta) {
proba <- 0
for (params in expand_theta(theta)) {
proba <- proba + params$pi *
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
}
return(proba)
}
proba_point <- function(x, theta) {
proba <- c()
for (params in expand_theta(theta)) {
proba <- cbind(proba, params$pi *
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
)
}
return(proba)
}
loglik <- function(x, theta) {
-log(sum(proba_total(x, theta)))
}
# EM function
E_proba <- function(x, theta) {
proba <- proba_point(x, theta)
proba_norm <- rowSums(proba)
for (cluster in 1:ncol(proba)) {
proba[, cluster] <- proba[, cluster] / proba_norm
proba[proba_norm == 0, cluster] <- 1 / ncol(proba)
}
return(proba)
}
E_N_clust <- function(proba) {
colSums(proba)
}
# Function for mean update
M_mean <- function(x, proba, N_clust) {
mu <- list()
for (cluster in 1:ncol(proba)) {
mu[[cluster]] <- colSums(x * proba[, cluster]) / N_clust[cluster]
}
return(mu)
}
M_cov <- function(x, proba, mu, N_clust) {
cov_clust <- list()
for (cluster in 1:ncol(proba)) {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu[[cluster]])) %*% (x - mu[[cluster]]) / N_clust[cluster]
}
sigma <- list()
sigma$f <- cov_clust[[1]]
sigma$m <- cov_clust[[2]]
sigma$a <- cov_clust[[3]]
return(sigma)
}
plot_proba <- function(x, proba) {
as_tibble(x) %>%
mutate(
proba_f = proba[, 1],
proba_m = proba[, 2],
proba_a = proba[, 3],
color = rgb(proba_f * 255, proba_m * 255, proba_a * 255, maxColorValue = 255)
) %>%
ggplot(aes(x = count_m, y = count_f, color = color)) +
geom_point()
}
EM_clust <- function(x, theta, threshold = 0.1) {
old_theta <- theta
old_theta$mu <- list(c(-Inf, -Inf), c(-Inf, -Inf), c(-Inf, -Inf))
while (params_diff(old_theta, theta, threshold)) {
proba <- E_proba(x, theta)
theta$pi <- E_N_clust(proba)
theta$mu <- M_mean(x, proba, theta$pi)
theta$sigma <- M_cov(x, proba, theta$mu, theta$pi)
theta$pi <- theta$pi / nrow(x)
print(head(proba_point(x, theta)))
print(plot_proba(x, proba_point(x, theta)))
print(loglik(x, theta))
Sys.sleep(.5)
}
return(proba)
}
data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(theta)
```
```{r}
```
```{r}
## Example code for clustering on a three-component mixture model using the EM-algorithm.
### First we load some libraries and define some useful functions
library(mvtnorm)
library(MASS)
# Create a 'true' data set (an easy one)
.create.data <- function(n)
{
l <- list()
l[[1]] <- list(component=1,
mixing.weight=0.5,
means=c(0,0),
cov=matrix(c(1,0,0,1), ncol=2, byrow=T))
l[[2]] <- list(mixing.weight=0.3,
component=2,
means=c(5,5),
cov=matrix(c(1, 0.5, 0.5, 1), ncol=2, byrow=T))
l[[3]] <- list(mixing.weight=0.2,
component=3,
means=c(10,10),
cov=matrix(c(1,0.75,0.75,1), ncol=2, byrow=T))
do.call("rbind",sapply(l, function(e) {
dat <- mvtnorm::rmvnorm(e$mixing.weight * n, e$means, e$cov)
cbind(component=e$component,
x1=dat[,1],
x2=dat[,2])
}))
}
# Function for covariance update
.cov <- function(n, r, dat, m, N.k)
{
(t(r * (dat[,2:3] -m)) %*% (( dat[,2:3]-m))) / N.k
}
# Generate starting values for means/covs/mixing weights
.init <- function()
{
l <- list()
l[[1]] <- list(mixing.weight=0.1,
means=c(-2, -2),
cov=matrix(c(1,0,0,1), ncol=2, byrow=T))
l[[2]] <- list(mixing.weight=0.1,
means=c(10, 0),
cov=matrix(c(1,0,0,1), ncol=2, byrow=T))
l[[3]] <- list(mixing.weight=0.8,
means=c(0, 10),
cov=matrix(c(1,0,0,1), ncol=2, byrow=T))
l
}
# Plot the 2D contours of the estimated Gaussian components
.contour <- function(means, cov, l)
{
X <- mvtnorm::rmvnorm(1000, means, cov)
z <- MASS::kde2d(X[,1], X[,2], n=50)
contour(z, drawlabels=FALSE, add=TRUE, lty=l, lwd=1.5)
}
# Do a scatter plot
.scatter <- function(dat, clusters)
{
plot(dat[,2], dat[,3],
xlab="X", ylab="Y", main="Three component Gaussian mixture model",
col=c("blue", "red", "orange", "black")[clusters],
pch=(1:4)[clusters])
col <- c("blue", "red", "orange")
pch <- 1:3
legend <- paste("Cluster", 1:3)
if (clusters == 4) {
col <- "black"
pch <- 4
legend = "No clusters"
}
legend("topleft", col=col, pch=pch, legend=legend)
}
n <- 10000
# create data with n samples
dat <- .create.data(n)
repeat
{
# set initial parameters
l <- .init()
# plot initial data
.scatter(dat, 4)
invisible(lapply(1:3, function(e) .contour(l[[e]]$means, l[[e]]$cov, 1)))
# Usually we would do a convergence criterion, e.g. compare difference of likelihoods
# but this will suffice for the hands on
for (i in seq(50))
{
### E step
# Compute the sum of all responsibilities (for normalization)
r <- sapply(l, function(r)
{
r$mixing.weight * mvtnorm::dmvnorm(dat[,2:3], r$means, r$cov)
})
r <- apply(r, 1, sum)
# Compute the responsibilities for each sample
rs <- sapply(l, function(e)
{
e$mixing.weight * mvtnorm::dmvnorm(dat[,2:3], e$means, e$cov) / r
})
# Compute number of points per cluster
N.k <- apply(rs, 2, sum)
### M step
# Compute the new means
m <- lapply(1:3, function(e)
{
apply(rs[,e] * dat[,2:3], 2, sum) / N.k[e]
})
# Compute the new covariances
c <- lapply(1:3, function(e)
{
.cov(n, rs[,e], dat, m[[e]], N.k[e])
})
# Compute the new mixing weights
mi <- N.k / n
# Update the old parameters
l <- lapply(1:3, function(e)
{
list(mixing.weight = mi[e], means=m[[e]], cov=c[[e]])
})
# Plot a 2D density (contour) to show the estimated means and covariances
if (i %% 5 == 0)
{
Sys.sleep(1.5)
.scatter(dat, apply(rs, 1, which.max))
invisible(lapply(1:3, function(e) .contour(l[[e]]$means, l[[e]]$cov, e + 1)))
}
}
}
```
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment