Skip to content
Snippets Groups Projects
Verified Commit e0a32115 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

clustering.Rmd: working XY clustering

parent 7bd66572
No related branches found
No related tags found
No related merge requests found
......@@ -49,7 +49,7 @@ test_slope(1.05, 2.05, 0.95)
Simulate k-mer counts data
```{r}
n_kmer = 1e4
n_kmer = 1e2
mean_coef = 1000
data <- tibble(
sex = "F",
......@@ -89,31 +89,22 @@ plot(data_clust, what = "uncertainty")
```
```{r}
theta2 <- list(
"pi" = c(.1, .05, .85),
"mu" = list(c(1000, 2000), c(1000, 0), c(1000, 1000)),
"sigma" = list(
"f" = matrix(c(1.05, 2, 2, 4.05) * mean_coef, ncol = 2),
"m" = matrix(c(.9, .05, .05, .05) * mean_coef, ncol = 2),
"a" = matrix(c(1, .95, .95, 1) * mean_coef, ncol = 2)
)
)
expand_theta <- function(theta) {
expand_theta <- function(theta, cluster_coef) {
list(
"f" = list(
"pi" = theta$pi[1],
"mu" = theta$mu[[1]],
"mu" = cluster_coef$f * theta$mu,
"sigma" = theta$sigma$f
),
"m" = list(
"pi" = theta$pi[2],
"mu" = theta$mu[[2]],
"mu" = cluster_coef$m * theta$mu,
"sigma" = theta$sigma$m
),
"a" = list(
"pi" = theta$pi[3],
"mu" = theta$mu[[3]],
"mu" = cluster_coef$a * theta$mu,
"sigma" = theta$sigma$a
)
)
......@@ -122,12 +113,7 @@ expand_theta <- function(theta) {
params_diff <- function(old_theta, theta, threshold) {
result <- max(
max(abs(old_theta$pi - theta$pi)),
max(abs(old_theta$mu[[1]] - theta$mu[[1]])),
max(abs(old_theta$mu[[2]] - theta$mu[[2]])),
max(abs(old_theta$mu[[3]] - theta$mu[[3]])),
max(abs(old_theta$sigma$f - theta$sigma$f)),
max(abs(old_theta$sigma$m - theta$sigma$m)),
max(abs(old_theta$sigma$a - theta$sigma$a))
max(abs(old_theta$mu - theta$mu))
)
if (is.finite(result)) {
return(results > threshold)
......@@ -135,37 +121,35 @@ params_diff <- function(old_theta, theta, threshold) {
return(T)
}
proba_total <- function(x, theta) {
proba_total <- function(x, theta, cluster_coef) {
proba <- 0
for (params in expand_theta(theta)) {
print(params)
proba <- proba + params$pi +
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma, log = T)
for (params in expand_theta(theta, cluster_coef)) {
proba <- proba + params$pi *
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
}
print(proba)
return(proba)
}
proba_point <- function(x, theta) {
proba_point <- function(x, theta, cluster_coef) {
proba <- c()
for (params in expand_theta(theta)) {
proba <- cbind(proba, params$pi +
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma, log = T)
for (params in expand_theta(theta, cluster_coef)) {
proba <- cbind(proba, params$pi *
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
)
}
return(proba)
}
loglik <- function(x, theta) {
-sum(proba_total(x, theta))
loglik <- function(x, theta, cluster_coef) {
-log(sum(proba_total(x, theta, cluster_coef)))
}
# EM function
E_proba <- function(x, theta) {
proba <- proba_point(x, theta)
proba_norm <- rowSums(exp(proba))
E_proba <- function(x, theta, cluster_coef) {
proba <- proba_point(x, theta, cluster_coef)
proba_norm <- rowSums(proba)
for (cluster in 1:ncol(proba)) {
proba[, cluster] <- exp(proba[, cluster]) / proba_norm
proba[, cluster] <- proba[, cluster] / proba_norm
proba[proba_norm == 0, cluster] <- 1 / ncol(proba)
}
return(proba)
......@@ -177,17 +161,29 @@ E_N_clust <- function(proba) {
# Function for mean update
M_mean <- function(x, proba, N_clust) {
mu <- list()
mu <- 0
for (cluster in 1:ncol(proba)) {
mu[[cluster]] <- colSums(x * proba[, cluster]) / N_clust[cluster]
if (cluster == 1) {
mu <- mu + 1/3 *
mean(colSums(x * c(1, 0.5) * proba[, cluster]) / N_clust[cluster])
}
if (cluster == 2) {
mu <- mu + 1/3 *
(colSums(x * c(1, 0) * proba[, cluster]) / N_clust[cluster])[1]
}
if (cluster == 2) {
mu <- mu + 1/3 *
mean(colSums(x * c(0.5, 0.5) * proba[, cluster]) / N_clust[cluster])
}
}
return(mu)
}
M_cov <- function(x, proba, mu, N_clust) {
M_cov <- function(x, proba, mu, N_clust, cluster_coef) {
cov_clust <- list()
for (cluster in 1:ncol(proba)) {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu[[cluster]])) %*% (x - mu[[cluster]]) / N_clust[cluster]
print(cluster_coef[[cluster]])
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster]
}
sigma <- list()
sigma$f <- cov_clust[[1]]
......@@ -209,19 +205,33 @@ plot_proba <- function(x, proba) {
scale_color_identity()
}
EM_clust <- function(x, theta, threshold = 0.1) {
EM_clust <- function(x, threshold = 0.1) {
old_loglik <- -Inf
new_loglik <- 0
cluster_coef <- list(
"f" = c(1, 2),
"m" = c(1, 0),
"a" = c(2, 2)
)
theta <- list(
"pi" = c(.1, .05, .85),
"mu" = mean(colMeans(x)) * .5
)
theta$sigma <- list(
"f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
"m" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
"a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
)
while (abs(new_loglik - old_loglik) > threshold) {
old_loglik <- loglik(x, theta)
proba <- E_proba(x, theta)
old_loglik <- loglik(x, theta, cluster_coef)
proba <- E_proba(x, theta, cluster_coef)
theta$pi <- E_N_clust(proba)
theta$mu <- M_mean(x, proba, theta$pi)
theta$sigma <- M_cov(x, proba, theta$mu, theta$pi)
theta$sigma <- M_cov(x, proba, theta$mu, theta$pi, cluster_coef)
theta$pi <- theta$pi / nrow(x)
new_loglik <- loglik(x, theta)
# Sys.sleep(.5)
new_loglik <- loglik(x, theta, cluster_coef)
}
print(theta)
return(proba)
}
......@@ -229,7 +239,7 @@ EM_clust <- function(x, theta, threshold = 0.1) {
proba <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(theta2)
EM_clust()
data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment