Skip to content
Snippets Groups Projects
Verified Commit cd0c90c4 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

add clustering on the log scale

parent a556c25d
No related branches found
No related tags found
No related merge requests found
......@@ -2,12 +2,13 @@
title: "flat_full.Rmd for working package"
output: html_document
editor_options:
chunk_output_type: console
chunk_output_type: inline
---
```{r load}
library(kmerclust)
library(tidyverse)
library(parallel)
```
```{r development-load}
......@@ -59,7 +60,7 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY", count = F) {
bind_rows(
tibble(
sex = "A",
count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(1, 1)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
%>% as_tibble(.name_repair = "universal")
) %>% unnest(c(count))
)
......@@ -123,18 +124,18 @@ expand_theta <- function(theta, cluster_coef, sex) {
theta_ref <- list(
"a" = list(
"pi" = theta$pi[1],
"mu" = cluster_coef$a * theta$mu,
"mu" = cluster_coef$a + theta$mu,
"sigma" = theta$sigma$a
),
"f" = list(
"pi" = theta$pi[2],
"mu" = cluster_coef$f * theta$mu,
"mu" = cluster_coef$f + theta$mu,
"sigma" = theta$sigma$f
))
if (sex == "XY") {
theta_ref[["m"]] <- list(
"pi" = theta$pi[3],
"mu" = cluster_coef$m * theta$mu,
"mu" = (cluster_coef$m + theta$mu) * c(1, 0),
"sigma" = theta$sigma$m
)
}
......@@ -243,8 +244,14 @@ E_N_clust <- function(proba) {
M_mean <- function(x, proba, N_clust, cluster_coef, sex) {
mu <- 0
for (cluster in 1:ncol(proba)) {
mu <- mu +
mean(colSums(x * cluster_coef[[cluster]] * proba[, cluster]) / N_clust[cluster])
if (cluster == 3) {
mu <- mu +
mean(colSums((x - cluster_coef[[cluster]]) * c(1, 0) * proba[, cluster]) / N_clust[cluster])
} else {
mu <- mu +
mean(colSums((x - cluster_coef[[cluster]]) * proba[, cluster]) / N_clust[cluster])
}
}
if (sex == "XY") {
return(mu / 3)
......@@ -266,7 +273,11 @@ M_mean <- function(x, proba, N_clust, cluster_coef, sex) {
M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
cov_clust <- list()
for (cluster in 1:ncol(proba)) {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster]
if (cluster == 3) {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - (mu + cluster_coef[[cluster]]) * c(1, 0))) %*% (x - (mu + cluster_coef[[cluster]]) * c(1, 0)) / N_clust[cluster]
} else {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu + cluster_coef[[cluster]])) %*% (x - mu + cluster_coef[[cluster]]) / N_clust[cluster]
}
}
sigma <- list()
sigma$a <- cov_clust[[1]]
......@@ -320,10 +331,6 @@ plot_proba <- function(x, proba, sex = "XY") {
theme_bw()
}
}
sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XY$proba, sex = "XY")
```
```{r fun-init_param}
......@@ -334,19 +341,19 @@ sample %>%
#' @return a list of parameters
init_param <- function(x, sex) {
cluster_coef <- list(
"a" = c(1, 1),
"f" = c(1, 0.5)
"a" = c(0, 0),
"f" = c(0, log(2))
)
theta <- list(
"pi" = c(.85, .1, .05),
"mu" = mean(colMeans(x)) * .5
"mu" = mean(colMeans(x))
)
theta$sigma <- list(
"a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
"f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
)
if (sex == "XY") {
cluster_coef$m <- c(1, 0)
cluster_coef$m <- c(0, 0)
theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
}
return(list(cluster_coef = cluster_coef, theta = theta))
......@@ -392,13 +399,13 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
new_loglik <- 0
param <- init_param(x, sex)
while (abs(new_loglik - old_loglik) > threshold) {
old_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
proba <- E_proba(x, param$theta, param$cluster_coef, sex)
param$theta$pi <- E_N_clust(proba)
param$theta$mu <- M_mean(x, proba, param$theta$pi, params$cluster_coef, sex)
param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
old_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
proba <- E_proba(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
param$theta$pi <- E_N_clust(proba = proba)
param$theta$mu <- M_mean(x = x, proba = proba, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex)
param$theta$sigma <- M_cov(x = x, proba = proba, mu = param$theta$mu, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex)
param$theta$pi <- param$theta$pi / nrow(x)
new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
new_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
if (is.infinite(new_loglik)) {
break
}
......@@ -505,13 +512,21 @@ BSS_WSS <- function(x, cluster) {
# clustering XY
```{r clustering_XY}
data <- sim_kmer(1e6, 1000, "XY", count =T)
data <- sim_kmer(1e5, 1000, "XY", count =T)
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
mutate(
count_m = log1p(count_m),
count_f = log1p(count_f),
) %>%
as.matrix() %>%
EM_clust()
data %>%
dplyr::select(count_m, count_f) %>%
mutate(
count_m = log1p(count_m),
count_f = log1p(count_f),
) %>%
as.matrix() %>%
plot_proba(model_XY$proba)
```
......@@ -519,13 +534,23 @@ data %>%
# clustering XO
```{r clustering_XO}
data <- sim_kmer(1e4, 1000, "XO")
data <- sim_kmer(1e5, 1000, "XO")
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
mutate(
count_m = log1p(count_m),
count_f = log1p(count_f),
) %>%
drop_na() %>%
as.matrix() %>%
EM_clust(sex = "X0")
data %>%
dplyr::select(count_m, count_f) %>%
mutate(
count_m = log1p(count_m),
count_f = log1p(count_f),
) %>%
drop_na() %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "X0")
```
......@@ -656,7 +681,7 @@ annotate_counts <- function(annotation, count, name) {
## M belari data
```{r dev}
load("../../results/12/mbelari/sample.Rdata", v = T)
load("../../../results/12/mbelari/sample.Rdata")
```
```{r dev}
......@@ -679,91 +704,42 @@ sample %>%
```
```{r dev}
sample %>%
select(count_m, count_f) %>%
sample_frac(0.01) %>%
mutate(
count_m = expm1(count_m),
count_f = expm1(count_f),
) %>%
ggplot(aes(x = count_m, y = count_f)) +
geom_point() +
theme_bw()
res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
```
## M Longespiculosa data
```{r dev}
sample %>%
select(count_m, count_f) %>%
sample_frac(0.01) %>%
ggplot(aes(x = count_m, y = count_f)) +
geom_point() +
theme_bw()
load("../../../results/12/mlongespiculosa/sample.Rdata")
```
```{r dev}
library(flexmix)
count <- sample %>%
model_XY <- sample %>%
dplyr::select(count_m, count_f) %>%
mutate(
count_m = round(expm1(count_m)),
count_f = round(expm1(count_f)),
) %>%
filter(count_m > 0) %>%
mutate(
ratio = count_f / count_m
) %>%
sample_n(10000)
count %>%
ggplot(aes(x = ratio)) +
geom_histogram() +
scale_x_log10()
m_xy <- flexmix(
ratio ~ ratio,
k = 3,
data = count,
)
m_xo <- flexmix(
count_m ~ count_f,
k = 2,
data = count,
model = FLXglm(family = "poisson")
)
summary(m_xy)
parameters(m_xy, component = 1)
parameters(m_xy, component = 2)
parameters(m_xy, component = 3)
plot(m_xy)
rm_xy <- refit(m_xy)
summary(rm_xy)
count %>%
mutate(
proba1 = m_xy@posterior$scaled[, 1],
proba2 = m_xy@posterior$scaled[, 2],
proba3 = m_xy@posterior$scaled[, 3]
) %>%
pivot_longer(cols = c(proba1, proba2, proba3)) %>%
ggplot(aes(x = count_m, y = count_f, color = value)) +
geom_point() +
facet_wrap(~name) +
coord_equal() +
theme_bw()
```
```{r dev, eval=F}
data <- readr::read_tsv("../../results/12/mlongespiculosa/mlongespiculosa.csv", show_col_types = FALSE)
format(object.size(data), units = "Mb")
annotation <- parse_annotation("../../data/sample.csv")
count <- annotate_counts(annotation, data, "mlongespiculosa")
save(count, file = "results/12/mlongespiculosa/counts.Rdata")
load("results/12/mlongespiculosa/counts.Rdata")
```
```{r dev, eval=F}
res <- compare_models(count %>% dplyr::ungroup(), nboot = 100, bootsize = 0.1, core = 24)
as.matrix() %>%
EM_clust(sex = "XY")
model_XO <- sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "X0")
sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XY$proba, sex = "XY")
sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "X0")
```
```{r dev}
res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
......@@ -772,4 +748,71 @@ res %>%
geom_violin()
```
## M Spiculigera data
```{r dev}
load("../../../results/12/mspiculigera/sample.Rdata")
```
```{r dev}
model_XY <- sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "XY")
model_XO <- sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "X0")
sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XY$proba, sex = "XY")
sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "X0")
```
```{r dev}
res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
res
```
# tmvnorm test
```{r dev}
library(tmvtnorm)
mean <- c(1, 1)
sigma <- matrix(c(10, 0,
0, 1), 2, 2)
# Linear Constraints
#
# a1 <= x1 + x2 <= b2
# a2 <= x1 - x2 <= b2
#
# [ a1 ] <= [ 1 1 ] [ x1 ] <= [b1]
# [ a2 ] [ 1 -1 ] [ x2 ] [b2]
a <- c(0, 0)
b <- c( 30, 30)
D <- matrix(c(1, 1,
-1, 1), 2, 2)
X <- rtmvnorm(n=10000, mean, sigma, lower=a, upper=b, D=D, algorithm="gibbsR")
plot(X, main="Gibbs sampling for multivariate normal
with linear constraints according to Geweke (1991)")
abline(a=0, b=1, col="blue", lwd = 2)
abline(a=0, b=0, col="blue", lwd = 2)
# mark linear constraints as lines
for (i in 1:nrow(D)) {
abline(a=a[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red")
abline(a=b[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red")
}
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment