Skip to content
Snippets Groups Projects
Verified Commit cd0c90c4 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

add clustering on the log scale

parent a556c25d
No related branches found
No related tags found
No related merge requests found
...@@ -2,12 +2,13 @@ ...@@ -2,12 +2,13 @@
title: "flat_full.Rmd for working package" title: "flat_full.Rmd for working package"
output: html_document output: html_document
editor_options: editor_options:
chunk_output_type: console chunk_output_type: inline
--- ---
```{r load} ```{r load}
library(kmerclust) library(kmerclust)
library(tidyverse) library(tidyverse)
library(parallel)
``` ```
```{r development-load} ```{r development-load}
...@@ -59,7 +60,7 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY", count = F) { ...@@ -59,7 +60,7 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY", count = F) {
bind_rows( bind_rows(
tibble( tibble(
sex = "A", sex = "A",
count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd") count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(1, 1)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
%>% as_tibble(.name_repair = "universal") %>% as_tibble(.name_repair = "universal")
) %>% unnest(c(count)) ) %>% unnest(c(count))
) )
...@@ -123,18 +124,18 @@ expand_theta <- function(theta, cluster_coef, sex) { ...@@ -123,18 +124,18 @@ expand_theta <- function(theta, cluster_coef, sex) {
theta_ref <- list( theta_ref <- list(
"a" = list( "a" = list(
"pi" = theta$pi[1], "pi" = theta$pi[1],
"mu" = cluster_coef$a * theta$mu, "mu" = cluster_coef$a + theta$mu,
"sigma" = theta$sigma$a "sigma" = theta$sigma$a
), ),
"f" = list( "f" = list(
"pi" = theta$pi[2], "pi" = theta$pi[2],
"mu" = cluster_coef$f * theta$mu, "mu" = cluster_coef$f + theta$mu,
"sigma" = theta$sigma$f "sigma" = theta$sigma$f
)) ))
if (sex == "XY") { if (sex == "XY") {
theta_ref[["m"]] <- list( theta_ref[["m"]] <- list(
"pi" = theta$pi[3], "pi" = theta$pi[3],
"mu" = cluster_coef$m * theta$mu, "mu" = (cluster_coef$m + theta$mu) * c(1, 0),
"sigma" = theta$sigma$m "sigma" = theta$sigma$m
) )
} }
...@@ -243,8 +244,14 @@ E_N_clust <- function(proba) { ...@@ -243,8 +244,14 @@ E_N_clust <- function(proba) {
M_mean <- function(x, proba, N_clust, cluster_coef, sex) { M_mean <- function(x, proba, N_clust, cluster_coef, sex) {
mu <- 0 mu <- 0
for (cluster in 1:ncol(proba)) { for (cluster in 1:ncol(proba)) {
mu <- mu + if (cluster == 3) {
mean(colSums(x * cluster_coef[[cluster]] * proba[, cluster]) / N_clust[cluster]) mu <- mu +
mean(colSums((x - cluster_coef[[cluster]]) * c(1, 0) * proba[, cluster]) / N_clust[cluster])
} else {
mu <- mu +
mean(colSums((x - cluster_coef[[cluster]]) * proba[, cluster]) / N_clust[cluster])
}
} }
if (sex == "XY") { if (sex == "XY") {
return(mu / 3) return(mu / 3)
...@@ -266,7 +273,11 @@ M_mean <- function(x, proba, N_clust, cluster_coef, sex) { ...@@ -266,7 +273,11 @@ M_mean <- function(x, proba, N_clust, cluster_coef, sex) {
M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) { M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
cov_clust <- list() cov_clust <- list()
for (cluster in 1:ncol(proba)) { for (cluster in 1:ncol(proba)) {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster] if (cluster == 3) {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - (mu + cluster_coef[[cluster]]) * c(1, 0))) %*% (x - (mu + cluster_coef[[cluster]]) * c(1, 0)) / N_clust[cluster]
} else {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu + cluster_coef[[cluster]])) %*% (x - mu + cluster_coef[[cluster]]) / N_clust[cluster]
}
} }
sigma <- list() sigma <- list()
sigma$a <- cov_clust[[1]] sigma$a <- cov_clust[[1]]
...@@ -320,10 +331,6 @@ plot_proba <- function(x, proba, sex = "XY") { ...@@ -320,10 +331,6 @@ plot_proba <- function(x, proba, sex = "XY") {
theme_bw() theme_bw()
} }
} }
sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XY$proba, sex = "XY")
``` ```
```{r fun-init_param} ```{r fun-init_param}
...@@ -334,19 +341,19 @@ sample %>% ...@@ -334,19 +341,19 @@ sample %>%
#' @return a list of parameters #' @return a list of parameters
init_param <- function(x, sex) { init_param <- function(x, sex) {
cluster_coef <- list( cluster_coef <- list(
"a" = c(1, 1), "a" = c(0, 0),
"f" = c(1, 0.5) "f" = c(0, log(2))
) )
theta <- list( theta <- list(
"pi" = c(.85, .1, .05), "pi" = c(.85, .1, .05),
"mu" = mean(colMeans(x)) * .5 "mu" = mean(colMeans(x))
) )
theta$sigma <- list( theta$sigma <- list(
"a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2), "a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
"f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2) "f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
) )
if (sex == "XY") { if (sex == "XY") {
cluster_coef$m <- c(1, 0) cluster_coef$m <- c(0, 0)
theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2) theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
} }
return(list(cluster_coef = cluster_coef, theta = theta)) return(list(cluster_coef = cluster_coef, theta = theta))
...@@ -392,13 +399,13 @@ EM_clust <- function(x, threshold = 1, sex = "XY") { ...@@ -392,13 +399,13 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
new_loglik <- 0 new_loglik <- 0
param <- init_param(x, sex) param <- init_param(x, sex)
while (abs(new_loglik - old_loglik) > threshold) { while (abs(new_loglik - old_loglik) > threshold) {
old_loglik <- loglik(x, param$theta, param$cluster_coef, sex) old_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
proba <- E_proba(x, param$theta, param$cluster_coef, sex) proba <- E_proba(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
param$theta$pi <- E_N_clust(proba) param$theta$pi <- E_N_clust(proba = proba)
param$theta$mu <- M_mean(x, proba, param$theta$pi, params$cluster_coef, sex) param$theta$mu <- M_mean(x = x, proba = proba, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex)
param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex) param$theta$sigma <- M_cov(x = x, proba = proba, mu = param$theta$mu, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex)
param$theta$pi <- param$theta$pi / nrow(x) param$theta$pi <- param$theta$pi / nrow(x)
new_loglik <- loglik(x, param$theta, param$cluster_coef, sex) new_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
if (is.infinite(new_loglik)) { if (is.infinite(new_loglik)) {
break break
} }
...@@ -505,13 +512,21 @@ BSS_WSS <- function(x, cluster) { ...@@ -505,13 +512,21 @@ BSS_WSS <- function(x, cluster) {
# clustering XY # clustering XY
```{r clustering_XY} ```{r clustering_XY}
data <- sim_kmer(1e6, 1000, "XY", count =T) data <- sim_kmer(1e5, 1000, "XY", count =T)
model_XY <- data %>% model_XY <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
mutate(
count_m = log1p(count_m),
count_f = log1p(count_f),
) %>%
as.matrix() %>% as.matrix() %>%
EM_clust() EM_clust()
data %>% data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
mutate(
count_m = log1p(count_m),
count_f = log1p(count_f),
) %>%
as.matrix() %>% as.matrix() %>%
plot_proba(model_XY$proba) plot_proba(model_XY$proba)
``` ```
...@@ -519,13 +534,23 @@ data %>% ...@@ -519,13 +534,23 @@ data %>%
# clustering XO # clustering XO
```{r clustering_XO} ```{r clustering_XO}
data <- sim_kmer(1e4, 1000, "XO") data <- sim_kmer(1e5, 1000, "XO")
model_XO <- data %>% model_XO <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
mutate(
count_m = log1p(count_m),
count_f = log1p(count_f),
) %>%
drop_na() %>%
as.matrix() %>% as.matrix() %>%
EM_clust(sex = "X0") EM_clust(sex = "X0")
data %>% data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
mutate(
count_m = log1p(count_m),
count_f = log1p(count_f),
) %>%
drop_na() %>%
as.matrix() %>% as.matrix() %>%
plot_proba(model_XO$proba, sex = "X0") plot_proba(model_XO$proba, sex = "X0")
``` ```
...@@ -656,7 +681,7 @@ annotate_counts <- function(annotation, count, name) { ...@@ -656,7 +681,7 @@ annotate_counts <- function(annotation, count, name) {
## M belari data ## M belari data
```{r dev} ```{r dev}
load("../../results/12/mbelari/sample.Rdata", v = T) load("../../../results/12/mbelari/sample.Rdata")
``` ```
```{r dev} ```{r dev}
...@@ -679,91 +704,42 @@ sample %>% ...@@ -679,91 +704,42 @@ sample %>%
``` ```
```{r dev} ```{r dev}
sample %>% res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
select(count_m, count_f) %>% res %>%
sample_frac(0.01) %>% ggplot(aes(x = name, y = BIC)) +
mutate( geom_violin()
count_m = expm1(count_m), res %>%
count_f = expm1(count_f), ggplot(aes(x = name, y = WSS_f / BSS)) +
) %>% geom_violin()
ggplot(aes(x = count_m, y = count_f)) +
geom_point() +
theme_bw()
``` ```
## M Longespiculosa data
```{r dev} ```{r dev}
sample %>% load("../../../results/12/mlongespiculosa/sample.Rdata")
select(count_m, count_f) %>%
sample_frac(0.01) %>%
ggplot(aes(x = count_m, y = count_f)) +
geom_point() +
theme_bw()
``` ```
```{r dev} ```{r dev}
library(flexmix) model_XY <- sample %>%
count <- sample %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
mutate( as.matrix() %>%
count_m = round(expm1(count_m)), EM_clust(sex = "XY")
count_f = round(expm1(count_f)), model_XO <- sample %>%
) %>% dplyr::select(count_m, count_f) %>%
filter(count_m > 0) %>% as.matrix() %>%
mutate( EM_clust(sex = "X0")
ratio = count_f / count_m sample %>%
) %>% dplyr::select(count_m, count_f) %>%
sample_n(10000) as.matrix() %>%
plot_proba(model_XY$proba, sex = "XY")
count %>% sample %>%
ggplot(aes(x = ratio)) + dplyr::select(count_m, count_f) %>%
geom_histogram() + as.matrix() %>%
scale_x_log10() plot_proba(model_XO$proba, sex = "X0")
```
m_xy <- flexmix(
ratio ~ ratio, ```{r dev}
k = 3, res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
data = count,
)
m_xo <- flexmix(
count_m ~ count_f,
k = 2,
data = count,
model = FLXglm(family = "poisson")
)
summary(m_xy)
parameters(m_xy, component = 1)
parameters(m_xy, component = 2)
parameters(m_xy, component = 3)
plot(m_xy)
rm_xy <- refit(m_xy)
summary(rm_xy)
count %>%
mutate(
proba1 = m_xy@posterior$scaled[, 1],
proba2 = m_xy@posterior$scaled[, 2],
proba3 = m_xy@posterior$scaled[, 3]
) %>%
pivot_longer(cols = c(proba1, proba2, proba3)) %>%
ggplot(aes(x = count_m, y = count_f, color = value)) +
geom_point() +
facet_wrap(~name) +
coord_equal() +
theme_bw()
```
```{r dev, eval=F}
data <- readr::read_tsv("../../results/12/mlongespiculosa/mlongespiculosa.csv", show_col_types = FALSE)
format(object.size(data), units = "Mb")
annotation <- parse_annotation("../../data/sample.csv")
count <- annotate_counts(annotation, data, "mlongespiculosa")
save(count, file = "results/12/mlongespiculosa/counts.Rdata")
load("results/12/mlongespiculosa/counts.Rdata")
```
```{r dev, eval=F}
res <- compare_models(count %>% dplyr::ungroup(), nboot = 100, bootsize = 0.1, core = 24)
res %>% res %>%
ggplot(aes(x = name, y = BIC)) + ggplot(aes(x = name, y = BIC)) +
geom_violin() geom_violin()
...@@ -772,4 +748,71 @@ res %>% ...@@ -772,4 +748,71 @@ res %>%
geom_violin() geom_violin()
``` ```
## M Spiculigera data
```{r dev}
load("../../../results/12/mspiculigera/sample.Rdata")
```
```{r dev}
model_XY <- sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "XY")
model_XO <- sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "X0")
sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XY$proba, sex = "XY")
sample %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "X0")
```
```{r dev}
res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
res
```
# tmvnorm test
```{r dev}
library(tmvtnorm)
mean <- c(1, 1)
sigma <- matrix(c(10, 0,
0, 1), 2, 2)
# Linear Constraints
#
# a1 <= x1 + x2 <= b2
# a2 <= x1 - x2 <= b2
#
# [ a1 ] <= [ 1 1 ] [ x1 ] <= [b1]
# [ a2 ] [ 1 -1 ] [ x2 ] [b2]
a <- c(0, 0)
b <- c( 30, 30)
D <- matrix(c(1, 1,
-1, 1), 2, 2)
X <- rtmvnorm(n=10000, mean, sigma, lower=a, upper=b, D=D, algorithm="gibbsR")
plot(X, main="Gibbs sampling for multivariate normal
with linear constraints according to Geweke (1991)")
abline(a=0, b=1, col="blue", lwd = 2)
abline(a=0, b=0, col="blue", lwd = 2)
# mark linear constraints as lines
for (i in 1:nrow(D)) {
abline(a=a[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red")
abline(a=b[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red")
}
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment