Skip to content
Snippets Groups Projects
Verified Commit 13335ade authored by Laurent Modolo's avatar Laurent Modolo
Browse files

add LTR and BIC

parent d5e24c1b
No related branches found
No related tags found
No related merge requests found
......@@ -49,8 +49,7 @@ test_slope(1.05, 2.05, 0.95)
Simulate k-mer counts data
```{r}
n_kmer = 1e2
mean_coef = 1000
sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
data <- tibble(
sex = "F",
count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>%
......@@ -58,21 +57,28 @@ data <- tibble(
) %>%
unnest(count) %>%
bind_rows(
tibble(
sex = "M",
count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd")
%>% as_tibble()
) %>%
unnest(count)
) %>% bind_rows(
tibble(
sex = "A",
count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
%>% as_tibble()
) %>% unnest(count)
)
if (sex == "XY") {
data <- data %>%
bind_rows(
tibble(
sex = "M",
count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd")
%>% as_tibble()
) %>%
unnest(count)
)
}
data %>%
rename(count_m = V1,
count_f = V2)
}
data <- sim_kmer(1e4, 1000, "XY")
data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) +
geom_point() +
......@@ -123,15 +129,6 @@ params_diff <- function(old_theta, theta, threshold) {
return(T)
}
proba_total <- function(x, theta, cluster_coef, sex) {
proba <- 0
for (params in expand_theta(theta, cluster_coef, sex)) {
proba <- proba + params$pi *
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
}
return(proba)
}
proba_point <- function(x, theta, cluster_coef, sex) {
proba <- c()
for (params in expand_theta(theta, cluster_coef, sex)) {
......@@ -143,7 +140,7 @@ proba_point <- function(x, theta, cluster_coef, sex) {
}
loglik <- function(x, theta, cluster_coef, sex) {
-log(sum(proba_total(x, theta, cluster_coef, sex)))
sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
}
# EM function
......@@ -243,8 +240,16 @@ init_param <- function(x, sex) {
return(list(cluster_coef = cluster_coef, theta = theta))
}
compute_bic <- function(x, loglik, sex = "XY") {
k <- 1 + 4 * 2
if (sex == "YX") {
k <- k + 4
}
return(k * log(nrow(x)) - 2 * loglik)
}
EM_clust <- function(x, threshold = 0.1, sex = "XY") {
EM_clust <- function(x, threshold = 1, sex = "XY") {
old_loglik <- -Inf
new_loglik <- 0
param <- init_param(x, sex)
......@@ -256,53 +261,80 @@ EM_clust <- function(x, threshold = 0.1, sex = "XY") {
param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
param$theta$pi <- param$theta$pi / nrow(x)
new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
if(is.infinite(new_loglik)) {
break
}
return(proba)
}
return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex)))
}
```
# clustering XY
proba <- data %>%
```{r}
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(proba)
plot_proba(model_XY$proba)
```
# clustering XO
proba <- data %>%
```{r}
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "X0")
data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(proba, sex = "X0")
plot_proba(model_XO$proba, sex = "X0")
```
# LRT
## For XY
```{r}
theta4 <- list(
"pi" = c(.1, .05, .85),
"mu" = list(c(1000, 2000, 1000, 2000), c(1000, 0, 1000, 0), c(1000, 1000, 1000, 1000)),
"sigma" = list(
"f" = diag(1000, nrow=4, ncol=4),
"m" = diag(1000, nrow=4, ncol=4),
"a" = diag(1000, nrow=4, ncol=4)
)
)
proba4 <- data %>%
data <- sim_kmer(1e3, 1000, "XY")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
dplyr::mutate(
count_m2 = count_m,
count_f2 = count_f) %>%
as.matrix() %>%
EM_clust(theta4)
EM_clust(sex = "XO")
model_XY$BIC
model_XO$BIC
-2 * (model_XY$loglik - model_XO$loglik)
-2 * (model_XO$loglik - model_XY$loglik)
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
pchisq(-2 * (model_XO$loglik - model_XY$loglik), 4)
```
data %>%
## For XO
```{r}
data <- sim_kmer(1e3, 1000, "XO")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(proba4)
EM_clust(sex = "XO")
model_XY$BIC
model_XO$BIC
- 2 * (model_XY$loglik - model_XO$loglik)
- 2 * (model_XO$loglik - model_XY$loglik)
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
pchisq(-2 * (model_XO$loglik - model_XY$loglik), 4)
```
## With real data
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment