Skip to content
Snippets Groups Projects
Verified Commit 13335ade authored by Laurent Modolo's avatar Laurent Modolo
Browse files

add LTR and BIC

parent d5e24c1b
Branches
No related tags found
No related merge requests found
...@@ -49,30 +49,36 @@ test_slope(1.05, 2.05, 0.95) ...@@ -49,30 +49,36 @@ test_slope(1.05, 2.05, 0.95)
Simulate k-mer counts data Simulate k-mer counts data
```{r} ```{r}
n_kmer = 1e2 sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
mean_coef = 1000 data <- tibble(
data <- tibble( sex = "F",
sex = "F", count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>%
count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>% as_tibble()
as_tibble() ) %>%
) %>% unnest(count) %>%
unnest(count) %>% bind_rows(
bind_rows( tibble(
tibble( sex = "A",
sex = "M", count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd") %>% as_tibble()
%>% as_tibble() ) %>% unnest(count)
) %>% )
unnest(count) if (sex == "XY") {
) %>% bind_rows( data <- data %>%
tibble( bind_rows(
sex = "A", tibble(
count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd") sex = "M",
%>% as_tibble() count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd")
) %>% unnest(count) %>% as_tibble()
) %>% ) %>%
rename(count_m = V1, unnest(count)
count_f = V2) )
}
data %>%
rename(count_m = V1,
count_f = V2)
}
data <- sim_kmer(1e4, 1000, "XY")
data %>% data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) + ggplot(aes(x = count_m, y = count_f, color = sex)) +
geom_point() + geom_point() +
...@@ -123,15 +129,6 @@ params_diff <- function(old_theta, theta, threshold) { ...@@ -123,15 +129,6 @@ params_diff <- function(old_theta, theta, threshold) {
return(T) return(T)
} }
proba_total <- function(x, theta, cluster_coef, sex) {
proba <- 0
for (params in expand_theta(theta, cluster_coef, sex)) {
proba <- proba + params$pi *
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
}
return(proba)
}
proba_point <- function(x, theta, cluster_coef, sex) { proba_point <- function(x, theta, cluster_coef, sex) {
proba <- c() proba <- c()
for (params in expand_theta(theta, cluster_coef, sex)) { for (params in expand_theta(theta, cluster_coef, sex)) {
...@@ -143,7 +140,7 @@ proba_point <- function(x, theta, cluster_coef, sex) { ...@@ -143,7 +140,7 @@ proba_point <- function(x, theta, cluster_coef, sex) {
} }
loglik <- function(x, theta, cluster_coef, sex) { loglik <- function(x, theta, cluster_coef, sex) {
-log(sum(proba_total(x, theta, cluster_coef, sex))) sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
} }
# EM function # EM function
...@@ -243,8 +240,16 @@ init_param <- function(x, sex) { ...@@ -243,8 +240,16 @@ init_param <- function(x, sex) {
return(list(cluster_coef = cluster_coef, theta = theta)) return(list(cluster_coef = cluster_coef, theta = theta))
} }
compute_bic <- function(x, loglik, sex = "XY") {
k <- 1 + 4 * 2
if (sex == "YX") {
k <- k + 4
}
return(k * log(nrow(x)) - 2 * loglik)
}
EM_clust <- function(x, threshold = 0.1, sex = "XY") { EM_clust <- function(x, threshold = 1, sex = "XY") {
old_loglik <- -Inf old_loglik <- -Inf
new_loglik <- 0 new_loglik <- 0
param <- init_param(x, sex) param <- init_param(x, sex)
...@@ -256,53 +261,80 @@ EM_clust <- function(x, threshold = 0.1, sex = "XY") { ...@@ -256,53 +261,80 @@ EM_clust <- function(x, threshold = 0.1, sex = "XY") {
param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex) param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
param$theta$pi <- param$theta$pi / nrow(x) param$theta$pi <- param$theta$pi / nrow(x)
new_loglik <- loglik(x, param$theta, param$cluster_coef, sex) new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
if(is.infinite(new_loglik)) {
break
}
} }
return(proba) return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex)))
} }
```
# clustering XY
proba <- data %>% ```{r}
model_XY <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
EM_clust() EM_clust()
data %>% data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
plot_proba(proba) plot_proba(model_XY$proba)
```
proba <- data %>% # clustering XO
```{r}
model_XO <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
EM_clust(sex = "X0") EM_clust(sex = "X0")
data %>% data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
plot_proba(proba, sex = "X0") plot_proba(model_XO$proba, sex = "X0")
``` ```
# LRT
## For XY
```{r} ```{r}
theta4 <- list( data <- sim_kmer(1e3, 1000, "XY")
"pi" = c(.1, .05, .85), model_XY <- data %>%
"mu" = list(c(1000, 2000, 1000, 2000), c(1000, 0, 1000, 0), c(1000, 1000, 1000, 1000)),
"sigma" = list(
"f" = diag(1000, nrow=4, ncol=4),
"m" = diag(1000, nrow=4, ncol=4),
"a" = diag(1000, nrow=4, ncol=4)
)
)
proba4 <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
dplyr::mutate(
count_m2 = count_m,
count_f2 = count_f) %>%
as.matrix() %>% as.matrix() %>%
EM_clust(theta4) EM_clust()
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "XO")
model_XY$BIC
model_XO$BIC
-2 * (model_XY$loglik - model_XO$loglik)
-2 * (model_XO$loglik - model_XY$loglik)
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
pchisq(-2 * (model_XO$loglik - model_XY$loglik), 4)
```
data %>% ## For XO
```{r}
data <- sim_kmer(1e3, 1000, "XO")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
model_XO <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
plot_proba(proba4) EM_clust(sex = "XO")
model_XY$BIC
model_XO$BIC
- 2 * (model_XY$loglik - model_XO$loglik)
- 2 * (model_XO$loglik - model_XY$loglik)
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
pchisq(-2 * (model_XO$loglik - model_XY$loglik), 4)
``` ```
## With real data ## With real data
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment