diff --git a/src/clustering.Rmd b/src/clustering.Rmd index 6150909f30f6f68a370039554502f49a5d76065c..72a1e80a75e525d7462740bf6a49dfad76a6f982 100644 --- a/src/clustering.Rmd +++ b/src/clustering.Rmd @@ -49,30 +49,36 @@ test_slope(1.05, 2.05, 0.95) Simulate k-mer counts data ```{r} -n_kmer = 1e2 -mean_coef = 1000 -data <- tibble( - sex = "F", - count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>% - as_tibble() -) %>% - unnest(count) %>% - bind_rows( - tibble( - sex = "M", - count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd") - %>% as_tibble() - ) %>% - unnest(count) - ) %>% bind_rows( - tibble( - sex = "A", - count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd") - %>% as_tibble() - ) %>% unnest(count) - ) %>% - rename(count_m = V1, - count_f = V2) +sim_kmer <- function(n_kmer, mean_coef, sex = "XY") { + data <- tibble( + sex = "F", + count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>% + as_tibble() + ) %>% + unnest(count) %>% + bind_rows( + tibble( + sex = "A", + count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd") + %>% as_tibble() + ) %>% unnest(count) + ) + if (sex == "XY") { + data <- data %>% + bind_rows( + tibble( + sex = "M", + count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd") + %>% as_tibble() + ) %>% + unnest(count) + ) + } + data %>% + rename(count_m = V1, + count_f = V2) +} +data <- sim_kmer(1e4, 1000, "XY") data %>% ggplot(aes(x = count_m, y = count_f, color = sex)) + geom_point() + @@ -123,15 +129,6 @@ params_diff <- function(old_theta, theta, threshold) { return(T) } -proba_total <- function(x, theta, cluster_coef, sex) { - proba <- 0 - for (params in expand_theta(theta, cluster_coef, sex)) { - proba <- proba + params$pi * - mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma) - } - return(proba) -} - proba_point <- function(x, theta, cluster_coef, sex) { proba <- c() for (params in expand_theta(theta, cluster_coef, sex)) { @@ -143,7 +140,7 @@ proba_point <- function(x, theta, cluster_coef, sex) { } loglik <- function(x, theta, cluster_coef, sex) { - -log(sum(proba_total(x, theta, cluster_coef, sex))) + sum(log(rowSums(proba_point(x, theta, cluster_coef, sex)))) } # EM function @@ -243,8 +240,16 @@ init_param <- function(x, sex) { return(list(cluster_coef = cluster_coef, theta = theta)) } +compute_bic <- function(x, loglik, sex = "XY") { + k <- 1 + 4 * 2 + if (sex == "YX") { + k <- k + 4 + } + return(k * log(nrow(x)) - 2 * loglik) +} + -EM_clust <- function(x, threshold = 0.1, sex = "XY") { +EM_clust <- function(x, threshold = 1, sex = "XY") { old_loglik <- -Inf new_loglik <- 0 param <- init_param(x, sex) @@ -256,53 +261,80 @@ EM_clust <- function(x, threshold = 0.1, sex = "XY") { param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex) param$theta$pi <- param$theta$pi / nrow(x) new_loglik <- loglik(x, param$theta, param$cluster_coef, sex) + if(is.infinite(new_loglik)) { + break + } } - return(proba) + return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex))) } +``` +# clustering XY -proba <- data %>% +```{r} +model_XY <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% EM_clust() data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% - plot_proba(proba) + plot_proba(model_XY$proba) +``` -proba <- data %>% +# clustering XO + +```{r} +model_XO <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% EM_clust(sex = "X0") data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% - plot_proba(proba, sex = "X0") + plot_proba(model_XO$proba, sex = "X0") ``` +# LRT + +## For XY ```{r} -theta4 <- list( - "pi" = c(.1, .05, .85), - "mu" = list(c(1000, 2000, 1000, 2000), c(1000, 0, 1000, 0), c(1000, 1000, 1000, 1000)), - "sigma" = list( - "f" = diag(1000, nrow=4, ncol=4), - "m" = diag(1000, nrow=4, ncol=4), - "a" = diag(1000, nrow=4, ncol=4) - ) -) -proba4 <- data %>% +data <- sim_kmer(1e3, 1000, "XY") +model_XY <- data %>% dplyr::select(count_m, count_f) %>% - dplyr::mutate( - count_m2 = count_m, - count_f2 = count_f) %>% as.matrix() %>% - EM_clust(theta4) + EM_clust() +model_XO <- data %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + EM_clust(sex = "XO") +model_XY$BIC +model_XO$BIC +-2 * (model_XY$loglik - model_XO$loglik) +-2 * (model_XO$loglik - model_XY$loglik) +pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4) +pchisq(-2 * (model_XO$loglik - model_XY$loglik), 4) +``` -data %>% +## For XO + +```{r} +data <- sim_kmer(1e3, 1000, "XO") +model_XY <- data %>% + dplyr::select(count_m, count_f) %>% + as.matrix() %>% + EM_clust() +model_XO <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% - plot_proba(proba4) + EM_clust(sex = "XO") +model_XY$BIC +model_XO$BIC +- 2 * (model_XY$loglik - model_XO$loglik) +- 2 * (model_XO$loglik - model_XY$loglik) +pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4) +pchisq(-2 * (model_XO$loglik - model_XY$loglik), 4) ``` ## With real data