add clustering on the log scale

cd0c90c4 · Laurent Modolo · a556c25d · cd0c90c4
Verified Commit cd0c90c4 authored 1 year ago by Laurent Modolo
--- a/dev/flat_full.Rmd
+++ b/dev/flat_full.Rmd
@@ -2,12 +2,13 @@
 title: "flat_full.Rmd for working package"
 output: html_document
 editor_options: 
-  chunk_output_type: console
+  chunk_output_type: inline
 ---
 ```{r load}
 library(kmerclust)
 library(tidyverse)
+library(parallel)
 ```
 ```{r development-load}
@@ -59,7 +60,7 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY", count = F) {
        bind_rows(
            tibble(
                sex = "A",
-                count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
+                count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(1, 1)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
                %>% as_tibble(.name_repair = "universal")
            ) %>% unnest(c(count))
        )
@@ -123,18 +124,18 @@ expand_theta <- function(theta, cluster_coef, sex) {
    theta_ref <- list(
    "a" = list(
        "pi" = theta$pi[1],
-        "mu" = cluster_coef$a * theta$mu,
+        "mu" = cluster_coef$a + theta$mu,
        "sigma" = theta$sigma$a
    ),
    "f" = list(
        "pi" = theta$pi[2],
-        "mu" = cluster_coef$f * theta$mu,
+        "mu" = cluster_coef$f + theta$mu,
        "sigma" = theta$sigma$f
    ))
    if (sex == "XY") {
        theta_ref[["m"]] <- list(
            "pi" = theta$pi[3],
-            "mu" = cluster_coef$m * theta$mu,
+            "mu" = (cluster_coef$m + theta$mu) * c(1, 0),
            "sigma" = theta$sigma$m
        )
    }
@@ -243,8 +244,14 @@ E_N_clust <- function(proba) {
 M_mean <- function(x, proba, N_clust, cluster_coef, sex) {
    mu <- 0
    for (cluster in 1:ncol(proba)) {
-        mu <- mu + 
+        if (cluster == 3) {
-          mean(colSums(x * cluster_coef[[cluster]] * proba[, cluster]) / N_clust[cluster])
+          mu <- mu + 
+            mean(colSums((x - cluster_coef[[cluster]]) * c(1, 0) * proba[, cluster]) / N_clust[cluster])
+        } else {
+          mu <- mu + 
+            mean(colSums((x - cluster_coef[[cluster]]) * proba[, cluster]) / N_clust[cluster])
+        }
    }
    if (sex == "XY") {
        return(mu / 3)
@@ -266,7 +273,11 @@ M_mean <- function(x, proba, N_clust, cluster_coef, sex) {
 M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
    cov_clust <- list() 
    for (cluster in 1:ncol(proba)) {
-        cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster]
+      if (cluster == 3) {
+        cov_clust[[cluster]] <- t(proba[, cluster] * (x - (mu + cluster_coef[[cluster]]) * c(1, 0))) %*% (x - (mu + cluster_coef[[cluster]]) * c(1, 0)) / N_clust[cluster]
+      } else {
+        cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu + cluster_coef[[cluster]])) %*% (x - mu + cluster_coef[[cluster]]) / N_clust[cluster]
+      }
    }
    sigma <- list()
    sigma$a <- cov_clust[[1]]
@@ -320,10 +331,6 @@ plot_proba <- function(x, proba, sex = "XY") {
            theme_bw()
    }
 }
-sample %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    plot_proba(model_XY$proba, sex = "XY")
 ```
 ```{r fun-init_param}
@@ -334,19 +341,19 @@ sample %>%
 #' @return a list of parameters
 init_param <- function(x, sex) {
    cluster_coef <- list(
-        "a" = c(1, 1),
+        "a" = c(0, 0),
-        "f" = c(1, 0.5)
+        "f" = c(0, log(2))
    )
    theta <- list(
        "pi" = c(.85, .1, .05),
-        "mu" = mean(colMeans(x)) * .5
+        "mu" = mean(colMeans(x))
    )
    theta$sigma <- list(
            "a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
            "f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
    )
    if (sex == "XY") {
-        cluster_coef$m <- c(1, 0)
+        cluster_coef$m <- c(0, 0)
        theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
    }
    return(list(cluster_coef = cluster_coef, theta = theta))
@@ -392,13 +399,13 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
    new_loglik <- 0
    param <- init_param(x, sex)
    while (abs(new_loglik - old_loglik) > threshold) {
-        old_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
+        old_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
-        proba <- E_proba(x, param$theta, param$cluster_coef, sex)
+        proba <- E_proba(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
-        param$theta$pi <- E_N_clust(proba)
+        param$theta$pi <- E_N_clust(proba = proba)
-        param$theta$mu <- M_mean(x, proba, param$theta$pi, params$cluster_coef, sex)
+        param$theta$mu <- M_mean(x = x, proba = proba, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex)
-        param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
+        param$theta$sigma <- M_cov(x = x, proba = proba, mu = param$theta$mu, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex)
        param$theta$pi <- param$theta$pi / nrow(x)
-        new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
+        new_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
        if (is.infinite(new_loglik)) {
            break
        }
@@ -505,13 +512,21 @@ BSS_WSS <- function(x, cluster) {
 # clustering XY
 ```{r clustering_XY}
-data <- sim_kmer(1e6, 1000, "XY", count =T)
+data <- sim_kmer(1e5, 1000, "XY", count =T)
 model_XY <- data %>%
    dplyr::select(count_m, count_f) %>%
+    mutate(
+      count_m = log1p(count_m),
+      count_f = log1p(count_f),
+    ) %>% 
    as.matrix() %>% 
    EM_clust()
 data %>%
    dplyr::select(count_m, count_f) %>%
+    mutate(
+      count_m = log1p(count_m),
+      count_f = log1p(count_f),
+    ) %>% 
    as.matrix() %>% 
    plot_proba(model_XY$proba)
 ```
@@ -519,13 +534,23 @@ data %>%
 # clustering XO
 ```{r clustering_XO}
-data <- sim_kmer(1e4, 1000, "XO")
+data <- sim_kmer(1e5, 1000, "XO")
 model_XO <- data %>%
    dplyr::select(count_m, count_f) %>%
+    mutate(
+      count_m = log1p(count_m),
+      count_f = log1p(count_f),
+    ) %>% 
+    drop_na() %>% 
    as.matrix() %>% 
    EM_clust(sex = "X0")
 data %>%
    dplyr::select(count_m, count_f) %>%
+    mutate(
+      count_m = log1p(count_m),
+      count_f = log1p(count_f),
+    ) %>% 
+    drop_na() %>% 
    as.matrix() %>% 
    plot_proba(model_XO$proba, sex = "X0")
 ```
@@ -656,7 +681,7 @@ annotate_counts <- function(annotation, count, name) {
 ## M belari data
 ```{r dev}
-load("../../results/12/mbelari/sample.Rdata", v = T)
+load("../../../results/12/mbelari/sample.Rdata")
 ```
 ```{r dev}
@@ -679,91 +704,42 @@ sample %>%
 ```
 ```{r dev}
-sample %>% 
+res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
-  select(count_m, count_f) %>% 
+res %>%
-  sample_frac(0.01) %>% 
+    ggplot(aes(x = name, y = BIC)) +
-  mutate(
+    geom_violin()
-    count_m = expm1(count_m),
+res %>%
-    count_f = expm1(count_f),
+    ggplot(aes(x = name, y = WSS_f / BSS)) +
-  ) %>% 
+    geom_violin()
-  ggplot(aes(x = count_m, y = count_f)) +
-  geom_point() +
-  theme_bw()
 ```
+## M Longespiculosa data
 ```{r dev}
-sample %>% 
+load("../../../results/12/mlongespiculosa/sample.Rdata")
-  select(count_m, count_f) %>% 
-  sample_frac(0.01) %>% 
-  ggplot(aes(x = count_m, y = count_f)) +
-  geom_point() +
-  theme_bw()
 ```
 ```{r dev}
-library(flexmix)
+model_XY <- sample %>%
-count <- sample %>%
    dplyr::select(count_m, count_f) %>%
-    mutate(
+    as.matrix() %>% 
-      count_m = round(expm1(count_m)),
+    EM_clust(sex = "XY")
-      count_f = round(expm1(count_f)),
+model_XO <- sample %>%
-    ) %>% 
+    dplyr::select(count_m, count_f) %>%
-    filter(count_m > 0) %>% 
+    as.matrix() %>% 
-    mutate(
+    EM_clust(sex = "X0")
-      ratio = count_f / count_m
+sample %>%
-    ) %>% 
+    dplyr::select(count_m, count_f) %>%
-    sample_n(10000)
+    as.matrix() %>% 
+    plot_proba(model_XY$proba, sex = "XY")
-count %>% 
+sample %>%
-ggplot(aes(x = ratio)) +
+    dplyr::select(count_m, count_f) %>%
-  geom_histogram() +
+    as.matrix() %>% 
-  scale_x_log10()
+    plot_proba(model_XO$proba, sex = "X0")
+```
-m_xy <- flexmix(
-  ratio ~ ratio,
+```{r dev}
-  k = 3,
+res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
-  data = count,
-  )
-m_xo <- flexmix(
-  count_m ~ count_f,
-  k = 2,
-  data = count,
-  model = FLXglm(family = "poisson")
-  )
-summary(m_xy)
-parameters(m_xy, component = 1)
-parameters(m_xy, component = 2)
-parameters(m_xy, component = 3)
-plot(m_xy)
-rm_xy <- refit(m_xy)
-summary(rm_xy)
-count %>% 
-  mutate(
-    proba1 = m_xy@posterior$scaled[, 1],
-    proba2 = m_xy@posterior$scaled[, 2],
-    proba3 = m_xy@posterior$scaled[, 3]
-  ) %>%
-  pivot_longer(cols = c(proba1, proba2, proba3)) %>%
-  ggplot(aes(x = count_m, y = count_f, color = value)) +
-  geom_point() +
-  facet_wrap(~name) +
-  coord_equal() +
-  theme_bw()
-```
-```{r dev, eval=F}
-data <- readr::read_tsv("../../results/12/mlongespiculosa/mlongespiculosa.csv", show_col_types = FALSE)
-format(object.size(data), units = "Mb")
-annotation <- parse_annotation("../../data/sample.csv")
-count <- annotate_counts(annotation, data, "mlongespiculosa")
-save(count, file = "results/12/mlongespiculosa/counts.Rdata")
-load("results/12/mlongespiculosa/counts.Rdata")
-```
-```{r dev, eval=F}
-res <- compare_models(count %>% dplyr::ungroup(), nboot = 100, bootsize = 0.1, core = 24)
 res %>%
    ggplot(aes(x = name, y = BIC)) +
    geom_violin()
@@ -772,4 +748,71 @@ res %>%
    geom_violin()
 ```
+## M Spiculigera data
+```{r dev}
+load("../../../results/12/mspiculigera/sample.Rdata")
+```
+```{r dev}
+model_XY <- sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    EM_clust(sex = "XY")
+model_XO <- sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    EM_clust(sex = "X0")
+sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    plot_proba(model_XY$proba, sex = "XY")
+sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    plot_proba(model_XO$proba, sex = "X0")
+```
+```{r dev}
+res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
+res %>%
+    ggplot(aes(x = name, y = BIC)) +
+    geom_violin()
+res %>%
+    ggplot(aes(x = name, y = WSS_f / BSS)) +
+    geom_violin()
+res
+```
+# tmvnorm test
+```{r dev}
+library(tmvtnorm)
+mean  <- c(1, 1)
+sigma <- matrix(c(10, 0,
+                   0, 1), 2, 2)
+# Linear Constraints
+#
+# a1 <= x1 + x2 <= b2
+# a2 <= x1 - x2 <= b2
+#
+# [ a1 ] <= [ 1   1 ] [ x1 ] <= [b1]
+# [ a2 ]    [ 1  -1 ] [ x2 ]    [b2]
+a     <- c(0, 0)
+b     <- c( 30,  30)
+D     <- matrix(c(1, 1,
+                  -1, 1), 2, 2)                   
+X <- rtmvnorm(n=10000, mean, sigma, lower=a, upper=b, D=D, algorithm="gibbsR")
+plot(X, main="Gibbs sampling for multivariate normal 
+              with linear constraints according to Geweke (1991)")
+  abline(a=0, b=1, col="blue", lwd = 2)
+  abline(a=0, b=0, col="blue", lwd = 2)
+# mark linear constraints as lines
+for (i in 1:nrow(D)) {
+  abline(a=a[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red")
+  abline(a=b[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red")
+}
+```