From cd0c90c476abf60335fcbcc2c9e86e05ff11bc4b Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Fri, 23 Jun 2023 13:38:44 +0200
Subject: [PATCH] add clustering on the log scale

---
 dev/flat_full.Rmd | 249 +++++++++++++++++++++++++++-------------------
 1 file changed, 146 insertions(+), 103 deletions(-)

diff --git a/dev/flat_full.Rmd b/dev/flat_full.Rmd
index 14d833b..61d773a 100644
--- a/dev/flat_full.Rmd
+++ b/dev/flat_full.Rmd
@@ -2,12 +2,13 @@
 title: "flat_full.Rmd for working package"
 output: html_document
 editor_options: 
-  chunk_output_type: console
+  chunk_output_type: inline
 ---
 
 ```{r load}
 library(kmerclust)
 library(tidyverse)
+library(parallel)
 ```
 
 ```{r development-load}
@@ -59,7 +60,7 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY", count = F) {
         bind_rows(
             tibble(
                 sex = "A",
-                count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
+                count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(1, 1)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
                 %>% as_tibble(.name_repair = "universal")
             ) %>% unnest(c(count))
         )
@@ -123,18 +124,18 @@ expand_theta <- function(theta, cluster_coef, sex) {
     theta_ref <- list(
     "a" = list(
         "pi" = theta$pi[1],
-        "mu" = cluster_coef$a * theta$mu,
+        "mu" = cluster_coef$a + theta$mu,
         "sigma" = theta$sigma$a
     ),
     "f" = list(
         "pi" = theta$pi[2],
-        "mu" = cluster_coef$f * theta$mu,
+        "mu" = cluster_coef$f + theta$mu,
         "sigma" = theta$sigma$f
     ))
     if (sex == "XY") {
         theta_ref[["m"]] <- list(
             "pi" = theta$pi[3],
-            "mu" = cluster_coef$m * theta$mu,
+            "mu" = (cluster_coef$m + theta$mu) * c(1, 0),
             "sigma" = theta$sigma$m
         )
     }
@@ -243,8 +244,14 @@ E_N_clust <- function(proba) {
 M_mean <- function(x, proba, N_clust, cluster_coef, sex) {
     mu <- 0
     for (cluster in 1:ncol(proba)) {
-        mu <- mu + 
-          mean(colSums(x * cluster_coef[[cluster]] * proba[, cluster]) / N_clust[cluster])
+        if (cluster == 3) {
+          mu <- mu + 
+            mean(colSums((x - cluster_coef[[cluster]]) * c(1, 0) * proba[, cluster]) / N_clust[cluster])
+        } else {
+          
+          mu <- mu + 
+            mean(colSums((x - cluster_coef[[cluster]]) * proba[, cluster]) / N_clust[cluster])
+        }
     }
     if (sex == "XY") {
         return(mu / 3)
@@ -266,7 +273,11 @@ M_mean <- function(x, proba, N_clust, cluster_coef, sex) {
 M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
     cov_clust <- list() 
     for (cluster in 1:ncol(proba)) {
-        cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster]
+      if (cluster == 3) {
+        cov_clust[[cluster]] <- t(proba[, cluster] * (x - (mu + cluster_coef[[cluster]]) * c(1, 0))) %*% (x - (mu + cluster_coef[[cluster]]) * c(1, 0)) / N_clust[cluster]
+      } else {
+        cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu + cluster_coef[[cluster]])) %*% (x - mu + cluster_coef[[cluster]]) / N_clust[cluster]
+      }
     }
     sigma <- list()
     sigma$a <- cov_clust[[1]]
@@ -320,10 +331,6 @@ plot_proba <- function(x, proba, sex = "XY") {
             theme_bw()
     }
 }
-sample %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    plot_proba(model_XY$proba, sex = "XY")
 ```
 
 ```{r fun-init_param}
@@ -334,19 +341,19 @@ sample %>%
 #' @return a list of parameters
 init_param <- function(x, sex) {
     cluster_coef <- list(
-        "a" = c(1, 1),
-        "f" = c(1, 0.5)
+        "a" = c(0, 0),
+        "f" = c(0, log(2))
     )
     theta <- list(
         "pi" = c(.85, .1, .05),
-        "mu" = mean(colMeans(x)) * .5
+        "mu" = mean(colMeans(x))
     )
     theta$sigma <- list(
             "a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
             "f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
     )
     if (sex == "XY") {
-        cluster_coef$m <- c(1, 0)
+        cluster_coef$m <- c(0, 0)
         theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
     }
     return(list(cluster_coef = cluster_coef, theta = theta))
@@ -392,13 +399,13 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
     new_loglik <- 0
     param <- init_param(x, sex)
     while (abs(new_loglik - old_loglik) > threshold) {
-        old_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
-        proba <- E_proba(x, param$theta, param$cluster_coef, sex)
-        param$theta$pi <- E_N_clust(proba)
-        param$theta$mu <- M_mean(x, proba, param$theta$pi, params$cluster_coef, sex)
-        param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
+        old_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
+        proba <- E_proba(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
+        param$theta$pi <- E_N_clust(proba = proba)
+        param$theta$mu <- M_mean(x = x, proba = proba, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex)
+        param$theta$sigma <- M_cov(x = x, proba = proba, mu = param$theta$mu, N_clust = param$theta$pi, cluster_coef = param$cluster_coef, sex = sex)
         param$theta$pi <- param$theta$pi / nrow(x)
-        new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
+        new_loglik <- loglik(x = x, theta = param$theta, cluster_coef = param$cluster_coef, sex = sex)
         if (is.infinite(new_loglik)) {
             break
         }
@@ -505,13 +512,21 @@ BSS_WSS <- function(x, cluster) {
 # clustering XY
 
 ```{r clustering_XY}
-data <- sim_kmer(1e6, 1000, "XY", count =T)
+data <- sim_kmer(1e5, 1000, "XY", count =T)
 model_XY <- data %>%
     dplyr::select(count_m, count_f) %>%
+    mutate(
+      count_m = log1p(count_m),
+      count_f = log1p(count_f),
+    ) %>% 
     as.matrix() %>% 
     EM_clust()
 data %>%
     dplyr::select(count_m, count_f) %>%
+    mutate(
+      count_m = log1p(count_m),
+      count_f = log1p(count_f),
+    ) %>% 
     as.matrix() %>% 
     plot_proba(model_XY$proba)
 ```
@@ -519,13 +534,23 @@ data %>%
 # clustering XO
 
 ```{r clustering_XO}
-data <- sim_kmer(1e4, 1000, "XO")
+data <- sim_kmer(1e5, 1000, "XO")
 model_XO <- data %>%
     dplyr::select(count_m, count_f) %>%
+    mutate(
+      count_m = log1p(count_m),
+      count_f = log1p(count_f),
+    ) %>% 
+    drop_na() %>% 
     as.matrix() %>% 
     EM_clust(sex = "X0")
 data %>%
     dplyr::select(count_m, count_f) %>%
+    mutate(
+      count_m = log1p(count_m),
+      count_f = log1p(count_f),
+    ) %>% 
+    drop_na() %>% 
     as.matrix() %>% 
     plot_proba(model_XO$proba, sex = "X0")
 ```
@@ -656,7 +681,7 @@ annotate_counts <- function(annotation, count, name) {
 ## M belari data
 
 ```{r dev}
-load("../../results/12/mbelari/sample.Rdata", v = T)
+load("../../../results/12/mbelari/sample.Rdata")
 ```
 
 ```{r dev}
@@ -679,91 +704,42 @@ sample %>%
 ```
 
 ```{r dev}
-sample %>% 
-  select(count_m, count_f) %>% 
-  sample_frac(0.01) %>% 
-  mutate(
-    count_m = expm1(count_m),
-    count_f = expm1(count_f),
-  ) %>% 
-  ggplot(aes(x = count_m, y = count_f)) +
-  geom_point() +
-  theme_bw()
+res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
+res %>%
+    ggplot(aes(x = name, y = BIC)) +
+    geom_violin()
+res %>%
+    ggplot(aes(x = name, y = WSS_f / BSS)) +
+    geom_violin()
 ```
 
+## M Longespiculosa data
+
 ```{r dev}
-sample %>% 
-  select(count_m, count_f) %>% 
-  sample_frac(0.01) %>% 
-  ggplot(aes(x = count_m, y = count_f)) +
-  geom_point() +
-  theme_bw()
+load("../../../results/12/mlongespiculosa/sample.Rdata")
 ```
 
 ```{r dev}
-library(flexmix)
-count <- sample %>%
+model_XY <- sample %>%
     dplyr::select(count_m, count_f) %>%
-    mutate(
-      count_m = round(expm1(count_m)),
-      count_f = round(expm1(count_f)),
-    ) %>% 
-    filter(count_m > 0) %>% 
-    mutate(
-      ratio = count_f / count_m
-    ) %>% 
-    sample_n(10000)
-
-count %>% 
-ggplot(aes(x = ratio)) +
-  geom_histogram() +
-  scale_x_log10()
-  
-m_xy <- flexmix(
-  ratio ~ ratio,
-  k = 3,
-  data = count,
-  )
-m_xo <- flexmix(
-  count_m ~ count_f,
-  k = 2,
-  data = count,
-  model = FLXglm(family = "poisson")
-  )
-summary(m_xy)
-parameters(m_xy, component = 1)
-parameters(m_xy, component = 2)
-parameters(m_xy, component = 3)
-plot(m_xy)
-rm_xy <- refit(m_xy)
-summary(rm_xy)
-count %>% 
-  mutate(
-    proba1 = m_xy@posterior$scaled[, 1],
-    proba2 = m_xy@posterior$scaled[, 2],
-    proba3 = m_xy@posterior$scaled[, 3]
-  ) %>%
-  pivot_longer(cols = c(proba1, proba2, proba3)) %>%
-  ggplot(aes(x = count_m, y = count_f, color = value)) +
-  geom_point() +
-  facet_wrap(~name) +
-  coord_equal() +
-  theme_bw()
-```
-
-
-
-```{r dev, eval=F}
-data <- readr::read_tsv("../../results/12/mlongespiculosa/mlongespiculosa.csv", show_col_types = FALSE)
-format(object.size(data), units = "Mb")
-annotation <- parse_annotation("../../data/sample.csv")
-count <- annotate_counts(annotation, data, "mlongespiculosa")
-save(count, file = "results/12/mlongespiculosa/counts.Rdata")
-load("results/12/mlongespiculosa/counts.Rdata")
-```
-
-```{r dev, eval=F}
-res <- compare_models(count %>% dplyr::ungroup(), nboot = 100, bootsize = 0.1, core = 24)
+    as.matrix() %>% 
+    EM_clust(sex = "XY")
+model_XO <- sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    EM_clust(sex = "X0")
+sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    plot_proba(model_XY$proba, sex = "XY")
+sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    plot_proba(model_XO$proba, sex = "X0")
+```
+
+```{r dev}
+res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
 res %>%
     ggplot(aes(x = name, y = BIC)) +
     geom_violin()
@@ -772,4 +748,71 @@ res %>%
     geom_violin()
 ```
 
+## M Spiculigera data
+
+```{r dev}
+load("../../../results/12/mspiculigera/sample.Rdata")
+```
 
+```{r dev}
+model_XY <- sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    EM_clust(sex = "XY")
+model_XO <- sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    EM_clust(sex = "X0")
+sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    plot_proba(model_XY$proba, sex = "XY")
+sample %>%
+    dplyr::select(count_m, count_f) %>%
+    as.matrix() %>% 
+    plot_proba(model_XO$proba, sex = "X0")
+```
+
+```{r dev}
+res <- compare_models(sample, nboot = 10, bootsize = 1, core = 8)
+res %>%
+    ggplot(aes(x = name, y = BIC)) +
+    geom_violin()
+res %>%
+    ggplot(aes(x = name, y = WSS_f / BSS)) +
+    geom_violin()
+res
+```
+
+# tmvnorm test
+
+```{r dev}
+library(tmvtnorm)
+
+mean  <- c(1, 1)
+sigma <- matrix(c(10, 0,
+                   0, 1), 2, 2)
+# Linear Constraints
+#
+# a1 <= x1 + x2 <= b2
+# a2 <= x1 - x2 <= b2
+#
+# [ a1 ] <= [ 1   1 ] [ x1 ] <= [b1]
+# [ a2 ]    [ 1  -1 ] [ x2 ]    [b2]
+a     <- c(0, 0)
+b     <- c( 30,  30)
+D     <- matrix(c(1, 1,
+                  -1, 1), 2, 2)                   
+
+X <- rtmvnorm(n=10000, mean, sigma, lower=a, upper=b, D=D, algorithm="gibbsR")
+plot(X, main="Gibbs sampling for multivariate normal 
+              with linear constraints according to Geweke (1991)")
+
+  abline(a=0, b=1, col="blue", lwd = 2)
+  abline(a=0, b=0, col="blue", lwd = 2)
+# mark linear constraints as lines
+for (i in 1:nrow(D)) {
+  abline(a=a[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red")
+  abline(a=b[i]/D[i, 2], b=-D[i,1]/D[i, 2], col="red")
+}
+```
-- 
GitLab