diff --git a/src/clustering.Rmd b/src/clustering.Rmd
index 467c2f85b06f33f793fbc99b1362204c5f9145fa..a991f52ac5cea0f589f64bea913d7542e7081ef2 100644
--- a/src/clustering.Rmd
+++ b/src/clustering.Rmd
@@ -11,9 +11,9 @@ knitr::opts_chunk$set(echo = TRUE)
 
 
 ```{r}
-library(mclust)
 library(tidyverse)
 library(mvtnorm)
+library(conflicted)
 ```
 
 ## Simulation
@@ -34,6 +34,15 @@ Which becomes on the $log$ scale:
 Test slope for a given sigma matrice
 
 ```{r}
+#' Function to dislay the slope of a 2d normal distribution with covariance matrix
+#' `matrix(c(x^2, rho*x*y, rho*x*y, y^2)`
+#'
+#' @param x A number.
+#' @param y A number.
+#' @param rho A number.
+#' @return A ggplot plot
+#' @examples
+#'  test_slope(1.05, 2.05, 0.95)
 test_slope <- function(x, y, rho) {
     test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>% 
         as_tibble()
@@ -49,6 +58,16 @@ test_slope(1.05, 2.05, 0.95)
 Simulate k-mer counts data
 
 ```{r}
+#' function to simulate kmer count under a gaussian model, with the correct
+#' sex bias for XY or XO species
+#'
+#' @param n_kmer the total number of kmers
+#' @param mean_coef the base mean of the kmers counts
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return a tibble with a c("sex, count_m, count_f) columns
+#' @examples
+#' sim_kmer(1e6, 1000, "XY")
+#' sim_kmer(1e6, 1000, "XO")
 sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
     data <- tibble(
         sex = "F",
@@ -78,6 +97,10 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
         rename(count_m = V1,
                count_f = V2)
 }
+```
+
+## XY data
+```{r}
 data <- sim_kmer(1e4, 1000, "XY")
 data %>% 
     ggplot(aes(x = count_m, y = count_f, color = sex)) +
@@ -85,16 +108,24 @@ data %>%
     coord_fixed()
 ```
 
+## XO data
 ```{r}
-data_clust = data %>% select(-c("sex")) %>% mclust::Mclust(G = 3)
-```
-```{r}
-summary(data_clust)
-plot(data_clust, what = "classification")
-plot(data_clust, what = "uncertainty")
+data <- sim_kmer(1e4, 1000, "XO")
+data %>% 
+    ggplot(aes(x = count_m, y = count_f, color = sex)) +
+    geom_point() +
+    coord_fixed()
 ```
 
+# Clustering
+
 ```{r}
+#' Function to transform GMM parameters theta format into a format easily iterable
+#'
+#' @param theta a list of paramaters
+#' @param cluster_coef a list of cluster coeficient
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return a list of parameters
 expand_theta <- function(theta, cluster_coef, sex) {
     theta_ref <- list(
     "a" = list(
@@ -117,6 +148,12 @@ expand_theta <- function(theta, cluster_coef, sex) {
     return(theta_ref)
 }
 
+#' Compute the threshold to stop the EM algorithm
+#'
+#' @param old_theta list of paramters
+#' @param theta  list of paramters.
+#' @param threshold threshold
+#' @return bool
 params_diff <- function(old_theta, theta, threshold) {
     result <- max(
         max(abs(old_theta$pi - theta$pi)),
@@ -128,6 +165,13 @@ params_diff <- function(old_theta, theta, threshold) {
     return(T)
 }
 
+#' Compute the probability of a kmers to belong to a cluster
+#'
+#' @param x a matrix of male female kmer counts
+#' @param theta list of parameters
+#' @param cluster_coef, the base mean coefficients for the cluster
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return matrix of cluster probability
 proba_point <- function(x, theta, cluster_coef, sex) {
     proba <- c()
     for (params in expand_theta(theta, cluster_coef, sex)) {
@@ -138,11 +182,27 @@ proba_point <- function(x, theta, cluster_coef, sex) {
     return(proba)
 }
 
+#' compute the log-likelihood of the model
+#'
+#' @param x a matrix of male female kmer counts
+#' @param theta list of parameters
+#' @param cluster_coef, the base mean coefficients for the cluster
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return the loglikelihood
+#' @examples
+#' add(1, 1)
+#' add(10, 1)
 loglik <- function(x, theta, cluster_coef, sex) {
     sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
 }
 
-# EM function
+#' Compute the normalized probability of a kmers to belong to a cluster
+#'
+#' @param x a matrix of male female kmer counts
+#' @param theta list of parameters
+#' @param cluster_coef, the base mean coefficients for the cluster
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return matrix of cluster probability
 E_proba <- function(x, theta, cluster_coef, sex) {
     proba <- proba_point(x, theta, cluster_coef, sex)
     proba_norm <- rowSums(proba)
@@ -153,11 +213,23 @@ E_proba <- function(x, theta, cluster_coef, sex) {
     return(proba)
 }
 
+#' Sum the proba of each points for each clusters
+#'
+#' @param proba a matrix of autosomal female and male chromosome probability
+#' @return a vector
 E_N_clust <- function(proba) {
     colSums(proba)
 }
 
-# Function for mean update
+#' Maximize the base mean
+#'
+#' @param x a matrix of male female kmer counts
+#' @param proba a matrix of autosomal female and male chromosome probability
+#' @param mu, the base mean parameter
+#' @param N_clust the size of the clusters
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return a mean mu
+#' @examples
 M_mean <- function(x, proba, N_clust, sex) {
     mu <- 0
     for (cluster in 1:ncol(proba)) {
@@ -180,6 +252,15 @@ M_mean <- function(x, proba, N_clust, sex) {
     return(mu / 2)
 }
 
+#' Maximize covariance matric parameters
+#'
+#' @param x a matrix of male female kmer counts
+#' @param proba a matrix of autosomal female and male chromosome probability
+#' @param mu, the base mean parameter
+#' @param N_clust the size of the clusters
+#' @param cluster_coef, the base mean coefficients for the cluster
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return a list of covarance matrices.
 M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
     cov_clust <- list() 
     for (cluster in 1:ncol(proba)) {
@@ -194,6 +275,15 @@ M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
     return(sigma)
 }
 
+#' Plot the kmer counts and colorize by cluster proba
+#'
+#' @param x a matrix of male female kmer counts
+#' @param proba a matrix of autosomal female and male chromosome probability
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return A number.
+#' @examples
+#' add(1, 1)
+#' add(10, 1)
 plot_proba <- function(x, proba, sex = "XY") {
     if (sex == "XY") {
         as_tibble(x) %>% 
@@ -219,6 +309,11 @@ plot_proba <- function(x, proba, sex = "XY") {
     }
 }
 
+#' initialize EM parameters
+#'
+#' @param x a matrix of male female kmer counts
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return a list of parameters
 init_param <- function(x, sex) {
     cluster_coef <- list(
         "a" = c(2, 2),
@@ -239,6 +334,12 @@ init_param <- function(x, sex) {
     return(list(cluster_coef = cluster_coef, theta = theta))
 }
 
+#' compute the BIC of the GMM 
+#'
+#' @param x A number.
+#' @param loglik A number.
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return the BIC
 compute_bic <- function(x, loglik, sex = "XY") {
     k <- 1 + 4 * 2
     if (sex == "YX") {
@@ -247,7 +348,22 @@ compute_bic <- function(x, loglik, sex = "XY") {
     return(k * log(nrow(x)) - 2 * loglik)
 }
 
-
+#' Fix a XY or XO GMM on kmer counts
+#'
+#' @param x a matrix of male female counts
+#' @param threshold the differences in loglik of the model to stop the EM
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return a list with model informations
+#' @examples
+#' data <- sim_kmer(1e2, 1000, "XY")
+#' model_XY <- data %>%
+#'     dplyr::select(count_m, count_f) %>%
+#'     as.matrix() %>% 
+#'     EM_clust()
+#' model_XO <- data %>%
+#'     dplyr::select(count_m, count_f) %>%
+#'     as.matrix() %>% 
+#'     EM_clust(sex = "XO")
 EM_clust <- function(x, threshold = 1, sex = "XY") {
     old_loglik <- -Inf
     new_loglik <- 0
@@ -260,36 +376,105 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
         param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
         param$theta$pi <- param$theta$pi / nrow(x)
         new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
-        if(is.infinite(new_loglik)) {
+        if (is.infinite(new_loglik)) {
             break
         }
     }
-    return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex)))
+   cluster <- proba %/% apply(proba, 1, max)
+   if (sex == "XY") {
+       cluster <- as.vector(cluster  %*% c(1, 2, 3))
+   } else {
+       cluster <- as.vector(cluster  %*% c(1, 2))
+   }
+   return(list(
+       proba = proba,
+       cluster = cluster,
+       theta = param$theta,
+       loglik = new_loglik,
+       BIC = compute_bic(x, new_loglik, sex),
+       XSS = BSS_WSS(x, cluster)
+  ))
 }
 
-boostrap_BIC <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
+#' Function to compile statistics bootstrap samples of the data
+#'
+#' @param x a matrix of male female kmers counts
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @param threshold theshold to stop the EM algorithm
+#' @param nboot number of boostrap sample
+#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
+#' @param core number of cpus to use for the computation
+#' @return tibble of the statistics
+#' @examples
+#' data <- sim_kmer(1e7, 1000, "XY")
+#' res <- boostrap_EM(data, nboot = 10, bootsize = 0.01)
+boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
     parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
-        res <- x %>% 
-            dplyr::select(count_m, count_f) %>% 
-            sample_n(bootsize, replace = T) %>% 
-            as.matrix() %>% 
-            EM_clust(sex = sex)
-        res$BIC
-    }, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) %>% 
-        unlist()
+        if (bootsize > 1) {
+            res <- x %>% 
+                dplyr::select(count_m, count_f) %>% 
+                sample_n(bootsize, replace = T) %>% 
+                as.matrix() %>% 
+                EM_clust(sex = sex)
+        } else {
+            res <- x %>% 
+                dplyr::select(count_m, count_f) %>% 
+                sample_frac(bootsize, replace = T) %>% 
+                as.matrix() %>% 
+                EM_clust(sex = sex)
+        }
+        if (sex == "XY") {
+            tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = res$XSS$WSS[3])
+        } else {
+            
+            tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = NA)
+        }
+    }, x = x, bootsize = bootsize, sex = sex, mc.cores = 6)
 }
 
-compare_BIC <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
+#' Function to compile statistics differences between model XY and XO on boostrap
+#' samples of the data
+#'
+#' @param x a matrix of male female kmers counts
+#' @param threshold theshold to stop the EM algorithm
+#' @param nboot number of boostrap sample
+#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
+#' @param core number of cpus to use for the computation
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return tibble with the statistics
+#' @examples
+#' data <- sim_kmer(1e7, 1000, "XY")
+#' res <- compare_models(data, nboot = 10, bootsize = 0.01)
+compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
     tibble(
-        BIC_XY = boostrap_BIC(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
-        BIC_XO = boostrap_BIC(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
-    )
+        model_XY = boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
+        model_XO = boostrap_EM(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
+    ) %>% 
+    pivot_longer(cols = c(model_XO, model_XY)) %>% 
+    unnest(value)
+}
+
+#' Compute the BSS, WSS et TSS
+#'
+#' @param x matrix of data for the male and female counts
+#' @param cluster a vector of cluster annotation (by row)
+#' @param sex = c("XY", "XO") the sex of the specie
+#' @return a list of metrics
+#' @examples
+#' add(1, 1)
+#' add(10, 1)
+BSS_WSS <- function(x, cluster) {
+   TSS <- sum((x - colMeans(x))^2)
+   WSS <- sapply(split(as_tibble(x), cluster), function(x) sum((x - colMeans(x))^2))
+   BSS <- TSS - sum(WSS)
+   return(list(TSS = TSS, WSS = WSS, BSS = BSS))
 }
 ```
 
 # clustering XY
 
 ```{r}
+data <- sim_kmer(1e4, 1000, "XY")
 model_XY <- data %>%
     dplyr::select(count_m, count_f) %>%
     as.matrix() %>% 
@@ -303,6 +488,7 @@ data %>%
 # clustering XO
 
 ```{r}
+data <- sim_kmer(1e4, 1000, "XO")
 model_XO <- data %>%
     dplyr::select(count_m, count_f) %>%
     as.matrix() %>% 
@@ -327,8 +513,7 @@ model_XO <- data %>%
     dplyr::select(count_m, count_f) %>%
     as.matrix() %>% 
     EM_clust(sex = "XO")
-
-data <- sim_kmer(1e6, 1000, "XY")
+pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
 ```
 
 ## For XO
@@ -343,26 +528,38 @@ model_XO <- data %>%
     dplyr::select(count_m, count_f) %>%
     as.matrix() %>% 
     EM_clust(sex = "XO")
-pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
+pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
 ```
 
-## Get Y k-mer
+# BIC analysis
+
+## For XY
 
 ```{r}
-res <- compare_BIC(data)
+data <- sim_kmer(1e7, 1000, "XY")
+res <- compare_models(data, nboot = 10, bootsize = 0.01)
+res %>%
+    ggplot(aes(x = name, y = BIC)) +
+    geom_violin()
 res %>%
-    pivot_longer(cols = 1:2) %>% 
-    ggplot(aes(x = name, y = value)) +
-        geom_violin()
+    ggplot(aes(x = name, y = WSS_f / BSS)) +
+    geom_violin()
+```
 
-data %>%
-    mutate(y_proba = model_XY$proba[,3]) %>% 
-    ggplot(aes(x = count_m, count_f, color = y_proba)) +
-    geom_point() +
-    theme_bw()
+## For XO
+
+```{r}
+data <- sim_kmer(1e7, 1000, "XO")
+res <- compare_models(data, nboot = 10, bootsize = 0.01)
+res %>%
+    ggplot(aes(x = name, y = BIC)) +
+    geom_violin()
+res %>%
+    ggplot(aes(x = name, y = WSS_f / BSS)) +
+    geom_violin()
 ```
 
-## With real data
+# With real data
 
 ```{r}
 data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)