From 13c5b8244e23f7e527b384b58e02502d6e1967ca Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Wed, 7 Jun 2023 14:02:17 +0200
Subject: [PATCH] create new repository for clustering

---
 src/clustering.Rmd | 630 ---------------------------------------------
 1 file changed, 630 deletions(-)
 delete mode 100644 src/clustering.Rmd

diff --git a/src/clustering.Rmd b/src/clustering.Rmd
deleted file mode 100644
index a991f52..0000000
--- a/src/clustering.Rmd
+++ /dev/null
@@ -1,630 +0,0 @@
----
-title: "kmer clustering"
-author: "Laurent Modolo"
-date: "`r Sys.Date()`"
-output: html_document
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-
-```{r}
-library(tidyverse)
-library(mvtnorm)
-library(conflicted)
-```
-
-## Simulation
-
-If we expect to have with $X_f$ the female k-mer count and $X_m$ the male k-mer count, the following relation for a XY species:
-
-- For the autosomal chromosomes: $X_m = X_f$
-- For the X chromosomes: $X_m = 2 X_m$
-- For the Y chromosomes: $X_m = 0^+ X_f$
-
-Which becomes on the $log$ scale:
-
-- For the autosomal chromosomes: $\log(X_m) = \log(X_f)$
-- For the X chromosomes: $\log(X_m) = \log(2) + \log(X_m)$
-- For the Y chromosomes: $\log(X_m) = log(0^+) + log(X_f)$
-
-
-Test slope for a given sigma matrice
-
-```{r}
-#' Function to dislay the slope of a 2d normal distribution with covariance matrix
-#' `matrix(c(x^2, rho*x*y, rho*x*y, y^2)`
-#'
-#' @param x A number.
-#' @param y A number.
-#' @param rho A number.
-#' @return A ggplot plot
-#' @examples
-#'  test_slope(1.05, 2.05, 0.95)
-test_slope <- function(x, y, rho) {
-    test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>% 
-        as_tibble()
-    ggplot(data = test, aes(x = V1, y = V2)) +
-        geom_point() +
-        geom_smooth(method = lm) +
-        labs(title = lm(test$V2 ~ test$V1)$coef[2]) +
-        coord_fixed()
-}
-test_slope(1.05, 2.05, 0.95)
-```
-
-Simulate k-mer counts data
-
-```{r}
-#' function to simulate kmer count under a gaussian model, with the correct
-#' sex bias for XY or XO species
-#'
-#' @param n_kmer the total number of kmers
-#' @param mean_coef the base mean of the kmers counts
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return a tibble with a c("sex, count_m, count_f) columns
-#' @examples
-#' sim_kmer(1e6, 1000, "XY")
-#' sim_kmer(1e6, 1000, "XO")
-sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
-    data <- tibble(
-        sex = "F",
-        count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>%
-            as_tibble()
-    ) %>%
-        unnest(count) %>% 
-        bind_rows(
-            tibble(
-                sex = "A",
-                count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
-                %>% as_tibble()
-            ) %>% unnest(count)
-        )
-    if (sex == "XY") {
-        data <- data %>% 
-            bind_rows(
-                tibble(
-                    sex = "M",
-                    count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd")
-                    %>% as_tibble()
-                ) %>%
-                unnest(count)
-            )
-    }
-    data %>% 
-        rename(count_m = V1,
-               count_f = V2)
-}
-```
-
-## XY data
-```{r}
-data <- sim_kmer(1e4, 1000, "XY")
-data %>% 
-    ggplot(aes(x = count_m, y = count_f, color = sex)) +
-    geom_point() +
-    coord_fixed()
-```
-
-## XO data
-```{r}
-data <- sim_kmer(1e4, 1000, "XO")
-data %>% 
-    ggplot(aes(x = count_m, y = count_f, color = sex)) +
-    geom_point() +
-    coord_fixed()
-```
-
-# Clustering
-
-```{r}
-#' Function to transform GMM parameters theta format into a format easily iterable
-#'
-#' @param theta a list of paramaters
-#' @param cluster_coef a list of cluster coeficient
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return a list of parameters
-expand_theta <- function(theta, cluster_coef, sex) {
-    theta_ref <- list(
-    "a" = list(
-        "pi" = theta$pi[1],
-        "mu" = cluster_coef$a * theta$mu,
-        "sigma" = theta$sigma$a
-    ),
-    "f" = list(
-        "pi" = theta$pi[2],
-        "mu" = cluster_coef$f * theta$mu,
-        "sigma" = theta$sigma$f
-    ))
-    if (sex == "XY") {
-        theta_ref[["m"]] <- list(
-            "pi" = theta$pi[3],
-            "mu" = cluster_coef$m * theta$mu,
-            "sigma" = theta$sigma$m
-        )
-    }
-    return(theta_ref)
-}
-
-#' Compute the threshold to stop the EM algorithm
-#'
-#' @param old_theta list of paramters
-#' @param theta  list of paramters.
-#' @param threshold threshold
-#' @return bool
-params_diff <- function(old_theta, theta, threshold) {
-    result <- max(
-        max(abs(old_theta$pi - theta$pi)),
-        max(abs(old_theta$mu - theta$mu))
-    )
-    if (is.finite(result)) {
-        return(results > threshold)
-    }
-    return(T)
-}
-
-#' Compute the probability of a kmers to belong to a cluster
-#'
-#' @param x a matrix of male female kmer counts
-#' @param theta list of parameters
-#' @param cluster_coef, the base mean coefficients for the cluster
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return matrix of cluster probability
-proba_point <- function(x, theta, cluster_coef, sex) {
-    proba <- c()
-    for (params in expand_theta(theta, cluster_coef, sex)) {
-        proba <- cbind(proba, params$pi * 
-            mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
-        )
-    }
-    return(proba)
-}
-
-#' compute the log-likelihood of the model
-#'
-#' @param x a matrix of male female kmer counts
-#' @param theta list of parameters
-#' @param cluster_coef, the base mean coefficients for the cluster
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return the loglikelihood
-#' @examples
-#' add(1, 1)
-#' add(10, 1)
-loglik <- function(x, theta, cluster_coef, sex) {
-    sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
-}
-
-#' Compute the normalized probability of a kmers to belong to a cluster
-#'
-#' @param x a matrix of male female kmer counts
-#' @param theta list of parameters
-#' @param cluster_coef, the base mean coefficients for the cluster
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return matrix of cluster probability
-E_proba <- function(x, theta, cluster_coef, sex) {
-    proba <- proba_point(x, theta, cluster_coef, sex)
-    proba_norm <- rowSums(proba)
-    for (cluster in 1:ncol(proba)) {
-        proba[, cluster] <- proba[, cluster] / proba_norm
-        proba[proba_norm == 0, cluster] <- 1 / ncol(proba)
-    }
-    return(proba)
-}
-
-#' Sum the proba of each points for each clusters
-#'
-#' @param proba a matrix of autosomal female and male chromosome probability
-#' @return a vector
-E_N_clust <- function(proba) {
-    colSums(proba)
-}
-
-#' Maximize the base mean
-#'
-#' @param x a matrix of male female kmer counts
-#' @param proba a matrix of autosomal female and male chromosome probability
-#' @param mu, the base mean parameter
-#' @param N_clust the size of the clusters
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return a mean mu
-#' @examples
-M_mean <- function(x, proba, N_clust, sex) {
-    mu <- 0
-    for (cluster in 1:ncol(proba)) {
-        if (cluster == 1) {
-            mu <- mu + 
-                mean(colSums(x * c(0.5, 0.5) * proba[, cluster]) / N_clust[cluster])
-        }
-        if (cluster == 2) {
-            mu <- mu +
-                mean(colSums(x * c(1, 0.5) * proba[, cluster]) / N_clust[cluster])
-        }
-        if (cluster == 3) {
-            mu <- mu +
-                (colSums(x * c(1, 0) * proba[, cluster]) / N_clust[cluster])[1]
-        }
-    }
-    if (sex == "XY") {
-        return(mu / 3)
-    }
-    return(mu / 2)
-}
-
-#' Maximize covariance matric parameters
-#'
-#' @param x a matrix of male female kmer counts
-#' @param proba a matrix of autosomal female and male chromosome probability
-#' @param mu, the base mean parameter
-#' @param N_clust the size of the clusters
-#' @param cluster_coef, the base mean coefficients for the cluster
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return a list of covarance matrices.
-M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
-    cov_clust <- list() 
-    for (cluster in 1:ncol(proba)) {
-        cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster]
-    }
-    sigma <- list()
-    sigma$a <- cov_clust[[1]]
-    sigma$f <- cov_clust[[2]]
-    if (sex == "XY") {
-        sigma$m <- cov_clust[[3]]
-    }
-    return(sigma)
-}
-
-#' Plot the kmer counts and colorize by cluster proba
-#'
-#' @param x a matrix of male female kmer counts
-#' @param proba a matrix of autosomal female and male chromosome probability
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return A number.
-#' @examples
-#' add(1, 1)
-#' add(10, 1)
-plot_proba <- function(x, proba, sex = "XY") {
-    if (sex == "XY") {
-        as_tibble(x) %>% 
-            mutate(
-                proba_a = proba[, 1],
-                proba_f = proba[, 2],
-                proba_m = proba[, 3],
-                clust_proba = rgb(proba_f, proba_m, proba_a, maxColorValue = 1)
-            ) %>% 
-            ggplot(aes(x = count_m, y = count_f, color = clust_proba)) +
-            geom_point() +
-            scale_color_identity()
-    } else {
-        as_tibble(x) %>% 
-            mutate(
-                proba_a = proba[, 1],
-                proba_f = proba[, 2],
-                clust_proba = rgb(proba_f, 0,  proba_a, maxColorValue = 1)
-            ) %>% 
-            ggplot(aes(x = count_m, y = count_f, color = clust_proba)) +
-            geom_point() +
-            scale_color_identity()
-    }
-}
-
-#' initialize EM parameters
-#'
-#' @param x a matrix of male female kmer counts
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return a list of parameters
-init_param <- function(x, sex) {
-    cluster_coef <- list(
-        "a" = c(2, 2),
-        "f" = c(1, 2)
-    )
-    theta <- list(
-        "pi" = c(.85, .1, .05),
-        "mu" = mean(colMeans(x)) * .5
-    )
-    theta$sigma <- list(
-            "a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
-            "f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
-    )
-    if (sex == "XY") {
-        cluster_coef$m <- c(1, 0)
-        theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
-    }
-    return(list(cluster_coef = cluster_coef, theta = theta))
-}
-
-#' compute the BIC of the GMM 
-#'
-#' @param x A number.
-#' @param loglik A number.
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return the BIC
-compute_bic <- function(x, loglik, sex = "XY") {
-    k <- 1 + 4 * 2
-    if (sex == "YX") {
-        k <- k + 4
-    }
-    return(k * log(nrow(x)) - 2 * loglik)
-}
-
-#' Fix a XY or XO GMM on kmer counts
-#'
-#' @param x a matrix of male female counts
-#' @param threshold the differences in loglik of the model to stop the EM
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return a list with model informations
-#' @examples
-#' data <- sim_kmer(1e2, 1000, "XY")
-#' model_XY <- data %>%
-#'     dplyr::select(count_m, count_f) %>%
-#'     as.matrix() %>% 
-#'     EM_clust()
-#' model_XO <- data %>%
-#'     dplyr::select(count_m, count_f) %>%
-#'     as.matrix() %>% 
-#'     EM_clust(sex = "XO")
-EM_clust <- function(x, threshold = 1, sex = "XY") {
-    old_loglik <- -Inf
-    new_loglik <- 0
-    param <- init_param(x, sex)
-    while (abs(new_loglik - old_loglik) > threshold) {
-        old_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
-        proba <- E_proba(x, param$theta, param$cluster_coef, sex)
-        param$theta$pi <- E_N_clust(proba)
-        param$theta$mu <- M_mean(x, proba, param$theta$pi, sex)
-        param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
-        param$theta$pi <- param$theta$pi / nrow(x)
-        new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
-        if (is.infinite(new_loglik)) {
-            break
-        }
-    }
-   cluster <- proba %/% apply(proba, 1, max)
-   if (sex == "XY") {
-       cluster <- as.vector(cluster  %*% c(1, 2, 3))
-   } else {
-       cluster <- as.vector(cluster  %*% c(1, 2))
-   }
-   return(list(
-       proba = proba,
-       cluster = cluster,
-       theta = param$theta,
-       loglik = new_loglik,
-       BIC = compute_bic(x, new_loglik, sex),
-       XSS = BSS_WSS(x, cluster)
-  ))
-}
-
-#' Function to compile statistics bootstrap samples of the data
-#'
-#' @param x a matrix of male female kmers counts
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @param threshold theshold to stop the EM algorithm
-#' @param nboot number of boostrap sample
-#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
-#' @param core number of cpus to use for the computation
-#' @return tibble of the statistics
-#' @examples
-#' data <- sim_kmer(1e7, 1000, "XY")
-#' res <- boostrap_EM(data, nboot = 10, bootsize = 0.01)
-boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
-    parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
-        if (bootsize > 1) {
-            res <- x %>% 
-                dplyr::select(count_m, count_f) %>% 
-                sample_n(bootsize, replace = T) %>% 
-                as.matrix() %>% 
-                EM_clust(sex = sex)
-        } else {
-            res <- x %>% 
-                dplyr::select(count_m, count_f) %>% 
-                sample_frac(bootsize, replace = T) %>% 
-                as.matrix() %>% 
-                EM_clust(sex = sex)
-        }
-        if (sex == "XY") {
-            tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = res$XSS$WSS[3])
-        } else {
-            
-            tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = NA)
-        }
-    }, x = x, bootsize = bootsize, sex = sex, mc.cores = 6)
-}
-
-#' Function to compile statistics differences between model XY and XO on boostrap
-#' samples of the data
-#'
-#' @param x a matrix of male female kmers counts
-#' @param threshold theshold to stop the EM algorithm
-#' @param nboot number of boostrap sample
-#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
-#' @param core number of cpus to use for the computation
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return tibble with the statistics
-#' @examples
-#' data <- sim_kmer(1e7, 1000, "XY")
-#' res <- compare_models(data, nboot = 10, bootsize = 0.01)
-compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
-    tibble(
-        model_XY = boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
-        model_XO = boostrap_EM(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
-    ) %>% 
-    pivot_longer(cols = c(model_XO, model_XY)) %>% 
-    unnest(value)
-}
-
-#' Compute the BSS, WSS et TSS
-#'
-#' @param x matrix of data for the male and female counts
-#' @param cluster a vector of cluster annotation (by row)
-#' @param sex = c("XY", "XO") the sex of the specie
-#' @return a list of metrics
-#' @examples
-#' add(1, 1)
-#' add(10, 1)
-BSS_WSS <- function(x, cluster) {
-   TSS <- sum((x - colMeans(x))^2)
-   WSS <- sapply(split(as_tibble(x), cluster), function(x) sum((x - colMeans(x))^2))
-   BSS <- TSS - sum(WSS)
-   return(list(TSS = TSS, WSS = WSS, BSS = BSS))
-}
-```
-
-# clustering XY
-
-```{r}
-data <- sim_kmer(1e4, 1000, "XY")
-model_XY <- data %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    EM_clust()
-data %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    plot_proba(model_XY$proba)
-```
-    
-# clustering XO
-
-```{r}
-data <- sim_kmer(1e4, 1000, "XO")
-model_XO <- data %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    EM_clust(sex = "X0")
-data %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    plot_proba(model_XO$proba, sex = "X0")
-```
-
-# LRT
-
-## For XY
-
-```{r}
-data <- sim_kmer(1e2, 1000, "XY")
-model_XY <- data %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    EM_clust()
-model_XO <- data %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    EM_clust(sex = "XO")
-pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
-```
-
-## For XO
-
-```{r}
-data <- sim_kmer(1e2, 1000, "XO")
-model_XY <- data %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    EM_clust()
-model_XO <- data %>%
-    dplyr::select(count_m, count_f) %>%
-    as.matrix() %>% 
-    EM_clust(sex = "XO")
-pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
-```
-
-# BIC analysis
-
-## For XY
-
-```{r}
-data <- sim_kmer(1e7, 1000, "XY")
-res <- compare_models(data, nboot = 10, bootsize = 0.01)
-res %>%
-    ggplot(aes(x = name, y = BIC)) +
-    geom_violin()
-res %>%
-    ggplot(aes(x = name, y = WSS_f / BSS)) +
-    geom_violin()
-```
-
-## For XO
-
-```{r}
-data <- sim_kmer(1e7, 1000, "XO")
-res <- compare_models(data, nboot = 10, bootsize = 0.01)
-res %>%
-    ggplot(aes(x = name, y = BIC)) +
-    geom_violin()
-res %>%
-    ggplot(aes(x = name, y = WSS_f / BSS)) +
-    geom_violin()
-```
-
-# With real data
-
-```{r}
-data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)
-format(object.size(data), units = "Mb")
-```
-```{r}
-annotation <- read_csv("data/sample.csv", show_col_types = FALSE) %>% 
-  pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>% 
-  mutate(
-    file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T),
-    file = gsub("\\.fasta\\.gz", "", file, perl = T)
-  ) %>% 
-  mutate(
-    file = paste0(file, ".csv")
-  ) %>% 
-  select(!c(read)) %>% 
-  group_by(specie, sex) %>% 
-  nest(.key = "files")
-```
-
-```{r}
-count <- annotation %>% 
-  group_by(specie) %>% 
-  nest(.key = "sex") %>% 
-  mutate(count = lapply(sex, function(files, data){
-    files_f <- files %>% filter(sex == "female") %>% unnest(files) %>% pull(file) %>% as.vector()
-    files_m <- files %>% filter(sex == "male") %>% unnest(files) %>% pull(file) %>% as.vector()
-    data %>% 
-      select(kmer) %>% 
-      mutate(
-         female = data %>% select(any_of(files_f)) %>% rowMeans(),
-         male = data %>% select(any_of(files_m)) %>% rowMeans()
-      )
-  }, data = data)) %>%
-  unnest(sex) %>%
-  unnest(count)
-save(count, file = "results/12/mbelari/counts.Rdata")
-```
-
-
-## M belari data
-
-```{r}
-load("results/12/mbelari/counts.Rdata")
-```
-
-```{r}
-s_count <- count %>%
-    ungroup() %>% 
-    sample_frac(0.01) %>% 
-    dplyr::select(male, female) %>% 
-    mutate(
-        count_m = log1p(male),
-        count_f = log1p(female)
-    )
-model_XY <-  s_count %>%
-    as.matrix() %>% 
-    EM_clust()
-model_XO <- s_count %>%
-    as.matrix() %>% 
-    EM_clust(sex = "XO")
-model_XO$BIC
-model_XY$BIC
-
-s_count %>% 
-    as.matrix() %>% 
-    plot_proba(model_XO$proba, sex = "XO")
-```
\ No newline at end of file
-- 
GitLab