diff --git a/src/clustering.Rmd b/src/clustering.Rmd index 467c2f85b06f33f793fbc99b1362204c5f9145fa..a991f52ac5cea0f589f64bea913d7542e7081ef2 100644 --- a/src/clustering.Rmd +++ b/src/clustering.Rmd @@ -11,9 +11,9 @@ knitr::opts_chunk$set(echo = TRUE) ```{r} -library(mclust) library(tidyverse) library(mvtnorm) +library(conflicted) ``` ## Simulation @@ -34,6 +34,15 @@ Which becomes on the $log$ scale: Test slope for a given sigma matrice ```{r} +#' Function to dislay the slope of a 2d normal distribution with covariance matrix +#' `matrix(c(x^2, rho*x*y, rho*x*y, y^2)` +#' +#' @param x A number. +#' @param y A number. +#' @param rho A number. +#' @return A ggplot plot +#' @examples +#' test_slope(1.05, 2.05, 0.95) test_slope <- function(x, y, rho) { test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>% as_tibble() @@ -49,6 +58,16 @@ test_slope(1.05, 2.05, 0.95) Simulate k-mer counts data ```{r} +#' function to simulate kmer count under a gaussian model, with the correct +#' sex bias for XY or XO species +#' +#' @param n_kmer the total number of kmers +#' @param mean_coef the base mean of the kmers counts +#' @param sex = c("XY", "XO") the sex of the specie +#' @return a tibble with a c("sex, count_m, count_f) columns +#' @examples +#' sim_kmer(1e6, 1000, "XY") +#' sim_kmer(1e6, 1000, "XO") sim_kmer <- function(n_kmer, mean_coef, sex = "XY") { data <- tibble( sex = "F", @@ -78,6 +97,10 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY") { rename(count_m = V1, count_f = V2) } +``` + +## XY data +```{r} data <- sim_kmer(1e4, 1000, "XY") data %>% ggplot(aes(x = count_m, y = count_f, color = sex)) + @@ -85,16 +108,24 @@ data %>% coord_fixed() ``` +## XO data ```{r} -data_clust = data %>% select(-c("sex")) %>% mclust::Mclust(G = 3) -``` -```{r} -summary(data_clust) -plot(data_clust, what = "classification") -plot(data_clust, what = "uncertainty") +data <- sim_kmer(1e4, 1000, "XO") +data %>% + ggplot(aes(x = count_m, y = count_f, color = sex)) + + geom_point() + + coord_fixed() ``` +# Clustering + ```{r} +#' Function to transform GMM parameters theta format into a format easily iterable +#' +#' @param theta a list of paramaters +#' @param cluster_coef a list of cluster coeficient +#' @param sex = c("XY", "XO") the sex of the specie +#' @return a list of parameters expand_theta <- function(theta, cluster_coef, sex) { theta_ref <- list( "a" = list( @@ -117,6 +148,12 @@ expand_theta <- function(theta, cluster_coef, sex) { return(theta_ref) } +#' Compute the threshold to stop the EM algorithm +#' +#' @param old_theta list of paramters +#' @param theta list of paramters. +#' @param threshold threshold +#' @return bool params_diff <- function(old_theta, theta, threshold) { result <- max( max(abs(old_theta$pi - theta$pi)), @@ -128,6 +165,13 @@ params_diff <- function(old_theta, theta, threshold) { return(T) } +#' Compute the probability of a kmers to belong to a cluster +#' +#' @param x a matrix of male female kmer counts +#' @param theta list of parameters +#' @param cluster_coef, the base mean coefficients for the cluster +#' @param sex = c("XY", "XO") the sex of the specie +#' @return matrix of cluster probability proba_point <- function(x, theta, cluster_coef, sex) { proba <- c() for (params in expand_theta(theta, cluster_coef, sex)) { @@ -138,11 +182,27 @@ proba_point <- function(x, theta, cluster_coef, sex) { return(proba) } +#' compute the log-likelihood of the model +#' +#' @param x a matrix of male female kmer counts +#' @param theta list of parameters +#' @param cluster_coef, the base mean coefficients for the cluster +#' @param sex = c("XY", "XO") the sex of the specie +#' @return the loglikelihood +#' @examples +#' add(1, 1) +#' add(10, 1) loglik <- function(x, theta, cluster_coef, sex) { sum(log(rowSums(proba_point(x, theta, cluster_coef, sex)))) } -# EM function +#' Compute the normalized probability of a kmers to belong to a cluster +#' +#' @param x a matrix of male female kmer counts +#' @param theta list of parameters +#' @param cluster_coef, the base mean coefficients for the cluster +#' @param sex = c("XY", "XO") the sex of the specie +#' @return matrix of cluster probability E_proba <- function(x, theta, cluster_coef, sex) { proba <- proba_point(x, theta, cluster_coef, sex) proba_norm <- rowSums(proba) @@ -153,11 +213,23 @@ E_proba <- function(x, theta, cluster_coef, sex) { return(proba) } +#' Sum the proba of each points for each clusters +#' +#' @param proba a matrix of autosomal female and male chromosome probability +#' @return a vector E_N_clust <- function(proba) { colSums(proba) } -# Function for mean update +#' Maximize the base mean +#' +#' @param x a matrix of male female kmer counts +#' @param proba a matrix of autosomal female and male chromosome probability +#' @param mu, the base mean parameter +#' @param N_clust the size of the clusters +#' @param sex = c("XY", "XO") the sex of the specie +#' @return a mean mu +#' @examples M_mean <- function(x, proba, N_clust, sex) { mu <- 0 for (cluster in 1:ncol(proba)) { @@ -180,6 +252,15 @@ M_mean <- function(x, proba, N_clust, sex) { return(mu / 2) } +#' Maximize covariance matric parameters +#' +#' @param x a matrix of male female kmer counts +#' @param proba a matrix of autosomal female and male chromosome probability +#' @param mu, the base mean parameter +#' @param N_clust the size of the clusters +#' @param cluster_coef, the base mean coefficients for the cluster +#' @param sex = c("XY", "XO") the sex of the specie +#' @return a list of covarance matrices. M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) { cov_clust <- list() for (cluster in 1:ncol(proba)) { @@ -194,6 +275,15 @@ M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) { return(sigma) } +#' Plot the kmer counts and colorize by cluster proba +#' +#' @param x a matrix of male female kmer counts +#' @param proba a matrix of autosomal female and male chromosome probability +#' @param sex = c("XY", "XO") the sex of the specie +#' @return A number. +#' @examples +#' add(1, 1) +#' add(10, 1) plot_proba <- function(x, proba, sex = "XY") { if (sex == "XY") { as_tibble(x) %>% @@ -219,6 +309,11 @@ plot_proba <- function(x, proba, sex = "XY") { } } +#' initialize EM parameters +#' +#' @param x a matrix of male female kmer counts +#' @param sex = c("XY", "XO") the sex of the specie +#' @return a list of parameters init_param <- function(x, sex) { cluster_coef <- list( "a" = c(2, 2), @@ -239,6 +334,12 @@ init_param <- function(x, sex) { return(list(cluster_coef = cluster_coef, theta = theta)) } +#' compute the BIC of the GMM +#' +#' @param x A number. +#' @param loglik A number. +#' @param sex = c("XY", "XO") the sex of the specie +#' @return the BIC compute_bic <- function(x, loglik, sex = "XY") { k <- 1 + 4 * 2 if (sex == "YX") { @@ -247,7 +348,22 @@ compute_bic <- function(x, loglik, sex = "XY") { return(k * log(nrow(x)) - 2 * loglik) } - +#' Fix a XY or XO GMM on kmer counts +#' +#' @param x a matrix of male female counts +#' @param threshold the differences in loglik of the model to stop the EM +#' @param sex = c("XY", "XO") the sex of the specie +#' @return a list with model informations +#' @examples +#' data <- sim_kmer(1e2, 1000, "XY") +#' model_XY <- data %>% +#' dplyr::select(count_m, count_f) %>% +#' as.matrix() %>% +#' EM_clust() +#' model_XO <- data %>% +#' dplyr::select(count_m, count_f) %>% +#' as.matrix() %>% +#' EM_clust(sex = "XO") EM_clust <- function(x, threshold = 1, sex = "XY") { old_loglik <- -Inf new_loglik <- 0 @@ -260,36 +376,105 @@ EM_clust <- function(x, threshold = 1, sex = "XY") { param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex) param$theta$pi <- param$theta$pi / nrow(x) new_loglik <- loglik(x, param$theta, param$cluster_coef, sex) - if(is.infinite(new_loglik)) { + if (is.infinite(new_loglik)) { break } } - return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex))) + cluster <- proba %/% apply(proba, 1, max) + if (sex == "XY") { + cluster <- as.vector(cluster %*% c(1, 2, 3)) + } else { + cluster <- as.vector(cluster %*% c(1, 2)) + } + return(list( + proba = proba, + cluster = cluster, + theta = param$theta, + loglik = new_loglik, + BIC = compute_bic(x, new_loglik, sex), + XSS = BSS_WSS(x, cluster) + )) } -boostrap_BIC <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) { +#' Function to compile statistics bootstrap samples of the data +#' +#' @param x a matrix of male female kmers counts +#' @param sex = c("XY", "XO") the sex of the specie +#' @param threshold theshold to stop the EM algorithm +#' @param nboot number of boostrap sample +#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x) +#' @param core number of cpus to use for the computation +#' @return tibble of the statistics +#' @examples +#' data <- sim_kmer(1e7, 1000, "XY") +#' res <- boostrap_EM(data, nboot = 10, bootsize = 0.01) +boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) { parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) { - res <- x %>% - dplyr::select(count_m, count_f) %>% - sample_n(bootsize, replace = T) %>% - as.matrix() %>% - EM_clust(sex = sex) - res$BIC - }, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) %>% - unlist() + if (bootsize > 1) { + res <- x %>% + dplyr::select(count_m, count_f) %>% + sample_n(bootsize, replace = T) %>% + as.matrix() %>% + EM_clust(sex = sex) + } else { + res <- x %>% + dplyr::select(count_m, count_f) %>% + sample_frac(bootsize, replace = T) %>% + as.matrix() %>% + EM_clust(sex = sex) + } + if (sex == "XY") { + tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = res$XSS$WSS[3]) + } else { + + tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = NA) + } + }, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) } -compare_BIC <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) { +#' Function to compile statistics differences between model XY and XO on boostrap +#' samples of the data +#' +#' @param x a matrix of male female kmers counts +#' @param threshold theshold to stop the EM algorithm +#' @param nboot number of boostrap sample +#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x) +#' @param core number of cpus to use for the computation +#' @param sex = c("XY", "XO") the sex of the specie +#' @return tibble with the statistics +#' @examples +#' data <- sim_kmer(1e7, 1000, "XY") +#' res <- compare_models(data, nboot = 10, bootsize = 0.01) +compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) { tibble( - BIC_XY = boostrap_BIC(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core), - BIC_XO = boostrap_BIC(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core) - ) + model_XY = boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core), + model_XO = boostrap_EM(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core) + ) %>% + pivot_longer(cols = c(model_XO, model_XY)) %>% + unnest(value) +} + +#' Compute the BSS, WSS et TSS +#' +#' @param x matrix of data for the male and female counts +#' @param cluster a vector of cluster annotation (by row) +#' @param sex = c("XY", "XO") the sex of the specie +#' @return a list of metrics +#' @examples +#' add(1, 1) +#' add(10, 1) +BSS_WSS <- function(x, cluster) { + TSS <- sum((x - colMeans(x))^2) + WSS <- sapply(split(as_tibble(x), cluster), function(x) sum((x - colMeans(x))^2)) + BSS <- TSS - sum(WSS) + return(list(TSS = TSS, WSS = WSS, BSS = BSS)) } ``` # clustering XY ```{r} +data <- sim_kmer(1e4, 1000, "XY") model_XY <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% @@ -303,6 +488,7 @@ data %>% # clustering XO ```{r} +data <- sim_kmer(1e4, 1000, "XO") model_XO <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% @@ -327,8 +513,7 @@ model_XO <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% EM_clust(sex = "XO") - -data <- sim_kmer(1e6, 1000, "XY") +pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5) ``` ## For XO @@ -343,26 +528,38 @@ model_XO <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% EM_clust(sex = "XO") -pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4) +pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5) ``` -## Get Y k-mer +# BIC analysis + +## For XY ```{r} -res <- compare_BIC(data) +data <- sim_kmer(1e7, 1000, "XY") +res <- compare_models(data, nboot = 10, bootsize = 0.01) +res %>% + ggplot(aes(x = name, y = BIC)) + + geom_violin() res %>% - pivot_longer(cols = 1:2) %>% - ggplot(aes(x = name, y = value)) + - geom_violin() + ggplot(aes(x = name, y = WSS_f / BSS)) + + geom_violin() +``` -data %>% - mutate(y_proba = model_XY$proba[,3]) %>% - ggplot(aes(x = count_m, count_f, color = y_proba)) + - geom_point() + - theme_bw() +## For XO + +```{r} +data <- sim_kmer(1e7, 1000, "XO") +res <- compare_models(data, nboot = 10, bootsize = 0.01) +res %>% + ggplot(aes(x = name, y = BIC)) + + geom_violin() +res %>% + ggplot(aes(x = name, y = WSS_f / BSS)) + + geom_violin() ``` -## With real data +# With real data ```{r} data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)