From 13c5b8244e23f7e527b384b58e02502d6e1967ca Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Wed, 7 Jun 2023 14:02:17 +0200 Subject: [PATCH] create new repository for clustering --- src/clustering.Rmd | 630 --------------------------------------------- 1 file changed, 630 deletions(-) delete mode 100644 src/clustering.Rmd diff --git a/src/clustering.Rmd b/src/clustering.Rmd deleted file mode 100644 index a991f52..0000000 --- a/src/clustering.Rmd +++ /dev/null @@ -1,630 +0,0 @@ ---- -title: "kmer clustering" -author: "Laurent Modolo" -date: "`r Sys.Date()`" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - - -```{r} -library(tidyverse) -library(mvtnorm) -library(conflicted) -``` - -## Simulation - -If we expect to have with $X_f$ the female k-mer count and $X_m$ the male k-mer count, the following relation for a XY species: - -- For the autosomal chromosomes: $X_m = X_f$ -- For the X chromosomes: $X_m = 2 X_m$ -- For the Y chromosomes: $X_m = 0^+ X_f$ - -Which becomes on the $log$ scale: - -- For the autosomal chromosomes: $\log(X_m) = \log(X_f)$ -- For the X chromosomes: $\log(X_m) = \log(2) + \log(X_m)$ -- For the Y chromosomes: $\log(X_m) = log(0^+) + log(X_f)$ - - -Test slope for a given sigma matrice - -```{r} -#' Function to dislay the slope of a 2d normal distribution with covariance matrix -#' `matrix(c(x^2, rho*x*y, rho*x*y, y^2)` -#' -#' @param x A number. -#' @param y A number. -#' @param rho A number. -#' @return A ggplot plot -#' @examples -#' test_slope(1.05, 2.05, 0.95) -test_slope <- function(x, y, rho) { - test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>% - as_tibble() - ggplot(data = test, aes(x = V1, y = V2)) + - geom_point() + - geom_smooth(method = lm) + - labs(title = lm(test$V2 ~ test$V1)$coef[2]) + - coord_fixed() -} -test_slope(1.05, 2.05, 0.95) -``` - -Simulate k-mer counts data - -```{r} -#' function to simulate kmer count under a gaussian model, with the correct -#' sex bias for XY or XO species -#' -#' @param n_kmer the total number of kmers -#' @param mean_coef the base mean of the kmers counts -#' @param sex = c("XY", "XO") the sex of the specie -#' @return a tibble with a c("sex, count_m, count_f) columns -#' @examples -#' sim_kmer(1e6, 1000, "XY") -#' sim_kmer(1e6, 1000, "XO") -sim_kmer <- function(n_kmer, mean_coef, sex = "XY") { - data <- tibble( - sex = "F", - count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>% - as_tibble() - ) %>% - unnest(count) %>% - bind_rows( - tibble( - sex = "A", - count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd") - %>% as_tibble() - ) %>% unnest(count) - ) - if (sex == "XY") { - data <- data %>% - bind_rows( - tibble( - sex = "M", - count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd") - %>% as_tibble() - ) %>% - unnest(count) - ) - } - data %>% - rename(count_m = V1, - count_f = V2) -} -``` - -## XY data -```{r} -data <- sim_kmer(1e4, 1000, "XY") -data %>% - ggplot(aes(x = count_m, y = count_f, color = sex)) + - geom_point() + - coord_fixed() -``` - -## XO data -```{r} -data <- sim_kmer(1e4, 1000, "XO") -data %>% - ggplot(aes(x = count_m, y = count_f, color = sex)) + - geom_point() + - coord_fixed() -``` - -# Clustering - -```{r} -#' Function to transform GMM parameters theta format into a format easily iterable -#' -#' @param theta a list of paramaters -#' @param cluster_coef a list of cluster coeficient -#' @param sex = c("XY", "XO") the sex of the specie -#' @return a list of parameters -expand_theta <- function(theta, cluster_coef, sex) { - theta_ref <- list( - "a" = list( - "pi" = theta$pi[1], - "mu" = cluster_coef$a * theta$mu, - "sigma" = theta$sigma$a - ), - "f" = list( - "pi" = theta$pi[2], - "mu" = cluster_coef$f * theta$mu, - "sigma" = theta$sigma$f - )) - if (sex == "XY") { - theta_ref[["m"]] <- list( - "pi" = theta$pi[3], - "mu" = cluster_coef$m * theta$mu, - "sigma" = theta$sigma$m - ) - } - return(theta_ref) -} - -#' Compute the threshold to stop the EM algorithm -#' -#' @param old_theta list of paramters -#' @param theta list of paramters. -#' @param threshold threshold -#' @return bool -params_diff <- function(old_theta, theta, threshold) { - result <- max( - max(abs(old_theta$pi - theta$pi)), - max(abs(old_theta$mu - theta$mu)) - ) - if (is.finite(result)) { - return(results > threshold) - } - return(T) -} - -#' Compute the probability of a kmers to belong to a cluster -#' -#' @param x a matrix of male female kmer counts -#' @param theta list of parameters -#' @param cluster_coef, the base mean coefficients for the cluster -#' @param sex = c("XY", "XO") the sex of the specie -#' @return matrix of cluster probability -proba_point <- function(x, theta, cluster_coef, sex) { - proba <- c() - for (params in expand_theta(theta, cluster_coef, sex)) { - proba <- cbind(proba, params$pi * - mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma) - ) - } - return(proba) -} - -#' compute the log-likelihood of the model -#' -#' @param x a matrix of male female kmer counts -#' @param theta list of parameters -#' @param cluster_coef, the base mean coefficients for the cluster -#' @param sex = c("XY", "XO") the sex of the specie -#' @return the loglikelihood -#' @examples -#' add(1, 1) -#' add(10, 1) -loglik <- function(x, theta, cluster_coef, sex) { - sum(log(rowSums(proba_point(x, theta, cluster_coef, sex)))) -} - -#' Compute the normalized probability of a kmers to belong to a cluster -#' -#' @param x a matrix of male female kmer counts -#' @param theta list of parameters -#' @param cluster_coef, the base mean coefficients for the cluster -#' @param sex = c("XY", "XO") the sex of the specie -#' @return matrix of cluster probability -E_proba <- function(x, theta, cluster_coef, sex) { - proba <- proba_point(x, theta, cluster_coef, sex) - proba_norm <- rowSums(proba) - for (cluster in 1:ncol(proba)) { - proba[, cluster] <- proba[, cluster] / proba_norm - proba[proba_norm == 0, cluster] <- 1 / ncol(proba) - } - return(proba) -} - -#' Sum the proba of each points for each clusters -#' -#' @param proba a matrix of autosomal female and male chromosome probability -#' @return a vector -E_N_clust <- function(proba) { - colSums(proba) -} - -#' Maximize the base mean -#' -#' @param x a matrix of male female kmer counts -#' @param proba a matrix of autosomal female and male chromosome probability -#' @param mu, the base mean parameter -#' @param N_clust the size of the clusters -#' @param sex = c("XY", "XO") the sex of the specie -#' @return a mean mu -#' @examples -M_mean <- function(x, proba, N_clust, sex) { - mu <- 0 - for (cluster in 1:ncol(proba)) { - if (cluster == 1) { - mu <- mu + - mean(colSums(x * c(0.5, 0.5) * proba[, cluster]) / N_clust[cluster]) - } - if (cluster == 2) { - mu <- mu + - mean(colSums(x * c(1, 0.5) * proba[, cluster]) / N_clust[cluster]) - } - if (cluster == 3) { - mu <- mu + - (colSums(x * c(1, 0) * proba[, cluster]) / N_clust[cluster])[1] - } - } - if (sex == "XY") { - return(mu / 3) - } - return(mu / 2) -} - -#' Maximize covariance matric parameters -#' -#' @param x a matrix of male female kmer counts -#' @param proba a matrix of autosomal female and male chromosome probability -#' @param mu, the base mean parameter -#' @param N_clust the size of the clusters -#' @param cluster_coef, the base mean coefficients for the cluster -#' @param sex = c("XY", "XO") the sex of the specie -#' @return a list of covarance matrices. -M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) { - cov_clust <- list() - for (cluster in 1:ncol(proba)) { - cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster] - } - sigma <- list() - sigma$a <- cov_clust[[1]] - sigma$f <- cov_clust[[2]] - if (sex == "XY") { - sigma$m <- cov_clust[[3]] - } - return(sigma) -} - -#' Plot the kmer counts and colorize by cluster proba -#' -#' @param x a matrix of male female kmer counts -#' @param proba a matrix of autosomal female and male chromosome probability -#' @param sex = c("XY", "XO") the sex of the specie -#' @return A number. -#' @examples -#' add(1, 1) -#' add(10, 1) -plot_proba <- function(x, proba, sex = "XY") { - if (sex == "XY") { - as_tibble(x) %>% - mutate( - proba_a = proba[, 1], - proba_f = proba[, 2], - proba_m = proba[, 3], - clust_proba = rgb(proba_f, proba_m, proba_a, maxColorValue = 1) - ) %>% - ggplot(aes(x = count_m, y = count_f, color = clust_proba)) + - geom_point() + - scale_color_identity() - } else { - as_tibble(x) %>% - mutate( - proba_a = proba[, 1], - proba_f = proba[, 2], - clust_proba = rgb(proba_f, 0, proba_a, maxColorValue = 1) - ) %>% - ggplot(aes(x = count_m, y = count_f, color = clust_proba)) + - geom_point() + - scale_color_identity() - } -} - -#' initialize EM parameters -#' -#' @param x a matrix of male female kmer counts -#' @param sex = c("XY", "XO") the sex of the specie -#' @return a list of parameters -init_param <- function(x, sex) { - cluster_coef <- list( - "a" = c(2, 2), - "f" = c(1, 2) - ) - theta <- list( - "pi" = c(.85, .1, .05), - "mu" = mean(colMeans(x)) * .5 - ) - theta$sigma <- list( - "a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2), - "f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2) - ) - if (sex == "XY") { - cluster_coef$m <- c(1, 0) - theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2) - } - return(list(cluster_coef = cluster_coef, theta = theta)) -} - -#' compute the BIC of the GMM -#' -#' @param x A number. -#' @param loglik A number. -#' @param sex = c("XY", "XO") the sex of the specie -#' @return the BIC -compute_bic <- function(x, loglik, sex = "XY") { - k <- 1 + 4 * 2 - if (sex == "YX") { - k <- k + 4 - } - return(k * log(nrow(x)) - 2 * loglik) -} - -#' Fix a XY or XO GMM on kmer counts -#' -#' @param x a matrix of male female counts -#' @param threshold the differences in loglik of the model to stop the EM -#' @param sex = c("XY", "XO") the sex of the specie -#' @return a list with model informations -#' @examples -#' data <- sim_kmer(1e2, 1000, "XY") -#' model_XY <- data %>% -#' dplyr::select(count_m, count_f) %>% -#' as.matrix() %>% -#' EM_clust() -#' model_XO <- data %>% -#' dplyr::select(count_m, count_f) %>% -#' as.matrix() %>% -#' EM_clust(sex = "XO") -EM_clust <- function(x, threshold = 1, sex = "XY") { - old_loglik <- -Inf - new_loglik <- 0 - param <- init_param(x, sex) - while (abs(new_loglik - old_loglik) > threshold) { - old_loglik <- loglik(x, param$theta, param$cluster_coef, sex) - proba <- E_proba(x, param$theta, param$cluster_coef, sex) - param$theta$pi <- E_N_clust(proba) - param$theta$mu <- M_mean(x, proba, param$theta$pi, sex) - param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex) - param$theta$pi <- param$theta$pi / nrow(x) - new_loglik <- loglik(x, param$theta, param$cluster_coef, sex) - if (is.infinite(new_loglik)) { - break - } - } - cluster <- proba %/% apply(proba, 1, max) - if (sex == "XY") { - cluster <- as.vector(cluster %*% c(1, 2, 3)) - } else { - cluster <- as.vector(cluster %*% c(1, 2)) - } - return(list( - proba = proba, - cluster = cluster, - theta = param$theta, - loglik = new_loglik, - BIC = compute_bic(x, new_loglik, sex), - XSS = BSS_WSS(x, cluster) - )) -} - -#' Function to compile statistics bootstrap samples of the data -#' -#' @param x a matrix of male female kmers counts -#' @param sex = c("XY", "XO") the sex of the specie -#' @param threshold theshold to stop the EM algorithm -#' @param nboot number of boostrap sample -#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x) -#' @param core number of cpus to use for the computation -#' @return tibble of the statistics -#' @examples -#' data <- sim_kmer(1e7, 1000, "XY") -#' res <- boostrap_EM(data, nboot = 10, bootsize = 0.01) -boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) { - parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) { - if (bootsize > 1) { - res <- x %>% - dplyr::select(count_m, count_f) %>% - sample_n(bootsize, replace = T) %>% - as.matrix() %>% - EM_clust(sex = sex) - } else { - res <- x %>% - dplyr::select(count_m, count_f) %>% - sample_frac(bootsize, replace = T) %>% - as.matrix() %>% - EM_clust(sex = sex) - } - if (sex == "XY") { - tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = res$XSS$WSS[3]) - } else { - - tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = NA) - } - }, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) -} - -#' Function to compile statistics differences between model XY and XO on boostrap -#' samples of the data -#' -#' @param x a matrix of male female kmers counts -#' @param threshold theshold to stop the EM algorithm -#' @param nboot number of boostrap sample -#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x) -#' @param core number of cpus to use for the computation -#' @param sex = c("XY", "XO") the sex of the specie -#' @return tibble with the statistics -#' @examples -#' data <- sim_kmer(1e7, 1000, "XY") -#' res <- compare_models(data, nboot = 10, bootsize = 0.01) -compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) { - tibble( - model_XY = boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core), - model_XO = boostrap_EM(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core) - ) %>% - pivot_longer(cols = c(model_XO, model_XY)) %>% - unnest(value) -} - -#' Compute the BSS, WSS et TSS -#' -#' @param x matrix of data for the male and female counts -#' @param cluster a vector of cluster annotation (by row) -#' @param sex = c("XY", "XO") the sex of the specie -#' @return a list of metrics -#' @examples -#' add(1, 1) -#' add(10, 1) -BSS_WSS <- function(x, cluster) { - TSS <- sum((x - colMeans(x))^2) - WSS <- sapply(split(as_tibble(x), cluster), function(x) sum((x - colMeans(x))^2)) - BSS <- TSS - sum(WSS) - return(list(TSS = TSS, WSS = WSS, BSS = BSS)) -} -``` - -# clustering XY - -```{r} -data <- sim_kmer(1e4, 1000, "XY") -model_XY <- data %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - EM_clust() -data %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - plot_proba(model_XY$proba) -``` - -# clustering XO - -```{r} -data <- sim_kmer(1e4, 1000, "XO") -model_XO <- data %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - EM_clust(sex = "X0") -data %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - plot_proba(model_XO$proba, sex = "X0") -``` - -# LRT - -## For XY - -```{r} -data <- sim_kmer(1e2, 1000, "XY") -model_XY <- data %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - EM_clust() -model_XO <- data %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - EM_clust(sex = "XO") -pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5) -``` - -## For XO - -```{r} -data <- sim_kmer(1e2, 1000, "XO") -model_XY <- data %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - EM_clust() -model_XO <- data %>% - dplyr::select(count_m, count_f) %>% - as.matrix() %>% - EM_clust(sex = "XO") -pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5) -``` - -# BIC analysis - -## For XY - -```{r} -data <- sim_kmer(1e7, 1000, "XY") -res <- compare_models(data, nboot = 10, bootsize = 0.01) -res %>% - ggplot(aes(x = name, y = BIC)) + - geom_violin() -res %>% - ggplot(aes(x = name, y = WSS_f / BSS)) + - geom_violin() -``` - -## For XO - -```{r} -data <- sim_kmer(1e7, 1000, "XO") -res <- compare_models(data, nboot = 10, bootsize = 0.01) -res %>% - ggplot(aes(x = name, y = BIC)) + - geom_violin() -res %>% - ggplot(aes(x = name, y = WSS_f / BSS)) + - geom_violin() -``` - -# With real data - -```{r} -data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE) -format(object.size(data), units = "Mb") -``` -```{r} -annotation <- read_csv("data/sample.csv", show_col_types = FALSE) %>% - pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>% - mutate( - file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T), - file = gsub("\\.fasta\\.gz", "", file, perl = T) - ) %>% - mutate( - file = paste0(file, ".csv") - ) %>% - select(!c(read)) %>% - group_by(specie, sex) %>% - nest(.key = "files") -``` - -```{r} -count <- annotation %>% - group_by(specie) %>% - nest(.key = "sex") %>% - mutate(count = lapply(sex, function(files, data){ - files_f <- files %>% filter(sex == "female") %>% unnest(files) %>% pull(file) %>% as.vector() - files_m <- files %>% filter(sex == "male") %>% unnest(files) %>% pull(file) %>% as.vector() - data %>% - select(kmer) %>% - mutate( - female = data %>% select(any_of(files_f)) %>% rowMeans(), - male = data %>% select(any_of(files_m)) %>% rowMeans() - ) - }, data = data)) %>% - unnest(sex) %>% - unnest(count) -save(count, file = "results/12/mbelari/counts.Rdata") -``` - - -## M belari data - -```{r} -load("results/12/mbelari/counts.Rdata") -``` - -```{r} -s_count <- count %>% - ungroup() %>% - sample_frac(0.01) %>% - dplyr::select(male, female) %>% - mutate( - count_m = log1p(male), - count_f = log1p(female) - ) -model_XY <- s_count %>% - as.matrix() %>% - EM_clust() -model_XO <- s_count %>% - as.matrix() %>% - EM_clust(sex = "XO") -model_XO$BIC -model_XY$BIC - -s_count %>% - as.matrix() %>% - plot_proba(model_XO$proba, sex = "XO") -``` \ No newline at end of file -- GitLab