Skip to content
Snippets Groups Projects
Verified Commit c6daedd9 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

clustering: add roxygen

parent 1f87de79
No related branches found
No related tags found
No related merge requests found
...@@ -11,9 +11,9 @@ knitr::opts_chunk$set(echo = TRUE) ...@@ -11,9 +11,9 @@ knitr::opts_chunk$set(echo = TRUE)
```{r} ```{r}
library(mclust)
library(tidyverse) library(tidyverse)
library(mvtnorm) library(mvtnorm)
library(conflicted)
``` ```
## Simulation ## Simulation
...@@ -34,6 +34,15 @@ Which becomes on the $log$ scale: ...@@ -34,6 +34,15 @@ Which becomes on the $log$ scale:
Test slope for a given sigma matrice Test slope for a given sigma matrice
```{r} ```{r}
#' Function to dislay the slope of a 2d normal distribution with covariance matrix
#' `matrix(c(x^2, rho*x*y, rho*x*y, y^2)`
#'
#' @param x A number.
#' @param y A number.
#' @param rho A number.
#' @return A ggplot plot
#' @examples
#' test_slope(1.05, 2.05, 0.95)
test_slope <- function(x, y, rho) { test_slope <- function(x, y, rho) {
test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>% test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>%
as_tibble() as_tibble()
...@@ -49,6 +58,16 @@ test_slope(1.05, 2.05, 0.95) ...@@ -49,6 +58,16 @@ test_slope(1.05, 2.05, 0.95)
Simulate k-mer counts data Simulate k-mer counts data
```{r} ```{r}
#' function to simulate kmer count under a gaussian model, with the correct
#' sex bias for XY or XO species
#'
#' @param n_kmer the total number of kmers
#' @param mean_coef the base mean of the kmers counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a tibble with a c("sex, count_m, count_f) columns
#' @examples
#' sim_kmer(1e6, 1000, "XY")
#' sim_kmer(1e6, 1000, "XO")
sim_kmer <- function(n_kmer, mean_coef, sex = "XY") { sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
data <- tibble( data <- tibble(
sex = "F", sex = "F",
...@@ -78,6 +97,10 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY") { ...@@ -78,6 +97,10 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
rename(count_m = V1, rename(count_m = V1,
count_f = V2) count_f = V2)
} }
```
## XY data
```{r}
data <- sim_kmer(1e4, 1000, "XY") data <- sim_kmer(1e4, 1000, "XY")
data %>% data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) + ggplot(aes(x = count_m, y = count_f, color = sex)) +
...@@ -85,16 +108,24 @@ data %>% ...@@ -85,16 +108,24 @@ data %>%
coord_fixed() coord_fixed()
``` ```
## XO data
```{r} ```{r}
data_clust = data %>% select(-c("sex")) %>% mclust::Mclust(G = 3) data <- sim_kmer(1e4, 1000, "XO")
``` data %>%
```{r} ggplot(aes(x = count_m, y = count_f, color = sex)) +
summary(data_clust) geom_point() +
plot(data_clust, what = "classification") coord_fixed()
plot(data_clust, what = "uncertainty")
``` ```
# Clustering
```{r} ```{r}
#' Function to transform GMM parameters theta format into a format easily iterable
#'
#' @param theta a list of paramaters
#' @param cluster_coef a list of cluster coeficient
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of parameters
expand_theta <- function(theta, cluster_coef, sex) { expand_theta <- function(theta, cluster_coef, sex) {
theta_ref <- list( theta_ref <- list(
"a" = list( "a" = list(
...@@ -117,6 +148,12 @@ expand_theta <- function(theta, cluster_coef, sex) { ...@@ -117,6 +148,12 @@ expand_theta <- function(theta, cluster_coef, sex) {
return(theta_ref) return(theta_ref)
} }
#' Compute the threshold to stop the EM algorithm
#'
#' @param old_theta list of paramters
#' @param theta list of paramters.
#' @param threshold threshold
#' @return bool
params_diff <- function(old_theta, theta, threshold) { params_diff <- function(old_theta, theta, threshold) {
result <- max( result <- max(
max(abs(old_theta$pi - theta$pi)), max(abs(old_theta$pi - theta$pi)),
...@@ -128,6 +165,13 @@ params_diff <- function(old_theta, theta, threshold) { ...@@ -128,6 +165,13 @@ params_diff <- function(old_theta, theta, threshold) {
return(T) return(T)
} }
#' Compute the probability of a kmers to belong to a cluster
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return matrix of cluster probability
proba_point <- function(x, theta, cluster_coef, sex) { proba_point <- function(x, theta, cluster_coef, sex) {
proba <- c() proba <- c()
for (params in expand_theta(theta, cluster_coef, sex)) { for (params in expand_theta(theta, cluster_coef, sex)) {
...@@ -138,11 +182,27 @@ proba_point <- function(x, theta, cluster_coef, sex) { ...@@ -138,11 +182,27 @@ proba_point <- function(x, theta, cluster_coef, sex) {
return(proba) return(proba)
} }
#' compute the log-likelihood of the model
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return the loglikelihood
#' @examples
#' add(1, 1)
#' add(10, 1)
loglik <- function(x, theta, cluster_coef, sex) { loglik <- function(x, theta, cluster_coef, sex) {
sum(log(rowSums(proba_point(x, theta, cluster_coef, sex)))) sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
} }
# EM function #' Compute the normalized probability of a kmers to belong to a cluster
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return matrix of cluster probability
E_proba <- function(x, theta, cluster_coef, sex) { E_proba <- function(x, theta, cluster_coef, sex) {
proba <- proba_point(x, theta, cluster_coef, sex) proba <- proba_point(x, theta, cluster_coef, sex)
proba_norm <- rowSums(proba) proba_norm <- rowSums(proba)
...@@ -153,11 +213,23 @@ E_proba <- function(x, theta, cluster_coef, sex) { ...@@ -153,11 +213,23 @@ E_proba <- function(x, theta, cluster_coef, sex) {
return(proba) return(proba)
} }
#' Sum the proba of each points for each clusters
#'
#' @param proba a matrix of autosomal female and male chromosome probability
#' @return a vector
E_N_clust <- function(proba) { E_N_clust <- function(proba) {
colSums(proba) colSums(proba)
} }
# Function for mean update #' Maximize the base mean
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param mu, the base mean parameter
#' @param N_clust the size of the clusters
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a mean mu
#' @examples
M_mean <- function(x, proba, N_clust, sex) { M_mean <- function(x, proba, N_clust, sex) {
mu <- 0 mu <- 0
for (cluster in 1:ncol(proba)) { for (cluster in 1:ncol(proba)) {
...@@ -180,6 +252,15 @@ M_mean <- function(x, proba, N_clust, sex) { ...@@ -180,6 +252,15 @@ M_mean <- function(x, proba, N_clust, sex) {
return(mu / 2) return(mu / 2)
} }
#' Maximize covariance matric parameters
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param mu, the base mean parameter
#' @param N_clust the size of the clusters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of covarance matrices.
M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) { M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
cov_clust <- list() cov_clust <- list()
for (cluster in 1:ncol(proba)) { for (cluster in 1:ncol(proba)) {
...@@ -194,6 +275,15 @@ M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) { ...@@ -194,6 +275,15 @@ M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
return(sigma) return(sigma)
} }
#' Plot the kmer counts and colorize by cluster proba
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param sex = c("XY", "XO") the sex of the specie
#' @return A number.
#' @examples
#' add(1, 1)
#' add(10, 1)
plot_proba <- function(x, proba, sex = "XY") { plot_proba <- function(x, proba, sex = "XY") {
if (sex == "XY") { if (sex == "XY") {
as_tibble(x) %>% as_tibble(x) %>%
...@@ -219,6 +309,11 @@ plot_proba <- function(x, proba, sex = "XY") { ...@@ -219,6 +309,11 @@ plot_proba <- function(x, proba, sex = "XY") {
} }
} }
#' initialize EM parameters
#'
#' @param x a matrix of male female kmer counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of parameters
init_param <- function(x, sex) { init_param <- function(x, sex) {
cluster_coef <- list( cluster_coef <- list(
"a" = c(2, 2), "a" = c(2, 2),
...@@ -239,6 +334,12 @@ init_param <- function(x, sex) { ...@@ -239,6 +334,12 @@ init_param <- function(x, sex) {
return(list(cluster_coef = cluster_coef, theta = theta)) return(list(cluster_coef = cluster_coef, theta = theta))
} }
#' compute the BIC of the GMM
#'
#' @param x A number.
#' @param loglik A number.
#' @param sex = c("XY", "XO") the sex of the specie
#' @return the BIC
compute_bic <- function(x, loglik, sex = "XY") { compute_bic <- function(x, loglik, sex = "XY") {
k <- 1 + 4 * 2 k <- 1 + 4 * 2
if (sex == "YX") { if (sex == "YX") {
...@@ -247,7 +348,22 @@ compute_bic <- function(x, loglik, sex = "XY") { ...@@ -247,7 +348,22 @@ compute_bic <- function(x, loglik, sex = "XY") {
return(k * log(nrow(x)) - 2 * loglik) return(k * log(nrow(x)) - 2 * loglik)
} }
#' Fix a XY or XO GMM on kmer counts
#'
#' @param x a matrix of male female counts
#' @param threshold the differences in loglik of the model to stop the EM
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list with model informations
#' @examples
#' data <- sim_kmer(1e2, 1000, "XY")
#' model_XY <- data %>%
#' dplyr::select(count_m, count_f) %>%
#' as.matrix() %>%
#' EM_clust()
#' model_XO <- data %>%
#' dplyr::select(count_m, count_f) %>%
#' as.matrix() %>%
#' EM_clust(sex = "XO")
EM_clust <- function(x, threshold = 1, sex = "XY") { EM_clust <- function(x, threshold = 1, sex = "XY") {
old_loglik <- -Inf old_loglik <- -Inf
new_loglik <- 0 new_loglik <- 0
...@@ -264,32 +380,101 @@ EM_clust <- function(x, threshold = 1, sex = "XY") { ...@@ -264,32 +380,101 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
break break
} }
} }
return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex))) cluster <- proba %/% apply(proba, 1, max)
if (sex == "XY") {
cluster <- as.vector(cluster %*% c(1, 2, 3))
} else {
cluster <- as.vector(cluster %*% c(1, 2))
}
return(list(
proba = proba,
cluster = cluster,
theta = param$theta,
loglik = new_loglik,
BIC = compute_bic(x, new_loglik, sex),
XSS = BSS_WSS(x, cluster)
))
} }
boostrap_BIC <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) { #' Function to compile statistics bootstrap samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @return tibble of the statistics
#' @examples
#' data <- sim_kmer(1e7, 1000, "XY")
#' res <- boostrap_EM(data, nboot = 10, bootsize = 0.01)
boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) { parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
if (bootsize > 1) {
res <- x %>% res <- x %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
sample_n(bootsize, replace = T) %>% sample_n(bootsize, replace = T) %>%
as.matrix() %>% as.matrix() %>%
EM_clust(sex = sex) EM_clust(sex = sex)
res$BIC } else {
}, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) %>% res <- x %>%
unlist() dplyr::select(count_m, count_f) %>%
sample_frac(bootsize, replace = T) %>%
as.matrix() %>%
EM_clust(sex = sex)
} }
if (sex == "XY") {
tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = res$XSS$WSS[3])
} else {
compare_BIC <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) { tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = NA)
}
}, x = x, bootsize = bootsize, sex = sex, mc.cores = 6)
}
#' Function to compile statistics differences between model XY and XO on boostrap
#' samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @param sex = c("XY", "XO") the sex of the specie
#' @return tibble with the statistics
#' @examples
#' data <- sim_kmer(1e7, 1000, "XY")
#' res <- compare_models(data, nboot = 10, bootsize = 0.01)
compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
tibble( tibble(
BIC_XY = boostrap_BIC(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core), model_XY = boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
BIC_XO = boostrap_BIC(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core) model_XO = boostrap_EM(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
) ) %>%
pivot_longer(cols = c(model_XO, model_XY)) %>%
unnest(value)
}
#' Compute the BSS, WSS et TSS
#'
#' @param x matrix of data for the male and female counts
#' @param cluster a vector of cluster annotation (by row)
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of metrics
#' @examples
#' add(1, 1)
#' add(10, 1)
BSS_WSS <- function(x, cluster) {
TSS <- sum((x - colMeans(x))^2)
WSS <- sapply(split(as_tibble(x), cluster), function(x) sum((x - colMeans(x))^2))
BSS <- TSS - sum(WSS)
return(list(TSS = TSS, WSS = WSS, BSS = BSS))
} }
``` ```
# clustering XY # clustering XY
```{r} ```{r}
data <- sim_kmer(1e4, 1000, "XY")
model_XY <- data %>% model_XY <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
...@@ -303,6 +488,7 @@ data %>% ...@@ -303,6 +488,7 @@ data %>%
# clustering XO # clustering XO
```{r} ```{r}
data <- sim_kmer(1e4, 1000, "XO")
model_XO <- data %>% model_XO <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
...@@ -327,8 +513,7 @@ model_XO <- data %>% ...@@ -327,8 +513,7 @@ model_XO <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
EM_clust(sex = "XO") EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
data <- sim_kmer(1e6, 1000, "XY")
``` ```
## For XO ## For XO
...@@ -343,26 +528,38 @@ model_XO <- data %>% ...@@ -343,26 +528,38 @@ model_XO <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
EM_clust(sex = "XO") EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4) pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
``` ```
## Get Y k-mer # BIC analysis
## For XY
```{r} ```{r}
res <- compare_BIC(data) data <- sim_kmer(1e7, 1000, "XY")
res <- compare_models(data, nboot = 10, bootsize = 0.01)
res %>% res %>%
pivot_longer(cols = 1:2) %>% ggplot(aes(x = name, y = BIC)) +
ggplot(aes(x = name, y = value)) +
geom_violin() geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
```
data %>% ## For XO
mutate(y_proba = model_XY$proba[,3]) %>%
ggplot(aes(x = count_m, count_f, color = y_proba)) + ```{r}
geom_point() + data <- sim_kmer(1e7, 1000, "XO")
theme_bw() res <- compare_models(data, nboot = 10, bootsize = 0.01)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
``` ```
## With real data # With real data
```{r} ```{r}
data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE) data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment