Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
K
kmer diff
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
Delattre
kmer diff
Commits
c6daedd9
Verified
Commit
c6daedd9
authored
Jun 6, 2023
by
Laurent Modolo
Browse files
Options
Downloads
Patches
Plain Diff
clustering: add roxygen
parent
1f87de79
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/clustering.Rmd
+236
-39
236 additions, 39 deletions
src/clustering.Rmd
with
236 additions
and
39 deletions
src/clustering.Rmd
+
236
−
39
View file @
c6daedd9
...
@@ -11,9 +11,9 @@ knitr::opts_chunk$set(echo = TRUE)
...
@@ -11,9 +11,9 @@ knitr::opts_chunk$set(echo = TRUE)
```{r}
```{r}
library(mclust)
library(tidyverse)
library(tidyverse)
library(mvtnorm)
library(mvtnorm)
library(conflicted)
```
```
## Simulation
## Simulation
...
@@ -34,6 +34,15 @@ Which becomes on the $log$ scale:
...
@@ -34,6 +34,15 @@ Which becomes on the $log$ scale:
Test slope for a given sigma matrice
Test slope for a given sigma matrice
```{r}
```{r}
#' Function to dislay the slope of a 2d normal distribution with covariance matrix
#' `matrix(c(x^2, rho*x*y, rho*x*y, y^2)`
#'
#' @param x A number.
#' @param y A number.
#' @param rho A number.
#' @return A ggplot plot
#' @examples
#' test_slope(1.05, 2.05, 0.95)
test_slope <- function(x, y, rho) {
test_slope <- function(x, y, rho) {
test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>%
test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>%
as_tibble()
as_tibble()
...
@@ -49,6 +58,16 @@ test_slope(1.05, 2.05, 0.95)
...
@@ -49,6 +58,16 @@ test_slope(1.05, 2.05, 0.95)
Simulate k-mer counts data
Simulate k-mer counts data
```{r}
```{r}
#' function to simulate kmer count under a gaussian model, with the correct
#' sex bias for XY or XO species
#'
#' @param n_kmer the total number of kmers
#' @param mean_coef the base mean of the kmers counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a tibble with a c("sex, count_m, count_f) columns
#' @examples
#' sim_kmer(1e6, 1000, "XY")
#' sim_kmer(1e6, 1000, "XO")
sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
data <- tibble(
data <- tibble(
sex = "F",
sex = "F",
...
@@ -78,6 +97,10 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
...
@@ -78,6 +97,10 @@ sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
rename(count_m = V1,
rename(count_m = V1,
count_f = V2)
count_f = V2)
}
}
```
## XY data
```{r}
data <- sim_kmer(1e4, 1000, "XY")
data <- sim_kmer(1e4, 1000, "XY")
data %>%
data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) +
ggplot(aes(x = count_m, y = count_f, color = sex)) +
...
@@ -85,16 +108,24 @@ data %>%
...
@@ -85,16 +108,24 @@ data %>%
coord_fixed()
coord_fixed()
```
```
## XO data
```{r}
```{r}
data_clust = data %>% select(-c("sex")) %>% mclust::Mclust(G = 3)
data <- sim_kmer(1e4, 1000, "XO")
```
data %>%
```{r}
ggplot(aes(x = count_m, y = count_f, color = sex)) +
summary(data_clust)
geom_point() +
plot(data_clust, what = "classification")
coord_fixed()
plot(data_clust, what = "uncertainty")
```
```
# Clustering
```{r}
```{r}
#' Function to transform GMM parameters theta format into a format easily iterable
#'
#' @param theta a list of paramaters
#' @param cluster_coef a list of cluster coeficient
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of parameters
expand_theta <- function(theta, cluster_coef, sex) {
expand_theta <- function(theta, cluster_coef, sex) {
theta_ref <- list(
theta_ref <- list(
"a" = list(
"a" = list(
...
@@ -117,6 +148,12 @@ expand_theta <- function(theta, cluster_coef, sex) {
...
@@ -117,6 +148,12 @@ expand_theta <- function(theta, cluster_coef, sex) {
return(theta_ref)
return(theta_ref)
}
}
#' Compute the threshold to stop the EM algorithm
#'
#' @param old_theta list of paramters
#' @param theta list of paramters.
#' @param threshold threshold
#' @return bool
params_diff <- function(old_theta, theta, threshold) {
params_diff <- function(old_theta, theta, threshold) {
result <- max(
result <- max(
max(abs(old_theta$pi - theta$pi)),
max(abs(old_theta$pi - theta$pi)),
...
@@ -128,6 +165,13 @@ params_diff <- function(old_theta, theta, threshold) {
...
@@ -128,6 +165,13 @@ params_diff <- function(old_theta, theta, threshold) {
return(T)
return(T)
}
}
#' Compute the probability of a kmers to belong to a cluster
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return matrix of cluster probability
proba_point <- function(x, theta, cluster_coef, sex) {
proba_point <- function(x, theta, cluster_coef, sex) {
proba <- c()
proba <- c()
for (params in expand_theta(theta, cluster_coef, sex)) {
for (params in expand_theta(theta, cluster_coef, sex)) {
...
@@ -138,11 +182,27 @@ proba_point <- function(x, theta, cluster_coef, sex) {
...
@@ -138,11 +182,27 @@ proba_point <- function(x, theta, cluster_coef, sex) {
return(proba)
return(proba)
}
}
#' compute the log-likelihood of the model
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return the loglikelihood
#' @examples
#' add(1, 1)
#' add(10, 1)
loglik <- function(x, theta, cluster_coef, sex) {
loglik <- function(x, theta, cluster_coef, sex) {
sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
}
}
# EM function
#' Compute the normalized probability of a kmers to belong to a cluster
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return matrix of cluster probability
E_proba <- function(x, theta, cluster_coef, sex) {
E_proba <- function(x, theta, cluster_coef, sex) {
proba <- proba_point(x, theta, cluster_coef, sex)
proba <- proba_point(x, theta, cluster_coef, sex)
proba_norm <- rowSums(proba)
proba_norm <- rowSums(proba)
...
@@ -153,11 +213,23 @@ E_proba <- function(x, theta, cluster_coef, sex) {
...
@@ -153,11 +213,23 @@ E_proba <- function(x, theta, cluster_coef, sex) {
return(proba)
return(proba)
}
}
#' Sum the proba of each points for each clusters
#'
#' @param proba a matrix of autosomal female and male chromosome probability
#' @return a vector
E_N_clust <- function(proba) {
E_N_clust <- function(proba) {
colSums(proba)
colSums(proba)
}
}
# Function for mean update
#' Maximize the base mean
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param mu, the base mean parameter
#' @param N_clust the size of the clusters
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a mean mu
#' @examples
M_mean <- function(x, proba, N_clust, sex) {
M_mean <- function(x, proba, N_clust, sex) {
mu <- 0
mu <- 0
for (cluster in 1:ncol(proba)) {
for (cluster in 1:ncol(proba)) {
...
@@ -180,6 +252,15 @@ M_mean <- function(x, proba, N_clust, sex) {
...
@@ -180,6 +252,15 @@ M_mean <- function(x, proba, N_clust, sex) {
return(mu / 2)
return(mu / 2)
}
}
#' Maximize covariance matric parameters
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param mu, the base mean parameter
#' @param N_clust the size of the clusters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of covarance matrices.
M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
cov_clust <- list()
cov_clust <- list()
for (cluster in 1:ncol(proba)) {
for (cluster in 1:ncol(proba)) {
...
@@ -194,6 +275,15 @@ M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
...
@@ -194,6 +275,15 @@ M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
return(sigma)
return(sigma)
}
}
#' Plot the kmer counts and colorize by cluster proba
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param sex = c("XY", "XO") the sex of the specie
#' @return A number.
#' @examples
#' add(1, 1)
#' add(10, 1)
plot_proba <- function(x, proba, sex = "XY") {
plot_proba <- function(x, proba, sex = "XY") {
if (sex == "XY") {
if (sex == "XY") {
as_tibble(x) %>%
as_tibble(x) %>%
...
@@ -219,6 +309,11 @@ plot_proba <- function(x, proba, sex = "XY") {
...
@@ -219,6 +309,11 @@ plot_proba <- function(x, proba, sex = "XY") {
}
}
}
}
#' initialize EM parameters
#'
#' @param x a matrix of male female kmer counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of parameters
init_param <- function(x, sex) {
init_param <- function(x, sex) {
cluster_coef <- list(
cluster_coef <- list(
"a" = c(2, 2),
"a" = c(2, 2),
...
@@ -239,6 +334,12 @@ init_param <- function(x, sex) {
...
@@ -239,6 +334,12 @@ init_param <- function(x, sex) {
return(list(cluster_coef = cluster_coef, theta = theta))
return(list(cluster_coef = cluster_coef, theta = theta))
}
}
#' compute the BIC of the GMM
#'
#' @param x A number.
#' @param loglik A number.
#' @param sex = c("XY", "XO") the sex of the specie
#' @return the BIC
compute_bic <- function(x, loglik, sex = "XY") {
compute_bic <- function(x, loglik, sex = "XY") {
k <- 1 + 4 * 2
k <- 1 + 4 * 2
if (sex == "YX") {
if (sex == "YX") {
...
@@ -247,7 +348,22 @@ compute_bic <- function(x, loglik, sex = "XY") {
...
@@ -247,7 +348,22 @@ compute_bic <- function(x, loglik, sex = "XY") {
return(k * log(nrow(x)) - 2 * loglik)
return(k * log(nrow(x)) - 2 * loglik)
}
}
#' Fix a XY or XO GMM on kmer counts
#'
#' @param x a matrix of male female counts
#' @param threshold the differences in loglik of the model to stop the EM
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list with model informations
#' @examples
#' data <- sim_kmer(1e2, 1000, "XY")
#' model_XY <- data %>%
#' dplyr::select(count_m, count_f) %>%
#' as.matrix() %>%
#' EM_clust()
#' model_XO <- data %>%
#' dplyr::select(count_m, count_f) %>%
#' as.matrix() %>%
#' EM_clust(sex = "XO")
EM_clust <- function(x, threshold = 1, sex = "XY") {
EM_clust <- function(x, threshold = 1, sex = "XY") {
old_loglik <- -Inf
old_loglik <- -Inf
new_loglik <- 0
new_loglik <- 0
...
@@ -264,32 +380,101 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
...
@@ -264,32 +380,101 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
break
break
}
}
}
}
return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex)))
cluster <- proba %/% apply(proba, 1, max)
if (sex == "XY") {
cluster <- as.vector(cluster %*% c(1, 2, 3))
} else {
cluster <- as.vector(cluster %*% c(1, 2))
}
return(list(
proba = proba,
cluster = cluster,
theta = param$theta,
loglik = new_loglik,
BIC = compute_bic(x, new_loglik, sex),
XSS = BSS_WSS(x, cluster)
))
}
}
boostrap_BIC <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
#' Function to compile statistics bootstrap samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @return tibble of the statistics
#' @examples
#' data <- sim_kmer(1e7, 1000, "XY")
#' res <- boostrap_EM(data, nboot = 10, bootsize = 0.01)
boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
if (bootsize > 1) {
res <- x %>%
res <- x %>%
dplyr::select(count_m, count_f) %>%
dplyr::select(count_m, count_f) %>%
sample_n(bootsize, replace = T) %>%
sample_n(bootsize, replace = T) %>%
as.matrix() %>%
as.matrix() %>%
EM_clust(sex = sex)
EM_clust(sex = sex)
res$BIC
} else {
}, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) %>%
res <- x %>%
unlist()
dplyr::select(count_m, count_f) %>%
sample_frac(bootsize, replace = T) %>%
as.matrix() %>%
EM_clust(sex = sex)
}
}
if (sex == "XY") {
tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = res$XSS$WSS[3])
} else {
compare_BIC <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = NA)
}
}, x = x, bootsize = bootsize, sex = sex, mc.cores = 6)
}
#' Function to compile statistics differences between model XY and XO on boostrap
#' samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @param sex = c("XY", "XO") the sex of the specie
#' @return tibble with the statistics
#' @examples
#' data <- sim_kmer(1e7, 1000, "XY")
#' res <- compare_models(data, nboot = 10, bootsize = 0.01)
compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
tibble(
tibble(
BIC_XY = boostrap_BIC(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
model_XY = boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
BIC_XO = boostrap_BIC(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
model_XO = boostrap_EM(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
)
) %>%
pivot_longer(cols = c(model_XO, model_XY)) %>%
unnest(value)
}
#' Compute the BSS, WSS et TSS
#'
#' @param x matrix of data for the male and female counts
#' @param cluster a vector of cluster annotation (by row)
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of metrics
#' @examples
#' add(1, 1)
#' add(10, 1)
BSS_WSS <- function(x, cluster) {
TSS <- sum((x - colMeans(x))^2)
WSS <- sapply(split(as_tibble(x), cluster), function(x) sum((x - colMeans(x))^2))
BSS <- TSS - sum(WSS)
return(list(TSS = TSS, WSS = WSS, BSS = BSS))
}
}
```
```
# clustering XY
# clustering XY
```{r}
```{r}
data <- sim_kmer(1e4, 1000, "XY")
model_XY <- data %>%
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
as.matrix() %>%
...
@@ -303,6 +488,7 @@ data %>%
...
@@ -303,6 +488,7 @@ data %>%
# clustering XO
# clustering XO
```{r}
```{r}
data <- sim_kmer(1e4, 1000, "XO")
model_XO <- data %>%
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
as.matrix() %>%
...
@@ -327,8 +513,7 @@ model_XO <- data %>%
...
@@ -327,8 +513,7 @@ model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
as.matrix() %>%
EM_clust(sex = "XO")
EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
data <- sim_kmer(1e6, 1000, "XY")
```
```
## For XO
## For XO
...
@@ -343,26 +528,38 @@ model_XO <- data %>%
...
@@ -343,26 +528,38 @@ model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
as.matrix() %>%
EM_clust(sex = "XO")
EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik),
4
)
pchisq(-2 * (model_XY$loglik - model_XO$loglik),
5
)
```
```
## Get Y k-mer
# BIC analysis
## For XY
```{r}
```{r}
res <- compare_BIC(data)
data <- sim_kmer(1e7, 1000, "XY")
res <- compare_models(data, nboot = 10, bootsize = 0.01)
res %>%
res %>%
pivot_longer(cols = 1:2) %>%
ggplot(aes(x = name, y = BIC)) +
ggplot(aes(x = name, y = value)) +
geom_violin()
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
```
data %>%
## For XO
mutate(y_proba = model_XY$proba[,3]) %>%
ggplot(aes(x = count_m, count_f, color = y_proba)) +
```{r}
geom_point() +
data <- sim_kmer(1e7, 1000, "XO")
theme_bw()
res <- compare_models(data, nboot = 10, bootsize = 0.01)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
```
```
#
# With real data
# With real data
```{r}
```{r}
data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)
data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment