Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
K
kmer diff
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
Delattre
kmer diff
Commits
13c5b824
Verified
Commit
13c5b824
authored
1 year ago
by
Laurent Modolo
Browse files
Options
Downloads
Patches
Plain Diff
create new repository for clustering
parent
c6daedd9
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/clustering.Rmd
+0
-630
0 additions, 630 deletions
src/clustering.Rmd
with
0 additions
and
630 deletions
src/clustering.Rmd
deleted
100644 → 0
+
0
−
630
View file @
c6daedd9
---
title: "kmer clustering"
author: "Laurent Modolo"
date: "`r Sys.Date()`"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyverse)
library(mvtnorm)
library(conflicted)
```
## Simulation
If we expect to have with $X_f$ the female k-mer count and $X_m$ the male k-mer count, the following relation for a XY species:
- For the autosomal chromosomes: $X_m = X_f$
- For the X chromosomes: $X_m = 2 X_m$
- For the Y chromosomes: $X_m = 0^+ X_f$
Which becomes on the $log$ scale:
- For the autosomal chromosomes: $\log(X_m) = \log(X_f)$
- For the X chromosomes: $\log(X_m) = \log(2) + \log(X_m)$
- For the Y chromosomes: $\log(X_m) = log(0^+) + log(X_f)$
Test slope for a given sigma matrice
```{r}
#' Function to dislay the slope of a 2d normal distribution with covariance matrix
#' `matrix(c(x^2, rho*x*y, rho*x*y, y^2)`
#'
#' @param x A number.
#' @param y A number.
#' @param rho A number.
#' @return A ggplot plot
#' @examples
#' test_slope(1.05, 2.05, 0.95)
test_slope <- function(x, y, rho) {
test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>%
as_tibble()
ggplot(data = test, aes(x = V1, y = V2)) +
geom_point() +
geom_smooth(method = lm) +
labs(title = lm(test$V2 ~ test$V1)$coef[2]) +
coord_fixed()
}
test_slope(1.05, 2.05, 0.95)
```
Simulate k-mer counts data
```{r}
#' function to simulate kmer count under a gaussian model, with the correct
#' sex bias for XY or XO species
#'
#' @param n_kmer the total number of kmers
#' @param mean_coef the base mean of the kmers counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a tibble with a c("sex, count_m, count_f) columns
#' @examples
#' sim_kmer(1e6, 1000, "XY")
#' sim_kmer(1e6, 1000, "XO")
sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
data <- tibble(
sex = "F",
count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>%
as_tibble()
) %>%
unnest(count) %>%
bind_rows(
tibble(
sex = "A",
count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
%>% as_tibble()
) %>% unnest(count)
)
if (sex == "XY") {
data <- data %>%
bind_rows(
tibble(
sex = "M",
count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd")
%>% as_tibble()
) %>%
unnest(count)
)
}
data %>%
rename(count_m = V1,
count_f = V2)
}
```
## XY data
```{r}
data <- sim_kmer(1e4, 1000, "XY")
data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) +
geom_point() +
coord_fixed()
```
## XO data
```{r}
data <- sim_kmer(1e4, 1000, "XO")
data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) +
geom_point() +
coord_fixed()
```
# Clustering
```{r}
#' Function to transform GMM parameters theta format into a format easily iterable
#'
#' @param theta a list of paramaters
#' @param cluster_coef a list of cluster coeficient
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of parameters
expand_theta <- function(theta, cluster_coef, sex) {
theta_ref <- list(
"a" = list(
"pi" = theta$pi[1],
"mu" = cluster_coef$a * theta$mu,
"sigma" = theta$sigma$a
),
"f" = list(
"pi" = theta$pi[2],
"mu" = cluster_coef$f * theta$mu,
"sigma" = theta$sigma$f
))
if (sex == "XY") {
theta_ref[["m"]] <- list(
"pi" = theta$pi[3],
"mu" = cluster_coef$m * theta$mu,
"sigma" = theta$sigma$m
)
}
return(theta_ref)
}
#' Compute the threshold to stop the EM algorithm
#'
#' @param old_theta list of paramters
#' @param theta list of paramters.
#' @param threshold threshold
#' @return bool
params_diff <- function(old_theta, theta, threshold) {
result <- max(
max(abs(old_theta$pi - theta$pi)),
max(abs(old_theta$mu - theta$mu))
)
if (is.finite(result)) {
return(results > threshold)
}
return(T)
}
#' Compute the probability of a kmers to belong to a cluster
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return matrix of cluster probability
proba_point <- function(x, theta, cluster_coef, sex) {
proba <- c()
for (params in expand_theta(theta, cluster_coef, sex)) {
proba <- cbind(proba, params$pi *
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
)
}
return(proba)
}
#' compute the log-likelihood of the model
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return the loglikelihood
#' @examples
#' add(1, 1)
#' add(10, 1)
loglik <- function(x, theta, cluster_coef, sex) {
sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
}
#' Compute the normalized probability of a kmers to belong to a cluster
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return matrix of cluster probability
E_proba <- function(x, theta, cluster_coef, sex) {
proba <- proba_point(x, theta, cluster_coef, sex)
proba_norm <- rowSums(proba)
for (cluster in 1:ncol(proba)) {
proba[, cluster] <- proba[, cluster] / proba_norm
proba[proba_norm == 0, cluster] <- 1 / ncol(proba)
}
return(proba)
}
#' Sum the proba of each points for each clusters
#'
#' @param proba a matrix of autosomal female and male chromosome probability
#' @return a vector
E_N_clust <- function(proba) {
colSums(proba)
}
#' Maximize the base mean
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param mu, the base mean parameter
#' @param N_clust the size of the clusters
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a mean mu
#' @examples
M_mean <- function(x, proba, N_clust, sex) {
mu <- 0
for (cluster in 1:ncol(proba)) {
if (cluster == 1) {
mu <- mu +
mean(colSums(x * c(0.5, 0.5) * proba[, cluster]) / N_clust[cluster])
}
if (cluster == 2) {
mu <- mu +
mean(colSums(x * c(1, 0.5) * proba[, cluster]) / N_clust[cluster])
}
if (cluster == 3) {
mu <- mu +
(colSums(x * c(1, 0) * proba[, cluster]) / N_clust[cluster])[1]
}
}
if (sex == "XY") {
return(mu / 3)
}
return(mu / 2)
}
#' Maximize covariance matric parameters
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param mu, the base mean parameter
#' @param N_clust the size of the clusters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of covarance matrices.
M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
cov_clust <- list()
for (cluster in 1:ncol(proba)) {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster]
}
sigma <- list()
sigma$a <- cov_clust[[1]]
sigma$f <- cov_clust[[2]]
if (sex == "XY") {
sigma$m <- cov_clust[[3]]
}
return(sigma)
}
#' Plot the kmer counts and colorize by cluster proba
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param sex = c("XY", "XO") the sex of the specie
#' @return A number.
#' @examples
#' add(1, 1)
#' add(10, 1)
plot_proba <- function(x, proba, sex = "XY") {
if (sex == "XY") {
as_tibble(x) %>%
mutate(
proba_a = proba[, 1],
proba_f = proba[, 2],
proba_m = proba[, 3],
clust_proba = rgb(proba_f, proba_m, proba_a, maxColorValue = 1)
) %>%
ggplot(aes(x = count_m, y = count_f, color = clust_proba)) +
geom_point() +
scale_color_identity()
} else {
as_tibble(x) %>%
mutate(
proba_a = proba[, 1],
proba_f = proba[, 2],
clust_proba = rgb(proba_f, 0, proba_a, maxColorValue = 1)
) %>%
ggplot(aes(x = count_m, y = count_f, color = clust_proba)) +
geom_point() +
scale_color_identity()
}
}
#' initialize EM parameters
#'
#' @param x a matrix of male female kmer counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of parameters
init_param <- function(x, sex) {
cluster_coef <- list(
"a" = c(2, 2),
"f" = c(1, 2)
)
theta <- list(
"pi" = c(.85, .1, .05),
"mu" = mean(colMeans(x)) * .5
)
theta$sigma <- list(
"a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
"f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
)
if (sex == "XY") {
cluster_coef$m <- c(1, 0)
theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
}
return(list(cluster_coef = cluster_coef, theta = theta))
}
#' compute the BIC of the GMM
#'
#' @param x A number.
#' @param loglik A number.
#' @param sex = c("XY", "XO") the sex of the specie
#' @return the BIC
compute_bic <- function(x, loglik, sex = "XY") {
k <- 1 + 4 * 2
if (sex == "YX") {
k <- k + 4
}
return(k * log(nrow(x)) - 2 * loglik)
}
#' Fix a XY or XO GMM on kmer counts
#'
#' @param x a matrix of male female counts
#' @param threshold the differences in loglik of the model to stop the EM
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list with model informations
#' @examples
#' data <- sim_kmer(1e2, 1000, "XY")
#' model_XY <- data %>%
#' dplyr::select(count_m, count_f) %>%
#' as.matrix() %>%
#' EM_clust()
#' model_XO <- data %>%
#' dplyr::select(count_m, count_f) %>%
#' as.matrix() %>%
#' EM_clust(sex = "XO")
EM_clust <- function(x, threshold = 1, sex = "XY") {
old_loglik <- -Inf
new_loglik <- 0
param <- init_param(x, sex)
while (abs(new_loglik - old_loglik) > threshold) {
old_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
proba <- E_proba(x, param$theta, param$cluster_coef, sex)
param$theta$pi <- E_N_clust(proba)
param$theta$mu <- M_mean(x, proba, param$theta$pi, sex)
param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
param$theta$pi <- param$theta$pi / nrow(x)
new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
if (is.infinite(new_loglik)) {
break
}
}
cluster <- proba %/% apply(proba, 1, max)
if (sex == "XY") {
cluster <- as.vector(cluster %*% c(1, 2, 3))
} else {
cluster <- as.vector(cluster %*% c(1, 2))
}
return(list(
proba = proba,
cluster = cluster,
theta = param$theta,
loglik = new_loglik,
BIC = compute_bic(x, new_loglik, sex),
XSS = BSS_WSS(x, cluster)
))
}
#' Function to compile statistics bootstrap samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @return tibble of the statistics
#' @examples
#' data <- sim_kmer(1e7, 1000, "XY")
#' res <- boostrap_EM(data, nboot = 10, bootsize = 0.01)
boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
if (bootsize > 1) {
res <- x %>%
dplyr::select(count_m, count_f) %>%
sample_n(bootsize, replace = T) %>%
as.matrix() %>%
EM_clust(sex = sex)
} else {
res <- x %>%
dplyr::select(count_m, count_f) %>%
sample_frac(bootsize, replace = T) %>%
as.matrix() %>%
EM_clust(sex = sex)
}
if (sex == "XY") {
tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = res$XSS$WSS[3])
} else {
tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = NA)
}
}, x = x, bootsize = bootsize, sex = sex, mc.cores = 6)
}
#' Function to compile statistics differences between model XY and XO on boostrap
#' samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @param sex = c("XY", "XO") the sex of the specie
#' @return tibble with the statistics
#' @examples
#' data <- sim_kmer(1e7, 1000, "XY")
#' res <- compare_models(data, nboot = 10, bootsize = 0.01)
compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
tibble(
model_XY = boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
model_XO = boostrap_EM(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
) %>%
pivot_longer(cols = c(model_XO, model_XY)) %>%
unnest(value)
}
#' Compute the BSS, WSS et TSS
#'
#' @param x matrix of data for the male and female counts
#' @param cluster a vector of cluster annotation (by row)
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of metrics
#' @examples
#' add(1, 1)
#' add(10, 1)
BSS_WSS <- function(x, cluster) {
TSS <- sum((x - colMeans(x))^2)
WSS <- sapply(split(as_tibble(x), cluster), function(x) sum((x - colMeans(x))^2))
BSS <- TSS - sum(WSS)
return(list(TSS = TSS, WSS = WSS, BSS = BSS))
}
```
# clustering XY
```{r}
data <- sim_kmer(1e4, 1000, "XY")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XY$proba)
```
# clustering XO
```{r}
data <- sim_kmer(1e4, 1000, "XO")
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "X0")
data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "X0")
```
# LRT
## For XY
```{r}
data <- sim_kmer(1e2, 1000, "XY")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
```
## For XO
```{r}
data <- sim_kmer(1e2, 1000, "XO")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
```
# BIC analysis
## For XY
```{r}
data <- sim_kmer(1e7, 1000, "XY")
res <- compare_models(data, nboot = 10, bootsize = 0.01)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
```
## For XO
```{r}
data <- sim_kmer(1e7, 1000, "XO")
res <- compare_models(data, nboot = 10, bootsize = 0.01)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
```
# With real data
```{r}
data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)
format(object.size(data), units = "Mb")
```
```{r}
annotation <- read_csv("data/sample.csv", show_col_types = FALSE) %>%
pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>%
mutate(
file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T),
file = gsub("\\.fasta\\.gz", "", file, perl = T)
) %>%
mutate(
file = paste0(file, ".csv")
) %>%
select(!c(read)) %>%
group_by(specie, sex) %>%
nest(.key = "files")
```
```{r}
count <- annotation %>%
group_by(specie) %>%
nest(.key = "sex") %>%
mutate(count = lapply(sex, function(files, data){
files_f <- files %>% filter(sex == "female") %>% unnest(files) %>% pull(file) %>% as.vector()
files_m <- files %>% filter(sex == "male") %>% unnest(files) %>% pull(file) %>% as.vector()
data %>%
select(kmer) %>%
mutate(
female = data %>% select(any_of(files_f)) %>% rowMeans(),
male = data %>% select(any_of(files_m)) %>% rowMeans()
)
}, data = data)) %>%
unnest(sex) %>%
unnest(count)
save(count, file = "results/12/mbelari/counts.Rdata")
```
## M belari data
```{r}
load("results/12/mbelari/counts.Rdata")
```
```{r}
s_count <- count %>%
ungroup() %>%
sample_frac(0.01) %>%
dplyr::select(male, female) %>%
mutate(
count_m = log1p(male),
count_f = log1p(female)
)
model_XY <- s_count %>%
as.matrix() %>%
EM_clust()
model_XO <- s_count %>%
as.matrix() %>%
EM_clust(sex = "XO")
model_XO$BIC
model_XY$BIC
s_count %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "XO")
```
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment