Skip to content
Snippets Groups Projects
Verified Commit 13c5b824 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

create new repository for clustering

parent c6daedd9
No related branches found
No related tags found
No related merge requests found
---
title: "kmer clustering"
author: "Laurent Modolo"
date: "`r Sys.Date()`"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyverse)
library(mvtnorm)
library(conflicted)
```
## Simulation
If we expect to have with $X_f$ the female k-mer count and $X_m$ the male k-mer count, the following relation for a XY species:
- For the autosomal chromosomes: $X_m = X_f$
- For the X chromosomes: $X_m = 2 X_m$
- For the Y chromosomes: $X_m = 0^+ X_f$
Which becomes on the $log$ scale:
- For the autosomal chromosomes: $\log(X_m) = \log(X_f)$
- For the X chromosomes: $\log(X_m) = \log(2) + \log(X_m)$
- For the Y chromosomes: $\log(X_m) = log(0^+) + log(X_f)$
Test slope for a given sigma matrice
```{r}
#' Function to dislay the slope of a 2d normal distribution with covariance matrix
#' `matrix(c(x^2, rho*x*y, rho*x*y, y^2)`
#'
#' @param x A number.
#' @param y A number.
#' @param rho A number.
#' @return A ggplot plot
#' @examples
#' test_slope(1.05, 2.05, 0.95)
test_slope <- function(x, y, rho) {
test <- mvtnorm::rmvnorm(1e4, mean = c(0, 0), sigma = matrix(c(x^2, rho*x*y, rho*x*y, y^2), ncol = 2), checkSymmetry = F, method = "svd") %>%
as_tibble()
ggplot(data = test, aes(x = V1, y = V2)) +
geom_point() +
geom_smooth(method = lm) +
labs(title = lm(test$V2 ~ test$V1)$coef[2]) +
coord_fixed()
}
test_slope(1.05, 2.05, 0.95)
```
Simulate k-mer counts data
```{r}
#' function to simulate kmer count under a gaussian model, with the correct
#' sex bias for XY or XO species
#'
#' @param n_kmer the total number of kmers
#' @param mean_coef the base mean of the kmers counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a tibble with a c("sex, count_m, count_f) columns
#' @examples
#' sim_kmer(1e6, 1000, "XY")
#' sim_kmer(1e6, 1000, "XO")
sim_kmer <- function(n_kmer, mean_coef, sex = "XY") {
data <- tibble(
sex = "F",
count = mvtnorm::rmvnorm(n_kmer * .1, mean = c(1, 2)*mean_coef, sigma = matrix(c(1.05, 2, 2, 4.05) * mean_coef^1.5, ncol = 2), checkSymmetry = F, method = "svd") %>%
as_tibble()
) %>%
unnest(count) %>%
bind_rows(
tibble(
sex = "A",
count = mvtnorm::rmvnorm(n_kmer * 0.85, mean = c(2, 2)*mean_coef, sigma = matrix(c(1, .95, .95, 1) * mean_coef^1.5 * 4, ncol = 2), method = "svd")
%>% as_tibble()
) %>% unnest(count)
)
if (sex == "XY") {
data <- data %>%
bind_rows(
tibble(
sex = "M",
count = mvtnorm::rmvnorm(n_kmer * .05, mean = c(1, 0)*mean_coef, sigma = matrix(c(.9, .05, .05, .05) * mean_coef^1.5, ncol = 2), method = "svd")
%>% as_tibble()
) %>%
unnest(count)
)
}
data %>%
rename(count_m = V1,
count_f = V2)
}
```
## XY data
```{r}
data <- sim_kmer(1e4, 1000, "XY")
data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) +
geom_point() +
coord_fixed()
```
## XO data
```{r}
data <- sim_kmer(1e4, 1000, "XO")
data %>%
ggplot(aes(x = count_m, y = count_f, color = sex)) +
geom_point() +
coord_fixed()
```
# Clustering
```{r}
#' Function to transform GMM parameters theta format into a format easily iterable
#'
#' @param theta a list of paramaters
#' @param cluster_coef a list of cluster coeficient
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of parameters
expand_theta <- function(theta, cluster_coef, sex) {
theta_ref <- list(
"a" = list(
"pi" = theta$pi[1],
"mu" = cluster_coef$a * theta$mu,
"sigma" = theta$sigma$a
),
"f" = list(
"pi" = theta$pi[2],
"mu" = cluster_coef$f * theta$mu,
"sigma" = theta$sigma$f
))
if (sex == "XY") {
theta_ref[["m"]] <- list(
"pi" = theta$pi[3],
"mu" = cluster_coef$m * theta$mu,
"sigma" = theta$sigma$m
)
}
return(theta_ref)
}
#' Compute the threshold to stop the EM algorithm
#'
#' @param old_theta list of paramters
#' @param theta list of paramters.
#' @param threshold threshold
#' @return bool
params_diff <- function(old_theta, theta, threshold) {
result <- max(
max(abs(old_theta$pi - theta$pi)),
max(abs(old_theta$mu - theta$mu))
)
if (is.finite(result)) {
return(results > threshold)
}
return(T)
}
#' Compute the probability of a kmers to belong to a cluster
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return matrix of cluster probability
proba_point <- function(x, theta, cluster_coef, sex) {
proba <- c()
for (params in expand_theta(theta, cluster_coef, sex)) {
proba <- cbind(proba, params$pi *
mvtnorm::dmvnorm(x, mean = params$mu, sigma = params$sigma)
)
}
return(proba)
}
#' compute the log-likelihood of the model
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return the loglikelihood
#' @examples
#' add(1, 1)
#' add(10, 1)
loglik <- function(x, theta, cluster_coef, sex) {
sum(log(rowSums(proba_point(x, theta, cluster_coef, sex))))
}
#' Compute the normalized probability of a kmers to belong to a cluster
#'
#' @param x a matrix of male female kmer counts
#' @param theta list of parameters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return matrix of cluster probability
E_proba <- function(x, theta, cluster_coef, sex) {
proba <- proba_point(x, theta, cluster_coef, sex)
proba_norm <- rowSums(proba)
for (cluster in 1:ncol(proba)) {
proba[, cluster] <- proba[, cluster] / proba_norm
proba[proba_norm == 0, cluster] <- 1 / ncol(proba)
}
return(proba)
}
#' Sum the proba of each points for each clusters
#'
#' @param proba a matrix of autosomal female and male chromosome probability
#' @return a vector
E_N_clust <- function(proba) {
colSums(proba)
}
#' Maximize the base mean
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param mu, the base mean parameter
#' @param N_clust the size of the clusters
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a mean mu
#' @examples
M_mean <- function(x, proba, N_clust, sex) {
mu <- 0
for (cluster in 1:ncol(proba)) {
if (cluster == 1) {
mu <- mu +
mean(colSums(x * c(0.5, 0.5) * proba[, cluster]) / N_clust[cluster])
}
if (cluster == 2) {
mu <- mu +
mean(colSums(x * c(1, 0.5) * proba[, cluster]) / N_clust[cluster])
}
if (cluster == 3) {
mu <- mu +
(colSums(x * c(1, 0) * proba[, cluster]) / N_clust[cluster])[1]
}
}
if (sex == "XY") {
return(mu / 3)
}
return(mu / 2)
}
#' Maximize covariance matric parameters
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param mu, the base mean parameter
#' @param N_clust the size of the clusters
#' @param cluster_coef, the base mean coefficients for the cluster
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of covarance matrices.
M_cov <- function(x, proba, mu, N_clust, cluster_coef, sex) {
cov_clust <- list()
for (cluster in 1:ncol(proba)) {
cov_clust[[cluster]] <- t(proba[, cluster] * (x - mu * cluster_coef[[cluster]])) %*% (x - mu * cluster_coef[[cluster]]) / N_clust[cluster]
}
sigma <- list()
sigma$a <- cov_clust[[1]]
sigma$f <- cov_clust[[2]]
if (sex == "XY") {
sigma$m <- cov_clust[[3]]
}
return(sigma)
}
#' Plot the kmer counts and colorize by cluster proba
#'
#' @param x a matrix of male female kmer counts
#' @param proba a matrix of autosomal female and male chromosome probability
#' @param sex = c("XY", "XO") the sex of the specie
#' @return A number.
#' @examples
#' add(1, 1)
#' add(10, 1)
plot_proba <- function(x, proba, sex = "XY") {
if (sex == "XY") {
as_tibble(x) %>%
mutate(
proba_a = proba[, 1],
proba_f = proba[, 2],
proba_m = proba[, 3],
clust_proba = rgb(proba_f, proba_m, proba_a, maxColorValue = 1)
) %>%
ggplot(aes(x = count_m, y = count_f, color = clust_proba)) +
geom_point() +
scale_color_identity()
} else {
as_tibble(x) %>%
mutate(
proba_a = proba[, 1],
proba_f = proba[, 2],
clust_proba = rgb(proba_f, 0, proba_a, maxColorValue = 1)
) %>%
ggplot(aes(x = count_m, y = count_f, color = clust_proba)) +
geom_point() +
scale_color_identity()
}
}
#' initialize EM parameters
#'
#' @param x a matrix of male female kmer counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of parameters
init_param <- function(x, sex) {
cluster_coef <- list(
"a" = c(2, 2),
"f" = c(1, 2)
)
theta <- list(
"pi" = c(.85, .1, .05),
"mu" = mean(colMeans(x)) * .5
)
theta$sigma <- list(
"a" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2),
"f" = matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
)
if (sex == "XY") {
cluster_coef$m <- c(1, 0)
theta$sigma$m <- matrix(c(1, 1, 1, 1) * theta$mu, ncol = 2)
}
return(list(cluster_coef = cluster_coef, theta = theta))
}
#' compute the BIC of the GMM
#'
#' @param x A number.
#' @param loglik A number.
#' @param sex = c("XY", "XO") the sex of the specie
#' @return the BIC
compute_bic <- function(x, loglik, sex = "XY") {
k <- 1 + 4 * 2
if (sex == "YX") {
k <- k + 4
}
return(k * log(nrow(x)) - 2 * loglik)
}
#' Fix a XY or XO GMM on kmer counts
#'
#' @param x a matrix of male female counts
#' @param threshold the differences in loglik of the model to stop the EM
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list with model informations
#' @examples
#' data <- sim_kmer(1e2, 1000, "XY")
#' model_XY <- data %>%
#' dplyr::select(count_m, count_f) %>%
#' as.matrix() %>%
#' EM_clust()
#' model_XO <- data %>%
#' dplyr::select(count_m, count_f) %>%
#' as.matrix() %>%
#' EM_clust(sex = "XO")
EM_clust <- function(x, threshold = 1, sex = "XY") {
old_loglik <- -Inf
new_loglik <- 0
param <- init_param(x, sex)
while (abs(new_loglik - old_loglik) > threshold) {
old_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
proba <- E_proba(x, param$theta, param$cluster_coef, sex)
param$theta$pi <- E_N_clust(proba)
param$theta$mu <- M_mean(x, proba, param$theta$pi, sex)
param$theta$sigma <- M_cov(x, proba, param$theta$mu, param$theta$pi, param$cluster_coef, sex)
param$theta$pi <- param$theta$pi / nrow(x)
new_loglik <- loglik(x, param$theta, param$cluster_coef, sex)
if (is.infinite(new_loglik)) {
break
}
}
cluster <- proba %/% apply(proba, 1, max)
if (sex == "XY") {
cluster <- as.vector(cluster %*% c(1, 2, 3))
} else {
cluster <- as.vector(cluster %*% c(1, 2))
}
return(list(
proba = proba,
cluster = cluster,
theta = param$theta,
loglik = new_loglik,
BIC = compute_bic(x, new_loglik, sex),
XSS = BSS_WSS(x, cluster)
))
}
#' Function to compile statistics bootstrap samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param sex = c("XY", "XO") the sex of the specie
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @return tibble of the statistics
#' @examples
#' data <- sim_kmer(1e7, 1000, "XY")
#' res <- boostrap_EM(data, nboot = 10, bootsize = 0.01)
boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
if (bootsize > 1) {
res <- x %>%
dplyr::select(count_m, count_f) %>%
sample_n(bootsize, replace = T) %>%
as.matrix() %>%
EM_clust(sex = sex)
} else {
res <- x %>%
dplyr::select(count_m, count_f) %>%
sample_frac(bootsize, replace = T) %>%
as.matrix() %>%
EM_clust(sex = sex)
}
if (sex == "XY") {
tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = res$XSS$WSS[3])
} else {
tibble(loglik = res$loglik, BIC = res$BIC, TSS = res$XSS$TSS, BSS = res$XSS$BSS, WSS_a = res$XSS$WSS[1], WSS_f = res$XSS$WSS[2], WSS_m = NA)
}
}, x = x, bootsize = bootsize, sex = sex, mc.cores = 6)
}
#' Function to compile statistics differences between model XY and XO on boostrap
#' samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @param sex = c("XY", "XO") the sex of the specie
#' @return tibble with the statistics
#' @examples
#' data <- sim_kmer(1e7, 1000, "XY")
#' res <- compare_models(data, nboot = 10, bootsize = 0.01)
compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
tibble(
model_XY = boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
model_XO = boostrap_EM(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
) %>%
pivot_longer(cols = c(model_XO, model_XY)) %>%
unnest(value)
}
#' Compute the BSS, WSS et TSS
#'
#' @param x matrix of data for the male and female counts
#' @param cluster a vector of cluster annotation (by row)
#' @param sex = c("XY", "XO") the sex of the specie
#' @return a list of metrics
#' @examples
#' add(1, 1)
#' add(10, 1)
BSS_WSS <- function(x, cluster) {
TSS <- sum((x - colMeans(x))^2)
WSS <- sapply(split(as_tibble(x), cluster), function(x) sum((x - colMeans(x))^2))
BSS <- TSS - sum(WSS)
return(list(TSS = TSS, WSS = WSS, BSS = BSS))
}
```
# clustering XY
```{r}
data <- sim_kmer(1e4, 1000, "XY")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XY$proba)
```
# clustering XO
```{r}
data <- sim_kmer(1e4, 1000, "XO")
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "X0")
data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "X0")
```
# LRT
## For XY
```{r}
data <- sim_kmer(1e2, 1000, "XY")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
```
## For XO
```{r}
data <- sim_kmer(1e2, 1000, "XO")
model_XY <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust()
model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 5)
```
# BIC analysis
## For XY
```{r}
data <- sim_kmer(1e7, 1000, "XY")
res <- compare_models(data, nboot = 10, bootsize = 0.01)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
```
## For XO
```{r}
data <- sim_kmer(1e7, 1000, "XO")
res <- compare_models(data, nboot = 10, bootsize = 0.01)
res %>%
ggplot(aes(x = name, y = BIC)) +
geom_violin()
res %>%
ggplot(aes(x = name, y = WSS_f / BSS)) +
geom_violin()
```
# With real data
```{r}
data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)
format(object.size(data), units = "Mb")
```
```{r}
annotation <- read_csv("data/sample.csv", show_col_types = FALSE) %>%
pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>%
mutate(
file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T),
file = gsub("\\.fasta\\.gz", "", file, perl = T)
) %>%
mutate(
file = paste0(file, ".csv")
) %>%
select(!c(read)) %>%
group_by(specie, sex) %>%
nest(.key = "files")
```
```{r}
count <- annotation %>%
group_by(specie) %>%
nest(.key = "sex") %>%
mutate(count = lapply(sex, function(files, data){
files_f <- files %>% filter(sex == "female") %>% unnest(files) %>% pull(file) %>% as.vector()
files_m <- files %>% filter(sex == "male") %>% unnest(files) %>% pull(file) %>% as.vector()
data %>%
select(kmer) %>%
mutate(
female = data %>% select(any_of(files_f)) %>% rowMeans(),
male = data %>% select(any_of(files_m)) %>% rowMeans()
)
}, data = data)) %>%
unnest(sex) %>%
unnest(count)
save(count, file = "results/12/mbelari/counts.Rdata")
```
## M belari data
```{r}
load("results/12/mbelari/counts.Rdata")
```
```{r}
s_count <- count %>%
ungroup() %>%
sample_frac(0.01) %>%
dplyr::select(male, female) %>%
mutate(
count_m = log1p(male),
count_f = log1p(female)
)
model_XY <- s_count %>%
as.matrix() %>%
EM_clust()
model_XO <- s_count %>%
as.matrix() %>%
EM_clust(sex = "XO")
model_XO$BIC
model_XY$BIC
s_count %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "XO")
```
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment