Skip to content
Snippets Groups Projects
Verified Commit 1f87de79 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

clustering: add boostrap for BIC

parent f9cc4071
Branches
No related tags found
No related merge requests found
...@@ -95,7 +95,6 @@ plot(data_clust, what = "uncertainty") ...@@ -95,7 +95,6 @@ plot(data_clust, what = "uncertainty")
``` ```
```{r} ```{r}
expand_theta <- function(theta, cluster_coef, sex) { expand_theta <- function(theta, cluster_coef, sex) {
theta_ref <- list( theta_ref <- list(
"a" = list( "a" = list(
...@@ -267,6 +266,25 @@ EM_clust <- function(x, threshold = 1, sex = "XY") { ...@@ -267,6 +266,25 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
} }
return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex))) return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex)))
} }
boostrap_BIC <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
res <- x %>%
dplyr::select(count_m, count_f) %>%
sample_n(bootsize, replace = T) %>%
as.matrix() %>%
EM_clust(sex = sex)
res$BIC
}, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) %>%
unlist()
}
compare_BIC <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
tibble(
BIC_XY = boostrap_BIC(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
BIC_XO = boostrap_BIC(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
)
}
``` ```
# clustering XY # clustering XY
...@@ -309,7 +327,8 @@ model_XO <- data %>% ...@@ -309,7 +327,8 @@ model_XO <- data %>%
dplyr::select(count_m, count_f) %>% dplyr::select(count_m, count_f) %>%
as.matrix() %>% as.matrix() %>%
EM_clust(sex = "XO") EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
data <- sim_kmer(1e6, 1000, "XY")
``` ```
## For XO ## For XO
...@@ -330,6 +349,12 @@ pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4) ...@@ -330,6 +349,12 @@ pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
## Get Y k-mer ## Get Y k-mer
```{r} ```{r}
res <- compare_BIC(data)
res %>%
pivot_longer(cols = 1:2) %>%
ggplot(aes(x = name, y = value)) +
geom_violin()
data %>% data %>%
mutate(y_proba = model_XY$proba[,3]) %>% mutate(y_proba = model_XY$proba[,3]) %>%
ggplot(aes(x = count_m, count_f, color = y_proba)) + ggplot(aes(x = count_m, count_f, color = y_proba)) +
...@@ -340,11 +365,11 @@ data %>% ...@@ -340,11 +365,11 @@ data %>%
## With real data ## With real data
```{r} ```{r}
data <- read_tsv("../results/12/mbelari/mbelari.csv", show_col_types = FALSE) data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)
format(object.size(data), units = "Mb") format(object.size(data), units = "Mb")
``` ```
```{r} ```{r}
annotation <- read_csv("../data/sample.csv", show_col_types = FALSE) %>% annotation <- read_csv("data/sample.csv", show_col_types = FALSE) %>%
pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>% pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>%
mutate( mutate(
file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T), file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T),
...@@ -374,31 +399,35 @@ count <- annotation %>% ...@@ -374,31 +399,35 @@ count <- annotation %>%
}, data = data)) %>% }, data = data)) %>%
unnest(sex) %>% unnest(sex) %>%
unnest(count) unnest(count)
save(count, file = "../results/12/mbelari/counts.Rdata") save(count, file = "results/12/mbelari/counts.Rdata")
``` ```
## M belari data ## M belari data
```{r} ```{r}
mb_data <- data %>% load("results/12/mbelari/counts.Rdata")
select(kmer) %>%
mutate(
female = data %>% select(any_of(mb_f)) %>% rowMeans(),
male = data %>% select(any_of(mb_m)) %>% rowMeans()
)
save(mb_data, file = "../results/mb_data.Rdata")
``` ```
```{r} ```{r}
load("../results/mb_data.Rdata") s_count <- count %>%
``` ungroup() %>%
sample_frac(0.01) %>%
dplyr::select(male, female) %>%
mutate(
count_m = log1p(male),
count_f = log1p(female)
)
model_XY <- s_count %>%
as.matrix() %>%
EM_clust()
model_XO <- s_count %>%
as.matrix() %>%
EM_clust(sex = "XO")
model_XO$BIC
model_XY$BIC
```{r} s_count %>%
mb_data %>% as.matrix() %>%
sample_frac(0.1) %>% plot_proba(model_XO$proba, sex = "XO")
ggplot(aes(x = log1p(male), y = log1p(female))) +
geom_point() +
coord_fixed() +
theme_bw()
``` ```
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment