Skip to content
Snippets Groups Projects
Verified Commit 1f87de79 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

clustering: add boostrap for BIC

parent f9cc4071
No related branches found
No related tags found
No related merge requests found
......@@ -95,7 +95,6 @@ plot(data_clust, what = "uncertainty")
```
```{r}
expand_theta <- function(theta, cluster_coef, sex) {
theta_ref <- list(
"a" = list(
......@@ -267,6 +266,25 @@ EM_clust <- function(x, threshold = 1, sex = "XY") {
}
return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex)))
}
boostrap_BIC <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) {
res <- x %>%
dplyr::select(count_m, count_f) %>%
sample_n(bootsize, replace = T) %>%
as.matrix() %>%
EM_clust(sex = sex)
res$BIC
}, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) %>%
unlist()
}
compare_BIC <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) {
tibble(
BIC_XY = boostrap_BIC(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core),
BIC_XO = boostrap_BIC(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core)
)
}
```
# clustering XY
......@@ -309,7 +327,8 @@ model_XO <- data %>%
dplyr::select(count_m, count_f) %>%
as.matrix() %>%
EM_clust(sex = "XO")
pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
data <- sim_kmer(1e6, 1000, "XY")
```
## For XO
......@@ -330,6 +349,12 @@ pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4)
## Get Y k-mer
```{r}
res <- compare_BIC(data)
res %>%
pivot_longer(cols = 1:2) %>%
ggplot(aes(x = name, y = value)) +
geom_violin()
data %>%
mutate(y_proba = model_XY$proba[,3]) %>%
ggplot(aes(x = count_m, count_f, color = y_proba)) +
......@@ -340,11 +365,11 @@ data %>%
## With real data
```{r}
data <- read_tsv("../results/12/mbelari/mbelari.csv", show_col_types = FALSE)
data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE)
format(object.size(data), units = "Mb")
```
```{r}
annotation <- read_csv("../data/sample.csv", show_col_types = FALSE) %>%
annotation <- read_csv("data/sample.csv", show_col_types = FALSE) %>%
pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>%
mutate(
file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T),
......@@ -374,31 +399,35 @@ count <- annotation %>%
}, data = data)) %>%
unnest(sex) %>%
unnest(count)
save(count, file = "../results/12/mbelari/counts.Rdata")
save(count, file = "results/12/mbelari/counts.Rdata")
```
## M belari data
```{r}
mb_data <- data %>%
select(kmer) %>%
mutate(
female = data %>% select(any_of(mb_f)) %>% rowMeans(),
male = data %>% select(any_of(mb_m)) %>% rowMeans()
)
save(mb_data, file = "../results/mb_data.Rdata")
load("results/12/mbelari/counts.Rdata")
```
```{r}
load("../results/mb_data.Rdata")
```
s_count <- count %>%
ungroup() %>%
sample_frac(0.01) %>%
dplyr::select(male, female) %>%
mutate(
count_m = log1p(male),
count_f = log1p(female)
)
model_XY <- s_count %>%
as.matrix() %>%
EM_clust()
model_XO <- s_count %>%
as.matrix() %>%
EM_clust(sex = "XO")
model_XO$BIC
model_XY$BIC
```{r}
mb_data %>%
sample_frac(0.1) %>%
ggplot(aes(x = log1p(male), y = log1p(female))) +
geom_point() +
coord_fixed() +
theme_bw()
s_count %>%
as.matrix() %>%
plot_proba(model_XO$proba, sex = "XO")
```
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment