diff --git a/src/clustering.Rmd b/src/clustering.Rmd index d6a2af52690eee6e441e6e416f2fd4e3d5587150..467c2f85b06f33f793fbc99b1362204c5f9145fa 100644 --- a/src/clustering.Rmd +++ b/src/clustering.Rmd @@ -95,7 +95,6 @@ plot(data_clust, what = "uncertainty") ``` ```{r} - expand_theta <- function(theta, cluster_coef, sex) { theta_ref <- list( "a" = list( @@ -267,6 +266,25 @@ EM_clust <- function(x, threshold = 1, sex = "XY") { } return(list(proba = proba, theta = param$theta, loglik = new_loglik, BIC = compute_bic(x, new_loglik, sex))) } + +boostrap_BIC <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1000, core = 6) { + parallel::mclapply(as.list(1:nboot), function(iter, x, bootsize, sex) { + res <- x %>% + dplyr::select(count_m, count_f) %>% + sample_n(bootsize, replace = T) %>% + as.matrix() %>% + EM_clust(sex = sex) + res$BIC + }, x = x, bootsize = bootsize, sex = sex, mc.cores = 6) %>% + unlist() +} + +compare_BIC <- function(x, threshold = 1, nboot = 100, bootsize = 1000, core = 6) { + tibble( + BIC_XY = boostrap_BIC(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core), + BIC_XO = boostrap_BIC(x, sex = "X0", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core) + ) +} ``` # clustering XY @@ -309,7 +327,8 @@ model_XO <- data %>% dplyr::select(count_m, count_f) %>% as.matrix() %>% EM_clust(sex = "XO") -pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4) + +data <- sim_kmer(1e6, 1000, "XY") ``` ## For XO @@ -330,6 +349,12 @@ pchisq(-2 * (model_XY$loglik - model_XO$loglik), 4) ## Get Y k-mer ```{r} +res <- compare_BIC(data) +res %>% + pivot_longer(cols = 1:2) %>% + ggplot(aes(x = name, y = value)) + + geom_violin() + data %>% mutate(y_proba = model_XY$proba[,3]) %>% ggplot(aes(x = count_m, count_f, color = y_proba)) + @@ -340,11 +365,11 @@ data %>% ## With real data ```{r} -data <- read_tsv("../results/12/mbelari/mbelari.csv", show_col_types = FALSE) +data <- read_tsv("results/12/mbelari/mbelari.csv", show_col_types = FALSE) format(object.size(data), units = "Mb") ``` ```{r} -annotation <- read_csv("../data/sample.csv", show_col_types = FALSE) %>% +annotation <- read_csv("data/sample.csv", show_col_types = FALSE) %>% pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>% mutate( file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T), @@ -374,31 +399,35 @@ count <- annotation %>% }, data = data)) %>% unnest(sex) %>% unnest(count) -save(count, file = "../results/12/mbelari/counts.Rdata") +save(count, file = "results/12/mbelari/counts.Rdata") ``` ## M belari data ```{r} -mb_data <- data %>% - select(kmer) %>% - mutate( - female = data %>% select(any_of(mb_f)) %>% rowMeans(), - male = data %>% select(any_of(mb_m)) %>% rowMeans() - ) -save(mb_data, file = "../results/mb_data.Rdata") +load("results/12/mbelari/counts.Rdata") ``` ```{r} -load("../results/mb_data.Rdata") -``` +s_count <- count %>% + ungroup() %>% + sample_frac(0.01) %>% + dplyr::select(male, female) %>% + mutate( + count_m = log1p(male), + count_f = log1p(female) + ) +model_XY <- s_count %>% + as.matrix() %>% + EM_clust() +model_XO <- s_count %>% + as.matrix() %>% + EM_clust(sex = "XO") +model_XO$BIC +model_XY$BIC -```{r} -mb_data %>% - sample_frac(0.1) %>% - ggplot(aes(x = log1p(male), y = log1p(female))) + - geom_point() + - coord_fixed() + - theme_bw() +s_count %>% + as.matrix() %>% + plot_proba(model_XO$proba, sex = "XO") ``` \ No newline at end of file