diff --git a/src/clustering.Rmd b/src/clustering.Rmd
index 60c80324df7ab3f4f0dd5c27856b600d75044238..eae4a0dbe744ade50ecb8dc6c55fd157cf89f8c2 100644
--- a/src/clustering.Rmd
+++ b/src/clustering.Rmd
@@ -265,5 +265,65 @@ data %>%
 ## With real data
 
 ```{r}
-data <- read_csv("../results/fusion.csv")
+data <- read_tsv("../results/12/mbelari/mbelari.csv", show_col_types = FALSE)
+format(object.size(data), units = "Mb")
+```
+```{r}
+annotation <- read_csv("../data/sample.csv", show_col_types = FALSE) %>% 
+  pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>% 
+  mutate(
+    file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T),
+    file = gsub("\\.fasta\\.gz", "", file, perl = T)
+  ) %>% 
+  mutate(
+    file = paste0(file, ".csv")
+  ) %>% 
+  select(!c(read)) %>% 
+  group_by(specie, sex) %>% 
+  nest(.key = "files")
+```
+
+```{r}
+count <- annotation %>% 
+  group_by(specie) %>% 
+  nest(.key = "sex") %>% 
+  mutate(count = lapply(sex, function(files, data){
+    files_f <- files %>% filter(sex == "female") %>% unnest(files) %>% pull(file) %>% as.vector()
+    files_m <- files %>% filter(sex == "male") %>% unnest(files) %>% pull(file) %>% as.vector()
+    data %>% 
+      select(kmer) %>% 
+      mutate(
+         female = data %>% select(any_of(files_f)) %>% rowMeans(),
+         male = data %>% select(any_of(files_m)) %>% rowMeans()
+      )
+  }, data = data)) %>%
+  unnest(sex) %>%
+  unnest(count)
+save(count, file = "../results/12/mbelari/counts.Rdata")
+```
+
+
+## M belari data
+
+```{r}
+mb_data <- data %>% 
+  select(kmer) %>% 
+  mutate(
+    female = data %>% select(any_of(mb_f)) %>% rowMeans(),
+    male = data %>% select(any_of(mb_m)) %>% rowMeans()
+  )
+save(mb_data, file = "../results/mb_data.Rdata")
+```
+
+```{r}
+load("../results/mb_data.Rdata")
+```
+
+```{r}
+mb_data %>%
+  sample_frac(0.1) %>% 
+  ggplot(aes(x = log1p(male), y = log1p(female))) +
+  geom_point() +
+  coord_fixed() +
+  theme_bw()
 ```
\ No newline at end of file