diff --git a/src/clustering.Rmd b/src/clustering.Rmd index 4a77beb83693b03c912cc7a0345d89e00f0c828e..e4fef2d34b569b7fdfe3575b157c174359161467 100644 --- a/src/clustering.Rmd +++ b/src/clustering.Rmd @@ -275,5 +275,65 @@ data %>% ## With real data ```{r} -data <- read_csv("../results/fusion.csv") +data <- read_tsv("../results/12/mbelari/mbelari.csv", show_col_types = FALSE) +format(object.size(data), units = "Mb") +``` +```{r} +annotation <- read_csv("../data/sample.csv", show_col_types = FALSE) %>% + pivot_longer(!c(sex, specie), names_to = "read", values_to = "file") %>% + mutate( + file = gsub("/scratch/Bio/lmodolo/kmer_diff/data/.*/", "", file, perl = T), + file = gsub("\\.fasta\\.gz", "", file, perl = T) + ) %>% + mutate( + file = paste0(file, ".csv") + ) %>% + select(!c(read)) %>% + group_by(specie, sex) %>% + nest(.key = "files") +``` + +```{r} +count <- annotation %>% + group_by(specie) %>% + nest(.key = "sex") %>% + mutate(count = lapply(sex, function(files, data){ + files_f <- files %>% filter(sex == "female") %>% unnest(files) %>% pull(file) %>% as.vector() + files_m <- files %>% filter(sex == "male") %>% unnest(files) %>% pull(file) %>% as.vector() + data %>% + select(kmer) %>% + mutate( + female = data %>% select(any_of(files_f)) %>% rowMeans(), + male = data %>% select(any_of(files_m)) %>% rowMeans() + ) + }, data = data)) %>% + unnest(sex) %>% + unnest(count) +save(count, file = "../results/12/mbelari/counts.Rdata") +``` + + +## M belari data + +```{r} +mb_data <- data %>% + select(kmer) %>% + mutate( + female = data %>% select(any_of(mb_f)) %>% rowMeans(), + male = data %>% select(any_of(mb_m)) %>% rowMeans() + ) +save(mb_data, file = "../results/mb_data.Rdata") +``` + +```{r} +load("../results/mb_data.Rdata") +``` + +```{r} +mb_data %>% + sample_frac(0.1) %>% + ggplot(aes(x = log1p(male), y = log1p(female))) + + geom_point() + + coord_fixed() + + theme_bw() ``` \ No newline at end of file