From f0e1cbc9e35b897afb1010af3b90c3d48b78ca7a Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent@modolo.fr>
Date: Fri, 30 Oct 2020 17:35:54 +0100
Subject: [PATCH] 0_1_Clone_size.R: add boostrapp test for clone diversity

---
 src/0_1_Clone_size.R | 273 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 273 insertions(+)

diff --git a/src/0_1_Clone_size.R b/src/0_1_Clone_size.R
index abb68aa..558c7a7 100644
--- a/src/0_1_Clone_size.R
+++ b/src/0_1_Clone_size.R
@@ -4,6 +4,7 @@ library("tidyverse")
 library("readxl")
 library("vegan")
 library("broom")
+library("pbmcapply")
 theme_set(theme_classic())
 pal = "Set1"
 scale_colour_discrete <-  function(palname=pal, ...){
@@ -391,3 +392,275 @@ fish_plot(data, timepoints=c(15,136,593), "Donor A A2", min_size = function(x){a
 data <- clone_size(clone=clone, select_donor="Donor D", select_antigen="A2") %>%
   select(D15, D90, D720)
 fish_plot(data, timepoints=c(15,90,720), "Donor D A2", min_size = function(x){any(x > 3)})
+
+
+library(lme4)
+
+# fisher alpha subsampling experiment
+data <- clone %>% 
+  mutate(day = fct_reorder(day, as.numeric(as.vector(day)))) %>% 
+  group_by(donor, day, antigen) %>% 
+  select(-percent) %>% 
+  nest() %>% 
+  mutate(alpha = lapply(data, function(x){
+      n_sample <- 100
+      tibble(
+        sampling = seq(from = 0.1, to = 1, by = 0.1) %>% rep(n_sample),
+        sample = rep(1:n_sample, each = 10)
+        ) %>%
+        mutate(
+          alpha = lapply(sampling, function(y, x){
+            x %>%
+            pull(n) %>%
+            sample(round(length(.) * y)) %>%
+            fisherfit(.) %>%
+            .$estimate
+          }, x = x) %>% unlist()
+        )
+    })
+  ) %>% 
+  unnest(alpha) %>% 
+  mutate(
+    sample = as.factor(sample),
+    percent = sampling * 100
+  ) %>% 
+  filter(alpha <= 2.0e9) %>% 
+  ggplot() +
+  geom_line(aes(
+    x = alpha,
+    y = percent,
+    color = day,
+    group = str_c(sample, day)
+    ),
+    alpha = 0.1
+  ) +
+  geom_smooth(aes(
+    x = alpha,
+    y = percent,
+    color = day,
+    group = day
+    ),
+    se = F
+  ) +
+  facet_wrap(~donor + antigen, scales = "free")
+  
+
+# abs number of cells
+clone %>%
+  mutate(day = fct_reorder(day, as.numeric(as.vector(day)))) %>% 
+  select(-percent) %>% 
+  group_by(donor, antigen, day) %>% 
+  nest() %>% 
+  mutate(alpha = pbmcapply::pbmclapply(data, function(data){
+      n_sample <- 10
+      tibble(
+        sampling = seq(
+          from = 20,
+          to = 2000,
+          length.out = 20) %>% rep(n_sample),
+        sample = rep(1:n_sample, each = 20)
+        ) %>%
+        mutate(
+          alpha = lapply(sampling, function(sampling, data){
+            data %>%
+            pull(n) %>%
+            sample(round(sampling), replace = T) %>%
+            fisherfit(.) %>%
+            .$estimate
+          }, data = data) %>% unlist()
+        )
+    },
+    mc.cores = 10,
+    ignore.interactive = T )) %>% 
+  unnest(alpha) %>% 
+  mutate(
+    sample = as.factor(sample),
+    n_cell = sampling
+  ) %>% 
+  filter(alpha <= 2.0e9) %>% 
+  ggplot() +
+  geom_line(aes(
+    x = alpha,
+    y = n_cell,
+    color = day,
+    group = str_c(sample, day)
+    ),
+    alpha = 0.1
+  ) +
+  geom_smooth(aes(
+    x = alpha,
+    y = n_cell,
+    color = day,
+    group = day
+    ),
+    se = F
+  ) +
+  facet_wrap(~donor + antigen, scales = "free")
+  
+
+# clone number subsampling experiment
+data <- clone %>% 
+  mutate(day = fct_reorder(day, as.numeric(as.vector(day)))) %>% 
+  group_by(donor, day, antigen) %>% 
+  select(-percent) %>% 
+  nest() %>% 
+  mutate(alpha = lapply(data, function(x){
+      n_sample <- 100
+      tibble(
+        sampling = seq(from = 0.1, to = 1, by = 0.1) %>% rep(n_sample),
+        sample = rep(1:n_sample, each = 10)
+        ) %>%
+        mutate(
+          alpha = lapply(sampling, function(y, x){
+            x %>%
+            pull(n) %>%
+            sample(round(length(.) * y)) %>%
+            fisherfit(.) %>%
+            .$estimate
+          }, x = x) %>% unlist()
+        )
+    })
+  ) %>% 
+  unnest(alpha) %>% 
+  mutate(
+    sample = as.factor(sample),
+    percent = sampling * 100
+  )
+data %>% 
+  ggplot() +
+  geom_line(aes(
+    x = alpha,
+    y = percent,
+    color = day,
+    group = str_c(sample, day)
+    ),
+    alpha = 0.1
+  ) +
+  geom_smooth(aes(
+    x = alpha,
+    y = percent,
+    color = day,
+    group = day
+    ),
+    se = F
+  ) +
+  facet_wrap(~donor + antigen, scales = "free")
+
+
+# abs number of cells
+clone
+data <- clone %>% 
+  mutate(day = fct_reorder(day, as.numeric(as.vector(day)))) %>% 
+  group_by(donor, day, antigen) %>% 
+  select(-percent) %>% 
+  nest() %>% 
+  mutate(detected_clone = lapply(data, function(data){
+      n_sample <- 1000
+      tibble(
+        n_cell = seq(
+          from = 100,
+          to = max(500, nrow(data) + 10),
+          step = 1) %>%
+          rep(n_sample),
+        sample = rep(
+          1:n_sample,
+          each = (
+            seq(
+              from = 100,
+              to = max(500, nrow(data) + 10),
+              step = 1) %>%
+                length()
+          )),
+        day_size = nrow(data)
+      ) %>%
+        mutate(
+          detected_clone = pbmcapply::pbmclapply(n_cell, function(n_cell, data){
+            data %>%  
+            select(clone) %>% 
+            .[sample(1:nrow(.), round(n_cell), replace = T), ] %>%
+            distinct() %>% 
+            nrow()
+          }, data = data,
+        mc.cores = 10,
+        ignore.interactive = T) %>% unlist(),
+          day_clone = data %>%  
+            select(clone) %>% 
+            distinct() %>% 
+            nrow()
+        )
+    }
+    )) %>% 
+  unnest(detected_clone) %>% 
+  mutate(
+    sample = as.factor(sample),
+  ) %>% 
+  group_by(donor, antigen, n_cell) %>% 
+  nest() %>% 
+  mutate(pval = lapply(data, function(data){
+    data %>% 
+    group_by(day) %>% 
+    mutate(
+      ecdf = ecdf(detected_clone)(detected_clone)
+      ) %>% 
+    filter(!duplicated(detected_clone)) %>% 
+    group_by(detected_clone) %>% 
+    mutate(
+      ecdf = ecdf / length(levels(day)),
+      s_ecdf = sum(ecdf)) %>% 
+    group_by(day) %>%
+    mutate(s_ecdf = s_ecdf - ecdf) %>%
+    group_by(detected_clone) %>% 
+    mutate(pval = max(sum(s_ecdf))) %>%
+    pull(pval) %>% 
+    max()
+  })) %>% 
+  unnest(data, pval) %>% 
+  group_by(donor, antigen) %>% 
+  mutate(pval_signif = max(n_cell[pval > 0.05])) %>% 
+  select(-data)
+
+save(data, file = "results/2020_10_30_clone_diversity_bootstrap.Rdata")
+
+
+p <- ggplot(data %>%
+         filter(n_cell < max(pval_signif, day_size))) +
+  geom_vline(
+    aes(
+      xintercept = pval_signif
+    ),
+    color = "gray50",
+    linetype = 1,
+    size = 1.5
+  ) +
+  geom_line(data = data %>% 
+              filter(n_cell < day_size),
+    aes(
+      x = n_cell,
+      y = detected_clone,
+      color = day,
+      group = str_c(sample, day)
+    ),
+    alpha = 0.1,
+  ) +
+  scale_fill_viridis_d() +
+  geom_smooth(data = data %>%
+         filter(n_cell < max(pval_signif, day_size) + 10),
+    aes(
+      x = n_cell,
+      y = detected_clone,
+      group = day
+    ),
+    method = "loess",
+    formula = 'y ~ x',
+    color = "black",
+    se = F
+  ) +
+  labs(x = "number of cells",
+       y = "number of clone detected") +
+  guides(colour = guide_legend(override.aes = list(alpha = 1))) +
+  facet_wrap(~ antigen + donor, scales = "free", ncol = 4)
+
+ggsave(plot = p, filename = "results/2020_10_30_clone_diversity_bootstrap.png", width = 30, height = 15, units = "cm")
+ggsave(plot = p, filename = "results/2020_10_30_clone_diversity_bootstrap.pdf", width = 30, height = 15, units = "cm")
+
+load(file = "results/2020_10_29_clone_diversity_bootstrap.Rdata")
-- 
GitLab