clonality_paper.R: test on heatmap

fde1f52a · Laurent Modolo · 17425f65 · fde1f52a
Unverified Commit fde1f52a authored Oct 8, 2020 by Laurent Modolo
--- a/src/clonality_paper.R
+++ b/src/clonality_paper.R
@@ -417,40 +417,6 @@ for (day in names(sce_day)) {
 save(sce_day, file = "results/2020_01_02_clonality_paper_sce.Rdata")
 load(file = "results/2020_01_02_clonality_paper_sce.Rdata")
-DEA_DEA_clone_cell_type_size <- list()
-min_clone_size <- 3
-for (day in names(sce_day)) {
-  colData(sce_day[[day]])$clone_id <- colData(sce_day[[day]]) %>% 
-    as_tibble() %>% 
-    mutate(clone_id = ifelse(
-      clone_id == 0,
-      (max(clone_id) + 1):(max(clone_id) + length(which(clone_id == 0))),
-      clone_id)) %>% 
-    pull(clone_id)
-  colData(sce_day[[day]])$clone_size <- colData(sce_day[[day]]) %>% 
-    as_tibble() %>% 
-    left_join(
-      colData(sce_day[[day]]) %>%
-        as_tibble() %>%
-        group_by(clone_id) %>%
-        dplyr::summarise(clone_size = n())
-    ) %>%
-    pull(clone_size)
-  DEA_DEA_clone_cell_type_size[[day]] <- DEA(
-    sce_day[[day]][, colData(sce_day[[day]])$clone_size >= min_clone_size],
-    test = "~ (1|clone_id)",
-    formula = "count ~ p_PLS_DEA_cell_type + (1|clone_id)",
-    assay_name = "counts_vst",
-    cpus = 10
-  )
-  save(
-    DEA_DEA_clone_cell_type_size,
-    file = "results/2020_01_01_DEA_DEA_clone_size.Rdata"
-  )
-}
 genes_PLS <- read_csv("data/2017_11_28_List_Laurent_Genes_PLS.csv") %>% 
  pivot_longer(cols = c("Genes_EFF", "Genes_MEM"),
               names_to = "type",
@@ -509,25 +475,25 @@ for (day in names(sce_day)) {
 min_clone_size <- 3
 load(file = "results/2020_01_02_clonality_paper_sce.Rdata")
 load(file = "results/2020_01_01_DEA_DEA_clone_size.Rdata")
+load(file = "results/2020_01_01_DEA_clone_PCA_cell_type_size.Rdata", v=T)
 ## adj pvalue
 for (day in names(sce_day)) {
  print(day)
-  rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size <- NA
+  rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size <- NA
-  rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size <- 
+  rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size <- 
-    get_genes_pval(DEA_DEA_clone_cell_type_size[[day]], sce_day[[day]])
+    get_genes_pval(DEA_clone_PCA_cell_type_size[[day]], sce_day[[day]])
-  rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size %>% 
+  rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size %>% 
    is.na() %>% 
    table() %>%
    print()
-  rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size_adj <- p.adjust(
+  rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size_adj <- p.adjust(
-    rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size,
+    rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size,
    method = "BH"
  )
-  table(rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size_adj < 0.05) %>% print()
+  table(rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size_adj < 0.05) %>% print()
 }
  day <- "593"
  assays(sce_day[[day]])$logcounts <- scater::logNormCounts(
@@ -538,8 +504,8 @@ for (day in names(sce_day)) {
    assay(., "logcounts") %>% 
    Matrix::Matrix(sparse = T)
  sce_DEA_hm <- sce_day[[day]][
-      !is.na(rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size_adj) &
+      !is.na(rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size_adj) &
-      rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size_adj < 0.05
+      rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size_adj < 0.05
    ]
  sce_DEA_hm %>% dim()
  colData(sce_DEA_hm) <- colData(sce_DEA_hm) %>% 
@@ -568,7 +534,6 @@ rm_genes <- readr::read_delim(
  future::plan("multiprocess", workers = 10)
  test <- assay(sce_DEA_hm, "logcounts") %>% 
-    log1p() %>% 
    as.matrix() %>% 
    as_tibble(rownames = "gene_name") %>% 
    tidyr::nest(counts = !c(gene_name)) %>% 
@@ -584,17 +549,23 @@ rm_genes <- readr::read_delim(
    mutate(
      count_var = purrr::map(.x = counts, .f = function(.x){
          var(.x$count)
+      }),
+      count_mean = purrr::map(.x = counts, .f = function(.x){
+          (.x$count)
      })
    ) %>%  
    unnest(count_var) %>% 
-    filter(count_var > 0.2) %>% 
+    unnest(count_mean) %>% 
+    select(count_var, count_mean) %>% summary()
+    filter(!(gene_name %in% rm_genes$geneid)) %>% 
+    filter(count_var > 0.1) %>% 
    mutate(
      model = furrr::future_map(.x = counts, .f = function(.x){
        model.matrix(~-1 + clone_id, data = .x) %>% 
-        glmnet(
+        glmnet::glmnet(
          x = .,
          y = (.x %>% pull(count)),
-          lambda = cv.glmnet(
+          lambda = glmnet::cv.glmnet(
            .,
            (.x %>% pull(count))
          )$lambda.1se
@@ -606,57 +577,103 @@ rm_genes <- readr::read_delim(
    select(-c(counts, model)) %>%
    tidyr::unnest(coefs)
-  gene_name <- test %>% 
-    janitor::clean_names() %>% 
-    dplyr::filter(
-      dev_ratio > 0
-    ) %>% 
-    group_by(gene_name) %>% 
-    dplyr::summarize(
-      estimate = max(abs(estimate)),
-    ) %>% 
-    # filter(estimate >= quantile(estimate, 0.90)) %>% 
-    pull(gene_name)
-  # cluster_row <- 
-  test %>% 
+ cluster_row <- assay(sce_DEA_hm, "logcounts") %>% 
+    as.matrix() %>% 
+    as_tibble(rownames = "gene_name") %>% 
+    filter(!(gene_name %in% rm_genes$geneid)) %>% 
+    tidyr::nest(counts = !c(gene_name)) %>% 
+    mutate(
+      counts = purrr::map(.x = counts, .f = function(.x){
+        tibble(
+          id = colnames(.x),
+          count = t(.x)[, 1],
+          clone_id = colData(sce_DEA_hm)$clone_id)
+        }
+      )
+    ) %>%
+    mutate(
+      count_var = purrr::map(.x = counts, .f = function(.x){
+          var(.x$count)
+      })
+    ) %>%  
+    unnest(count_var) %>% 
+    mutate(
+      model = furrr::future_map(.x = counts, .f = function(.x){
+        lm(count ~ clone_id, data = .x)
+      },
+      .progress = TRUE)
+    ) %>%
+    mutate(coefs = map(model, tidy)) %>%
+    select(-c(counts, model)) %>%
+    tidyr::unnest(coefs) %>% 
    janitor::clean_names() %>% 
-    filter(gene_name %in% gene_name) %>% 
+    dplyr::mutate(term = ifelse(
-    mutate(term = ifelse(
      term == "(Intercept)",
      colData(sce_DEA_hm)$clone_id %>% as.factor() %>% levels() %>% .[1] %>% 
        str_c("clone_id", .),
      term)
    ) %>% 
-    mutate(
+    dplyr::mutate(
      term = str_replace(term, "clone_id(.*)", "\\1")
    ) %>% 
-    select(gene_name, term, estimate) %>% 
+    dplyr::select(gene_name, term, estimate) %>% 
-    mutate(id = 1:nrow(.),
+    tidyr::pivot_wider(
-           estimate = ifelse(is.na(estimate), 0, estimate)) %>% 
+      id_cols = gene_name,
-    group_by(c(gene_name, id) %>% 
-    summarise(estimate = sum(estimate))
-    summarize()
-    pivot_wider(
-      id_cols = c(id, gene_name),
      names_from = term,
-      values_from = estimate
+      values_from = estimate,
+      values_fill = 0,
+      values_fn = sum
    ) %>% 
-    select(-id) %>% 
    as.data.frame()
  rownames(cluster_row) <- cluster_row$gene_name
-  cluster_row %>% dim()
+cluster_row <- 
-  sce_DEA_hm %>% dim()
+  assay(sce_DEA_hm, "logcounts") %>% 
+  as.matrix() %>% 
+  as_tibble(rownames = "gene_name") %>% 
+  filter(!(gene_name %in% rm_genes$geneid)) %>% 
+  tidyr::nest(counts = !c(gene_name)) %>% 
+  mutate(
+    counts = purrr::map(.x = counts, .f = function(.x){
+      tibble(
+        id = colnames(.x),
+        count = t(.x)[, 1],
+        clone_id = as.factor(colData(sce_DEA_hm)$clone_id)
+        ) %>% 
+        group_by(clone_id) %>% 
+        dplyr::summarise(
+          id = id,
+          count = count,
+          clone_id = clone_id,
+          count_mean = max(count)
+        ) %>% 
+        ungroup()
+      }
+    )
+  ) %>% 
+  tidyr::unnest(counts) %>% 
+  janitor::clean_names() %>% 
+  dplyr::select(gene_name, clone_id, count_mean) %>% 
+  tidyr::pivot_wider(
+    id_cols = gene_name,
+    names_from = clone_id,
+    values_from = count_mean,
+    values_fill = 0,
+    values_fn = sum
+  ) %>% 
+  as.data.frame()
+rownames(cluster_row) <- cluster_row$gene_name
-sce_DEA_hm_plot <- sce_DEA_hm[rownames(sce_DEA_hm) %in% gene_name, ]
+sce_DEA_hm_plot <- sce_DEA_hm[rownames(sce_DEA_hm) %in% cluster_row$gene_name, ]
 rowData(sce_DEA_hm_plot)$gene_order <- cluster_row[, -1] %>%
-  dist() %>% 
+  dist(method = "canberra") %>% 
  hclust() %>% .$order
+rownames(sce_DEA_hm_plot) <- rowData(sce_DEA_hm_plot)$gene_name
+sce_DEA_hm_plot <- sce_DEA_hm_plot[rowData(sce_DEA_hm_plot)$gene_order, ]
 plotHeatmap(
-  sce_DEA_hm_plot,
+  sce_DEA_hm_plot[!(rowData(sce_DEA_hm_plot)$gene_name %in% rm_genes),],
  features = rownames(sce_DEA_hm_plot),
  order_columns_by = c("clone_id", "p_PLS_DEA_cell_type"),
  colour_columns_by = c("clone_id", "p_PLS_DEA_cell_type"),
@@ -665,7 +682,4 @@ plotHeatmap(
  zlim = c(-5, 5),
  main = day,
  cluster_rows = F,
-  # order_rows_by = c("gene_order")
 )