From 045898f3dd8d497aeb0c296765b97ddd324ec55f Mon Sep 17 00:00:00 2001
From: ahugues <alice.hugues@ens-lyon.fr>
Date: Mon, 20 Mar 2023 15:04:43 +0100
Subject: [PATCH] add variability analysis

---
 src/variability_analysis/02_dimred_fie.R      | 197 ++++++++++++++++++
 .../02_dimred_fie_functions.R                 |  16 ++
 2 files changed, 213 insertions(+)
 create mode 100644 src/variability_analysis/02_dimred_fie.R
 create mode 100644 src/variability_analysis/02_dimred_fie_functions.R

diff --git a/src/variability_analysis/02_dimred_fie.R b/src/variability_analysis/02_dimred_fie.R
new file mode 100644
index 0000000..f161f78
--- /dev/null
+++ b/src/variability_analysis/02_dimred_fie.R
@@ -0,0 +1,197 @@
+# Use only FIE targets 
+
+library(egg)
+library(factoextra)
+library(tibble)
+library(tidyr)
+
+source('src/00_utils.R')
+source('src/02_dimred_fie_functions.R')
+
+rds.name <- 'seuObj_wt_preprocessed_filtered'
+output.dir <- 'results/filtered/230220_wt'
+
+# load data
+seuObj <- readRDS(file = sprintf('data/%s.rds', rds.name))
+seuObj 
+
+
+# filter genes
+rmve <- c(grep('AT[CM]', rownames(seuObj), value = F),
+          which(rownames(seuObj) %in% ribo), 
+          which(rownames(seuObj) %in% cc))
+seuObj <- subset(seuObj, features = rownames(seuObj)[-unique(rmve)])
+seuObj
+
+# 
+glist <- c(FIE, K27_genes, K27K4_genes, K27_wr)
+features <- intersect(glist, root_genes)
+#features <- intersect(features, rownames(seuObj))
+#features <- intersect(root_genes, rownames(seuObj))
+length(features)
+
+# PcG targets among variable features
+seuObj.sub <- FindVariableFeatures(seuObj[root_genes,], assay = 'RNA', selection.method = "vst", 
+                                   nfeatures = length(root_genes),
+                     clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf))
+N <- c(100, 200, 300, 400, 600, 1000, 2000, 4000, 7000, 8000, length(root_genes))
+df <- sapply(N, function(n, set, features){
+  proportions(table(head(set, n) %in% features, deparse.level = FALSE))
+}, set = VariableFeatures(seuObj.sub), features)[2,]
+p <- df %>% data.frame %>%
+  mutate(n = N) %>%
+  ggplot(aes(x = n, y = .)) + geom_point(pch = 4) + 
+  labs(y = 'Proportions of PRC2 target genes',
+       x = 'Top-N variable features') +
+  lims(y = c(0, 1)) +
+  geom_hline(yintercept = length(features)/dim(seuObj.sub)[1], lty = 'dashed') +
+  theme_article()
+png(file = sprintf('%s/dimred_pcgamongvf.png', output.dir),
+    width = 140, height = 80, res = 300, units = 'mm')
+p + geom_vline(xintercept = 300, color = 'red') +
+  geom_point(x = N[3], y = df[3], color = 'red', size = 2)
+dev.off()
+
+# Plot vst plot
+seuObj.sub <- FindVariableFeatures(seuObj[root_genes,], assay = 'RNA', selection.method = "vst", 
+                                    nfeatures = 300,
+                                    clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf))
+png(file = sprintf('%s/dimred_varplotSeurat.png', output.dir),
+    width = 180, height = 140, res = 300, units = 'mm')
+VariableFeaturePlot(seuObj.sub, assay = 'RNA', cols = c(alpha('black', 0.2), 'red')) 
+dev.off()
+
+png(file = sprintf('%s/dimred_varplotSeurat_nohighlight.png', output.dir),
+    width = 180, height = 140, res = 300, units = 'mm')
+VariableFeaturePlot(seuObj.sub, assay = 'RNA', cols = c(alpha('black', 0.2), alpha('black', 0.2))) 
+dev.off()
+
+# Contribution of PcG vs non-PcG targets to variance in each PC
+pca <- 'pca'
+seuObj <- FindVariableFeatures(seuObj[root_genes,], assay = 'SCT', selection.method = "vst", 
+                                   nfeatures = length(root_genes),
+                                   clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf))
+seuObj <- RunPCA(seuObj, assay = 'SCT', features = rownames(seuObj),
+                 reduction.name = pca)
+png(file = sprintf('%s/dimred_plotpca.png', output.dir),
+    width = 140, height = 140, res = 300, units = 'mm')
+DimPlot(seuObj, reduction = pca, group.by = NULL, cols = rep(alpha('black', 0.4), dim(seuObj)[2])) +
+  theme(legend.position = 'none')
+dev.off()
+Loadings(object = seuObj[[pca]])[1:5, 1:5] # = seuObj[[pca]]@feature.loadings[1:5, 1:5]
+TopFeatures(object = seuObj[["pca"]], dim = 1, nfeatures = 20, balanced = FALSE)
+names(tail(sort(Loadings(object = seuObj[[pca]])[,1]), 20)) # = rev(names(tail(sort(var.contrib[,1]), 20)))
+
+var_coord_func <- function(loadings, comp.sdev){
+  loadings*comp.sdev
+}
+var.coord <- t(apply(seuObj[[pca]]@feature.loadings, 1, var_coord_func, comp.sdev = seuObj[[pca]]@stdev)) 
+var.cos2 <- var.coord^2
+comp.cos2 <- apply(var.cos2, 2, sum)
+contrib <- function(var.cos2, comp.cos2){ var.cos2*100/comp.cos2 }
+var.contrib <- t(apply(var.cos2, 1, contrib, comp.cos2))
+
+# how much PcG targets among the top-n genes explaining variance in each PC?
+df <- prop_contrib(n = 1000, var.contrib, labels = c('PcG', 'non-PcG'), 
+             features)
+df <- do.call(rbind, sapply(N, function(n){
+  df <- prop_contrib(n = n, var.contrib, labels = c('PcG', 'non-PcG'), features)
+  df$n <- n
+  #print(head(df))
+  return(df)
+}, simplify = F))
+png(file = sprintf('%s/dimred_fie_topnprop.png', output.dir),
+    width = 140, height = 80, res = 300, units = 'mm')
+#df %>% ggplot(aes(x = PC, y = PcG)) + geom_point() +
+#  coord_flip() +
+#  geom_hline(yintercept = length(features)/length(rownames(var.contrib)), lty = 'dashed') +
+#  labs(y = 'Proportion of PcG targets among the top-1000 features in each PC')
+df %>% ggplot(aes(x = n, y = PcG, group = n)) + geom_boxplot() + 
+  geom_hline(yintercept = length(features)/length(rownames(var.contrib)), lty = 'dashed') +
+  labs(y = 'Proportions of PRC2 target genes', x = 'Top-N variable features') +
+  lims(y = c(0, 0.6)) +
+  theme_article()
+dev.off()
+
+# % variance explained
+contrib_glist <- apply(var.contrib, 2, function(x){ 
+  #tapply(x, bin, quantile, 0.75)
+  l <- split(x, bin)
+  #print(length(l[[2]]))
+  #sapply(l, function(x){ sum(head(sort(x, decreasing = T), length(l[[2]]))) })
+  sapply(l, function(x){ sum(head(sort(x, decreasing = T), 100)) })
+  #tapply(x, bin, sum)
+  })
+df <- data.frame(t(contrib_glist)) %>% 
+  rownames_to_column('PC') %>%
+  pivot_longer(cols = c(2,3)) %>%
+  mutate(PC = as.factor(gsub('PC_', '', PC)))
+levels(df$PC) <- order(levels(df$PC), as.numeric(levels(df$PC)))
+pl <- df %>% ggplot(aes(x = PC, fill = name, y = value)) + geom_col(position = 'dodge') +
+  coord_flip() + labs(y = 'Contribution to variance in each PC (%)') +
+  scale_fill_manual(values = c('grey', 'coral')) +
+  theme(legend.position = 'none')
+pr <- data.frame('stdev' = seuObj[[pca]]@stdev, 'PC' = as.factor(1:50), 'name' = 'PcG') %>% 
+  ggplot(aes(x = PC, y = stdev)) + geom_point() + coord_flip() +
+  labs(y = 'Total variance explained')
+pl
+
+var.contrib %>%
+  data.frame() %>%
+  rownames_to_column('gene_id') %>%
+  mutate('class' = bin) %>%
+  pivot_longer(cols = contains('PC'), names_to = 'PC', values_to = 'contrib') %>%
+  ggplot(aes(x = PC, y = log(contrib), fill = class)) + geom_boxplot() + coord_flip()
+
+pdf(file = sprintf('%s/dimred_fie.pdf', output.dir))
+egg::ggarrange(pr, pl + 
+                 theme(axis.text.y = element_blank(),
+                       axis.ticks.y = element_blank(),
+                       axis.title.y = element_blank() ), nrow = 1)
+dev.off()
+
+####
+# Dimension reduction using only FIE targets
+length(features)/length(glist) # huge filtering!
+features_ctrl <- sample(setdiff(rownames(seuObj), glist), 
+                   size = length(intersect(glist, rownames(seuObj))))
+features_ctrl <- setdiff(rownames(seuObj), glist)
+any(features_ctrl %in% glist) # should be FALSE
+length(features_ctrl)
+
+seuObj <- RunPCA(seuObj, assay = 'SCT', features = features,
+                 reduction.name = "pca_PcG")
+seuObj <- RunPCA(seuObj, assay = 'SCT', features = features_ctrl,
+                 reduction.name = "pca_ctrl")
+#DimPlot(seuObj, reduction = "pca")
+#ElbowPlot(object = seuObj, ndims = 40, reduction = "pca_PcG") +
+#  ElbowPlot(object = seuObj, ndims = 40, reduction = "pca_ctrl")
+
+
+pca <- 'pca_ctrl'
+pct <- seuObj[[pca]]@stdev / sum(seuObj[[pca]]@stdev) * 100
+cumu <- cumsum(pct)
+co1 <- which(cumu > 90 & pct < 5)[1]
+co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
+min(co1, co2)
+
+npc <- 50
+data.frame(stdev = c(seuObj[["pca_PcG"]]@stdev, seuObj[["pca_ctrl"]]@stdev),
+           spl = rep(c('PcG', 'ctrl'), each = npc),
+           pc = rep(1:npc, 2)) %>%
+  ggplot(aes(x = pc, y = stdev, color = spl)) + geom_point() +
+  labs(x = 'PC', y = 'Standard deviation')
+
+pca <- c('pca_ctrl', 'pca_PcG')
+umap <- gsub(pattern = 'pca', replacement = 'umap', pca)
+umap
+seuObj <- RunUMAP(seuObj, reduction = pca[1], dims = 1:20, a = 0.8, b = 1.2, 
+                  umap.method = "uwot", metric = "euclidean",
+                  reduction.name = umap[1])
+seuObj <- RunUMAP(seuObj, reduction = pca[2], dims = 1:20, a = 0.8, b = 1.2, 
+                  umap.method = "uwot", metric = "euclidean",
+                  reduction.name = umap[2])
+
+DimPlot(seuObj, reduction = umap[1], group.by = "orig.ident") +
+  DimPlot(seuObj, reduction = umap[2], group.by = "orig.ident")
+
diff --git a/src/variability_analysis/02_dimred_fie_functions.R b/src/variability_analysis/02_dimred_fie_functions.R
new file mode 100644
index 0000000..ce5f5d4
--- /dev/null
+++ b/src/variability_analysis/02_dimred_fie_functions.R
@@ -0,0 +1,16 @@
+# functions associated with 02_dimred_fie.R
+
+prop_contrib <- function(n = 1000, var.contrib, labels = c('PcG', 'non-PcG'), 
+                         features){
+  bin <- ifelse(rownames(var.contrib) %in% features, labels[1], labels[2])
+  names(bin) <- rownames(var.contrib)
+  contrib_glist <- apply(var.contrib, 2, function(x, n){ 
+    top <- names(tail(sort(x), n))
+    table(bin[top]) }, n) %>% prop.table(margin = 2)
+  df <- data.frame(t(contrib_glist)) %>% 
+    rownames_to_column('PC') %>%
+    mutate(PC = as.factor(gsub('PC_', '', PC)))
+  #pivot_longer(cols = c(2,3)) %>%
+  levels(df$PC) <- order(levels(df$PC), as.numeric(levels(df$PC)))
+  return(df)
+}
\ No newline at end of file
-- 
GitLab