From 045898f3dd8d497aeb0c296765b97ddd324ec55f Mon Sep 17 00:00:00 2001 From: ahugues <alice.hugues@ens-lyon.fr> Date: Mon, 20 Mar 2023 15:04:43 +0100 Subject: [PATCH] add variability analysis --- src/variability_analysis/02_dimred_fie.R | 197 ++++++++++++++++++ .../02_dimred_fie_functions.R | 16 ++ 2 files changed, 213 insertions(+) create mode 100644 src/variability_analysis/02_dimred_fie.R create mode 100644 src/variability_analysis/02_dimred_fie_functions.R diff --git a/src/variability_analysis/02_dimred_fie.R b/src/variability_analysis/02_dimred_fie.R new file mode 100644 index 0000000..f161f78 --- /dev/null +++ b/src/variability_analysis/02_dimred_fie.R @@ -0,0 +1,197 @@ +# Use only FIE targets + +library(egg) +library(factoextra) +library(tibble) +library(tidyr) + +source('src/00_utils.R') +source('src/02_dimred_fie_functions.R') + +rds.name <- 'seuObj_wt_preprocessed_filtered' +output.dir <- 'results/filtered/230220_wt' + +# load data +seuObj <- readRDS(file = sprintf('data/%s.rds', rds.name)) +seuObj + + +# filter genes +rmve <- c(grep('AT[CM]', rownames(seuObj), value = F), + which(rownames(seuObj) %in% ribo), + which(rownames(seuObj) %in% cc)) +seuObj <- subset(seuObj, features = rownames(seuObj)[-unique(rmve)]) +seuObj + +# +glist <- c(FIE, K27_genes, K27K4_genes, K27_wr) +features <- intersect(glist, root_genes) +#features <- intersect(features, rownames(seuObj)) +#features <- intersect(root_genes, rownames(seuObj)) +length(features) + +# PcG targets among variable features +seuObj.sub <- FindVariableFeatures(seuObj[root_genes,], assay = 'RNA', selection.method = "vst", + nfeatures = length(root_genes), + clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf)) +N <- c(100, 200, 300, 400, 600, 1000, 2000, 4000, 7000, 8000, length(root_genes)) +df <- sapply(N, function(n, set, features){ + proportions(table(head(set, n) %in% features, deparse.level = FALSE)) +}, set = VariableFeatures(seuObj.sub), features)[2,] +p <- df %>% data.frame %>% + mutate(n = N) %>% + ggplot(aes(x = n, y = .)) + geom_point(pch = 4) + + labs(y = 'Proportions of PRC2 target genes', + x = 'Top-N variable features') + + lims(y = c(0, 1)) + + geom_hline(yintercept = length(features)/dim(seuObj.sub)[1], lty = 'dashed') + + theme_article() +png(file = sprintf('%s/dimred_pcgamongvf.png', output.dir), + width = 140, height = 80, res = 300, units = 'mm') +p + geom_vline(xintercept = 300, color = 'red') + + geom_point(x = N[3], y = df[3], color = 'red', size = 2) +dev.off() + +# Plot vst plot +seuObj.sub <- FindVariableFeatures(seuObj[root_genes,], assay = 'RNA', selection.method = "vst", + nfeatures = 300, + clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf)) +png(file = sprintf('%s/dimred_varplotSeurat.png', output.dir), + width = 180, height = 140, res = 300, units = 'mm') +VariableFeaturePlot(seuObj.sub, assay = 'RNA', cols = c(alpha('black', 0.2), 'red')) +dev.off() + +png(file = sprintf('%s/dimred_varplotSeurat_nohighlight.png', output.dir), + width = 180, height = 140, res = 300, units = 'mm') +VariableFeaturePlot(seuObj.sub, assay = 'RNA', cols = c(alpha('black', 0.2), alpha('black', 0.2))) +dev.off() + +# Contribution of PcG vs non-PcG targets to variance in each PC +pca <- 'pca' +seuObj <- FindVariableFeatures(seuObj[root_genes,], assay = 'SCT', selection.method = "vst", + nfeatures = length(root_genes), + clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf)) +seuObj <- RunPCA(seuObj, assay = 'SCT', features = rownames(seuObj), + reduction.name = pca) +png(file = sprintf('%s/dimred_plotpca.png', output.dir), + width = 140, height = 140, res = 300, units = 'mm') +DimPlot(seuObj, reduction = pca, group.by = NULL, cols = rep(alpha('black', 0.4), dim(seuObj)[2])) + + theme(legend.position = 'none') +dev.off() +Loadings(object = seuObj[[pca]])[1:5, 1:5] # = seuObj[[pca]]@feature.loadings[1:5, 1:5] +TopFeatures(object = seuObj[["pca"]], dim = 1, nfeatures = 20, balanced = FALSE) +names(tail(sort(Loadings(object = seuObj[[pca]])[,1]), 20)) # = rev(names(tail(sort(var.contrib[,1]), 20))) + +var_coord_func <- function(loadings, comp.sdev){ + loadings*comp.sdev +} +var.coord <- t(apply(seuObj[[pca]]@feature.loadings, 1, var_coord_func, comp.sdev = seuObj[[pca]]@stdev)) +var.cos2 <- var.coord^2 +comp.cos2 <- apply(var.cos2, 2, sum) +contrib <- function(var.cos2, comp.cos2){ var.cos2*100/comp.cos2 } +var.contrib <- t(apply(var.cos2, 1, contrib, comp.cos2)) + +# how much PcG targets among the top-n genes explaining variance in each PC? +df <- prop_contrib(n = 1000, var.contrib, labels = c('PcG', 'non-PcG'), + features) +df <- do.call(rbind, sapply(N, function(n){ + df <- prop_contrib(n = n, var.contrib, labels = c('PcG', 'non-PcG'), features) + df$n <- n + #print(head(df)) + return(df) +}, simplify = F)) +png(file = sprintf('%s/dimred_fie_topnprop.png', output.dir), + width = 140, height = 80, res = 300, units = 'mm') +#df %>% ggplot(aes(x = PC, y = PcG)) + geom_point() + +# coord_flip() + +# geom_hline(yintercept = length(features)/length(rownames(var.contrib)), lty = 'dashed') + +# labs(y = 'Proportion of PcG targets among the top-1000 features in each PC') +df %>% ggplot(aes(x = n, y = PcG, group = n)) + geom_boxplot() + + geom_hline(yintercept = length(features)/length(rownames(var.contrib)), lty = 'dashed') + + labs(y = 'Proportions of PRC2 target genes', x = 'Top-N variable features') + + lims(y = c(0, 0.6)) + + theme_article() +dev.off() + +# % variance explained +contrib_glist <- apply(var.contrib, 2, function(x){ + #tapply(x, bin, quantile, 0.75) + l <- split(x, bin) + #print(length(l[[2]])) + #sapply(l, function(x){ sum(head(sort(x, decreasing = T), length(l[[2]]))) }) + sapply(l, function(x){ sum(head(sort(x, decreasing = T), 100)) }) + #tapply(x, bin, sum) + }) +df <- data.frame(t(contrib_glist)) %>% + rownames_to_column('PC') %>% + pivot_longer(cols = c(2,3)) %>% + mutate(PC = as.factor(gsub('PC_', '', PC))) +levels(df$PC) <- order(levels(df$PC), as.numeric(levels(df$PC))) +pl <- df %>% ggplot(aes(x = PC, fill = name, y = value)) + geom_col(position = 'dodge') + + coord_flip() + labs(y = 'Contribution to variance in each PC (%)') + + scale_fill_manual(values = c('grey', 'coral')) + + theme(legend.position = 'none') +pr <- data.frame('stdev' = seuObj[[pca]]@stdev, 'PC' = as.factor(1:50), 'name' = 'PcG') %>% + ggplot(aes(x = PC, y = stdev)) + geom_point() + coord_flip() + + labs(y = 'Total variance explained') +pl + +var.contrib %>% + data.frame() %>% + rownames_to_column('gene_id') %>% + mutate('class' = bin) %>% + pivot_longer(cols = contains('PC'), names_to = 'PC', values_to = 'contrib') %>% + ggplot(aes(x = PC, y = log(contrib), fill = class)) + geom_boxplot() + coord_flip() + +pdf(file = sprintf('%s/dimred_fie.pdf', output.dir)) +egg::ggarrange(pr, pl + + theme(axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.title.y = element_blank() ), nrow = 1) +dev.off() + +#### +# Dimension reduction using only FIE targets +length(features)/length(glist) # huge filtering! +features_ctrl <- sample(setdiff(rownames(seuObj), glist), + size = length(intersect(glist, rownames(seuObj)))) +features_ctrl <- setdiff(rownames(seuObj), glist) +any(features_ctrl %in% glist) # should be FALSE +length(features_ctrl) + +seuObj <- RunPCA(seuObj, assay = 'SCT', features = features, + reduction.name = "pca_PcG") +seuObj <- RunPCA(seuObj, assay = 'SCT', features = features_ctrl, + reduction.name = "pca_ctrl") +#DimPlot(seuObj, reduction = "pca") +#ElbowPlot(object = seuObj, ndims = 40, reduction = "pca_PcG") + +# ElbowPlot(object = seuObj, ndims = 40, reduction = "pca_ctrl") + + +pca <- 'pca_ctrl' +pct <- seuObj[[pca]]@stdev / sum(seuObj[[pca]]@stdev) * 100 +cumu <- cumsum(pct) +co1 <- which(cumu > 90 & pct < 5)[1] +co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1 +min(co1, co2) + +npc <- 50 +data.frame(stdev = c(seuObj[["pca_PcG"]]@stdev, seuObj[["pca_ctrl"]]@stdev), + spl = rep(c('PcG', 'ctrl'), each = npc), + pc = rep(1:npc, 2)) %>% + ggplot(aes(x = pc, y = stdev, color = spl)) + geom_point() + + labs(x = 'PC', y = 'Standard deviation') + +pca <- c('pca_ctrl', 'pca_PcG') +umap <- gsub(pattern = 'pca', replacement = 'umap', pca) +umap +seuObj <- RunUMAP(seuObj, reduction = pca[1], dims = 1:20, a = 0.8, b = 1.2, + umap.method = "uwot", metric = "euclidean", + reduction.name = umap[1]) +seuObj <- RunUMAP(seuObj, reduction = pca[2], dims = 1:20, a = 0.8, b = 1.2, + umap.method = "uwot", metric = "euclidean", + reduction.name = umap[2]) + +DimPlot(seuObj, reduction = umap[1], group.by = "orig.ident") + + DimPlot(seuObj, reduction = umap[2], group.by = "orig.ident") + diff --git a/src/variability_analysis/02_dimred_fie_functions.R b/src/variability_analysis/02_dimred_fie_functions.R new file mode 100644 index 0000000..ce5f5d4 --- /dev/null +++ b/src/variability_analysis/02_dimred_fie_functions.R @@ -0,0 +1,16 @@ +# functions associated with 02_dimred_fie.R + +prop_contrib <- function(n = 1000, var.contrib, labels = c('PcG', 'non-PcG'), + features){ + bin <- ifelse(rownames(var.contrib) %in% features, labels[1], labels[2]) + names(bin) <- rownames(var.contrib) + contrib_glist <- apply(var.contrib, 2, function(x, n){ + top <- names(tail(sort(x), n)) + table(bin[top]) }, n) %>% prop.table(margin = 2) + df <- data.frame(t(contrib_glist)) %>% + rownames_to_column('PC') %>% + mutate(PC = as.factor(gsub('PC_', '', PC))) + #pivot_longer(cols = c(2,3)) %>% + levels(df$PC) <- order(levels(df$PC), as.numeric(levels(df$PC))) + return(df) +} \ No newline at end of file -- GitLab