Skip to content
Snippets Groups Projects
Commit 045898f3 authored by Alice Hugues's avatar Alice Hugues
Browse files

add variability analysis

parent e756647c
No related branches found
No related tags found
No related merge requests found
# Use only FIE targets
library(egg)
library(factoextra)
library(tibble)
library(tidyr)
source('src/00_utils.R')
source('src/02_dimred_fie_functions.R')
rds.name <- 'seuObj_wt_preprocessed_filtered'
output.dir <- 'results/filtered/230220_wt'
# load data
seuObj <- readRDS(file = sprintf('data/%s.rds', rds.name))
seuObj
# filter genes
rmve <- c(grep('AT[CM]', rownames(seuObj), value = F),
which(rownames(seuObj) %in% ribo),
which(rownames(seuObj) %in% cc))
seuObj <- subset(seuObj, features = rownames(seuObj)[-unique(rmve)])
seuObj
#
glist <- c(FIE, K27_genes, K27K4_genes, K27_wr)
features <- intersect(glist, root_genes)
#features <- intersect(features, rownames(seuObj))
#features <- intersect(root_genes, rownames(seuObj))
length(features)
# PcG targets among variable features
seuObj.sub <- FindVariableFeatures(seuObj[root_genes,], assay = 'RNA', selection.method = "vst",
nfeatures = length(root_genes),
clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf))
N <- c(100, 200, 300, 400, 600, 1000, 2000, 4000, 7000, 8000, length(root_genes))
df <- sapply(N, function(n, set, features){
proportions(table(head(set, n) %in% features, deparse.level = FALSE))
}, set = VariableFeatures(seuObj.sub), features)[2,]
p <- df %>% data.frame %>%
mutate(n = N) %>%
ggplot(aes(x = n, y = .)) + geom_point(pch = 4) +
labs(y = 'Proportions of PRC2 target genes',
x = 'Top-N variable features') +
lims(y = c(0, 1)) +
geom_hline(yintercept = length(features)/dim(seuObj.sub)[1], lty = 'dashed') +
theme_article()
png(file = sprintf('%s/dimred_pcgamongvf.png', output.dir),
width = 140, height = 80, res = 300, units = 'mm')
p + geom_vline(xintercept = 300, color = 'red') +
geom_point(x = N[3], y = df[3], color = 'red', size = 2)
dev.off()
# Plot vst plot
seuObj.sub <- FindVariableFeatures(seuObj[root_genes,], assay = 'RNA', selection.method = "vst",
nfeatures = 300,
clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf))
png(file = sprintf('%s/dimred_varplotSeurat.png', output.dir),
width = 180, height = 140, res = 300, units = 'mm')
VariableFeaturePlot(seuObj.sub, assay = 'RNA', cols = c(alpha('black', 0.2), 'red'))
dev.off()
png(file = sprintf('%s/dimred_varplotSeurat_nohighlight.png', output.dir),
width = 180, height = 140, res = 300, units = 'mm')
VariableFeaturePlot(seuObj.sub, assay = 'RNA', cols = c(alpha('black', 0.2), alpha('black', 0.2)))
dev.off()
# Contribution of PcG vs non-PcG targets to variance in each PC
pca <- 'pca'
seuObj <- FindVariableFeatures(seuObj[root_genes,], assay = 'SCT', selection.method = "vst",
nfeatures = length(root_genes),
clip.max = 20, mean.cutoff = c(1, 20), dispersion.cutoff = c(1, Inf))
seuObj <- RunPCA(seuObj, assay = 'SCT', features = rownames(seuObj),
reduction.name = pca)
png(file = sprintf('%s/dimred_plotpca.png', output.dir),
width = 140, height = 140, res = 300, units = 'mm')
DimPlot(seuObj, reduction = pca, group.by = NULL, cols = rep(alpha('black', 0.4), dim(seuObj)[2])) +
theme(legend.position = 'none')
dev.off()
Loadings(object = seuObj[[pca]])[1:5, 1:5] # = seuObj[[pca]]@feature.loadings[1:5, 1:5]
TopFeatures(object = seuObj[["pca"]], dim = 1, nfeatures = 20, balanced = FALSE)
names(tail(sort(Loadings(object = seuObj[[pca]])[,1]), 20)) # = rev(names(tail(sort(var.contrib[,1]), 20)))
var_coord_func <- function(loadings, comp.sdev){
loadings*comp.sdev
}
var.coord <- t(apply(seuObj[[pca]]@feature.loadings, 1, var_coord_func, comp.sdev = seuObj[[pca]]@stdev))
var.cos2 <- var.coord^2
comp.cos2 <- apply(var.cos2, 2, sum)
contrib <- function(var.cos2, comp.cos2){ var.cos2*100/comp.cos2 }
var.contrib <- t(apply(var.cos2, 1, contrib, comp.cos2))
# how much PcG targets among the top-n genes explaining variance in each PC?
df <- prop_contrib(n = 1000, var.contrib, labels = c('PcG', 'non-PcG'),
features)
df <- do.call(rbind, sapply(N, function(n){
df <- prop_contrib(n = n, var.contrib, labels = c('PcG', 'non-PcG'), features)
df$n <- n
#print(head(df))
return(df)
}, simplify = F))
png(file = sprintf('%s/dimred_fie_topnprop.png', output.dir),
width = 140, height = 80, res = 300, units = 'mm')
#df %>% ggplot(aes(x = PC, y = PcG)) + geom_point() +
# coord_flip() +
# geom_hline(yintercept = length(features)/length(rownames(var.contrib)), lty = 'dashed') +
# labs(y = 'Proportion of PcG targets among the top-1000 features in each PC')
df %>% ggplot(aes(x = n, y = PcG, group = n)) + geom_boxplot() +
geom_hline(yintercept = length(features)/length(rownames(var.contrib)), lty = 'dashed') +
labs(y = 'Proportions of PRC2 target genes', x = 'Top-N variable features') +
lims(y = c(0, 0.6)) +
theme_article()
dev.off()
# % variance explained
contrib_glist <- apply(var.contrib, 2, function(x){
#tapply(x, bin, quantile, 0.75)
l <- split(x, bin)
#print(length(l[[2]]))
#sapply(l, function(x){ sum(head(sort(x, decreasing = T), length(l[[2]]))) })
sapply(l, function(x){ sum(head(sort(x, decreasing = T), 100)) })
#tapply(x, bin, sum)
})
df <- data.frame(t(contrib_glist)) %>%
rownames_to_column('PC') %>%
pivot_longer(cols = c(2,3)) %>%
mutate(PC = as.factor(gsub('PC_', '', PC)))
levels(df$PC) <- order(levels(df$PC), as.numeric(levels(df$PC)))
pl <- df %>% ggplot(aes(x = PC, fill = name, y = value)) + geom_col(position = 'dodge') +
coord_flip() + labs(y = 'Contribution to variance in each PC (%)') +
scale_fill_manual(values = c('grey', 'coral')) +
theme(legend.position = 'none')
pr <- data.frame('stdev' = seuObj[[pca]]@stdev, 'PC' = as.factor(1:50), 'name' = 'PcG') %>%
ggplot(aes(x = PC, y = stdev)) + geom_point() + coord_flip() +
labs(y = 'Total variance explained')
pl
var.contrib %>%
data.frame() %>%
rownames_to_column('gene_id') %>%
mutate('class' = bin) %>%
pivot_longer(cols = contains('PC'), names_to = 'PC', values_to = 'contrib') %>%
ggplot(aes(x = PC, y = log(contrib), fill = class)) + geom_boxplot() + coord_flip()
pdf(file = sprintf('%s/dimred_fie.pdf', output.dir))
egg::ggarrange(pr, pl +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank() ), nrow = 1)
dev.off()
####
# Dimension reduction using only FIE targets
length(features)/length(glist) # huge filtering!
features_ctrl <- sample(setdiff(rownames(seuObj), glist),
size = length(intersect(glist, rownames(seuObj))))
features_ctrl <- setdiff(rownames(seuObj), glist)
any(features_ctrl %in% glist) # should be FALSE
length(features_ctrl)
seuObj <- RunPCA(seuObj, assay = 'SCT', features = features,
reduction.name = "pca_PcG")
seuObj <- RunPCA(seuObj, assay = 'SCT', features = features_ctrl,
reduction.name = "pca_ctrl")
#DimPlot(seuObj, reduction = "pca")
#ElbowPlot(object = seuObj, ndims = 40, reduction = "pca_PcG") +
# ElbowPlot(object = seuObj, ndims = 40, reduction = "pca_ctrl")
pca <- 'pca_ctrl'
pct <- seuObj[[pca]]@stdev / sum(seuObj[[pca]]@stdev) * 100
cumu <- cumsum(pct)
co1 <- which(cumu > 90 & pct < 5)[1]
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
min(co1, co2)
npc <- 50
data.frame(stdev = c(seuObj[["pca_PcG"]]@stdev, seuObj[["pca_ctrl"]]@stdev),
spl = rep(c('PcG', 'ctrl'), each = npc),
pc = rep(1:npc, 2)) %>%
ggplot(aes(x = pc, y = stdev, color = spl)) + geom_point() +
labs(x = 'PC', y = 'Standard deviation')
pca <- c('pca_ctrl', 'pca_PcG')
umap <- gsub(pattern = 'pca', replacement = 'umap', pca)
umap
seuObj <- RunUMAP(seuObj, reduction = pca[1], dims = 1:20, a = 0.8, b = 1.2,
umap.method = "uwot", metric = "euclidean",
reduction.name = umap[1])
seuObj <- RunUMAP(seuObj, reduction = pca[2], dims = 1:20, a = 0.8, b = 1.2,
umap.method = "uwot", metric = "euclidean",
reduction.name = umap[2])
DimPlot(seuObj, reduction = umap[1], group.by = "orig.ident") +
DimPlot(seuObj, reduction = umap[2], group.by = "orig.ident")
# functions associated with 02_dimred_fie.R
prop_contrib <- function(n = 1000, var.contrib, labels = c('PcG', 'non-PcG'),
features){
bin <- ifelse(rownames(var.contrib) %in% features, labels[1], labels[2])
names(bin) <- rownames(var.contrib)
contrib_glist <- apply(var.contrib, 2, function(x, n){
top <- names(tail(sort(x), n))
table(bin[top]) }, n) %>% prop.table(margin = 2)
df <- data.frame(t(contrib_glist)) %>%
rownames_to_column('PC') %>%
mutate(PC = as.factor(gsub('PC_', '', PC)))
#pivot_longer(cols = c(2,3)) %>%
levels(df$PC) <- order(levels(df$PC), as.numeric(levels(df$PC)))
return(df)
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment