Practical_b.Rmd

# https://www.gnu.org/licenses/agpl-3.0.txt
title: "Practice: Introduction to clustering"
author: "Ghislain Durif, Laurent Modolo, Franck Picard"
if (!require("remotes"))
  install.packages("remotes")
if (!require("tidyverse"))
  install.packages("tidyverse")
library(tidyverse) # to manipule data and make plot
if (!require("Seurat"))
  install.packages("Seurat")
if (!require("factoextra"))
  install.packages("factoextra")
library(factoextra) # manipulate pca results
if (!require("fontawesome"))
  install.packages("fontawesome")
library(fontawesome)
if (!require("igraph"))
  install.packages("igraph")
library(igraph)
if (!require("cccd"))
  install.packages("cccd")
library(cccd)
if (!require("mclust"))
  install.packages("mclust")
library(mclust)
if (!require("umap"))
  install.packages("umap")
rm(list = ls())
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(comment = NA)

if("conflicted" %in% .packages()) conflicted::conflicts_prefer(dplyr::filter)
if (!file.exists("practical_b.Rdata")) {
  if (!require("SeuratData"))
    remotes::install_github('satijalab/seurat-data', upgrade = T)
  library(SeuratData)
  InstallData("pbmc3k")
  pbmc <- LoadData("pbmc3k", type = "pbmc3k.final")
  pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
  pbmc <- ScaleData(pbmc)
  pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
  data <- Assays(pbmc, slot = "RNA")@scale.data
  cell_annotation <- pbmc@meta.data$seurat_annotations
  colnames(data) <- str_c(1:ncol(data), "_", cell_annotation)
  var_gene_2000 <- VariableFeatures(pbmc)
  save(data, cell_annotation, var_gene_2000, file = "practical_b.Rdata")
}
load("practical_b.Rdata", verbose = T)
library(tidyverse)
library(factoextra)
library(igraph)
library(cccd)
library(mclust)
library(umap)
load(url("https://lbmc.gitbiopages.ens-lyon.fr/hub/formations/ens_m1_ml/practical_b.Rdata"), verbose = T)
dim(data)
length(cell_annotation)
table(cell_annotation)
length(var_gene_2000)
head(var_gene_2000)
c2c_dist_10 <- data[var_gene_2000[1:10], 1:100] %>%
  t() %>%
  dist()
c2c_dist_10 %>%
  as.vector() %>%
  as_tibble() %>%
  ggplot() +
  geom_histogram(aes(x = value))
  fviz_pca_ind(
    data_pca,
    geom = "point",
    col.ind = cell_annotation
  )
data_pca %>%
  fviz_pca_ind(
    geom = "point",
    col.ind = cell_annotation
  )
fviz_eig(data_pca)
data_dist <- dist(data_pca$x[,1:2])
data_hclust <- hclust(data_dist)
data_pca %>%
  fviz_pca_ind(
    geom = "point",
    col.ind = cell_annotation # we want a as.factor()
  )
data_pca$x[,1:3] %>%
  dist() %>%
  hclust() %>%
  cutree(k = 9) %>%
  adjustedRandIndex(cell_annotation)
tibble(
  n_pcs = seq(from = 3, to = 100, by = 10)
) %>%
  mutate(
    ari = purrr::map(n_pcs, function(n_pcs, data_pca, cell_annotation){
      data_pca$x[,1:n_pcs] %>%
        dist() %>%
        hclust() %>%
        cutree(k = 9) %>%
        adjustedRandIndex(cell_annotation)
    }, data_pca = data_pca, cell_annotation = cell_annotation)
  ) %>%
  unnest(ari) %>%
  ggplot() +
  geom_line(aes(x = n_pcs, y = ari))
data_kmeans <- kmeans(data_dist, centers = 9)
data_pca %>%
  fviz_pca_ind(
    geom = "point",
    col.ind = cell_annotation
  )
data_pca %>%
  fviz_pca_ind(
    geom = "point",
    col.ind = as.factor(data_kmeans$cluster)
  )
fviz_nbclust(data_pca$x[, 1:3], kmeans, method = "wss")
fviz_nbclust(data_pca$x[, 1:3], kmeans, method = "silhouette")
fviz_nbclust(data_pca$x[, 1:3], hcut, method = "silhouette")
data_knn <- data_dist %>%
  as.matrix() %>%
  nng(k = 30, mutual = T)
</p>
</details>


<div class="red_pencadre">
Why do we need a knn ?
</div>


The `cluster_louvain()` function implements the multi-level modularity optimization algorithm for finding community structure in a graph. Use this function on `data_knn` to create a `data_louvain` variable.

You can check the clustering results with `membership(data_louvain)`.

<div class="pencadre">
For which `resolution` value do you get 9 clusters ?
</div>

<details><summary>Solution</summary>
<p>
```{r}
data_louvain <- data_knn %>%
  cluster_louvain(resolution = 0.41)
data_pca %>%
  fviz_pca_ind(
    geom = "point",
    col.ind = as.factor(membership(data_louvain))
  )
library(umap)
data_umap <- umap(data_pca$x[, 1:10])
data_umap$layout %>%
  as_tibble(.name_repair = "universal") %>%
  mutate(cell_type = cell_annotation) %>%
  ggplot() +
  geom_point(aes(x = ...1, y = ...2, color = cell_type))
data_pca %>%
  fviz_pca_ind(
    geom = "point",
    col.ind = as.factor(kmeans_example(data_pca$x[,1:2], k = 9))
  )