From 7892871b659244b793855c9f3b061eab7945529c Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Wed, 11 Oct 2023 15:34:26 +0200
Subject: [PATCH] Practical b: add kmeans implementation

---
 Practical_b.Rmd | 123 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/Practical_b.Rmd b/Practical_b.Rmd
index cf16b5c..01ff3eb 100644
--- a/Practical_b.Rmd
+++ b/Practical_b.Rmd
@@ -480,6 +480,129 @@ fviz_nbclust(data_pca$x[, 1:3], hcut, method = "silhouette")
 Explain the discrepancy between these results and $k=9$
 </div>
 
+### Implementing your own $k$-means clustering algorithm
+
+The $k$-means algorithm follow the following steps:
+
+- Assign point to the cluster with the nearest centroid
+- Compute the new cluster centroids
+
+<div class="pencadre">
+Think about the starting state of your algorithm and the stopping condition
+</div>
+
+<details><summary>Solution</summary>
+<p>
+We have no prior information about the centroid, we can randomly draw them
+We are going to iterate over the two step of the algorithm until the centroids stay the same
+</p>
+</details>
+
+<div class="pencadre">
+Start by implementing an `kmeans_initiation(x, k)` function for your algorithm, returning $k$ centroids
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r}
+kmeans_initiation <- function(x, k) {
+  centroid <- matrix(0, k, ncol(x))
+  for (i in 1:ncol(x)) {
+    centroid[, i] <- runif(k, min = min(x[, i]), max = max(x[, i]))
+  }
+  return(centroid)
+}
+```
+</p>
+</details>
+
+<div class="pencadre">
+Implement an `compute_distance(x, centroid)` function for your algorithm, the distance of each point (row of x) to each centroid, based on the squared Euclidian distance
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r}
+compute_distance <- function(x, centroid) {
+  distance <- matrix(0, nrow(x), nrow(centroid))
+  for (i in 1:ncol(distance)) {
+      distance[, i] <- rowSums((x - centroid[i, ])^2)
+  }
+  return(distance)
+}
+```
+</p>
+</details>
+
+<div class="pencadre">
+Implement an `cluster_assignment(distance)` function for your algorithm, returning the assignment of each point (row of x), based on the squared Euclidian distance
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r}
+cluster_assignment <- function(distance) {
+  cluster <- c()
+  for (i in 1:nrow(distance)) {
+    cluster[i] <- which(distance[i, ] == min(distance[i, ]))[1]
+  }
+  return(cluster)
+}
+```
+</p>
+</details>
+
+
+<div class="pencadre">
+Implement an `centroid_update(x, cluster, k)` function for your algorithm, returning the updated centroid for your clusters
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r}
+centroid_update <- function(x, cluster, k) {
+  centroid <- matrix(0, k, ncol(x))
+  for (i in 1:k) {
+    centroid[i, ] <- mean(x[cluster[i], ])
+  }
+  return(centroid)
+}
+```
+</p>
+</details>
+
+<div class="pencadre">
+Implement an `kmeans_example(x, k)` function for your algorithm, wrapping everything and test it
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r}
+kmeans_example <- function(x, k) {
+  centroid <- kmeans_initiation(x, k)
+  stop_condition <- T
+  while (stop_condition) {
+    old_centroid <- centroid
+    cluster <- cluster_assignment(compute_distance(x, centroid))
+    centroid <- centroid_update(x, cluster, k)
+    if (max(centroid - old_centroid) < 5) {
+      stop_condition <- F
+    }
+  }
+  return(cluster)
+}
+```
+</p>
+</details>
+
+```{r, echo = F}
+data_pca %>%
+  fviz_pca_ind(
+    geom = "point",
+    col.ind = as.factor(kmeans_example(data_pca$x[,1:2], k = 9))
+  )
+```
+
 ## Graph-based clustering
 
 We are going to use the `cluster_louvain()` function to perform a graph-based clustering.
-- 
GitLab