From 32c8edfddc768c80d1aa61ff5ac71e1265055a2f Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Fri, 28 Oct 2022 10:55:17 +0200
Subject: [PATCH] Practical a & b add red pencadre

---
 Practical_a.Rmd | 31 +++++++++++++++++++++++--------
 Practical_b.Rmd | 24 ++++++++++++++++++------
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/Practical_a.Rmd b/Practical_a.Rmd
index f10baa8..1e08020 100644
--- a/Practical_a.Rmd
+++ b/Practical_a.Rmd
@@ -132,6 +132,10 @@ data %>%
   pairs()
 ```
 
+<div class="red_pencadre">
+What can you say about the covariation structure of the data ?
+</div>
+
 To explore the PCA algorithm we are first going to focus on 2 two continuous variable in this data set: the bill length and depth (`bill_length_mm`, `bill_depth_mm`) for the female penguins (`sex`).
 
 
@@ -200,7 +204,7 @@ The package `factoextra` provides us with functions to manipulate and plot the o
 
 You can do this with the `fviz_pca_ind` function.
 
-<div class="pencadre">
+<div class="red_pencadre">
 Compare the `fviz_pca_ind` output with the `bill_length_mm` and `bill_depth_mm` scatter plot
 </div>
  
@@ -265,8 +269,8 @@ diy_data_f <- data_f %>%
 </p>
 </details>
 
-<div class="pencadre">
-! Explain the importance of the centering and scaling steps of the data
+<div class="red_pencadre">
+Explain the importance of the centering and scaling steps of the data
 </div>
 
 
@@ -350,6 +354,10 @@ first_pc_projection_code <- function(line_slope, x, y){
 
 We can minimize the distance from each point to the red line. Or we can maximize the distance from each point to the point of origin.
 
+<div class="red_pencadre">
+Explain why these two approaches are equivalent.
+</div>
+
 The following code is the same as before but with the 
 
 - `SS` : Sum-square of distances of the projected point to the origin
@@ -534,7 +542,7 @@ geom_abline(slope = eigen(diy_cov)$vector[2, 1] / eigen(diy_cov)$vector[1, 1], c
 </p>
 </details>
 
-<div class="pencadre">
+<div class="red_pencadre">
 Do you have the same results as your neighbors ?
 </div>
 
@@ -644,7 +652,7 @@ For the slope value:
 
 For the PCA construction we want, a PC2 orthogonal to the PC1.
 
-<div class="pencadre">
+<div class="red_pencadre">
 How many lines are compatible with the orthogonality constraint for 2 variables ?
 For 3 variables ?
 </div>
@@ -813,7 +821,7 @@ diy_data_f %>%
   geom_point(aes(x = pc1_x_ref, y = pc2_y_ref, color = species), size = 0.5) 
 ```
 
-<div class="pencadre">
+<div class="red_pencadre">
 What could be the problem when comparing these two PCA results ?
 </div>
 
@@ -890,6 +898,11 @@ pc_var / sum(pc_var)
 </p>
 </details>
 
+<div class="red_pencadre">
+Why are these metric important when analyzing PCA outputs ?
+</div>
+
+
 ### Scree plot
 
 The `fviz_eig` function create a scree plot of your PCA.
@@ -921,7 +934,7 @@ tibble(
 
 The axis of the PCA are linear combinations of the variable axis.
 
-<div class="pencadre">
+<div class="red_pencadre">
 What does it mean to find a slope of `0.5` for PC1 in the `bill_length_mm`, `bill_depth_mm` plot ?
 </div>
 
@@ -956,7 +969,7 @@ fviz_pca_biplot(
   )
 ```
 
-<div class="pencadre">
+<div class="red_pencadre">
 Remember that we scaled all the variables before computing the PCA. What are the results of the scaling in terms of variable unit contribution ?
 </div>
 
@@ -972,6 +985,8 @@ Explore the results of these two functions.
 
 <div class="pencadre">
 What is the variable contributing the most to PC4 ?
+</div>
+<div class="red_pencadre">
 How do you interpret the `cos2` slot
 </div>
 
diff --git a/Practical_b.Rmd b/Practical_b.Rmd
index ca18149..cf16b5c 100644
--- a/Practical_b.Rmd
+++ b/Practical_b.Rmd
@@ -132,7 +132,7 @@ head(var_gene_2000)
 </details>
 
 
-<div class="pencadre">
+<div class="red_pencadre">
 Why do you think that we need a list of the 2000 most variable genes ?
 </div>
 
@@ -149,7 +149,7 @@ c2c_dist_10 <- data[var_gene_2000[1:10], 1:100] %>%
   dist()
 ```
 
-<div class="pencadre">
+<div class="red_pencadre">
 Use the following code to study the impact of the number of genes on the distances
 
 ```{r, eval=F}
@@ -265,9 +265,11 @@ data_hclust %>% plot()
 
 Too much information drawn the information, the function `cutree()` can help you solve this problem.
 
-<div class="pencadre">
+<div class="red_pencadre">
 Which choice of `k` would you take ?
+</div>
 
+<div class="pencadre">
 Modify the following code to display your clustering.
 
 ```{r}
@@ -309,7 +311,7 @@ adjustedRandIndex(
 </details>
 
 
-<div class="pencadre">
+<div class="red_pencadre">
 Modify the following code to study the relation between the adjusted Rand index and the number of PCs used to compute the distance
 
 ```{r, eval=F}
@@ -379,7 +381,7 @@ The `kmeans()` function performs a k-means clustering analysis from a distance m
 data_kmeans <- kmeans(data_dist, centers = 9)
 ```
 
-<div class="pencadre">
+<div class="red_pencadre">
 Why is the `centers` parameter required for `kmeans()` and not for the `hclust()` function ?
 </div>
 
@@ -474,6 +476,10 @@ fviz_nbclust(data_pca$x[, 1:3], hcut, method = "silhouette")
 </p>
 </details>
 
+<div class="red_pencadre">
+Explain the discrepancy between these results and $k=9$
+</div>
+
 ## Graph-based clustering
 
 We are going to use the `cluster_louvain()` function to perform a graph-based clustering.
@@ -505,6 +511,12 @@ str(data_knn_F)
 </p>
 </details>
 
+
+<div class="red_pencadre">
+Why do we need a knn ?
+</div>
+
+
 The `cluster_louvain()` function implements the multi-level modularity optimization algorithm for finding community structure in a graph. Use this function on `data_knn` to create a `data_louvain` variable.
 
 You can check the clustering results with `membership(data_louvain)`.
@@ -559,7 +571,7 @@ data_umap$layout %>%
   geom_point(aes(x = V1, y = V2, color = cell_type))
 ```
 
-<div class="pencadre">
+<div class="red_pencadre">
 What can you say about the axes of this plot ?
 </div>
 
-- 
GitLab