From 32c8edfddc768c80d1aa61ff5ac71e1265055a2f Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Fri, 28 Oct 2022 10:55:17 +0200 Subject: [PATCH] Practical a & b add red pencadre --- Practical_a.Rmd | 31 +++++++++++++++++++++++-------- Practical_b.Rmd | 24 ++++++++++++++++++------ 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/Practical_a.Rmd b/Practical_a.Rmd index f10baa8..1e08020 100644 --- a/Practical_a.Rmd +++ b/Practical_a.Rmd @@ -132,6 +132,10 @@ data %>% pairs() ``` +<div class="red_pencadre"> +What can you say about the covariation structure of the data ? +</div> + To explore the PCA algorithm we are first going to focus on 2 two continuous variable in this data set: the bill length and depth (`bill_length_mm`, `bill_depth_mm`) for the female penguins (`sex`). @@ -200,7 +204,7 @@ The package `factoextra` provides us with functions to manipulate and plot the o You can do this with the `fviz_pca_ind` function. -<div class="pencadre"> +<div class="red_pencadre"> Compare the `fviz_pca_ind` output with the `bill_length_mm` and `bill_depth_mm` scatter plot </div> @@ -265,8 +269,8 @@ diy_data_f <- data_f %>% </p> </details> -<div class="pencadre"> -! Explain the importance of the centering and scaling steps of the data +<div class="red_pencadre"> +Explain the importance of the centering and scaling steps of the data </div> @@ -350,6 +354,10 @@ first_pc_projection_code <- function(line_slope, x, y){ We can minimize the distance from each point to the red line. Or we can maximize the distance from each point to the point of origin. +<div class="red_pencadre"> +Explain why these two approaches are equivalent. +</div> + The following code is the same as before but with the - `SS` : Sum-square of distances of the projected point to the origin @@ -534,7 +542,7 @@ geom_abline(slope = eigen(diy_cov)$vector[2, 1] / eigen(diy_cov)$vector[1, 1], c </p> </details> -<div class="pencadre"> +<div class="red_pencadre"> Do you have the same results as your neighbors ? </div> @@ -644,7 +652,7 @@ For the slope value: For the PCA construction we want, a PC2 orthogonal to the PC1. -<div class="pencadre"> +<div class="red_pencadre"> How many lines are compatible with the orthogonality constraint for 2 variables ? For 3 variables ? </div> @@ -813,7 +821,7 @@ diy_data_f %>% geom_point(aes(x = pc1_x_ref, y = pc2_y_ref, color = species), size = 0.5) ``` -<div class="pencadre"> +<div class="red_pencadre"> What could be the problem when comparing these two PCA results ? </div> @@ -890,6 +898,11 @@ pc_var / sum(pc_var) </p> </details> +<div class="red_pencadre"> +Why are these metric important when analyzing PCA outputs ? +</div> + + ### Scree plot The `fviz_eig` function create a scree plot of your PCA. @@ -921,7 +934,7 @@ tibble( The axis of the PCA are linear combinations of the variable axis. -<div class="pencadre"> +<div class="red_pencadre"> What does it mean to find a slope of `0.5` for PC1 in the `bill_length_mm`, `bill_depth_mm` plot ? </div> @@ -956,7 +969,7 @@ fviz_pca_biplot( ) ``` -<div class="pencadre"> +<div class="red_pencadre"> Remember that we scaled all the variables before computing the PCA. What are the results of the scaling in terms of variable unit contribution ? </div> @@ -972,6 +985,8 @@ Explore the results of these two functions. <div class="pencadre"> What is the variable contributing the most to PC4 ? +</div> +<div class="red_pencadre"> How do you interpret the `cos2` slot </div> diff --git a/Practical_b.Rmd b/Practical_b.Rmd index ca18149..cf16b5c 100644 --- a/Practical_b.Rmd +++ b/Practical_b.Rmd @@ -132,7 +132,7 @@ head(var_gene_2000) </details> -<div class="pencadre"> +<div class="red_pencadre"> Why do you think that we need a list of the 2000 most variable genes ? </div> @@ -149,7 +149,7 @@ c2c_dist_10 <- data[var_gene_2000[1:10], 1:100] %>% dist() ``` -<div class="pencadre"> +<div class="red_pencadre"> Use the following code to study the impact of the number of genes on the distances ```{r, eval=F} @@ -265,9 +265,11 @@ data_hclust %>% plot() Too much information drawn the information, the function `cutree()` can help you solve this problem. -<div class="pencadre"> +<div class="red_pencadre"> Which choice of `k` would you take ? +</div> +<div class="pencadre"> Modify the following code to display your clustering. ```{r} @@ -309,7 +311,7 @@ adjustedRandIndex( </details> -<div class="pencadre"> +<div class="red_pencadre"> Modify the following code to study the relation between the adjusted Rand index and the number of PCs used to compute the distance ```{r, eval=F} @@ -379,7 +381,7 @@ The `kmeans()` function performs a k-means clustering analysis from a distance m data_kmeans <- kmeans(data_dist, centers = 9) ``` -<div class="pencadre"> +<div class="red_pencadre"> Why is the `centers` parameter required for `kmeans()` and not for the `hclust()` function ? </div> @@ -474,6 +476,10 @@ fviz_nbclust(data_pca$x[, 1:3], hcut, method = "silhouette") </p> </details> +<div class="red_pencadre"> +Explain the discrepancy between these results and $k=9$ +</div> + ## Graph-based clustering We are going to use the `cluster_louvain()` function to perform a graph-based clustering. @@ -505,6 +511,12 @@ str(data_knn_F) </p> </details> + +<div class="red_pencadre"> +Why do we need a knn ? +</div> + + The `cluster_louvain()` function implements the multi-level modularity optimization algorithm for finding community structure in a graph. Use this function on `data_knn` to create a `data_louvain` variable. You can check the clustering results with `membership(data_louvain)`. @@ -559,7 +571,7 @@ data_umap$layout %>% geom_point(aes(x = V1, y = V2, color = cell_type)) ``` -<div class="pencadre"> +<div class="red_pencadre"> What can you say about the axes of this plot ? </div> -- GitLab