From bbc65f73b2cc366b362c5ea86daf1f3d9c61ebef Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Mon, 16 Oct 2023 15:54:17 +0200 Subject: [PATCH] Practical a: add answers --- Practical_a.Rmd | 62 +++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/Practical_a.Rmd b/Practical_a.Rmd index 1e08020..84860a6 100644 --- a/Practical_a.Rmd +++ b/Practical_a.Rmd @@ -71,13 +71,13 @@ library(palmerpenguins) # we load the data ## loading the data -We are going to work on the famous Palmer penguins dataset. This dataset is an integrative study of the breeding ecology and population structure of Pygoscelis penguins along the western Antarctic Peninsula. These data were collected from 2007 - 2009 by Dr. Kristen Gorman with the Palmer Station Long Term Ecological Research Program, part of the US Long Term Ecological Research Network. +We are going to work on the famous Palmer penguins dataset. This dataset is an integrative study of the breeding ecology and population structure of Pygoscelis penguins along the western Antarctic Peninsula. These data were collected from 2007 to 2009 by Dr. Kristen Gorman with the Palmer Station Long Term Ecological Research Program (part of the US Long Term Ecological Research Network). The `palmerpenguins` data contains size measurements for three penguin species observed on three islands in the Palmer Archipelago, Antarctica.  -The `palmerpenguins` library load the `penguins` dataset into your R environment. If you are not familiar with `tibble`, you just have to know that they are equivalent to `data.frame`. +The `palmerpenguins` library load the `penguins` dataset into your R environment. If you are not familiar with `tibble`, you just have to know that they are equivalent to `data.frame` (but easier to work with). ```{r} penguins @@ -95,7 +95,7 @@ The data is tidy: - Each observation has its own row. - Each value must have its own cell. -Meeting these 3 criteria for your data will simplify most of your data processing and analysis. +Meeting these 3 criteria for your data will simplify your data processing and analysis as most of the algorithm expect this format. ```{r} summary(penguins) @@ -138,6 +138,12 @@ What can you say about the covariation structure of the data ? To explore the PCA algorithm we are first going to focus on 2 two continuous variable in this data set: the bill length and depth (`bill_length_mm`, `bill_depth_mm`) for the female penguins (`sex`). +```{r} +data %>% + filter(sex == "female") %>% + ggplot() + + geom_point(aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +``` <div class="pencadre"> Using the `filter` and `select` functions, create a `data_f` data set that meet the @@ -155,13 +161,6 @@ data_f <- data %>% </p> </details> -```{r} -data %>% - filter(sex == "female") %>% - ggplot() + - geom_point(aes(x = bill_length_mm, y = bill_depth_mm, color = species)) -``` - ## Performing your first PCA The `prcomp` and `princomp` functions are implementations of the PCA methods @@ -173,7 +172,7 @@ data_f_pca <- prcomp(data_f, scale = T) You can use the `str()` function to explore the `data_f_pca` object. <div class="pencadre"> -Compare the `center` and `scale` slot with the `data_f` table +Compare the `center` and `scale` slot with the moments of the `data_f` table </div> <details><summary>Solution</summary> @@ -205,18 +204,24 @@ The package `factoextra` provides us with functions to manipulate and plot the o You can do this with the `fviz_pca_ind` function. <div class="red_pencadre"> -Compare the `fviz_pca_ind` output with the `bill_length_mm` and `bill_depth_mm` scatter plot +Compare the `fviz_pca_ind` output with the `bill_length_mm` and `bill_depth_mm` scatter plot (you need the species variable) + +```{r} +species_f <- data %>% filter(sex == "female") %>% pull(species) +``` </div> <details><summary>Solution</summary> <p> ```{r} -species_f <- data %>% filter(sex == "female") %>% pull(species) # we get the species variable fviz_pca_ind(data_f_pca, geom = "point", col.ind = species_f ) ``` + +We have a rotation of the point coordinates + </p> </details> @@ -225,8 +230,10 @@ What are the percentages in the Dim1 and Dim2 axes ? </div> <details><summary>Solution</summary> -71.3 % of total variation explained by the first principal component -28.7 % of total variation explained by the second principal component + +- 71.3 % of total variation is explained by the first principal component +- 28.7 % of total variation is explained by the second principal component + </p> </details> @@ -273,6 +280,15 @@ diy_data_f <- data_f %>% Explain the importance of the centering and scaling steps of the data </div> +<details><summary>Solution</summary> +<p> + +- centering set the empirical mean to 0, if the data are not center the distance matrix is not the covariance matrix +- scaling put every variable on the same scale + +</p> +</details> + ### Data projection @@ -315,7 +331,7 @@ diy_pca %>% aes(x = bill_length_mm, y = bill_depth_mm, xend = projection_x, - yend = projection_y), color = "red", size = 0.1) + + yend = projection_y), color = "red", linewidth = 0.1) + coord_equal() ``` @@ -394,7 +410,7 @@ diy_pca %>% aes(x = bill_length_mm, y = bill_depth_mm, xend = projection_x, - yend = projection_y), color = "red", size = 0.1) + + yend = projection_y), color = "red", linewidth = 0.1) + labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2), ", SR = ", round(sum(diy_pca$Residuals), 2))) + coord_equal() @@ -477,7 +493,7 @@ diy_pca %>% aes(x = bill_length_mm, y = bill_depth_mm, xend = pc1_x, - yend = pc1_y), color = "red", size = 0.1) + + yend = pc1_y), color = "red", linewidth = 0.1) + labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2), ", SR = ", round(sum(diy_pca$Residuals), 2))) + coord_equal() @@ -513,7 +529,7 @@ diy_pca %>% aes(x = bill_length_mm, y = bill_depth_mm, xend = pc1_x, - yend = pc1_y), color = "red", size = 0.1) + + yend = pc1_y), color = "red", linewidth = 0.1) + labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2), ", SR = ", round(sum(diy_pca$Residuals), 2))) + coord_equal() @@ -585,7 +601,7 @@ diy_pca %>% aes(x = bill_length_mm, y = bill_depth_mm, xend = pc2_x, - yend = pc2_y), color = "red", size = 0.1) + + yend = pc2_y), color = "red", linewidth = 0.1) + labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2), ", SR = ", round(sum(diy_pca$Residuals), 2))) + coord_equal() @@ -622,7 +638,7 @@ diy_pca %>% aes(x = bill_length_mm, y = bill_depth_mm, xend = pc2_x, - yend = pc2_y), color = "red", size = 0.1) + + yend = pc2_y), color = "red", linewidth = 0.1) + labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2), ", SR = ", round(sum(diy_pca$Residuals), 2))) + coord_equal() @@ -702,12 +718,12 @@ diy_pca %>% aes(x = bill_length_mm, y = bill_depth_mm, xend = pc1_x, - yend = pc1_y,), color = "red", size = 0.1) + + yend = pc1_y,), color = "red", linewidth = 0.1) + geom_segment( aes(x = bill_length_mm, y = bill_depth_mm, xend = pc2_x, - yend = pc2_y,), color = "red", size = 0.1) + + yend = pc2_y,), color = "red", linewidth = 0.1) + coord_equal() ``` -- GitLab