Practical_a.Rmd

# https://www.gnu.org/licenses/agpl-3.0.txt
title: "Practice: Introduction to Principal Component Analysis"
author: "Ghislain Durif, Laurent Modolo, Franck Picard"
if (!require("tidyverse"))
  install.packages("tidyverse")
library(tidyverse) # to manipule data and make plot
if (!require("factoextra"))
  install.packages("factoextra")
library(factoextra) # manipulate pca results
if (!require("palmerpenguins"))
  install.packages("palmerpenguins")
library(palmerpenguins) # we load the data
if (!require("fontawesome"))
  install.packages("fontawesome")
library(fontawesome)
rm(list = ls())
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(comment = NA)

if("conflicted" %in% .packages()) conflicted::conflicts_prefer(dplyr::filter)

first_pc_projection_code <- function(line_slope, x, y){
  a <- c(x, y)
  b <- c(1, line_slope)
  scaled_b <- b / c(sqrt(sum(b^2)))
  c(a %*% scaled_b) * scaled_b
}
library(tidyverse) # to manipule data and make plot
library(factoextra) # manipulate pca results
library(palmerpenguins) # we load the data
penguins
dim(penguins)
summary(penguins)
data <- penguins %>% drop_na()
dim(data)
data %>%
  select(c(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)) %>%
  pairs()
data %>%
  filter(sex == "female") %>%
  ggplot() +
  geom_point(aes(x = bill_length_mm, y = bill_depth_mm, color = species))
data_f_pca <- prcomp(data_f, scale = T)
str(data_f_pca)
data_f_pca$center
data_f_pca$scale
map(data_f, mean)
map(data_f, sd)
species_f <- data %>% filter(sex == "female") %>% pull(species)
data_f_pca$sdev^2 / sum(data_f_pca$sdev^2)
as_tibble(data_f_pca$x) %>%
  bind_cols(data %>% filter(sex == "female")) %>% # we add all the other variables to color by species
  ggplot() +
  geom_point(aes(x = PC1, y = PC2, color = species))
line_slope <- 2 # to change

point_projection <- function(line_slope, x, y){
  results <- first_pc_projection_code(line_slope, x, y) # to replace with your code
  return(list(x = results[1], y = results[2]))
}

diy_pca <- diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    projection_x = point_projection(
      line_slope = line_slope,
      x = bill_length_mm,
      y = bill_depth_mm)$x,
    projection_y = point_projection(
      line_slope = line_slope,
      x = bill_length_mm,
      y = bill_depth_mm)$y
  )

diy_pca %>%
  ggplot() +
  geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_abline(slope = line_slope, color = "red") +
  geom_point(aes(x = projection_x, y = projection_y), color = "red") +
  geom_segment(
    aes(x = bill_length_mm,
        y = bill_depth_mm,
        xend = projection_x,
        yend = projection_y), color = "red", linewidth = 0.1) +
  coord_equal()
line_slope <- 0.2
diy_pca <- diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    projection_x = point_projection(
      line_slope = line_slope,
      x = bill_length_mm,
      y = bill_depth_mm)$x,
    projection_y = point_projection(
      line_slope = line_slope,
      x = bill_length_mm,
      y = bill_depth_mm)$y,
    S_dist = ,# right formula
    Residuals = # right formula
  )

diy_pca %>%
  ggplot() +
  geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_abline(slope = line_slope, color = "red") +
  geom_point(aes(x = projection_x, y = projection_y), color = "red") +
  geom_segment(
    aes(x = bill_length_mm,
        y = bill_depth_mm,
        xend = projection_x,
        yend = projection_y), color = "red", linewidth = 0.1) +
  labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2),
                     ", SR = ", round(sum(diy_pca$Residuals), 2))) +
  coord_equal()
point_projection <- function(diy_cov, x, y){
  a <- c(x, y)
  b <- eigen(diy_cov)$vector[, 1]
  results <- c(a %*% b) * b
  list(x = results[1], y = results[2])
}

diy_pca <- diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    pc1_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm)$x,
    pc1_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm)$y,
    S_dist = pc1_x^2 + pc1_y^2,
    Residuals = sqrt((pc1_x - bill_length_mm)^2 + (pc1_y - bill_depth_mm)^2)
  )

diy_pca %>%
  ggplot() +
  geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_abline(slope = eigen(diy_cov)$vector[2, 1] / eigen(diy_cov)$vector[1, 1], color = "red") +
  geom_point(aes(x = pc1_x, y = pc1_y), color = "red") +
  geom_segment(
    aes(x = bill_length_mm,
        y = bill_depth_mm,
        xend = pc1_x,
        yend = pc1_y), color = "red", linewidth = 0.1) +
  labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2),
                     ", SR = ", round(sum(diy_pca$Residuals), 2))) +
  coord_equal()
point_projection <- function(diy_cov, x, y){
  # your code
  list(x = results[1], y = results[2])
}

diy_pca <- diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    pc1_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm)$x,
    pc1_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm)$y,
    S_dist = , # right formula
    Residuals = # right formula
  )

diy_pca %>%
  ggplot() +
  geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_abline(slope = , color = "red") + # missing slope value here
  geom_point(aes(x = pc1_x, y = pc1_y), color = "red") +
  geom_segment(
    aes(x = bill_length_mm,
        y = bill_depth_mm,
        xend = pc1_x,
        yend = pc1_y), color = "red", linewidth = 0.1) +
  labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2),
                     ", SR = ", round(sum(diy_pca$Residuals), 2))) +
  coord_equal()
point_projection <- function(diy_cov, x, y){
  a <- c(x, y)
  b <- eigen(diy_cov)$vector[, 1]
  results <- c(a %*% b) * b
  list(x = results[1], y = results[2])
}
geom_abline(slope = eigen(diy_cov)$vector[2, 1] / eigen(diy_cov)$vector[1, 1], color = "red") +
point_projection <- function(diy_cov, x, y){
  a <- c(x, y)
  b <- eigen(diy_cov)$vector[, 2]
  results <- c(a %*% b) * b
  return(list(x = results[1], y = results[2]))
}

diy_pca <- diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    pc2_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm)$x,
    pc2_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm)$y,
    S_dist = pc2_x^2 + pc2_y^2,
    Residuals = sqrt((pc2_x - bill_length_mm)^2 + (pc2_y - bill_depth_mm)^2)
  )

diy_pca %>%
  ggplot() +
  geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_abline(slope = eigen(diy_cov)$vector[2, 1] / eigen(diy_cov)$vector[1, 1], color = "red") +
  geom_abline(slope = eigen(diy_cov)$vector[2, 2] / eigen(diy_cov)$vector[1, 2], color = "blue") +
  geom_point(aes(x = pc2_x, y = pc2_y), color = "blue") +
  geom_segment(
    aes(x = bill_length_mm,
        y = bill_depth_mm,
        xend = pc2_x,
        yend = pc2_y), color = "red", linewidth = 0.1) +
  labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2),
                     ", SR = ", round(sum(diy_pca$Residuals), 2))) +
  coord_equal()
point_projection <- function(diy_cov, x, y){
  # you corde
  return(list(x = results[1], y = results[2]))
}

diy_pca <- diy_pca %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    pc2_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm)$x,
    pc2_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm)$y,
    S_dist = , # your code
    Residuals = # your code
  )

diy_pca %>%
  ggplot() +
  geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_abline(slope = , color = "red") + # slope of the PC1
  geom_abline(slope = , color = "blue") + # slope of the PC2
  geom_point(aes(x = pc2_x, y = pc2_y), color = "blue") +
  geom_segment(
    aes(x = bill_length_mm,
        y = bill_depth_mm,
        xend = pc2_x,
        yend = pc2_y), color = "red", linewidth = 0.1) +
  labs(title = str_c("SS = ", round(sum(diy_pca$S_dist), 2),
                     ", SR = ", round(sum(diy_pca$Residuals), 2))) +
  coord_equal()
point_projection <- function(diy_cov, x, y){
  a <- c(x, y)
  b <- eigen(diy_cov)$vector[, 2]
  results <- c(a %*% b) * b
  return(list(x = results[1], y = results[2]))
}
  geom_abline(slope = eigen(diy_cov)$vector[2, 1] / eigen(diy_cov)$vector[1, 1], color = "red") +
  geom_abline(slope = eigen(diy_cov)$vector[2, 2] / eigen(diy_cov)$vector[1, 2], color = "blue") +
point_projection <- function(diy_cov, x, y, PC){
  a <- c(x, y)
  b <- eigen(diy_cov)$vector[, PC]
  results <- c(a %*% b) * b
  return(list(x = results[1], y = results[2]))
}

diy_pca <- diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    pc1_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm, 1)$x,
    pc1_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm, 1)$y,
     pc2_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm, 2)$x,
    pc2_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm, 2)$y
  )

diy_pca %>%
  ggplot() +
  geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_abline(slope = eigen(diy_cov)$vector[2, 1] / eigen(diy_cov)$vector[1, 1], color = "red") +
  geom_abline(slope = eigen(diy_cov)$vector[2, 2] / eigen(diy_cov)$vector[1, 2], color = "blue") +
  geom_point(aes(x = pc1_x, y = pc1_y), color = "red") +
  geom_point(aes(x = pc2_x, y = pc2_y), color = "blue") +
  geom_segment(
    aes(x = bill_length_mm,
        y = bill_depth_mm,
        xend = pc1_x,
        yend = pc1_y,), color = "red", linewidth = 0.1) +
  geom_segment(
    aes(x = bill_length_mm,
        y = bill_depth_mm,
        xend = pc2_x,
        yend = pc2_y,), color = "red", linewidth = 0.1) +
  coord_equal()
point_projection <- function(diy_cov, x, y, PC){
  # your code
}

diy_pca <- diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    pc1_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm,
      PC = 1),
    pc2_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm,
      PC = 2),
  )

diy_pca %>%
  bind_cols(
    data %>% select(-colnames(diy_pca)[1:2]) %>%  filter(sex == "female")
  ) %>%
  ggplot() +
  geom_point(aes(x = pc1_x, y = pc2_y, color = species))
point_projection <- function(diy_cov, x, y, PC){
  a <- c(x, y)
  b <- eigen(diy_cov)$vector[, PC]
  a %*% b
}

diy_pca <- diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    pc1_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm,
      PC = 1),
    pc2_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm,
      PC = 2),
  )

diy_pca %>%
  bind_cols(
    data %>% select(-colnames(diy_pca)[1:2]) %>%  filter(sex == "female")
  ) %>%
  ggplot() +
  geom_point(aes(x = pc1_x, y = pc2_y, color = species))
diy_data_f %>%
  rowwise() %>% # perform the subsequent opperation row by row
  mutate(
    pc1_x = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm,
      PC = 1),
    pc2_y = point_projection(
      diy_cov = diy_cov,
      x = bill_length_mm,
      y = bill_depth_mm,
      PC = 2),
  ) %>%
  ungroup() %>%
  mutate(
    pc1_x_ref = data_f_pca$x[,1],
    pc2_y_ref = data_f_pca$x[,2]
  ) %>%
  bind_cols(
    data %>% select(-colnames(diy_pca)[1:2]) %>%  filter(sex == "female")
  ) %>%
  ggplot() +
  geom_point(aes(x = pc1_x, y = pc2_y, color = species), alpha = 0.5) +
  geom_point(aes(x = pc1_x_ref, y = pc2_y_ref, color = species), size = 0.5)
species_f <- data %>% filter(sex == "female") %>% pull(species) # we get the species variable
fviz_pca_ind(data_f_pca,
             geom = "point",
             col.ind = species_f
             )
get_eigenvalue(data_f_pca)
data_f_pca$sdev^2
data %>%
  filter(sex == "female") %>%
  select(c(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)) %>%
  scale() %>%
  cov() %>%
  eigen()
(pc_var <- data_f_pca$sdev^2 / (ncol(data_f_pca$x) - 1))
pc_var / sum(pc_var)
fviz_eig(data_f_pca)
tibble(
  pc = 1:4,
  var = pc_var / sum(pc_var)
) %>%
  ggplot() +
  geom_point(aes(x = pc, y = var))

fviz_pca_var(data_f_pca, col.var = "contrib")
fviz_pca_biplot(
  data_f_pca, geom = "point",
  col.ind = (data %>% filter(sex == "female") %>% pull(species)),
  )
# results per individuals
res_ind <- get_pca_ind(data_f_pca)
species_f <- data %>% filter(sex == "female") %>% pull(species) # we get the species variable
fviz_pca_ind(data_f_pca,
             geom = "point",
             col.ind = (res_ind$contrib[, 3])
             )
# results per individuals
res_ind <- get_pca_ind(data_f_pca)
species_f <- data %>% filter(sex == "female") %>% pull(species) # we get the species variable
fviz_pca_ind(data_f_pca,
             geom = "point",
             col.ind = (res_ind$cos2[, 3])
             )
data_m_scale <- data %>%
  filter(sex == "male") %>%
  select(c(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)) %>%
   scale(
      center = data_f_pca$center,
      scale = data_f_pca$scale
    )
coord_func <- function(ind, loadings){
  r <- loadings * ind
  apply(r, 2, sum)
}
data_m_pca <- t(apply(data_m_scale, 1, coord_func, data_f_pca$rotation ))

as_tibble(data_f_pca$x) %>%
  bind_cols(
    data %>% filter(sex == "female")
  ) %>%
  bind_rows(
    as_tibble(data_m_pca) %>%
    bind_cols(
      data %>% filter(sex == "male")
    )
  ) %>%
  ggplot() +
  geom_point(aes(x = PC1, y = PC2, color = species, shape = sex))
data_f_svd <- data %>%
  filter(sex == "female") %>%
  select(c(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)) %>%
  mutate(
    bill_length_mm = (bill_length_mm - mean(bill_length_mm)) / sd(bill_length_mm),
    bill_depth_mm = (bill_depth_mm - mean(bill_depth_mm)) / sd(bill_depth_mm),
    flipper_length_mm = (flipper_length_mm - mean(flipper_length_mm)) / sd(flipper_length_mm),
    body_mass_g = (body_mass_g - mean(body_mass_g)) / sd(body_mass_g),
  ) %>%
  svd()