diff --git a/Practical_c.Rmd b/Practical_c.Rmd
index 896792d2ebd9ef1867865ba685ac55dd1d071a92..8c3ab9d9c2e7b53da164c60ad978f512ecfba1f8 100644
--- a/Practical_c.Rmd
+++ b/Practical_c.Rmd
@@ -26,6 +26,9 @@ if (!requireNamespace("remotes"))
 if (!requireNamespace("tidyverse"))
   install.packages("tidyverse")
 library(tidyverse) # to manipulate data and make plot
+if (!requireNamespace("FactoMineR"))
+  install.packages("FactoMineR")
+library(FactoMineR) # for dimension reduction
 if (!requireNamespace("factoextra"))
   install.packages("factoextra")
 library(factoextra) # manipulate pca results
@@ -67,6 +70,7 @@ klippy::klippy(
 $$
  \def\H{{\mathcal{H}}}
  \def\PP{{\mathbb{P}}}
+ \def\EE{{\mathbb{E}}}
 $$
 :::
 
@@ -122,6 +126,8 @@ We will need the following packages
 ```{r echo=T, message=F}
 library(tidyverse)   # to manipulate data and make plot
 library(performance) # to evaluate model
+library(FactoMineR)  # for dimension reduction
+library(factoextra)  # to plot dimension reduction output
 ```
 
 ---
@@ -1034,24 +1040,24 @@ Write the linear model for the ANOVA regarding the morphological trait `A101` de
 
 Notations:
 
-- $i=1,2$ is the genotype associated to the SNP `YAL069W_1`
-- $y_{ir}$ is the value of the morphological trait `A101` in the $r^\text{th}$ individual with genotype $i$. The associated random variable is $Y_{ir}$
+- $j=1,2$ is the genotype associated to the SNP `YAL069W_1` (i.e. `0` for $j=1$ and `1` for $j=2$)
+- $y_{jr}$ is the value of the morphological trait `A101` in the $r^\text{th}$ individual with genotype $j$. The associated random variable is $Y_{jr}$
 
 Model:
 
 \[
-Y_{ir} = \mu_i + E_{ir}
+Y_{jr} = \mu_j + \varepsilon_{jr}
 \]
 
 Assumptions:
 
-- we assume that $E_{ir} \sim \mathcal{N}(0, \sigma^2)$
-- we assume that $Y_{ir}$ are independent (hence that $E_{ir}$ are independent)
+- we assume that $\varepsilon_{jr} \sim \mathcal{N}(0, \sigma^2)$
+- we assume that $Y_{jr}$ are independent (hence that $\varepsilon_{jr}$ are independent)
 
 Equivalent formulation:
 
 \[
-Y_{ir} \sim \mathcal{N}(\mu_i, \sigma^2)
+Y_{jr} \sim \mathcal{N}(\mu_j, \sigma^2)
 \]
 
 ---
@@ -1059,44 +1065,6 @@ Y_{ir} \sim \mathcal{N}(\mu_i, \sigma^2)
 </p>
 </details>
 
-<!-- <div class="pencadre"> -->
-<!-- Question ? -->
-<!-- </div> -->
-
-<!-- <details><summary>Solution</summary> -->
-<!-- <p> -->
-
-<!-- Solution -->
-
-<!-- </p> -->
-<!-- </details> -->
-
-
-<!-- <div class="pencadre"> -->
-<!-- Question ? -->
-<!-- </div> -->
-
-<!-- <details><summary>Solution</summary> -->
-<!-- <p> -->
-
-<!-- Solution -->
-
-<!-- </p> -->
-<!-- </details> -->
-
-
-<!-- <div class="pencadre"> -->
-<!-- Question ? -->
-<!-- </div> -->
-
-<!-- <details><summary>Solution</summary> -->
-<!-- <p> -->
-
-<!-- Solution -->
-
-<!-- </p> -->
-<!-- </details> -->
-
 **IMPORTANT**: We will first consider only a single cell cycle phase (e.g. the `"C"` phase) to avoid dealing with the measure heterogeneity across cell cycles (c.f. [later](#two-factor-anova))
 
 ```{r}
@@ -1134,7 +1102,9 @@ What should we check before interpreting the results of the ANOVA?
 <details><summary>Solution</summary>
 <p>
 
-Check the normality of the residuals and the homoskedasticity (constant variance)
+Check the normality of the residuals and the homoskedasticity (constant variance).
+
+> **Note:** you can get the residuals of a model with the `residuals()` function.
 
 Between `A101` and the SNP `YAL069W_1`:
 
@@ -1142,8 +1112,13 @@ Between `A101` and the SNP `YAL069W_1`:
 # linear model
 mod1 <- lm(A101 ~ factor(YAL069W_1), data = yeast_av_subdata)
 
-# residuals
-residuals(mod1)
+# scatter plot of the residuals
+plot(residuals(mod1))
+abline(h=0, col = "red")
+
+# empirical distribution of the residuals
+hist(residuals(mod1), prob = TRUE) # histogram of empirical frequency (instead of counts)
+lines(density(residuals(mod1)), col = "red") # empirical density
 
 # qq-plot
 plot(mod1, which = 2)
@@ -1159,8 +1134,13 @@ Between `A101` and the SNP `YAL069W_1`:
 # linear model
 mod2 <- lm(A101 ~ factor(YDL200C_427), data = yeast_av_subdata)
 
-# residuals
-residuals(mod2)
+# scatter plot of the residuals
+plot(residuals(mod2))
+abline(h=0, col = "red")
+
+# empirical distribution of the residuals
+hist(residuals(mod2), prob = TRUE) # histogram of empirical frequency (instead of counts)
+lines(density(residuals(mod2)), col = "red") # empirical density
 
 # qq-plot
 plot(mod2, which = 2)
@@ -1195,28 +1175,35 @@ yeast_av_subdata %>% group_by(YAL069W_1) %>% summarise(mu=mean(A101))
 lm(A101 ~ factor(YAL069W_1), data = yeast_av_subdata)
 ```
 
-Model considered in R:
+**Model considered in R** (with the previous notations):
 
 \[
-Y_{ir} = \mu + \nu_i + E_{ir}
+Y_{jr} = \mu + \beta_j + \varepsilon_{jr}
 \]
 
 with:
 
 - $\mu$ is the intercept
-- $\sum_i \nu_i = 0$ (think $\nu_i = \mu_i - \mu$), hence you get only $\nu_1$
+- $\beta_j$ is the coefficient associated to genotype $j$, with the constraint: $\sum_j \beta_j = 0$ (think $\beta_j = \mu_j - \mu$), hence you get only $\beta_j$ for $j=2$ (i.e. genotype = 1)
 
-Matrix notation:
+**Corresponding matrix notation** (which requires to reindex $Y$ and $E$):
 
 \[
 Y = \mu + X \times B + E
 \]
 
 where:
-- $Y$ is the vector containing the values for the output variable
-- $X = [x_{ij}]$ is the $\{0,1\}$ indicator that individual $j$ has genotype $i$
-- $B = [\nu_i]$ is the vector of genotype coefficients
-- $E$ is the vector of residuals
+
+- $Y = [y_i]$ is the vector containing the values for the output variable for all $n$ individuals indexed by $i=1:n$
+- $B = [\beta_j]$ is the vector of coefficients for the $p$ different genotypes indexed by $j = 1:p$ (here $p=2$ in our example)
+- $X = [x_{ij}]$ is the $\{0,1\}$ indicator that individual $i$ has genotype $j$
+- $E = [\varepsilon_i]$ is the vector of residuals for all individuals indexed by $i$
+
+Conditional expectation:
+
+\[
+\EE(Y_i\ \vert\ X_{ij} = x_{ij}) = \mu + \sum_{i=1}^n x_{ij}\ \beta_j
+\]
 
 Then $\mu$ and $B$ are estimated by a **least square linear regression** (see [here](https://en.wikipedia.org/wiki/Simple_linear_regression) [here](https://en.wikipedia.org/wiki/Linear_regression) and [here](https://en.wikipedia.org/wiki/Least_squares) for more details and references).
 
@@ -1244,74 +1231,612 @@ After verifying the normality and homoskedasticity of the residuals (**if either
 
 ### Two-factor ANOVA
 
-Between `A101` and the SNP `YAL069W_1`:
+We would like to avoid restricting the analysis to a single cell cycle phase, and account for the potential variability of the morphological trait `A101` across the different cell cycle phases, which is not explained by the genotype.
+
+<div class="pencadre">
+Write the linear model for the two-factor ANOVA regarding the morphological trait `A101` depending on genotype associated to the SNP `YAL069W_1` and the cell cycle phase (stored in the column `cell_cycle)?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+Notation:
+
+- $j=1,2$ indexes the genotype associated to the SNP `YAL069W_1` (i.e. `0` for $j=1$ and `1` for $j=2$)
+- $k=1,2,3$ indexes the cell cycle phase (among `"A"`, `"A1B"`, `"C"`)
+- $y_{jkr}$ is the value of the morphological trait `A101` in the $r^\text{th}$ individual with genotype $j$ in cell cycle $k$. The associated random variable is $Y_{jkr}$
+
+Two-factor ANOVA without interaction:
+
+\[
+Y_{jkr} = \mu + \beta_j + \alpha_k + \varepsilon_{jkr}
+\]
+
+where:
+
+- $\mu$ is the intercept
+- $\beta_j$ is the coefficient associated to genotype $j$, with the constraint: $\sum_j \beta_j = 0$
+- $\alpha_k$ is the coefficient associated to genotype $k$, with the constraint: $\sum_k \alpha_k = 0$
+
+Two-factor ANOVA with interaction:
+
+\[
+Y_{jkr} = \mu + \beta_j + \alpha_k + \gamma_{jk} + \varepsilon_{jkr}
+\]
+
+where:
+
+- $\beta_j$ is the coefficient associated to the interaction between genotype $j$ and cell cycle phase $k$, with the constraints: $\sum_j \gamma_{jk} = 0$ and $\sum_k \gamma_{jk} = 0$
+
+---
+
+</p>
+</details>
+
+
+<div class="pencadre">
+Conduct a one-factor ANOVA between the considered output variable (`A101`) and a combination of one of the considered SNPs (`YAL069W_1` or `YDL200C_427`) and the cell cycle? (for the moment, we do not consider both SNPs in the same model)
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+- `A101` depending on SNP `YAL069W_1` and `cell_cycle` (without interaction):
 
 ```{r}
-anova(lm(A101 ~ factor(YAL069W_1) + factor(cell_cycle), data = yeast_av_data))
+anova(lm(A101 ~ factor(YAL069W_1) + cell_cycle, data = yeast_av_data))
 ```
 
-Between `A101` and the SNP `YAL069W_1`:
+> **Note:** when the variable is numerical but with integer (discrete) values (e.g. `YAL069W_1` encoded with 0 and 1), we specify `factor()` in the model so that the `lm()` function understands this function as a qualitative and not quantitative variable. When the variable type is not ambiguous (e.g. `cell_cycle` which is a character variable), it is not necessary to specify that it is a `factor()` in the linear model, the `lm()` function assumes it automatically.
+
+- `A101` depending on SNP `YDL200C_427` and `cell_cycle` (without interaction):
 ```{r}
-anova(lm(A101 ~ factor(YDL200C_427) + factor(cell_cycle), data = yeast_av_data))
+anova(lm(A101 ~ factor(YDL200C_427) + cell_cycle, data = yeast_av_data))
 ```
 
-Interaction?
+**Interaction?**
 
-Between `A101` and the SNP `YAL069W_1`:
+- `A101` depending on SNP `YAL069W_1` and `cell_cycle` (with interaction):
 
 ```{r}
-anova(lm(A101 ~ factor(YAL069W_1) * factor(cell_cycle), data = yeast_av_data))
+anova(lm(A101 ~ factor(YAL069W_1) * cell_cycle, data = yeast_av_data))
 ```
 
-Between `A101` and the SNP `YAL069W_1`:
+- `A101` depending on SNP `YDL200C_427` and `cell_cycle` (with interaction):
+
 ```{r}
-anova(lm(A101 ~ factor(YDL200C_427) * factor(cell_cycle), data = yeast_av_data))
+anova(lm(A101 ~ factor(YDL200C_427) * cell_cycle, data = yeast_av_data))
 ```
 
+> **Note:** in a linear formula in R, `factor1 * factor2` is equivalent to `factor1 + factor2 + factor1:factor2`. The `:` is used to explicitely specify the interaction between factors.
 
-<!-- <div class="pencadre"> -->
-<!-- Question ? -->
-<!-- </div> -->
+---
 
-<!-- <details><summary>Solution</summary> -->
-<!-- <p> -->
+</p>
+</details>
 
-<!-- Solution -->
 
-<!-- </p> -->
-<!-- </details> -->
+
+
+
+
+<div class="pencadre">
+After doing regular verification for an ANOVA model, interpret the previous results?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+**Regular verification (normality and homoskedasticity for the residuals):**
+
+- `A101` depending on SNP `YAL069W_1` and `cell_cycle` (without interaction):
+
+```{r}
+# linear model
+mod1 <- lm(A101 ~ factor(YAL069W_1) + cell_cycle, data = yeast_av_data)
+
+# check normality
+performance::check_normality(mod1)
+# check homoskedasticity
+performance::check_heteroskedasticity(mod1)
+```
+
+- `A101` depending on SNP `YAL069W_1` and `cell_cycle` (with interaction):
+
+```{r}
+# linear model
+mod2 <- lm(A101 ~ factor(YAL069W_1) * cell_cycle, data = yeast_av_data)
+
+# check normality
+performance::check_normality(mod2)
+# check homoskedasticity
+performance::check_heteroskedasticity(mod2)
+```
+
+- `A101` depending on SNP `YDL200C_427` and `cell_cycle` (without interaction):
+
+```{r}
+# linear model
+mod3 <- lm(A101 ~ factor(YDL200C_427) + cell_cycle, data = yeast_av_data)
+
+# check normality
+performance::check_normality(mod3)
+# check homoskedasticity
+performance::check_heteroskedasticity(mod3)
+```
+
+- `A101` depending on SNP `YDL200C_427` and `cell_cycle` (with interaction):
+
+```{r}
+# linear model
+mod4 <- lm(A101 ~ factor(YDL200C_427) * cell_cycle, data = yeast_av_data)
+
+# check normality
+performance::check_normality(mod4)
+# check homoskedasticity
+performance::check_heteroskedasticity(mod4)
+```
+
+**Interpretation and comments:**
+
+1. The prerequisite Gaussian and homoskedasticity assumptions are not verified therefore, in theory, we **cannot** use the p-values computed by R to assess which factor (or interaction) has an effect or not on the output variable `A101`.
+
+What can we do in that case?
+
+- use variance stabilizing function to transform the output response, like `log1p()`, i.e. $x \mapsto \log(x+1)$, or the [Anscombe transform](https://en.wikipedia.org/wiki/Anscombe_transform)
+- switch to a generalized linear model if relevant, e.g. in case of specific non-Gaussian output variables, like categorical or count variables
+
+2. We can at least verify the values of the computed coefficients returned by the `lm()` functions (using the `coefficient) for the different model:
+
+```{r}
+coefficients(mod1)
+coefficients(mod2)
+coefficients(mod3)
+coefficients(mod4)
+```
+
+The closer to zero coefficient, the lower effect. However, we don't have any idea of the statistical significance of theses results.
+
+In practice, you need to choose if you are going to illicitly use these p-values or not (knowing that the conditions for the results to be interpretable are not fully verified).
+
+> **Note on model selection:**
+> 
+> - When comparing multiple model, e.g. with or without interaction, a simple rule is to start by training with the "richest" model (with all factors and interactions) and then train a more simple model where we remove factors and/or interactions that are not significant (if we can use the p-values).
+> - There exist automatic methods, called [**model selection**](https://en.wikipedia.org/wiki/Model_selection) approaches to allow to automatically compare and choose the "best" model among a set of candidate models (these types of methods and the comparison criteria they are based on is beyond the scope of the course).
+
+---
+
+</p>
+</details>
 
 ## Dimension reduction
 
-PCA vs Correspondence analysis
+In the following, we want to consider all the SNPs in the dataset and not just one or two.
 
-<!-- ### Genotype data -->
+### Genotype data
 
-<!-- Here is a heatmap representation (each cell of the image represent the encoded value for the corresponding SNP) of the genotype table regarding all SNPs for each strain. -->
+Here is a heatmap representation of the genotype table regarding all SNPs (in columns) for each strain (in rows): each cell of the image represent the genotype for the corresponding SNP in the corresponding strain.
 
-<!-- ```{r, echo=FALSE, fig.height = 12, fig.align = "center"} -->
-<!-- gt_data %>% select(any_of(strain_id$morpho_id)) %>%  -->
-<!--     t() %>% -->
-<!--     pheatmap( -->
-<!--         scale = "none", cluster_rows = FALSE, legend_breaks = c(0,1,2), -->
-<!--         cluster_cols = FALSE -->
-<!--     ) -->
+```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height = 12, fig.align = "center"}
+gt_data %>% select(any_of(strain_id$morpho_id)) %>%
+    t() %>%
+    pheatmap(
+        scale = "none", cluster_rows = FALSE, legend_breaks = c(0,1,2),
+        cluster_cols = FALSE
+    )
 
-<!-- ``` -->
+```
 
-<!-- <div class="pencadre"> -->
-<!-- Qhat can you say about the heatmap ? -->
-<!-- </div> -->
+<div class="pencadre">
+What can you say about the heatmap ?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+We observe a lot of variability in the genotypes along all SNPs between the different strains. Some SNPs seems to be less variable than other. It appears quite difficult to extract information from this representation. We would need to summarize the information contained in the genotype table to get an overview of these information.
+
+---
+
+</p>
+</details>
+
+
+<div class="pencadre">
+How could we visualize both the information contained in the SNP data and the morphological trait data (we will still focus on a single morphological variable, e.g. `A101`)?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+We can use a dimension reduction approach, like PCA.
+
+
+---
+
+</p>
+</details>
+
+
+### PCA vs Correspondence analysis
+
+<div class="pencadre">
+Use a PCA dimension reduction on the SNP data and use a visualization of the results colored by the corresponding value of the morphological trait `A101` in each strain?
+
+Here is the code to extract the data. Again, we focus on a specific cell cycle.
+
+```{r}
+# extract genotype data regarding all SNP
+data_for_pca <- yeast_av_data %>%
+    # consider a specific cell cycle
+    filter(cell_cycle == "A") %>%
+    # select genotype data
+    select(c(26:ncol(.))) %>%
+    # remove SNPs with too many NAs
+    dplyr::select(which(colSums(is.na(.)) == 0))
+
+# extract corresponding value for morphological trait
+A101_value <- yeast_av_data %>%
+    # consider a specific cell cycle
+    filter(cell_cycle == "A") %>%
+    # extract result
+    select(A101) %>% unlist()
+```
+
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+```{r}
+# run PCA
+pca_res <- prcomp(data_for_pca)
+
+# PCA indiv plot
+fviz_pca_ind(
+    pca_res,
+    geom = "point",
+    col.ind = A101_value, 
+    gradient.cols = c("#2E9FDF", "#FC4E07")
+)
+
+```
+
+We can use a discretized scale to get a better visualization effect:
+
+```{r}
+# discretize the continuous morphological variable
+discrete_A101 <- yeast_av_data %>%
+    # consider a specific cell cycle
+    filter(cell_cycle == "A") %>%
+    # discretize
+    mutate(A101 = cut(A101, breaks = seq(0,40,by=5)/100)) %>%
+    # extract result
+    select(A101) %>% unlist() %>% as.character()
+
+# PCA indiv plot
+fviz_pca_ind(
+    pca_res,
+    geom = "point",
+    col.ind = discrete_A101
+)
+```
+
+---
+
+</p>
+</details>
+
+
+<div class="pencadre">
+What can you say about this representation?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+The structure of the SNP data extracted by the first two components of PCA does not allow to discriminate between the different level of the morphological trait `A101`. The variability in the data explained by the first two components is not very high though.
+
+---
+
+</p>
+</details>
+
+Maybe we can use a different dimension reduction approach that is more suitable for the SNP data. If we consider SNP genotype as categorical variables, we can use the [multiple correspondence analysis (MCA)](https://en.wikipedia.org/wiki/Multiple_correspondence_analysis) method which is another linear dimension reduction approach but adatped to categorical variables.
+
+Here are the results of such an approach:
+
+```{r}
+# extract genotype data regarding all SNP and convert them to factors
+data_for_mca <- yeast_av_data %>%
+    # consider a specific cell cycle
+    filter(cell_cycle == "A") %>%
+    # select genotype data
+    select(c(26:ncol(.))) %>%
+    # remove SNPs with too many NAs
+    dplyr::select(which(colSums(is.na(.)) == 0)) %>%
+    # transform all columns in factors
+    mutate(across(everything(), ~as.factor(.x)))
+
+# run MCA
+mca_res <- MCA(data_for_mca, graph=FALSE)
+
+# MCA indiv plot
+fviz_mca_ind(
+    mca_res,
+    geom = "point",
+    col.ind = discrete_A101
+)
+```
+
+<div class="pencadre">
+Is the representation obtained by MCA better than the one obtained with PCA?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+Not better, maybe we could try some supervised dimension reduction approaches (c.f. next note) or some non-linear dimension reduction approaches (c.f. previous practical subject).
+
+---
+
+</p>
+</details>
+
+> **Note on supervised dimension reduction and mulitple output variables:** here we consider a single morphological trait as output variable, however this output variable is not used in the dimension reduction process. One could aim at finding the reduced-dimension representation of the data that best relate to this output variable. To solve this problem, one can use the [partial least squares regression (PLS)](https://en.wikipedia.org/wiki/Partial_least_squares_regression) method which is specifically able to find a latent dimension that relate to an output variable. In addition, there exist specific approaches, such as PLS and also [canonical correlation analysis (CCA)](https://en.wikipedia.org/wiki/Canonical_correlation), that allows to perform this supervised dimension reduction by accouting for multiple output variables (e.g. multiple morphological traits in our example).
+
+---
 
 ## Multiple testing
 
+The exploration of the SNP data by linear dimension reduction approaches did not allow to detect a specific global structure linking the SNP genotypes and the morphological trait measure `A101`. However, we still want to investigate the possible effect of the genotypes of all the recorded SNP on the morphological trait `A101`.
+
+To do so, we are going to run an ANOVA for each SNP.
+
+<div class="pencadre">
+Given the number of SNPs in the data (i.e. `r nrow(gt_data)`), what could be the risk when running such an analysis?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+We are going to do thousands of tests, computing and using thousands of p-values to assess the potential significant effect of each SNP on the considered morphological trait.
+
+We have a non-negligible risk to wrong reject the null hypothesis for many of the SNPs, and conclude to a non-existing significant effect.
+
+Thus, we have to use p-values correction (or adjustment) procedure adapted to the case of multiple testing.
+
+---
+
+</p>
+</details>
+
+We run the two-factor ANOVA model for all SNP (accounting for the considered SNP effect and the cell cycle effect).
+
+```{r}
+test_result <- yeast_av_data %>%
+    # run all ANOVA tests
+    summarize(
+        across(
+            any_of(gt_data$RQTL_name), 
+               ~anova(lm(A101~factor(.x) + cell_cycle))$`Pr(>F)`[1]
+        )
+    ) %>%
+    # reformat the output (to get a data.frame [SNP, p_values])
+    t() %>% as.data.frame() %>% dplyr::rename("p_values" = "V1") %>% 
+    rownames_to_column("SNP") %>%
+    # extract SNP index (to facilitate graphical representation)
+    mutate(SNP_index = as.integer(as.factor(SNP))) 
+```
+
+<div class="pencadre">
+What should we verify?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+In theory, we should verify the Gaussian and homoskedasticity assumptions for the residuals in each model (i.e. `r nrow(test_result)` models). In practice, it can be cumbersome... (but it would be a flaw in the analysis).
+
+---
+
+</p>
+</details>
+
+Here is the result of the analysis. We plot the p_values for all SNPs.
+
+```{r}
+ggplot(test_result) + geom_point(aes(x=SNP_index, y=p_values)) +
+    geom_hline(yintercept = 0.05, col = "red") +
+    annotate("text", label = "alpha = 5%", x=100, y=0.1, col = "red") +
+    theme_bw()
+
+```
+
+```{r}
+
+
+```
+
+
+
+<div class="pencadre">
+What can you say about these results?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+All SNPs for which the p-value is below the horizontal line are found to have a significant effect on the morphological trait `A101`.
+
+It corresponding to hundred of genes:
+
+```{r}
+sum(test_result$p_values <= 0.05)
+```
+
+**But what about the p-value correction?**
+
+---
+
+</p>
+</details>
+
+
+
+
+<div class="pencadre">
+The p-values should be adjusted in the context of multiple testing. Try different method to adjust the p-values (see the function `p.adjust()`).
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+```{r}
+test_result <- test_result %>%
+    mutate(
+        bonf_adj_p_values = p.adjust(p_values, method="bonferroni"),
+        fdr_adj_p_values = p.adjust(p_values, method="fdr"),
+    )
+
+```
+
+---
+
+</p>
+</details>
+
+
+Here is the distribution of the p-values after and before adjustement (we used the Bonferroni and the FDR adjustment approaches):
+
+```{r}
+ggplot(
+    test_result %>% pivot_longer(!c(SNP, SNP_index), names_to = "type", values_to = "p_values")
+) + geom_histogram(aes(x=p_values)) +
+    facet_wrap(~type) + 
+    geom_vline(xintercept = 0.05, col = "red") +
+    annotate("text", label = "alpha = 5%", x=0.1, y=1000, angle=90, col = "red") +
+    theme_bw()
+```
+
+
+<div class="pencadre">
+What can you say about the different corrections?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+After the Bonferronni correction, no significant effect is found anymore. It tend to be very conservative.
+
+The FDR correction reduces the number of found significant effects:
+
+```{r}
+sum(test_result$fdr_adj_p_values <= 0.05)
+```
+
+Here is another way to compare the FDR-adjusted and original p-values:
+
+```{r}
+ggplot(test_result) + geom_line(aes(x=p_values, y=fdr_adj_p_values)) + 
+    geom_abline(slope=1, intercept=0, col = "red") +
+    annotate("text", label = "y=x", x=0.5, y=0.45, angle=45, col = "red") +
+    theme_bw()
+```
+
+---
+
+</p>
+</details>
+
+Eventually, we can also compare the number of significant SNPs found before and after p-value correction depending on the chosen type I risk alpha.
+
+```{r}
+# compute the number of significant SNPs
+signif_result <- data.frame(alpha = seq(0, 1, 0.001)) %>%
+    rowwise() %>%
+    mutate(
+        signif = sum(test_result$p_values <= alpha),
+        signif_adj = sum(test_result$fdr_adj_p_values <= alpha)
+    )
+
+# graphical representation of the number of significant SNPs depending on alpha
+ggplot(
+    signif_result %>% pivot_longer(!alpha, names_to = "type", values_to = "number_snp")
+) + 
+    geom_line(aes(x = alpha, y=number_snp, col=type)) +
+    geom_vline(xintercept = 0.05, col = "red") +
+    annotate("text", label = "alpha = 5%", x=0.1, y=2000, angle=90, col = "red") +
+    theme_bw()
+```
+
+<div class="pencadre">
+Extract the top50 SNPs with the most significant effects.
+
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+```{r}
+test_result %>%
+    arrange(fdr_adj_p_values) %>%
+    select(!bonf_adj_p_values) %>%
+    head(50)
+```
+
+---
+
+</p>
+</details>
+
+
+<div class="pencadre">
+
+What can we do with these results?
+
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+- investigate the corresponding genomic region containing these SNPs (is it a gene, etc.)
+
+- run a confirmation study (for instance by inducing mutation in these SNPs)
+
+- etc.
+
+</p>
+</details>
+
+
 ---
 
 
 ## Full data analysis
 
 
+<div class="pencadre">
+Open (and optional) question: run the previous analysis to find SNPs with significant effect on the morphological trait `A101` with the full dataset `yeast_data`, i.e. without the average by strain for the morphological traits.
+
+In this case, you will have to account for the `strain_id` factor in the ANOVA.
+
+You could also try to consider linear model integrating multiple SNPs (instead of testing each SNP independently). However be aware of potential dimension issues. You will also encounter the question of model selection in this particular setting (which SNP do I integrate in my model?).
 
+</div>
+
+<details><summary>Solution</summary>
+<p>
+
+Write me!
+
+---
 
+</p>
+</details>
+
+
+
+---
 
 [The .Rmd file corresponding to this page is available here under the AGPL3 Licence](https://lbmc.gitbiopages.ens-lyon.fr/hub/formations/ens_m1_ml/Practical_b.Rmd)
diff --git a/practical_c_prepa.R b/practical_c_prepa.R
index 15ac09f932d0f8ad8937fc5b288d900798e49291..f61bd6c189e9c49cc33a0403622578d755b22427 100644
--- a/practical_c_prepa.R
+++ b/practical_c_prepa.R
@@ -46,7 +46,7 @@ gt_data <- read_table(
     rename_with(str_replace_all, pattern = "\\.", replacement = "_") %>%
     # select non genotype encoding columns and columns of individuals 
     # present in morphological data
-    select(!matches("^[0-9]+_[0-9]+_[a-z]$") | any_of(strain_id$gt_id)) %>%
+    dplyr::select(!matches("^[0-9]+_[0-9]+_[a-z]$") | any_of(strain_id$gt_id)) %>%
     # rename strain id columns to ids used in morphological trat data
     rename_with(function(col_names) {
         return(ifelse(
@@ -63,10 +63,10 @@ gt_data <- read_table(
 # gt_data %>% colnames()
 # gt_data %>% dim()
 # 
-# gt_data %>% select(any_of(strain_id$morpho_id)) %>% 
+# gt_data %>% dplyr::select(any_of(strain_id$morpho_id)) %>% 
 #     mutate_all(as.numeric) %>% unlist() %>% hist()
 # 
-# gt_data %>% select(any_of(strain_id$morpho_id)) %>% 
+# gt_data %>% dplyr::select(any_of(strain_id$morpho_id)) %>% 
 #     t() %>%
 #     pheatmap(
 #         scale = "none", cluster_rows = FALSE, legend_breaks = c(0,1,2),
@@ -216,7 +216,7 @@ morpho_data <- bind_rows(morpho_rm, morpho_seg, morpho_k) %>% as_tibble()
 #     summarize(across(matches("[ABC][0-9]+\\-?[0-9]*"), ~sum(.x<0, na.rm = TRUE))) %>% 
 #     unlist()
 # 
-# tmp <- morpho_data %>% select(matches("[ABC][0-9]+\\-?[0-9]*")) %>%
+# tmp <- morpho_data %>% dplyr::select(matches("[ABC][0-9]+\\-?[0-9]*")) %>%
 #     filter(if_any(everything(), ~.x<0)) %>% drop_na() %>%
 #     unlist()
 # hist(tmp[tmp<0])
@@ -226,22 +226,16 @@ morpho_data <- morpho_data %>%
     # forget about parental strain for the moment
     filter(!strain_id %in% c("A1", "A2")) %>%
     # remove morphological traits with too many NAs
-    select(which(colSums(is.na(.)) == 0)) %>%
+    dplyr::select(which(colSums(is.na(.)) == 0)) %>%
     # remove negative values in morphological trait
     mutate(across(matches("[ABC][0-9]+\\-?[0-9]*"), ~na_if(., -1)))
 
-#### additional preprocessing ####
-
-# morpho_data <- morpho_data %>% 
-#     # apply log1p transform to morphological trait measures
-#     mutate(across(matches("[ABC][0-9]+\\-?[0-9]*"), log1p))
-
 #### merge morpho and genotype data ####
 
 # prepare genotype data
 transpose_gt_data <- gt_data %>% 
     # keep only genotype column (one for each individual)
-    select(any_of(strain_id$morpho_id)) %>% 
+    dplyr::select(any_of(strain_id$morpho_id)) %>% 
     # transpose
     t() %>% 
     # convert back to data.frame
@@ -261,7 +255,7 @@ yeast_data <- morpho_data %>% left_join(transpose_gt_data, by = "strain_id")
 
 # # check merge
 # dim(yeast_data)
-# yeast_data %>% filter(strain_id == "A3") %>% select(any_of(gt_data$RQTL_name)) %>% pheatmap(
+# yeast_data %>% filter(strain_id == "A3") %>% dplyr::select(any_of(gt_data$RQTL_name)) %>% pheatmap(
 #     scale = "none", cluster_rows = FALSE, legend_breaks = c(0,1,2),
 #     cluster_cols = FALSE
 # )
@@ -284,11 +278,31 @@ yeast_av_data <- morpho_data %>% group_by(strain_id, cell_cycle) %>%
 dim(yeast_av_data)
 
 
+#### data transformation ####
+
+# log1p transform
+morpho_transform_data <- morpho_data %>%
+    # apply log1p transform to morphological trait measures
+    mutate(across(matches("[ABC][0-9]+\\-?[0-9]*"), log1p))
+
+# merge
+yeast_transform_data <- morpho_transform_data %>% left_join(transpose_gt_data, by = "strain_id")
+
+# average data by clone and cell cycle
+yeast_av_transform_data <- morpho_transform_data %>% group_by(strain_id, cell_cycle) %>%
+    summarize(across(matches("[ABC][0-9]+\\-?[0-9]*"), ~mean(.x, na.rm = TRUE))) %>% 
+    ungroup() %>%
+    left_join(transpose_gt_data, by = "strain_id")
+
+
 #### save data ####
 file_name <- "practical_c.Rdata"
 save(
     list = c("gt_data", "morpho_data", "strain_id", 
-             "yeast_data", "yeast_av_data"), 
+             "yeast_data", "yeast_av_data", 
+             # "morpho_transform_data",
+             # "yeast_transform_data", "yeast_av_transform_data"
+             ), 
     file = file_name
 )