From b83046f9d09ef91d5c2d5468b43e4e2f62f2983f Mon Sep 17 00:00:00 2001 From: Carine Rey <carine.rey@ens-lyon.fr> Date: Tue, 4 Oct 2022 18:03:37 +0200 Subject: [PATCH] finish to implement Antoine's comments --- session_4/session_4.Rmd | 128 ++++++++++++++++++++++------------------ 1 file changed, 72 insertions(+), 56 deletions(-) diff --git a/session_4/session_4.Rmd b/session_4/session_4.Rmd index 4ebf9d2..1839c1a 100644 --- a/session_4/session_4.Rmd +++ b/session_4/session_4.Rmd @@ -503,7 +503,7 @@ First let's create a thiner dataset to work on `flights_thin` that contains - the `distance` and `air_time` columns - the `dep_time` and `sched_dep_time` columns -Then let's create an even smaller dataset as toy dataset to test your commands before using them on the large dataset (It a good reflex to take). For that you can use the function `head` +Then let's create an even smaller dataset as toy dataset to test your commands before using them on the large dataset (It a good reflex to take). For that you can use the function `head` or `sample_n` for a more random sampling. - select only 5 rows @@ -516,6 +516,7 @@ Then let's create an even smaller dataset as toy dataset to test your commands b ```{r mutate, include=TRUE} (flights_thin <- select(flights, year:day, ends_with("delay"), distance, air_time, contains("dep_time"))) (flights_thin_toy <- head(flights_thin, n=5)) +(flights_thin_toy2 <- sample_n(flights_thin, size=5)) ``` </p> </details> @@ -602,7 +603,18 @@ mutate( .after = "dep_time" ) ``` -In one row (or you can also remove column HH and MM using select): +or `.keep = "used"` to keep only the columns used for the calculus which can be usefull for debugging + +```{r mutate_challenges_a21, include=TRUE} +mutate( + flights_thin_toy, + HH = dep_time %/% 100, + MM = dep_time %% 100, + dep_time2 = HH * 60 + MM, + .keep = "used" ) +``` + +In one row (or you can also remove columns HH and MM using select): ```{r mutate_challenges_a3, include=TRUE, eval = F} mutate( @@ -642,13 +654,13 @@ mutate( ## Useful creation functions -- Offsets: `lead()` and `lag()` allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. `x - lag(x)`) or find when values change (`x != lag(x)`). -- Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: `cumsum()`, `cumprod()`, `cummin()`, `cummax()`; and dplyr provides `cummean()` for cumulative means. +- Offsets: lead(x) and lag(x) allow you to refer to the previous or next values of the column x. This allows you to compute running differences (e.g. `x - lag(x)`) or find when values change (`x != lag(x)`). +- R provides functions for running cumulative sums, products, mins and maxes: `cumsum()`, `cumprod()`, `cummin()`, `cummax()`; and dplyr provides `cummean()` for cumulative means. - Logical comparisons, `<`, `<=`, `>`, `>=`, `!=`, and `==` -- Ranking: there are a number of ranking functions, but you should start with `min_rank()`. There is also `row_number()`, `dense_rank()`, `percent_rank()`, `cume_dist()`, `ntile()` +- Ranking: there are a number of ranking functions, the most frequently used being min_rank(). They differ by the way ties are treated, etc. Try ?mutate, ?min_rank, ?rank, for more information. -## See you in [R#5: Pipping and grouping](https://can.gitbiopages.ens-lyon.fr/R_basis/session_5.html) +## See you in [R#5: Pipping and grouping](https://can.gitbiopages.ens-lyon.fr/R_basis/session_5.html) # To go further: Data transformation and color sets. @@ -718,13 +730,21 @@ Open the csv file using the `read_csv2()` function. The file is located at "http <p> Download the Expression_matrice_pivot_longer_DEGs_GSE86356.csv file and save it in your working directory. +You may have to set you working directory using `setwd()` ```{r read_csv1} expr_DM1 <- read_csv2("Expression_matrice_pivot_longer_DEGs_GSE86356.csv") expr_DM1 ``` - </p> + +or you can read it from the url + +```{r read_csv1_url, eval = F} +(expr_DM1 <- read_csv2("https://can.gitbiopages.ens-lyon.fr/R_basis/session_4/Expression_matrice_pivot_longer_DEGs_GSE86356.csv")) +``` + +</p> </details> With this tibble, use `ggplot2` and the `geom_tile()` function to make a heatmap. @@ -736,13 +756,14 @@ Fit the samples on the x-axis and the genes on the y-axis. <p> ```{r heatmap1} -ggplot(expr_DM1, aes(samples, Genes, fill= log1p(counts))) + +(DM1_tile_base <- + ggplot(expr_DM1, aes(samples, Genes, fill = log1p(counts))) + geom_tile() + labs(y="Genes", x = "Samples") + theme( - axis.text.y = element_text(size= 4), - axis.text.x = element_text(size = 4, angle = 90) - ) + axis.text.y = element_text(size= 6), + axis.text.x = element_text(size = 6, angle = 90) + )) ``` **Nota bene :** The elements of the axes, and the theme in general, are modified in the `theme()` function. </p> @@ -762,14 +783,8 @@ With `scale_fill_gradient2()` function, change the colors of the gradient, takin <p> ```{r heatmapGreen} -ggplot(expr_DM1, aes(samples, Genes, fill= log1p(counts))) + - geom_tile() + - scale_fill_gradient2(low = "white", high = "springgreen4") + - labs(y="Genes", x = "Samples") + - theme( - axis.text.y = element_text(size= 4), - axis.text.x = element_text(size = 4, angle = 90) - ) +DM1_tile_base + scale_fill_gradient2(low = "white", high = "springgreen4") + ``` </p> </details> @@ -781,14 +796,7 @@ Now let s use the [viridis color gradient](https://gotellilab.github.io/GotelliL <p> ```{r heatmapViridis} -ggplot(expr_DM1, aes(samples, Genes, fill= log1p(counts))) + - geom_tile() + - scale_fill_viridis_c() + - labs(y="Genes", x = "Samples") + - theme( - axis.text.y = element_text(size= 4), - axis.text.x = element_text(size = 4, angle = 90) - ) +DM1_tile_base + scale_fill_viridis_c() ``` </p> </details> @@ -809,6 +817,15 @@ tab <- read_csv2("EWang_Tibialis_DEGs_GRCH37-87_GSE86356.csv") tab ``` + +```{r read_csv2_url, eval = F} +tab <- read_csv2("https://can.gitbiopages.ens-lyon.fr/R_basis/session_4/EWang_Tibialis_DEGs_GRCH37-87_GSE86356.csv") +tab +``` + + + + </p> </details> @@ -819,30 +836,24 @@ With `mutate()` and `ifelse()` [fonctions](https://dplyr.tidyverse.org/reference - a column 'sig' : it indicates if the gene is significant ( TRUE or FALSE ). **Thresholds :** baseMean > 20 and padj < 0.05 and abs(log2FoldChange) >= 1.5 -- a column 'UpDown' : it indicates if the gene is Up regulated or Down regulated. +- a column 'UpDown' : it indicates if the gene is significantly up-regulated (Up), down-regulated (Down), or not significantly regulated (NO). <details><summary>Solution</summary> <p> ```{r sig} -tab.sig <- tab %>% - mutate(sig = baseMean > 20 & padj < 0.05 & abs(log2FoldChange) >= 1.5 ) %>% - mutate(UpDown = ifelse( - baseMean > 20 & padj < 0.05 & log2FoldChange >= 1.5, - "Up", - ifelse( - baseMean > 20 & padj < 0.05 & log2FoldChange <= -1.5, - "Down", - "NO" - ))) - -tab.sig +(tab.sig <- mutate(tab, + sig = baseMean > 20 & padj < 0.05 & abs(log2FoldChange) >= 1.5, + UpDown = ifelse(sig, ### we can use in the same mutate a column created by a previous line + ifelse(log2FoldChange > 0, "Up", "Down"), "NO") + ) +) ``` </p> </details> We want to see the top10 DEGs on the graph. For this, we will use the package `ggrepel`. -Install and load the `ggrepl` package. +Install and load the `ggrepel` package. <details><summary>Solution</summary> <p> @@ -857,19 +868,26 @@ library(ggrepel) </p> </details> -Let s **filter** our table into a new variable, top10, to keep only the top 10 according to the adjusted pvalue. The **smaller** the adjusted pvalue, the more significant. + +Let's **filter** out table into a new variable, top10, to keep only the significant differentialy expressed genes with the top 10 adjusted pvalue. The **smaller** the adjusted pvalue, the more significant. + **Tips :** You can use the [function](https://dplyr.tidyverse.org/reference/slice.html) `slice_min()` <details><summary>Solution</summary> - <p> +<p> + +```{r top10_1} +(top10 <- arrange(tab.sig, desc(sig), padj)) +(top10 <- mutate(top10, row_N = row_number())) +(top10 <- filter(top10, row_N <= 10)) +``` ```{r top10} -top10 <- tab.sig %>% - filter(sig == TRUE) %>% - slice_min(n = 10, padj) +(top10 <- filter(tab.sig, sig == TRUE)) +(top10 <- slice_min(top10, padj, n = 10)) ``` - </p> +</p> </details> The data is ready to be used to make a volcano plot! @@ -878,9 +896,11 @@ The data is ready to be used to make a volcano plot! To make the graph below, use `ggplot2`, the functions `geom_point()`, `geom_hline()`, `geom_vline()`, `theme_minimal()`, `theme()` (to remove the legend), `geom_label_repel()` and the function `scale_color_manual()` for the colors. </div> -**Tips 1 :** Don t forget the transformation of the adjusted pvalue. -**Tips 2 :** Feel free to search your favorite Web browser for help. -**Tips 3 :** `geom_label_repel()` function needs a new parameter 'data' and 'label' in aes parameters. + +- **Tips 1 :** Don t forget the transformation of the adjusted pvalue. +- **Tips 2 :** Feel free to search your favorite Web browser for help. +- **Tips 3 :** `geom_label_repel()` function needs a new parameter 'data' and 'label' in aes parameters. + ```{r VolcanoPlotDemo, echo = FALSE} ggplot(tab.sig, aes(x = log2FoldChange, y = -log10(padj), color = UpDown)) + @@ -889,9 +909,7 @@ ggplot(tab.sig, aes(x = log2FoldChange, y = -log10(padj), color = UpDown)) + geom_hline(yintercept=-log10(0.05), col="black") + geom_vline(xintercept=c(-1.5, 1.5), col="black") + theme_minimal() + - theme( - legend.position="none" - ) + + theme(legend.position="none") + labs(y="-log10(p-value)", x = "log2(FoldChange)") + geom_label_repel(data = top10, mapping = aes(label = gene_symbol)) @@ -907,9 +925,7 @@ ggplot(tab.sig, aes(x = log2FoldChange, y = -log10(padj), color = UpDown)) + geom_hline(yintercept=-log10(0.05), col="black") + geom_vline(xintercept=c(-1.5, 1.5), col="black") + theme_minimal() + - theme( - legend.position="none" - ) + + theme(legend.position="none") + labs(y="-log10(p-value)", x = "log2(FoldChange)") + geom_label_repel(data = top10, mapping = aes(label = gene_symbol)) -- GitLab