From 9adeb3c1dbd5515b6720835681eb852d80ea81a8 Mon Sep 17 00:00:00 2001 From: Gilquin <laurent.gilquin@ens-lyon.fr> Date: Fri, 12 Jul 2024 13:44:15 +0200 Subject: [PATCH] fix: correct code formatting and warnings correct unconsistent code formatting with styler package correct deprecated warnings --- session_1/session_1.Rmd | 113 +++++++++++++++-------------- session_2/session_2.Rmd | 112 +++++++++++++++-------------- session_3/session_3.Rmd | 59 ++++++++------- session_4/session_4.Rmd | 130 +++++++++++++++++---------------- session_5/session_5.Rmd | 154 ++++++++++++++++++++++------------------ session_6/session_6.Rmd | 56 ++++++++------- session_7/session_7.Rmd | 19 ++--- session_8/session_8.Rmd | 20 +++--- 8 files changed, 352 insertions(+), 311 deletions(-) diff --git a/session_1/session_1.Rmd b/session_1/session_1.Rmd index 0739122..a5b5154 100644 --- a/session_1/session_1.Rmd +++ b/session_1/session_1.Rmd @@ -9,7 +9,7 @@ library(fontawesome) ``` ```{r setup, include=FALSE , echo=FALSE} -rm(list=ls()) +rm(list = ls()) knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = NA) ``` @@ -66,13 +66,16 @@ Reasons to use it: ```{r echo=F, message=F} cran_packages <- nrow(available.packages(repos = "http://cran.us.r-project.org")) -if (! require("rvest")) { +if (!require("rvest")) { install.packages("rvest", quiet = T) } library(rvest) -url <- 'https://www.bioconductor.org/packages/release/bioc/' -biocPackages <- url %>% read_html() %>% html_table() %>%.[[1]] +url <- "https://www.bioconductor.org/packages/release/bioc/" +biocPackages <- url %>% + read_html() %>% + html_table() %>% + .[[1]] bioconductor_packages <- nrow(biocPackages) ``` @@ -219,8 +222,8 @@ You can use parenthesis `(` `)` to change this order. But too much parenthesis can be hard to read ```{r calculatorstep5, include=TRUE} -(3 + (5 * (2 ^ 2))) # hard to read -3 + 5 * (2 ^ 2) # if you forget some rules, this might help +(3 + (5 * (2^2))) # hard to read +3 + 5 * (2^2) # if you forget some rules, this might help ``` **Note :** The text following a `#` is a comment. It will not be interpreted by R. In the future, I advise you to use comments a lot to explain in your own words what the command means. @@ -230,7 +233,7 @@ But too much parenthesis can be hard to read For small of large numbers, R will automatically switch to scientific notation. ```{r calculatorstep6, include=TRUE} -2/10000 +2 / 10000 ``` `2e-4` is shorthand for `2 * 10^(-4)` @@ -248,7 +251,7 @@ To call a mathematical function, you must use `function_name(<number>)`. For example, for the natural logarithm: ```{r calculatorstep8, include=TRUE} -log(2) # natural logarithm +log(2) # natural logarithm ``` ```{r calculatorstep9, include=TRUE} @@ -375,9 +378,9 @@ A variable can be assigned a `numeric` value as well as a `character` value. Just put the character (or string) between double quote `"` when you assign this value. ```{r VandAstep6, include=TRUE} -z <- "x" # One character +z <- "x" # One character z -a <- "Hello world" # Multiple characters == String +a <- "Hello world" # Multiple characters == String a ``` @@ -541,32 +544,32 @@ function_name <- function(a, b){ Predict the result of R1, R2 and R3. ```R -minus <- function(a, b){ +minus <- function(a, b) { result_1 <- a - b return(result_1) } -##R1: -R1 <- minus(4,2) +## R1: +R1 <- minus(4, 2) -##R2 -R2 <- minus(2,4) +## R2 +R2 <- minus(2, 4) -##R3 +## R3 a <- 2 b <- 10 -R3 <- minus(b,a) +R3 <- minus(b, a) ``` </div> <details><summary>Solution 1</summary> <p> ```{r minus1, include=TRUE} -minus <- function(a, b){ +minus <- function(a, b) { result_1 <- a - b return(result_1) } -minus(4,2) +minus(4, 2) ``` </p> </details> @@ -575,7 +578,7 @@ minus(4,2) <details><summary>Solution 2</summary> <p> ```{r minus2, include=TRUE} -minus(2,4) +minus(2, 4) ``` </p> </details> @@ -586,7 +589,7 @@ minus(2,4) ```{r minus3, include=TRUE} a <- 2 b <- 10 -minus(b,a) +minus(b, a) ``` </p> </details> @@ -599,22 +602,22 @@ Predict the result of R1, R2, R3 and R4. a <- 10 b <- 2 -minus <- function(a, b){ +minus <- function(a, b) { result_1 <- a - b return(result_1) } -##R1: -R1 <- minus(a=6,b=3) +## R1: +R1 <- minus(a = 6, b = 3) -##R2 -R2 <- minus(b=3,a=6) +## R2 +R2 <- minus(b = 3, a = 6) -##R3 +## R3 R3 <- a -##R4 -R4 <- minus(b=b,a=a) +## R4 +R4 <- minus(b = b, a = a) ``` </div> @@ -623,11 +626,11 @@ R4 <- minus(b=b,a=a) ```{r minus21, include=TRUE} a <- 10 b <- 2 -minus <- function(a, b){ +minus <- function(a, b) { result_1 <- a - b return(result_1) } -R1 <- minus(a=6,b=3) +R1 <- minus(a = 6, b = 3) R1 ``` </p> @@ -637,7 +640,7 @@ R1 <details><summary>Solution 2</summary> <p> ```{r minus22, include=TRUE} -R2 <- minus(b=3,a=6) +R2 <- minus(b = 3, a = 6) R2 ``` </p> @@ -656,7 +659,7 @@ R3 <details><summary>Solution 4</summary> <p> ```{r minus24, include=TRUE} -R4 <- minus(b=b,a=a) +R4 <- minus(b = b, a = a) R4 ``` </p> @@ -666,18 +669,18 @@ R4 - Default values for arguments may be set at definition and the default value is used when argument is not provided. ```{r minus10, include=TRUE} -minus_10 <- function(a, b=10){ +minus_10 <- function(a, b = 10) { result_1 <- a - b return(result_1) } minus_10(40) -minus_10(40,b=5) -minus_10(40,5) +minus_10(40, b = 5) +minus_10(40, 5) ``` - Functions can be define without argument ```{r print_hw, include=TRUE} -print_hw <- function(){ +print_hw <- function() { print("Hello world!") print("How R U?") } @@ -726,11 +729,11 @@ of the modulo is equal to `0`. <p> ```{r rect_area, include=TRUE} -rect_area <- function(L,W){ +rect_area <- function(L, W) { area <- L * W return(area) } -rect_area(4,3) +rect_area(4, 3) ``` </p> </details> @@ -739,7 +742,7 @@ rect_area(4,3) <details><summary>Solution 2 </summary> <p> ```{r VandAstep11, include=TRUE} -even_test <- function(x){ +even_test <- function(x) { modulo_result <- x %% 2 is_even <- modulo_result == 0 return(is_even) @@ -751,7 +754,7 @@ even_test(3) **Note :** A function can be written in several forms. ```{r VandAstep11smal2, include=TRUE} -even_test2 <- function(x){ +even_test2 <- function(x) { (x %% 2) == 0 } even_test(4) @@ -765,8 +768,8 @@ even_test(3) <details><summary>Solution 3 </summary> <p> ```{r VandAstep13, include=TRUE} -even_print <- function(x){ - if(even_test(x) == TRUE) { +even_print <- function(x) { + if (even_test(x) == TRUE) { print("This number is even") } else { print("This number is odd") @@ -779,8 +782,8 @@ even_print(3) **Note :** There is no need to test whether a boolean variable (TRUE/FALSE) is TRUE or FALSE. ```{r VandAstep11small14, include=TRUE} -even_print <- function(x){ - if(even_test(x)) { +even_print <- function(x) { + if (even_test(x)) { print("This number is even") } else { print("This number is odd") @@ -919,36 +922,36 @@ Here are some examples that show how elements of vectors can be obtained by inde You can use the position(s) of the value(s) in the vector: ```{r index1, include=TRUE} -x <- c(1,5,7,8) +x <- c(1, 5, 7, 8) x[4] -x[c(1,3,4)] +x[c(1, 3, 4)] ``` You can use booleans to define which values should be kept: ```{r index2, include=TRUE} -x <- c(1,5,7,8,15) -x[c(TRUE,FALSE,TRUE,FALSE,TRUE)] -x[c(FALSE,TRUE)] # Bolean vector is reused if it is not of the same size of the vector to index +x <- c(1, 5, 7, 8, 15) +x[c(TRUE, FALSE, TRUE, FALSE, TRUE)] +x[c(FALSE, TRUE)] # Bolean vector is reused if it is not of the same size of the vector to index -y <- c(TRUE,FALSE,FALSE,FALSE,TRUE) +y <- c(TRUE, FALSE, FALSE, FALSE, TRUE) x[y] ``` You can use names in the case of a named vector: ```{r index3, include=TRUE} -x <-c(a = 1, b = 2, c = 3, d = 4, e = 5) -x[c("a","c")] +x <- c(a = 1, b = 2, c = 3, d = 4, e = 5) +x[c("a", "c")] ``` You can also use an index to change values: ```{r index4, include=TRUE} -x <- c(1,5,7,8,15) +x <- c(1, 5, 7, 8, 15) x[1] <- 3 x -x[x>5] <- 13 +x[x > 5] <- 13 x ``` diff --git a/session_2/session_2.Rmd b/session_2/session_2.Rmd index 16c7ba6..96516d4 100644 --- a/session_2/session_2.Rmd +++ b/session_2/session_2.Rmd @@ -7,12 +7,13 @@ date: "2022" ```{r include=FALSE} library(fontawesome) -if("conflicted" %in% .packages()) - conflicted::conflicts_prefer(dplyr::filter) +if ("conflicted" %in% .packages()) { + conflicted::conflicts_prefer(dplyr::filter) +} ``` ```{r setup, include=FALSE} -rm(list=ls()) +rm(list = ls()) knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = NA) ``` @@ -21,8 +22,9 @@ knitr::opts_chunk$set(comment = NA) library("tidyverse") tmp <- tempfile(fileext = ".zip") download.file("http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip", - tmp, - quiet = TRUE) + tmp, + quiet = TRUE +) unzip(tmp, exdir = "data-raw") new_class_level <- c( "Compact Cars", @@ -210,7 +212,7 @@ We are going to make the simplest plot possible to study the relationship betwee The following command generates a plot between engine size `displ` and fuel efficiency `hwy` from the `new_mpg` `tibble`. ```{r new_mpg_plot_a, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg) + +ggplot(data = new_mpg) + geom_point(mapping = aes(x = displ, y = hwy)) ``` @@ -239,7 +241,7 @@ What happened when you only use the command `ggplot(data = mpg)` ? <details><summary>Solution</summary> <p> ```{r only_ggplot, cache = TRUE, fig.width=4.5, fig.height=2} -ggplot(data = new_mpg) +ggplot(data = new_mpg) ``` </p> </details> @@ -253,7 +255,7 @@ Make a scatterplot of `hwy` ( fuel efficiency ) vs. `cyl` ( number of cylinders <details><summary>Solution</summary> <p> ```{r new_mpg_plot_b, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = hwy, y = cyl)) + +ggplot(data = new_mpg, mapping = aes(x = hwy, y = cyl)) + geom_point() ``` @@ -286,7 +288,7 @@ Try the following aesthetics: ### `color` mapping {#sec-color-mapping} ```{r new_mpg_plot_e, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + geom_point() ``` @@ -294,29 +296,29 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + ### `size` mapping ```{r new_mpg_plot_f, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, size = class)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, size = class)) + geom_point() ``` ### `alpha` mapping ```{r new_mpg_plot_g, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, alpha = class)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, alpha = class)) + geom_point() ``` ### `shape` mapping ```{r new_mpg_plot_h, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, shape = class)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, shape = class)) + geom_point() ``` You can also set the aesthetic properties of your **geom** manually. For example, we can make all of the points in our plot blue and squares: ```{r new_mpg_plot_i, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + - geom_point(color = "blue", shape=0) +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + + geom_point(color = "blue", shape = 0) ``` Here is a list of different shapes available in R: @@ -329,14 +331,14 @@ What's gone wrong with this code? Why are the points not blue? </div> ```{r new_mpg_plot_not_blue, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = "blue")) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = "blue")) + geom_point() ``` <details><summary>Solution</summary> <p> ```{r new_mpg_plot_blue, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_point(color = "blue") ``` </p> @@ -347,7 +349,7 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + You can also map continuous variable to a color ```{r continu, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = cyl)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = cyl)) + geom_point() ``` @@ -358,7 +360,7 @@ What happens if you map an aesthetic to something other than a variable name, li <details><summary>Solution</summary> <p> ```{r condiColor, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = displ < 5)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = displ < 5)) + geom_point() ``` </p> @@ -373,8 +375,8 @@ We will come back to formulas in R later, for now, you just have to know that fo To make a scatterplot of `displ` versus `hwy` per car `class` you can use the following code: ```{r new_mpg_plot_k, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + - geom_point() + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + + geom_point() + facet_wrap(~class, nrow = 2) ``` @@ -400,14 +402,14 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + There are different ways to represent the information: ```{r new_mpg_plot_o, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_point() ``` \ ```{r new_mpg_plot_p, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_smooth() ``` @@ -416,7 +418,7 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + We can add as many layers as we want: ```{r new_mpg_plot_q, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_smooth() ``` @@ -436,7 +438,7 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + We can use different `data` (here new_mpg and mpg tables) for different layers (you will lean more on `filter()` later) ```{r new_mpg_plot_t, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_point(mapping = aes(color = class)) + geom_smooth(data = filter(mpg, class == "subcompact")) ``` @@ -448,8 +450,8 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + Run this code in your head and predict what the output will look like. Then, run the code in R and check your predictions. </div> ```R -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = drive)) + - geom_point(show.legend = FALSE) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = drive)) + + geom_point(show.legend = FALSE) + geom_smooth(se = FALSE) ``` @@ -462,8 +464,8 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = drive)) + <details><summary>Solution</summary> <p> ```{r soluce_challenge_1, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = drive)) + - geom_point(show.legend = FALSE) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = drive)) + + geom_point(show.legend = FALSE) + geom_smooth(se = FALSE) ``` </p> @@ -486,7 +488,7 @@ How being a `Two Seaters` car (*class column*) impact the engine size (*displ co <p> ```{r new_mpg_plot_color_2seater1, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_point() ``` @@ -496,7 +498,7 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + <details><summary>Solution 2</summary> <p> ```{r new_mpg_plot_color_2seater2, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_point(data = filter(new_mpg, class == "Two Seaters"), color = "red") ``` @@ -508,7 +510,7 @@ ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + <p> ```{r new_mpg_plot_color_2seater_facet, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + +ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_point(data = filter(new_mpg, class == "Two Seaters"), color = "red") + facet_wrap(~class) @@ -526,7 +528,7 @@ Write a `function` called `plot_color_a_class` that can take as argument the cla <p> ```{r new_mpg_plot_color_2seater_fx, cache = TRUE, fig.width=8, fig.height=4.5} plot_color_a_class <- function(my_class) { - ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + + ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_point(data = filter(new_mpg, class == my_class), color = "red") + facet_wrap(~class) @@ -573,8 +575,8 @@ You can do it with the `ggsave` function. First save your plot in a variable : ```{r} -p1 <- ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + - geom_point() +p1 <- ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + + geom_point() ``` Then save it in the format of your choice: @@ -606,7 +608,7 @@ install.packages("cowplot") ``` ```{r, include=F, echo =F} -if (! require("cowplot")) { +if (!require("cowplot")) { install.packages("cowplot") } ``` @@ -618,27 +620,27 @@ library(cowplot) ``` ```{r,fig.width=8, fig.height=4.5, message=FALSE} -p1 <- ggplot(data = new_mpg) + +p1 <- ggplot(data = new_mpg) + geom_point(mapping = aes(x = displ, y = hwy)) p1 ``` ```{r,fig.width=8, fig.height=4.5, message=FALSE} -p2 <- ggplot(data = new_mpg, mapping = aes(x = cty, y = hwy)) + +p2 <- ggplot(data = new_mpg, mapping = aes(x = cty, y = hwy)) + geom_point() p2 ``` ```{r,fig.width=8, fig.height=4.5, message=FALSE} -plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12) +plot_grid(p1, p2, labels = c("A", "B"), label_size = 12) ``` You can also save it in a file. ```{r, eval=F} -p_final = plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12) +p_final <- plot_grid(p1, p2, labels = c("A", "B"), label_size = 12) ggsave("test_plot_1_and_2.png", p_final, width = 20, height = 8, units = "cm") ``` @@ -649,31 +651,35 @@ Use the `cowplot` documentation to reproduce this plot and save it. </div> ```{r, echo=F} -p1 <- ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + - geom_point() + theme_bw() +p1 <- ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + + geom_point() + + theme_bw() -p2 <- ggplot(data = new_mpg, mapping = aes(x = cty, y = hwy, color = class)) + - geom_point() + theme_bw() +p2 <- ggplot(data = new_mpg, mapping = aes(x = cty, y = hwy, color = class)) + + geom_point() + + theme_bw() -p_row <- plot_grid(p1 + theme(legend.position = "none"), p2 + theme(legend.position = "none"), labels = c('A', 'B'), label_size = 12) -p_legend <- get_legend(p1 + theme(legend.position = "top")) +p_row <- plot_grid(p1 + theme(legend.position = "none"), p2 + theme(legend.position = "none"), labels = c("A", "B"), label_size = 12) +p_legend <- get_plot_component(p1, "guide-box-top", return_all = TRUE) -plot_grid(p_row, p_legend, nrow = 2, rel_heights = c(1,0.2)) +plot_grid(p_row, p_legend, nrow = 2, rel_heights = c(1, 0.2)) ``` <details><summary>Solution</summary> <p> ```{r , echo = TRUE, eval = F} -p1 <- ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + - geom_point() + theme_bw() +p1 <- ggplot(data = new_mpg, mapping = aes(x = displ, y = hwy, color = class)) + + geom_point() + + theme_bw() -p2 <- ggplot(data = new_mpg, mapping = aes(x = cty, y = hwy, color = class)) + - geom_point() + theme_bw() +p2 <- ggplot(data = new_mpg, mapping = aes(x = cty, y = hwy, color = class)) + + geom_point() + + theme_bw() -p_row <- plot_grid(p1 + theme(legend.position = "none"), p2 + theme(legend.position = "none"), labels = c('A', 'B'), label_size = 12) -p_legend <- get_legend(p1 + theme(legend.position = "top")) +p_row <- plot_grid(p1 + theme(legend.position = "none"), p2 + theme(legend.position = "none"), labels = c("A", "B"), label_size = 12) +p_legend <- get_plot_component(p1, "guide-box-top", return_all = TRUE) -p_final <- plot_grid(p_row, p_legend, nrow = 2, rel_heights = c(1,0.2)) +p_final <- plot_grid(p_row, p_legend, nrow = 2, rel_heights = c(1, 0.2)) p_final ``` diff --git a/session_3/session_3.Rmd b/session_3/session_3.Rmd index 2b57277..3cca44e 100644 --- a/session_3/session_3.Rmd +++ b/session_3/session_3.Rmd @@ -9,7 +9,7 @@ library(fontawesome) ``` ```{r setup, include=FALSE} -rm(list=ls()) +rm(list = ls()) knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = NA) ``` @@ -63,7 +63,7 @@ We saw scatterplot (`geom_point()`) and smoothplot (`geom_smooth()`). We can also use `geom_bar()` to draw barplot: ```{r diamonds_barplot, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = diamonds, mapping = aes(x = cut)) + +ggplot(data = diamonds, mapping = aes(x = cut)) + geom_bar() ``` @@ -82,7 +82,7 @@ The figure below describes how this process works with `geom_bar()`. You can generally use **geoms** and **stats** interchangeably. For example, you can recreate the previous plot using `stat_count()` instead of `geom_bar()`: ```{r diamonds_stat_count, include=TRUE, fig.width=8, fig.height=4.5} -ggplot(data = diamonds, mapping = aes(x = cut)) + +ggplot(data = diamonds, mapping = aes(x = cut)) + stat_count() ``` @@ -125,7 +125,7 @@ ggplot(data = demo, mapping = aes(x = cut, y = freq)) + You might want to override the default mapping from transformed variables to aesthetics ( e.g., proportion). ```{r 3_b, include=TRUE, fig.width=8, fig.height=4.5} -ggplot(data = diamonds, mapping = aes(x = cut, y = ..prop.., group = 1)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = after_stat(prop), group = 1)) + geom_bar() ``` @@ -136,7 +136,7 @@ In our proportion bar chart, we need to set `group = 1`. Why? <details><summary>Solution</summary> <p> ```{r diamonds_stats_challenge, include=TRUE, message=FALSE, fig.width=8, fig.height=4.5} -ggplot(data = diamonds, mapping = aes(x = cut, y = ..prop..)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = after_stat(prop))) + geom_bar() ``` @@ -155,8 +155,7 @@ value, to draw attention to the summary that you are computing. <details><summary>Solution</summary> <p> ```{r 3_c, include=TRUE, fig.width=8, fig.height=4.5, message=FALSE} - -ggplot(data = diamonds, mapping = aes(x = cut, y = depth)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = depth)) + stat_summary() ``` </p> @@ -169,7 +168,7 @@ Set the `fun.min`, `fun.max` and `fun` to the `min`, `max` and `median` function <details><summary>Solution</summary> <p> ```{r 3_d, include=TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, y = depth)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = depth)) + stat_summary( fun.min = min, fun.max = max, @@ -190,12 +189,12 @@ Try both approaches on a `cut`, histogram. <details><summary>Solution</summary> <p> ```{r diamonds_barplot_color, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, color = cut)) + +ggplot(data = diamonds, mapping = aes(x = cut, color = cut)) + geom_bar() ``` ```{r diamonds_barplot_fill, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + +ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + geom_bar() ``` </p> @@ -210,7 +209,7 @@ Try to color by `clarity`. Is `clarity` a continuous or categorical variable ? <details><summary>Solution</summary> <p> ```{r diamonds_barplot_fill_clarity, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + +ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + geom_bar() ``` </p> @@ -228,18 +227,18 @@ Try the following `position` parameter for `geom_bar`: `"fill"`, `"dodge"` and ` <details><summary>Solution</summary> <p> ```{r diamonds_barplot_pos_fill, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + - geom_bar( position = "fill") +ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + + geom_bar(position = "fill") ``` ```{r diamonds_barplot_pos_dodge, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + - geom_bar( position = "dodge") +ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + + geom_bar(position = "dodge") ``` ```{r diamonds_barplot_pos_jitter, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + - geom_bar( position = "jitter") +ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + + geom_bar(position = "jitter") ``` </p> </details> @@ -253,12 +252,12 @@ Compare `geom_point` to `geom_jitter` plot `cut` versus `depth` and color by `c <details><summary>Solution</summary> <p> ```{r dia_jitter2, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + geom_point() ``` ```{r dia_jitter3, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + geom_jitter() ``` </p> @@ -271,7 +270,7 @@ What parameters of `geom_jitter` control the amount of jittering ? <details><summary>Solution</summary> <p> ```{r dia_jitter4, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + geom_jitter(width = .1, height = .1) ``` </p> @@ -282,7 +281,7 @@ In the `geom_jitter` plot that we made, we cannot really see the limits of the d <details><summary>Solution</summary> <p> ```{r dia_violon, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + geom_violin() ``` </p> @@ -293,7 +292,7 @@ ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + A Cartesian coordinate system is a coordinate system where the x and y positions act independently to determine the location of each point. There are a number of other coordinate systems that are occasionally helpful. ```{r dia_boxplot, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + geom_boxplot() ``` @@ -304,7 +303,7 @@ Add the `coord_flip()` layer to the previous plot. <details><summary>Solution</summary> <p> ```{r dia_boxplot_flip, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + +ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + geom_boxplot() + coord_flip() ``` @@ -315,8 +314,8 @@ ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + Add the `coord_polar()` layer to this plot: ```{r diamonds_bar, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE, eval=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + - geom_bar( show.legend = FALSE, width = 1 ) + +ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + + geom_bar(show.legend = FALSE, width = 1) + theme(aspect.ratio = 1) + labs(x = NULL, y = NULL) ``` @@ -325,8 +324,8 @@ ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + <details><summary>Solution</summary> <p> ```{r diamonds_bar2, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + - geom_bar( show.legend = FALSE, width = 1 ) + +ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + + geom_bar(show.legend = FALSE, width = 1) + theme(aspect.ratio = 1) + labs(x = NULL, y = NULL) + coord_polar() @@ -402,7 +401,7 @@ You can use the `scale_x_log10()` to display the `gdpPercap` on the `log10` scal <p> ```{r gapminder_plot_b} ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, color = continent)) + - geom_point() + + geom_point() + scale_x_log10() ``` </p> @@ -418,10 +417,10 @@ For this we need to add a `transition_time` layer that will take as an argument <p> ```{r gapminder_plot_c} ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, color = continent)) + - geom_point() + + geom_point() + scale_x_log10() + transition_time(year) + - labs(title = 'Year: {as.integer(frame_time)}') + labs(title = "Year: {as.integer(frame_time)}") ``` </p> </details> \ No newline at end of file diff --git a/session_4/session_4.Rmd b/session_4/session_4.Rmd index 3f2c517..96ea42b 100644 --- a/session_4/session_4.Rmd +++ b/session_4/session_4.Rmd @@ -7,12 +7,13 @@ date: "2022" ```{r include=FALSE} library(fontawesome) -if("conflicted" %in% .packages()) - conflicted::conflicts_prefer(dplyr::filter) +if ("conflicted" %in% .packages()) { + conflicted::conflicts_prefer(dplyr::filter) +} ``` ```{r setup, include=FALSE} -rm(list=ls()) +rm(list = ls()) knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = NA) ``` @@ -49,7 +50,7 @@ library("nycflights13") ### Data set : nycflights13 -`nycflights13::flights` contains all $336 \ 776$ flights that departed from New York City in 2013. +`nycflights13::flights` contains all 336,776 flights that departed from New York City in 2013. The data comes from the US Bureau of Transportation Statistics, and is documented in `?flights` ```R @@ -104,14 +105,14 @@ You can use the relational operators (`<`,`>`,`==`,`<=`,`>=`,`!=`) to make a tes ```{r filter_sup_eq, include=TRUE, eval=FALSE} filter(flights, air_time >= 680) -filter(flights, carrier == "HA") -filter(flights, origin != "JFK") +filter(flights, carrier == "HA") +filter(flights, origin != "JFK") ``` The operator `%in%` is very useful to test if a value is in a list. ```{r filter_sup_inf, include=TRUE, eval=FALSE} -filter(flights, carrier %in% c("OO","AS")) -filter(flights, month %in% c(5,6,7,12)) +filter(flights, carrier %in% c("OO", "AS")) +filter(flights, month %in% c(5, 6, 7, 12)) ``` @@ -149,9 +150,9 @@ In R you can use the symbols `&` (and), `|` (or), `!` (not) and the function `xo Display the `long_flights` variable and predict the results of: ```{r logical_operators_exemples2, eval=FALSE} -filter(long_flights, day <= 15 & carrier == "HA") -filter(long_flights, day <= 15 | carrier == "HA") -filter(long_flights, (day <= 15 | carrier == "HA") & (! month > 2)) +filter(long_flights, day <= 15 & carrier == "HA") +filter(long_flights, day <= 15 | carrier == "HA") +filter(long_flights, (day <= 15 | carrier == "HA") & (!month > 2)) ``` @@ -163,9 +164,9 @@ filter(long_flights, (day <= 15 | carrier == "HA") & (! month > 2)) ```{r logical_operators_exemples2_sol, include=TRUE} long_flights -filter(long_flights, day <= 15 & carrier == "HA") -filter(long_flights, day <= 15 | carrier == "HA") -filter(long_flights, (day <= 15 | carrier == "HA") & (! month > 2)) +filter(long_flights, day <= 15 & carrier == "HA") +filter(long_flights, day <= 15 | carrier == "HA") +filter(long_flights, (day <= 15 | carrier == "HA") & (!month > 2)) ``` </p> </details> @@ -233,9 +234,10 @@ is.na(NA) `filter()` only includes rows where the condition is `TRUE`; it excludes both `FALSE` and `NA` values. If you want to preserve missing values, ask for them explicitly: ```{r filter_logical_operators_test_NA2, include=TRUE} -df <- tibble( x = c("A","B","C"), - y = c(1, NA, 3) - ) +df <- tibble( + x = c("A", "B", "C"), + y = c(1, NA, 3) +) df filter(df, y > 1) filter(df, is.na(y) | y > 1) @@ -279,7 +281,7 @@ Why is `NA ^ 0` not missing? Why is `NA | TRUE` not missing? Why is `FALSE & NA` <p> ```{r filter_chalenges_d, eval=TRUE} -NA ^ 0 # ^ 0 is always 1 it's an arbitrary rule not a computation +NA^0 # ^ 0 is always 1 it's an arbitrary rule not a computation NA | TRUE # if a member of a OR operation is TRUE the results is TRUE FALSE & NA # if a member of a AND operation is FALSE the results is FALSE NA * 0 # here we have a true computation @@ -308,9 +310,10 @@ arrange(flights, distance, desc(dep_delay)) Missing values are always sorted at the end: ```{r arrange_NA, include=TRUE} -df <- tibble( x = c("A","B","C"), - y = c(1, NA, 3) - ) +df <- tibble( + x = c("A", "B", "C"), + y = c(1, NA, 3) +) df arrange(df, y) @@ -406,10 +409,10 @@ colnames(df_dep_arr) ```{r challenge_select_a1, eval=FALSE} select(flights, dep_time, dep_delay, arr_time, arr_delay) -select(flights, starts_with("dep"), starts_with("arr") ) -select(flights, starts_with("dep") | starts_with("arr") ) -select(flights, matches("^(dep|arr)") ) -select(flights, dep_time : arr_delay & !starts_with("sched")) +select(flights, starts_with("dep"), starts_with("arr")) +select(flights, starts_with("dep") | starts_with("arr")) +select(flights, matches("^(dep|arr)")) +select(flights, dep_time:arr_delay & !starts_with("sched")) ``` </p> </details> @@ -485,7 +488,7 @@ It's often useful to add new columns that are functions of existing columns. Tha First let's create a thinner dataset to work on `flights_thin` that contains: - columns from `year` to `day` -- columns that ends with `delays` +- columns that ends with `delay` - the `distance` and `air_time` columns - the `dep_time` and `sched_dep_time` columns @@ -500,9 +503,9 @@ Then let's create an even smaller toy dataset to test your commands before using <p> ```{r mutate, include=TRUE} -(flights_thin <- select(flights, year:day, ends_with("delay"), distance, air_time, contains("dep_time"))) -(flights_thin_toy <- head(flights_thin, n=5)) -(flights_thin_toy2 <- sample_n(flights_thin, size=5)) +(flights_thin <- select(flights, year:day, ends_with("delay"), distance, air_time, contains("dep_time"))) +(flights_thin_toy <- head(flights_thin, n = 5)) +(flights_thin_toy2 <- sample_n(flights_thin, size = 5)) ``` </p> </details> @@ -556,7 +559,7 @@ Currently `dep_time` and `sched_dep_time` are convenient to look at, but difficu ```{r mutate_exemple, include=TRUE} HH <- 2003 %/% 100 HH -MM <- 2003 %% 100 +MM <- 2003 %% 100 MM HH * 60 + MM ``` @@ -585,7 +588,8 @@ mutate( HH = dep_time %/% 100, MM = dep_time %% 100, dep_time2 = HH * 60 + MM, - .after = "dep_time" ) + .after = "dep_time" +) ``` or `.keep = "used"` to keep only the columns used for the calculus which can be usefull for debugging, @@ -596,7 +600,8 @@ mutate( HH = dep_time %/% 100, MM = dep_time %% 100, dep_time2 = HH * 60 + MM, - .keep = "used" ) + .keep = "used" +) ``` In one row (or you can also remove columns HH and MM using select): @@ -604,8 +609,9 @@ In one row (or you can also remove columns HH and MM using select): ```{r mutate_challenges_a3, include=TRUE, eval = F} mutate( flights_thin_toy, - dep_time2 = dep_time %/% 100 * 60 + dep_time %% 100, - .after = "dep_time" ) + dep_time2 = dep_time %/% 100 * 60 + dep_time %% 100, + .after = "dep_time" +) ``` **Note**: You can also directly replace a column by the result of the mutate operation, @@ -613,7 +619,8 @@ mutate( ```{r mutate_challenges_a4, include=TRUE, eval = F} mutate( flights_thin_toy, - dep_time = dep_time * 60 + dep_time) + dep_time = dep_time * 60 + dep_time +) ``` </p> </details> @@ -624,10 +631,8 @@ mutate( ```{r mutate_challenges_b, eval=F, message=F, cache=T} mutate( flights, - dep_time = (dep_time %/% 100) * 60 + - dep_time %% 100, - sched_dep_time = (sched_dep_time %/% 100) * 60 + - sched_dep_time %% 100 + dep_time = (dep_time %/% 100) * 60 + dep_time %% 100, + sched_dep_time = (sched_dep_time %/% 100) * 60 + sched_dep_time %% 100 ) ``` @@ -671,7 +676,7 @@ Using `mpg` and the ggplot2 package, reproduce the graph studied in @sec-color-m Modify the colors representing the class of cars with the palettes `Dark2` of [RColorBrewer](https://www.datanovia.com/en/fr/blog/palette-de-couleurs-rcolorbrewer-de-a-a-z/), then `MononokeMedium` from [Ghibli](https://github.com/ewenme/ghibli). ```{r mpg_color} -ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) + +ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) + geom_point() ``` Go to the links to find the appropriate function: they are very similar between the two packages. @@ -680,13 +685,13 @@ Go to the links to find the appropriate function: they are very similar between <p> ```{r mpg_color1} -ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) + +ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) + geom_point() + scale_color_brewer(palette = "Dark2") ``` ```{r mpg_color2} -ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) + +ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) + geom_point() + scale_colour_ghibli_d("MononokeMedium") ``` @@ -744,9 +749,9 @@ Fit the samples on the x-axis and the genes on the y-axis. (DM1_tile_base <- ggplot(expr_DM1, aes(samples, Genes, fill = log1p(counts))) + geom_tile() + - labs(y="Genes", x = "Samples") + + labs(y = "Genes", x = "Samples") + theme( - axis.text.y = element_text(size= 6), + axis.text.y = element_text(size = 6), axis.text.x = element_text(size = 6, angle = 90) )) ``` @@ -825,11 +830,14 @@ With `mutate()` and `ifelse()` [fonctions](https://dplyr.tidyverse.org/reference <p> ```{r sig} -(tab.sig <- mutate(tab, - sig = baseMean > 20 & padj < 0.05 & abs(log2FoldChange) >= 1.5, - UpDown = ifelse(sig, ### we can use in the same mutate a column created by a previous line - ifelse(log2FoldChange > 0, "Up", "Down"), "NO") - ) +( + tab.sig <- mutate( + tab, + sig = baseMean > 20 & padj < 0.05 & abs(log2FoldChange) >= 1.5, + UpDown = ifelse(sig, ### we can use in the same mutate a column created by a previous line + ifelse(log2FoldChange > 0, "Up", "Down"), "NO" + ) + ) ) ``` </p> @@ -888,13 +896,13 @@ To make the graph below, use `ggplot2`, the functions `geom_point()`, `geom_hlin ```{r VolcanoPlotDemo, echo = FALSE} ggplot(tab.sig, aes(x = log2FoldChange, y = -log10(padj), color = UpDown)) + geom_point() + - scale_color_manual(values=c("steelblue", "lightgrey", "firebrick" )) + - geom_hline(yintercept=-log10(0.05), col="black") + - geom_vline(xintercept=c(-1.5, 1.5), col="black") + + scale_color_manual(values = c("steelblue", "lightgrey", "firebrick")) + + geom_hline(yintercept = -log10(0.05), col = "black") + + geom_vline(xintercept = c(-1.5, 1.5), col = "black") + theme_minimal() + - theme(legend.position="none") + - labs(y="-log10(p-value)", x = "log2(FoldChange)") + - geom_label_repel(data = top10, mapping = aes(label = gene_symbol)) + theme(legend.position = "none") + + labs(y = "-log10(p-value)", x = "log2(FoldChange)") + + geom_label_repel(data = top10, mapping = aes(label = gene_symbol)) ``` @@ -904,13 +912,13 @@ ggplot(tab.sig, aes(x = log2FoldChange, y = -log10(padj), color = UpDown)) + ```{r VolcanoPlotSolut, echo = TRUE, results = 'hide'} ggplot(tab.sig, aes(x = log2FoldChange, y = -log10(padj), color = UpDown)) + geom_point() + - scale_color_manual(values=c("steelblue", "lightgrey", "firebrick" )) + - geom_hline(yintercept=-log10(0.05), col="black") + - geom_vline(xintercept=c(-1.5, 1.5), col="black") + + scale_color_manual(values = c("steelblue", "lightgrey", "firebrick")) + + geom_hline(yintercept = -log10(0.05), col = "black") + + geom_vline(xintercept = c(-1.5, 1.5), col = "black") + theme_minimal() + - theme(legend.position="none") + - labs(y="-log10(p-value)", x = "log2(FoldChange)") + - geom_label_repel(data = top10, mapping = aes(label = gene_symbol)) + theme(legend.position = "none") + + labs(y = "-log10(p-value)", x = "log2(FoldChange)") + + geom_label_repel(data = top10, mapping = aes(label = gene_symbol)) ``` </p> diff --git a/session_5/session_5.Rmd b/session_5/session_5.Rmd index 7dbb74f..b2d1843 100644 --- a/session_5/session_5.Rmd +++ b/session_5/session_5.Rmd @@ -7,12 +7,13 @@ date: "2022" ```{r include=FALSE} library(fontawesome) -if("conflicted" %in% .packages()) - conflicted::conflicts_prefer(dplyr::filter) +if ("conflicted" %in% .packages()) { + conflicted::conflicts_prefer(dplyr::filter) +} ``` ```{r setup, include=FALSE} -rm(list=ls()) +rm(list = ls()) knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = NA) ``` @@ -49,8 +50,10 @@ Find the 10 most delayed flights using the ranking function `min_rank()`. <details><summary>Solution</summary> <p> ```{r pipe_example_a, include=TRUE} -flights_md <- mutate(flights, - most_delay = min_rank(desc(dep_delay))) +flights_md <- mutate( + flights, + most_delay = min_rank(desc(dep_delay)) +) flights_md <- filter(flights_md, most_delay < 10) flights_md <- arrange(flights_md, most_delay) ``` @@ -70,9 +73,9 @@ Try to pipe operators to rewrite your precedent code with only **one** variable <p> ```{r pipe_example_b, include=TRUE} flights_md2 <- flights %>% - mutate(most_delay = min_rank(desc(dep_delay))) %>% - filter(most_delay < 10) %>% - arrange(most_delay) + mutate(most_delay = min_rank(desc(dep_delay))) %>% + filter(most_delay < 10) %>% + arrange(most_delay) ``` </p> </details> @@ -97,9 +100,9 @@ The `summarise()` function collapses a data frame to a single row. Check the difference between `summarise()` and `mutate()` with the following commands: ```{r load_data, eval=FALSE} -flights %>% +flights %>% mutate(delay = mean(dep_delay, na.rm = TRUE)) -flights %>% +flights %>% summarise(delay = mean(dep_delay, na.rm = TRUE)) ``` @@ -114,14 +117,14 @@ Then, when you use aggregation functions on the grouped data frame, they'll be a You can use the following code to compute the average delay per months across years. ```{r summarise_group_by, include=TRUE, fig.width=8, fig.height=3.5} -flights_delay <- flights %>% - group_by(year, month) %>% - summarise(delay = mean(dep_delay, na.rm = TRUE), sd = sd(dep_delay, na.rm = TRUE)) %>% +flights_delay <- flights %>% + group_by(year, month) %>% + summarise(delay = mean(dep_delay, na.rm = TRUE), sd = sd(dep_delay, na.rm = TRUE)) %>% arrange(month) ggplot(data = flights_delay, mapping = aes(x = month, y = delay)) + - geom_bar(stat="identity", color="black", fill = "#619CFF") + - geom_errorbar(mapping = aes( ymin=0, ymax=delay+sd)) + + geom_bar(stat = "identity", color = "black", fill = "#619CFF") + + geom_errorbar(mapping = aes(ymin = 0, ymax = delay + sd)) + theme(axis.text.x = element_blank()) ``` @@ -136,8 +139,8 @@ You may have wondered about the `na.rm` argument we used above. What happens if </div> ```{r summarise_group_by_NA, include=TRUE} -flights %>% - group_by(dest) %>% +flights %>% + group_by(dest) %>% summarise( dist = mean(distance), delay = mean(arr_delay) @@ -151,20 +154,20 @@ Aggregation functions obey the usual rule of missing values: **if there's any mi Whenever you do any aggregation, it's always a good idea to include a count (`n()`). This way, you can check that you're not drawing conclusions based on very small amounts of data. ```{r summarise_group_by_count, include = T, echo=F, warning=F, message=F, fig.width=8, fig.height=3.5} -summ_delay_filghts <- flights %>% - group_by(dest) %>% +summ_delay_filghts <- flights %>% + group_by(dest) %>% summarise( n_flights = n(), avg_distance = mean(distance, na.rm = TRUE), avg_delay = mean(arr_delay, na.rm = TRUE) - ) %>% - filter(dest != "HNL") %>% + ) %>% + filter(dest != "HNL") %>% filter(avg_delay < 40 & avg_delay > -20) ggplot(summ_delay_filghts, mapping = aes(x = avg_distance, y = avg_delay, size = n_flights)) + geom_point() + geom_smooth(method = lm, se = FALSE) + - theme(legend.position='none') + theme(legend.position = "none") ``` <div class="pencadre"> @@ -183,19 +186,19 @@ Here are three steps to prepare those data: <details><summary>Solution</summary> <p> ```{r summarise_group_by_count_b, include = T, eval=F, warning=F, message=F, fig.width=8, fig.height=3.5} -flights %>% - group_by(dest) %>% +flights %>% + group_by(dest) %>% summarise( n_flights = n(), avg_distance = mean(distance, na.rm = TRUE), avg_delay = mean(arr_delay, na.rm = TRUE) - ) %>% - filter(dest != "HNL") %>% - filter(avg_delay < 40 & avg_delay > -20) %>% + ) %>% + filter(dest != "HNL") %>% + filter(avg_delay < 40 & avg_delay > -20) %>% ggplot(mapping = aes(x = avg_distance, y = avg_delay, size = n_flights)) + geom_point() + geom_smooth(method = lm, se = FALSE) + - theme(legend.position='none') + theme(legend.position = "none") ``` </p> </details> @@ -210,8 +213,8 @@ Try the following example </div> ```{r ungroup, eval=T, message=FALSE, cache=T} -flights %>% - group_by(year, month, day) %>% +flights %>% + group_by(year, month, day) %>% ungroup() %>% summarise(delay = mean(dep_delay, na.rm = TRUE)) ``` @@ -241,13 +244,13 @@ Look at the number of canceled flights per day. Is there a pattern? <details><summary>Solution</summary> <p> ```{r grouping_challenges_a, eval=T, message=FALSE, cache=T} -flights %>% +flights %>% mutate( canceled = is.na(dep_time) | is.na(arr_time) - ) %>% - filter(canceled) %>% - mutate(wday = strftime(time_hour,'%A')) %>% - group_by(wday) %>% + ) %>% + filter(canceled) %>% + mutate(wday = strftime(time_hour, "%A")) %>% + group_by(wday) %>% summarise( cancel_day = n() ) %>% @@ -266,17 +269,17 @@ Is the proportion of canceled flights by day of the week related to the average <details><summary>Solution</summary> <p> ```{r grouping_challenges_b1, eval=T, message=FALSE, cache=T, fig.width=8, fig.height=3.5} -flights %>% +flights %>% mutate( canceled = is.na(dep_time) | is.na(arr_time) - ) %>% - mutate(wday = strftime(time_hour,'%A')) %>% - group_by(wday) %>% + ) %>% + mutate(wday = strftime(time_hour, "%A")) %>% + group_by(wday) %>% summarise( - prop_cancel_day = sum(canceled)/n(), + prop_cancel_day = sum(canceled) / n(), av_delay = mean(dep_delay, na.rm = TRUE) ) %>% - ungroup() %>% + ungroup() %>% ggplot(mapping = aes(x = av_delay, y = prop_cancel_day, color = wday)) + geom_point() ``` @@ -293,33 +296,37 @@ Brainstorm a way to have access to the mean and standard deviation or the `prop_ <details><summary>Solution</summary> <p> ```{r grouping_challenges_b2, eval=T, message=FALSE, cache=T, fig.width=8, fig.height=3.5} -flights %>% +flights %>% mutate( canceled = is.na(dep_time) | is.na(arr_time) - ) %>% - mutate(wday = strftime(time_hour,'%A')) %>% - group_by(day) %>% + ) %>% + mutate(wday = strftime(time_hour, "%A")) %>% + group_by(day) %>% mutate( - prop_cancel_day = sum(canceled)/sum(!canceled), + prop_cancel_day = sum(canceled) / sum(!canceled), av_delay = mean(dep_delay, na.rm = TRUE) ) %>% - group_by(wday) %>% + group_by(wday) %>% summarize( mean_cancel_day = mean(prop_cancel_day, na.rm = TRUE), sd_cancel_day = sd(prop_cancel_day, na.rm = TRUE), mean_av_delay = mean(av_delay, na.rm = TRUE), sd_av_delay = sd(av_delay, na.rm = TRUE) - ) %>% + ) %>% ggplot(mapping = aes(x = mean_av_delay, y = mean_cancel_day, color = wday)) + geom_point() + - geom_errorbarh(mapping = aes( - xmin = -sd_av_delay + mean_av_delay, - xmax = sd_av_delay + mean_av_delay - )) + - geom_errorbar(mapping = aes( - ymin = -sd_cancel_day + mean_cancel_day, - ymax = sd_cancel_day + mean_cancel_day - )) + geom_errorbarh( + mapping = aes( + xmin = -sd_av_delay + mean_av_delay, + xmax = sd_av_delay + mean_av_delay + ) + ) + + geom_errorbar( + mapping = aes( + ymin = -sd_cancel_day + mean_cancel_day, + ymax = sd_cancel_day + mean_cancel_day + ) + ) ``` </p> </details> @@ -331,21 +338,26 @@ Now that you are aware of the interest of using `geom_errorbar`, what `hour` of <details><summary>Solution</summary> <p> ```{r group_filter_b3, eval=T, warning=F, message=FALSE, cache=T, fig.width=8, fig.height=3.5} -flights %>% - group_by(hour) %>% +flights %>% + group_by(hour) %>% summarise( mean_delay = mean(arr_delay, na.rm = T), sd_delay = sd(arr_delay, na.rm = T), - ) %>% + ) %>% ggplot() + - geom_errorbar(mapping = aes( - x = hour, - ymax = mean_delay + sd_delay, - ymin = mean_delay - sd_delay)) + - geom_point(mapping = aes( - x = hour, - y = mean_delay, - )) + geom_errorbar( + mapping = aes( + x = hour, + ymax = mean_delay + sd_delay, + ymin = mean_delay - sd_delay + ) + ) + + geom_point( + mapping = aes( + x = hour, + y = mean_delay, + ) + ) ``` </p> </details> @@ -359,8 +371,8 @@ Which carrier has the worst delays? <details><summary>Solution</summary> <p> ```{r grouping_challenges_c2, eval=F, echo = T, message=FALSE, cache=T} -flights %>% - group_by(carrier) %>% +flights %>% + group_by(carrier) %>% summarise( carrier_delay = mean(arr_delay, na.rm = T) ) %>% @@ -378,8 +390,8 @@ Can you disentangle the effects of bad airports vs. bad carriers? (Hint: think a <details><summary>Solution</summary> <p> ```{r grouping_challenges_c1, eval=F, echo = T, message=FALSE, cache=T} -flights %>% - group_by(carrier, dest) %>% +flights %>% + group_by(carrier, dest) %>% summarise( carrier_delay = mean(arr_delay, na.rm = T), number_of_flight = n() diff --git a/session_6/session_6.Rmd b/session_6/session_6.Rmd index 7d0c2e9..449f989 100644 --- a/session_6/session_6.Rmd +++ b/session_6/session_6.Rmd @@ -9,7 +9,7 @@ library(fontawesome) ``` ```{r setup, include=FALSE} -rm(list=ls()) +rm(list = ls()) knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = NA) ``` @@ -72,10 +72,12 @@ knitr::include_graphics('img/pivot_longer.png') ``` ```{r, eval = F} -wide_example <- tibble(X1 = c("A","B"), - X2 = c(1,2), - X3 = c(0.1,0.2), - X4 = c(10,20)) +wide_example <- tibble( + X1 = c("A", "B"), + X2 = c(1, 2), + X3 = c(0.1, 0.2), + X4 = c(10, 20) +) ``` If you have a wide dataset, such as `wide_example`, that you want to make longer, you will use the `pivot_longer()` function. @@ -84,13 +86,14 @@ You have to specify the names of the columns you want to pivot into longer forma ```{r, eval = F} wide_example %>% - pivot_longer(c(X2,X3,X4)) + pivot_longer(c(X2, X3, X4)) ``` ... or the reverse selection (-X1): ```{r, eval = F} -wide_example %>% pivot_longer(-X1) +wide_example %>% + pivot_longer(-X1) ``` You can specify the names of the columns where the data will be tidy (by default, it is `names` and `value`): @@ -129,10 +132,11 @@ For this we need to : - give the name of the variable stored in the cells of the columns years (`case`) ```{r pivot_longer, eval=T, message=T} -table4a %>% +table4a %>% pivot_longer(-country, - names_to = "year", - values_to = "case") + names_to = "year", + values_to = "case" + ) ``` </p> </details> @@ -148,8 +152,10 @@ If you have a long dataset, that you want to make wider, you will use the `pivot You have to specify which column contains the name of the output column (`names_from`), and which column contains the cell values from (`values_from`). ```{r, eval = F} -long_example %>% pivot_wider(names_from = V1, - values_from = V2) +long_example %>% pivot_wider( + names_from = V1, + values_from = V2 +) ``` @@ -167,9 +173,11 @@ The column `count` store two types of information: the `population` size of the You can use the `pivot_wider` function to make your table wider and have one observation per row and one variable per column. ```{r pivot_wider, eval=T, message=T} -table2 %>% - pivot_wider(names_from = type, - values_from = count) +table2 %>% + pivot_wider( + names_from = type, + values_from = count + ) ``` </p> </details> @@ -188,7 +196,7 @@ flights airlines airports weather -flights2 <- flights %>% +flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier) ``` @@ -257,7 +265,7 @@ flights2 %>% The default, `by = NULL`, uses all variables that appear in both tables, the so called natural join. ```{r , eval=T} -flights2 %>% +flights2 %>% left_join(weather) ``` @@ -271,24 +279,24 @@ flights2 %>% If you want to join by data that are in two columns with different names, you must specify the correspondence with a named character vector: `by = c("a" = "b")`. This will match variable `a` in table `x` to variable `b` in table `y`. ```{r , eval=T, echo = T} -flights2 %>% +flights2 %>% left_join(airports, c("dest" = "faa")) ``` If two columns have identical names in the input tables but are not used in the join, they are automatically renamed with the suffix `.x` and `.y` because all column names must be different in the output table. ```{r , eval=T, echo = T} -flights2 %>% - left_join(airports, c("dest" = "faa")) %>% +flights2 %>% + left_join(airports, c("dest" = "faa")) %>% left_join(airports, c("origin" = "faa")) ``` You can change the suffix using the option `suffix` ```{r , eval=T, echo = T} -flights2 %>% - left_join(airports, by = c("dest" = "faa")) %>% - left_join(airports, by = c("origin" = "faa"), suffix = c(".dest",".origin")) +flights2 %>% + left_join(airports, by = c("dest" = "faa")) %>% + left_join(airports, by = c("origin" = "faa"), suffix = c(".dest", ".origin")) ``` ### Filtering joins @@ -302,7 +310,7 @@ Filtering joins match observations in the same way as mutating joins, but affect top_dest <- flights %>% count(dest, sort = TRUE) %>% head(10) -flights %>% +flights %>% semi_join(top_dest) ``` diff --git a/session_7/session_7.Rmd b/session_7/session_7.Rmd index 79dc194..3521cba 100644 --- a/session_7/session_7.Rmd +++ b/session_7/session_7.Rmd @@ -7,12 +7,13 @@ date: "2022" ```{r include=FALSE} library(fontawesome) -if("conflicted" %in% .packages()) - conflicted::conflicts_prefer(dplyr::filter) +if ("conflicted" %in% .packages()) { + conflicted::conflicts_prefer(dplyr::filter) +} ``` ```{r setup, include=FALSE} -rm(list=ls()) +rm(list = ls()) knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = NA) ``` @@ -350,10 +351,10 @@ identical(no_vowels_1, no_vowels_2) ```{r str_detecttibble, eval=T, cache=T} df <- tibble( - word = words, + word = words, i = seq_along(word) ) -df %>% +df %>% filter(str_detect(word, "x$")) ``` @@ -390,14 +391,14 @@ noun <- "(a|the) ([^ ]+)" has_noun <- sentences %>% str_subset(noun) %>% head(10) -has_noun %>% +has_noun %>% str_extract(noun) ``` `str_extract()` gives us the complete match; `str_match()` gives each individual component. ```{r noun_regex_match, eval=T, cache=T} -has_noun %>% +has_noun %>% str_match(noun) ``` @@ -410,7 +411,7 @@ has_noun %>% Instead of replacing with a fixed string, you can use back references to insert components of the match. In the following code, I flip the order of the second and third words. ```{r replacing_matches, eval=T, cache=T} -sentences %>% +sentences %>% str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% head(5) ``` @@ -425,7 +426,7 @@ sentences %>% ```{r splitting, eval=T, cache=T} sentences %>% - head(5) %>% + head(5) %>% str_split("\\s") ``` diff --git a/session_8/session_8.Rmd b/session_8/session_8.Rmd index a6aae5a..e875124 100644 --- a/session_8/session_8.Rmd +++ b/session_8/session_8.Rmd @@ -7,12 +7,13 @@ date: "2022" ```{r include=FALSE} library(fontawesome) -if("conflicted" %in% .packages()) - conflicted::conflicts_prefer(dplyr::filter) +if ("conflicted" %in% .packages()) { + conflicted::conflicts_prefer(dplyr::filter) +} ``` ```{r setup, include=FALSE} -rm(list=ls()) +rm(list = ls()) knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = NA) ``` @@ -61,7 +62,7 @@ You can fix both of these problems with a factor. ```{r sort_month_factor, eval=T, cache=T} month_levels <- c( - "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" ) y1 <- factor(x1, levels = month_levels) @@ -79,7 +80,9 @@ y2 Sometimes you'd prefer that the order of the levels match the order of the first appearance in the data. ```{r inorder_month_factor, eval=T, cache=T} -f2 <- x1 %>% factor() %>% fct_inorder() +f2 <- x1 %>% + factor() %>% + fct_inorder() f2 levels(f2) ``` @@ -111,7 +114,8 @@ relig_summary <- gss_cat %>% tvhours = mean(tvhours, na.rm = TRUE), n = n() ) -ggplot(relig_summary, aes(x = tvhours, y = relig)) + geom_point() +ggplot(relig_summary, aes(x = tvhours, y = relig)) + + geom_point() ``` It is difficult to interpret this plot because there's no overall pattern. We can improve it by reordering the levels of the factor relig using `fct_reorder()`. `fct_reorder()` takes three arguments: @@ -131,7 +135,7 @@ As you start making more complicated transformations, I would recommend moving t relig_summary %>% mutate(relig = fct_reorder(relig, tvhours)) %>% ggplot(aes(x = tvhours, y = relig)) + - geom_point() + geom_point() ``` ## `fct_reorder2()` @@ -174,6 +178,6 @@ For example [rmarkdown](https://rmarkdown.rstudio.com/) is a great way to turn y - [a comprehensive guide](https://bookdown.org/yihui/rmarkdown/) - [the cheatsheet](https://raw.githubusercontent.com/rstudio/cheatsheets/main/rmarkdown-2.0.pdf) -In addition most packages will provide **vignette**s on how to perform an analysis from scratch. On the [cran.r-project.org](https://cran.r-project.org/web/packages/ggplot2/index.html) or [bioconductor.org](http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html) websites (specialised on R packages for biologists), you will have direct links to a package vignettes. +In addition most packages will provide **vignette**s on how to perform an analysis from scratch. On the [cran.r-project.org](https://cran.r-project.org/web/packages/ggplot2/index.html) or [bioconductor.org](http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html) (specialised on R packages for biologists) websites, you will have direct links to a package vignettes. Finally, don't forget to search the web for your problems or error in R, for instance [stackoverflow](https://stackoverflow.com/) contains high quality and well-curated answers. \ No newline at end of file -- GitLab