From ebf0db051e6f5ca0cc6a9e00c669b8d0791c6288 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Thu, 9 Sep 2021 15:31:31 +0200 Subject: [PATCH] code cleanup --- session_2/slides.Rmd | 404 ------------------------------------ session_2/tp.R | 0 session_2/tp.md | 0 session_3/slides.Rmd | 279 ------------------------- session_3/tp.R | 0 session_3/tp.md | 0 session_4/challengeTime.Rmd | 139 ------------- session_4/slides.Rmd | 315 ---------------------------- 8 files changed, 1137 deletions(-) delete mode 100644 session_2/slides.Rmd delete mode 100644 session_2/tp.R delete mode 100644 session_2/tp.md delete mode 100644 session_3/slides.Rmd delete mode 100644 session_3/tp.R delete mode 100644 session_3/tp.md delete mode 100644 session_4/challengeTime.Rmd delete mode 100644 session_4/slides.Rmd diff --git a/session_2/slides.Rmd b/session_2/slides.Rmd deleted file mode 100644 index 3695714..0000000 --- a/session_2/slides.Rmd +++ /dev/null @@ -1,404 +0,0 @@ ---- -title: "R#2: introduction to Tidyverse" -author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)" -date: "24 Oct 2019" -output: - slidy_presentation: - highlight: tango - beamer_presentation: - theme: metropolis - slide_level: 3 - fig_caption: no - df_print: tibble - highlight: tango - latex_engine: xelatex ---- - -```{r setup, include=FALSE, cache=TRUE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -tmp <- tempfile(fileext = ".zip") -download.file("http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip", - tmp, - quiet = TRUE) -unzip(tmp, exdir = "data-raw") -new_class_level <- c( - "Compact Cars", - "Large Cars", - "Midsize Cars", - "Midsize Cars", - "Midsize Cars", - "Compact Cars", - "Minivan", - "Minivan", - "Pickup Trucks", - "Pickup Trucks", - "Pickup Trucks", - "Sport Utility Vehicle", - "Sport Utility Vehicle", - "Compact Cars", - "Special Purpose Vehicle", - "Special Purpose Vehicle", - "Special Purpose Vehicle", - "Special Purpose Vehicle", - "Special Purpose Vehicle", - "Special Purpose Vehicle", - "Sport Utility Vehicle", - "Sport Utility Vehicle", - "Pickup Trucks", - "Pickup Trucks", - "Pickup Trucks", - "Pickup Trucks", - "Sport Utility Vehicle", - "Sport Utility Vehicle", - "Compact Cars", - "Two Seaters", - "Vans", - "Vans", - "Vans", - "Vans" -) -new_fuel_level <- c( - "gas", - "Diesel", - "Regular", - "gas", - "gas", - "Regular", - "Regular", - "Hybrid", - "Hybrid", - "Regular", - "Regular", - "Hybrid", - "Hybrid" -) -read_csv("data-raw/vehicles.csv") %>% - select( - "id", - "make", - "model", - "year", - "VClass", - "trany", - "drive", - "cylinders", - "displ", - "fuelType", - "highway08", - "city08" - ) %>% - rename( - "class" = "VClass", - "trans" = "trany", - "drive" = "drive", - "cyl" = "cylinders", - "displ" = "displ", - "fuel" = "fuelType", - "hwy" = "highway08", - "cty" = "city08" - ) %>% - filter(drive != "") %>% - drop_na() %>% - arrange(make, model, year) %>% - mutate(class = factor(as.factor(class), labels = new_class_level)) %>% - mutate(fuel = factor(as.factor(fuel), labels = new_fuel_level)) %>% - write_csv("2_data.csv") -``` - -## R#2: introduction to Tidyverse -The goal of this practical is to familiarize yourself with `ggplot2`. - -The objectives of this session will be to: - -- Create basic plot with `ggplot2` -- Understand the `tibble` type -- Learn the different aesthetics in R plots -- Compose graphics - -## Tidyverse - -The tidyverse is a collection of R packages designed for data science. - -All packages share an underlying design philosophy, grammar, and data structures. - -```{r install_tidyverse, cache = TRUE, eval = FALSE} -install.packages("tidyverse") -``` - -```{r load_tidyverse, cache = TRUE} -library("tidyverse") -``` - - -## Toy data set `mpg` - -This dataset contains a subset of the fuel economy data that the EPA makes available on **http://fueleconomy.gov**. It contains only models which had a new release every year between 1999 and 2008. - - -```{r mpg_inspect, cache = TRUE, eval=FALSE} -?mpg -mpg -dim(mpg) -View(mpg) -``` - -## Updated version of the data - -`mpg` is loaded with tidyverse, we want to be able to read our own data from -**http://perso.ens-lyon.fr/laurent.modolo/R/2_data.csv** - -```{r mpg_download, cache=TRUE, message=FALSE} -new_mpg <- read_csv( - "http://perso.ens-lyon.fr/laurent.modolo/R/2_data.csv" - ) -``` - -**http://perso.ens-lyon.fr/laurent.modolo/R/2_a** - -## First plot with `ggplot2` - -Relationship between engine size `displ` and fuel efficiency `hwy`. -```{r new_mpg_plot_a, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg) + - geom_point(mapping = aes(x = displ, y = hwy)) -``` - -## Composition of plot with `ggplot2` - -Composition of plot with `ggplot2` - -```R -ggplot(data = <DATA>) + - <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>)) -``` - -- you begin a plot with the function `ggplot()` -- you complete your graph by adding one or more layers -- `geom_point()` adds a layer with a scatterplot -- each geom function in `ggplot2` takes a `mapping` argument -- the `mapping` argument is always paired with `aes()` - - -## First challenge - -- Run `ggplot(data = new_mpg)`. What do you see? -- How many rows are in `new_mpg`? How many columns? -- What does the `cty` variable describe? Read the help for `?mpg` to find out. -- Make a scatterplot of `hwy` vs. `cyl`. -- What happens if you make a scatterplot of `class` vs. `drive`? Why is the plot not useful? - -## Run `ggplot(data = mpg)`. What do you see? - -```{r empty_plot, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg) -``` - -## How many rows are in `new_mpg`? How many columns? - -```{r size_of_mpg, cache = TRUE, fig.width=8, fig.height=4.5} -new_mpg -``` - -## Make a scatterplot of `hwy` vs. `cyl`. - -```{r new_mpg_plot_b, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg) + - geom_point(mapping = aes(x = hwy, y = cyl)) -``` - -## What happens if you make a scatterplot of `class` vs. `drive`? -Why is the plot not useful? - -```{r new_mpg_plot_c, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = new_mpg) + - geom_point(mapping = aes(x = class, y = drive)) -``` - -## Aesthetic mappings - -How can you explain these cars? - -```{r new_mpg_plot_d, echo = FALSE, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy)) + - geom_point(data = mpg %>% filter(class == "2seater"), - mapping = aes(x = displ, y = hwy), color = "red") -``` - -## Aesthetic mapping `color` - -```{r new_mpg_plot_e, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy, color = class)) -``` - -## Aesthetic mappings - -`ggplot2` will automatically assign a unique level of the aesthetic (here a unique color) to each unique value of the variable, a process known as scaling. `ggplot2` will also add a legend that explains which levels correspond to which values. - -Try the following aesthetic: - -- `size` -- `alpha` -- `shape` - -## Aesthetic mapping `size` - -```{r new_mpg_plot_f, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy, size = class)) -``` - -## Aesthetic mapping `alpha` - -```{r new_mpg_plot_g, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy, alpha = class)) -``` - -## Aesthetic mapping `shape` - -```{r new_mpg_plot_h, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy, shape = class)) -``` - -## Aesthetic - -You can also set the aesthetic properties of your geom manually. For example, we can make all of the points in our plot blue: - -```{r new_mpg_plot_i, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy), color = "blue") -``` - -## Second challenge - -- What’s gone wrong with this code? Why are the points not blue? - -```R -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy, color = "blue")) -``` - -- Which variables in `mpg` are **categorical**? Which variables are **continuous**? (Hint: type `mpg`) -- Map a **continuous** variable to color, size, and shape. -- What does the `stroke` aesthetic do? What shapes does it work with? (Hint: use ?geom_point) -- What happens if you map an aesthetic to something other than a variable name, like `color = displ < 5`? - -## Facets - -```{r new_mpg_plot_j, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy)) + - facet_wrap(~class) -``` - -## Facets - -```{r new_mpg_plot_k, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy)) + - facet_wrap(~class, nrow = 2) -``` - -## Facets - -```{r new_mpg_plot_l, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy)) + - facet_wrap(~ fl + class, nrow = 2) -``` - -## Composition - -There are different ways to represent the information - -```{r new_mpg_plot_o, cache = TRUE, fig.width=8, fig.height=4.5} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy)) -``` - -## Composition - -There are different ways to represent the information - -```{r new_mpg_plot_p, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg) + - geom_smooth(mapping = aes(x = displ, y = hwy)) -``` - - -## Composition - -We can add as many layers as we want - -```{r new_mpg_plot_q, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy)) + - geom_smooth(mapping = aes(x = displ, y = hwy)) -``` - - -## Composition - -We can avoid code duplication - -```{r new_mpg_plot_r, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + - geom_point() + - geom_smooth() -``` - - -## Composition - -We can make `mapping` layer specific - -```{r new_mpg_plot_s, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + - geom_point(mapping = aes(color = class)) + - geom_smooth() -``` - -## Composition - -We can use different `data` for different layer (You will lean more on `filter()` later) - -```{r new_mpg_plot_t, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + - geom_point(mapping = aes(color = class)) + - geom_smooth(data = filter(mpg, class == "subcompact")) -``` - -## Fird challenge - -- Run this code in your head and predict what the output will look like. Then, run the code in R and check your predictions. -```R -ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + - geom_point() + - geom_smooth(se = FALSE) -``` -**http://perso.ens-lyon.fr/laurent.modolo/R/2_d** - -- What does `show.legend = FALSE` do? -- What does the `se` argument to `geom_smooth()` do? - -## Third challenge - -- Recreate the R code necessary to generate the following graph - -```{r new_mpg_plot_u, echo = FALSE, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + - geom_point() + - geom_smooth(mapping = aes(linetype = drv)) -``` - -## Third challenge - -```{r new_mpg_plot_v, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + - geom_point() + - geom_smooth(mapping = aes(linetype = drv)) -``` \ No newline at end of file diff --git a/session_2/tp.R b/session_2/tp.R deleted file mode 100644 index e69de29..0000000 diff --git a/session_2/tp.md b/session_2/tp.md deleted file mode 100644 index e69de29..0000000 diff --git a/session_3/slides.Rmd b/session_3/slides.Rmd deleted file mode 100644 index 78123e1..0000000 --- a/session_3/slides.Rmd +++ /dev/null @@ -1,279 +0,0 @@ ---- -title: "R#3: stats with ggplot2" -author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)" -date: "08 Nov 2019" -output: - slidy_presentation: - highlight: tango - beamer_presentation: - theme: metropolis - slide_level: 3 - fig_caption: no - df_print: tibble - highlight: tango - latex_engine: xelatex ---- - -```{r setup, include=FALSE, cache=TRUE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -``` - -## R#3: stats with ggplot2 -The goal of this practical is to practices advanced features of `ggplot2`. - -The objectives of this session will be to: - -- learn about statistical transformations -- practices position adjustments -- change the coordinate systems - -## `ggplot2` statistical transformations - -We are going to use the `diamonds` data set included in `tidyverse`. - -- Use the `help` and `View` command to explore this data set. -- Try the `str` command, which information are displayed ? - -## `ggplot2` statistical transformations - -```{r diamonds_barplot, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut)) -``` - -More diamonds are available with high quality cuts. - -## `ggplot2` statistical transformations - -On the x-axis, the chart displays cut, a variable from diamonds. On the y-axis, it displays count, but count is not a variable in diamonds! - -The algorithm used to calculate new values for a graph is called a **stat**, short for statistical transformation. The figure below describes how this process works with `geom_bar()`. - -\includegraphics[width=\textwidth]{img/visualization-stat-bar.png} - -## `ggplot2` statistical transformations - -You can generally use geoms and stats interchangeably. For example, you can recreate the previous plot using `stat_count()` instead of `geom_bar()`: - -```{r diamonds_stat_count, eval=FALSE, message=FALSE} -ggplot(data = diamonds) + - stat_count(mapping = aes(x = cut)) -``` - -## `ggplot2` statistical transformations - -Every geom has a default stat; and every stat has a default geom. This means that you can typically use geoms without worrying about the underlying statistical transformation. There are three reasons you might need to use a stat explicitly: - -- You might want to override the default stat. **3_a** -- You might want to override the default mapping from transformed variables to aesthetics. **3_b** -- You might want to draw greater attention to the statistical transformation in your code. **3_c** - -## Statistical transformation challenge - -- What does `geom_col()` do? How is it different to `geom_bar()`? -- What variables does `stat_smooth()` compute? What parameters control its behaviour? -- In our proportion bar chart, we need to set `group = 1`. Why? In other words what is the problem with these two graphs? - -```{r diamonds_stats_challenge, eval=FALSE, message=FALSE} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut, y = ..prop..)) -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..)) -``` - -## Position adjustments -You can colour a bar chart using either the `colour` aesthetic, or, more usefully, `fill`: - -```{r diamonds_barplot_color, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut, colour = cut)) -``` - -## Position adjustments -You can colour a bar chart using either the `colour` aesthetic, or, more usefully, `fill`: - -```{r diamonds_barplot_fill, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut, fill = cut)) -``` - -## Position adjustments - -You can also use `fill` with another variable: - -```{r diamonds_barplot_fill_clarity, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut, fill = clarity)) -``` - -## Position adjustments - -The stacking is performed by the position adjustment `position` - -```{r diamonds_barplot_pos_identity, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds, - mapping = aes(x = cut, colour = clarity)) + - geom_bar(fill = NA, position = "identity") -``` - -## Position adjustments - -The stacking is performed by the position adjustment `position` - -```{r diamonds_barplot_pos_fill, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut, fill = clarity), - position = "fill") -``` - -## Position adjustments - -The stacking is performed by the position adjustment `position` - -```{r diamonds_barplot_pos_dodge, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut, fill = clarity), - position = "dodge") -``` - -## Position adjustments - -The stacking is performed by the position adjustment `position` - -```{r mpg_point_pos_jitter, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg) + - geom_point(mapping = aes(x = displ, y = hwy), - position = "jitter") -``` - -## Position adjustments - -The stacking is performed by the position adjustment `position` - -```{r mpg_jitter, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg) + - geom_jitter(mapping = aes(x = displ, y = hwy)) -``` - -## Position adjustments challenges - -- What is the problem with this plot? How could you improve it? -```{r mpg_point, eval=F, message=FALSE} -ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + - geom_point() -``` -- What parameters to `geom_jitter()` control the amount of jittering? -- Compare and contrast `geom_jitter()` with `geom_count()` -- What’s the default position adjustment for `geom_boxplot()` ? Create a visualisation of the `mpg` dataset that demonstrates it. - -## Coordinate systems - -Cartesian coordinate system where the x and y positions act independently to determine the location of each point. There are a number of other coordinate systems that are occasionally helpful. - -## Coordinate systems - -```{r mpg_boxplot, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + - geom_boxplot() -``` - -## Coordinate systems - -```{r mpg_boxplot_flip, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + - geom_boxplot() + - coord_flip() -``` - -## Coordinate systems - -```{r diamonds_bar, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -bar <- ggplot(data = diamonds) + - geom_bar( - mapping = aes(x = cut, fill = cut), - show.legend = FALSE, - width = 1 - ) + - theme(aspect.ratio = 1) + - labs(x = NULL, y = NULL) -``` -**3_d** - -## Coordinate systems - -```{r diamonds_bar_plot, echo=F, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -bar -``` - -**3_d** - -## Coordinate systems -```{r diamonds_bar_flip, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -bar + coord_flip() -``` - -## Coordinate systems - -```{r mpg_jitter_noquickmap, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = mpg) + - geom_jitter(mapping = aes(x = cty, y = hwy)) -``` - -## Coordinate systems - -```{r mpg_jitter_quickmap, cache = TRUE, fig.width=3.5, fig.height=3.5, message=FALSE} -ggplot(data = mpg) + - geom_jitter(mapping = aes(x = cty, y = hwy)) + - coord_quickmap() -``` - -## Coordinate systems - -```{r mpg_jitter_log, cache = TRUE, fig.width=8.5, fig.height=3.5, message=FALSE} -ggplot(data = mpg) + - geom_jitter(mapping = aes(x = cty, y = hwy)) + - scale_y_log10() + - scale_x_log10() -``` - -## Coordinate systems -```{r diamonds_bar_polar, cache = TRUE, fig.width=5, fig.height=3.5, message=FALSE} -bar + coord_polar() -``` - -## Coordinate systems challenges - -- Turn a stacked bar chart into a pie chart using `coord_polar()`. -- What does `labs()` do? Read the documentation. -- What does the plot below tell you about the relationship between `city` and highway `mpg`? Why is `coord_fixed()` important? What does `geom_abline()` do? - -```{r mpg_point_fixed, eval = F, cache = TRUE, fig.width=4.5, fig.height=3.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + - geom_point() + - geom_abline() + - coord_fixed() -``` - -## Coordinate systems challenges - -```{r diamonds_barplot_pos_fill_polar, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut, fill = clarity), - position = "fill") + - coord_polar() -``` - -## Coordinate systems challenges - -```{r mpg_point_nofixed_plot, eval = T, cache = TRUE, fig.width=8, fig.height=3.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + - geom_point() + geom_abline() -``` - -## Coordinate systems challenges - -```{r mpg_point_fixed_plot, eval = T, cache = TRUE, fig.width=8, fig.height=3.5, message=FALSE} -ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + - geom_point() + geom_abline() + coord_fixed() -``` diff --git a/session_3/tp.R b/session_3/tp.R deleted file mode 100644 index e69de29..0000000 diff --git a/session_3/tp.md b/session_3/tp.md deleted file mode 100644 index e69de29..0000000 diff --git a/session_4/challengeTime.Rmd b/session_4/challengeTime.Rmd deleted file mode 100644 index 1986436..0000000 --- a/session_4/challengeTime.Rmd +++ /dev/null @@ -1,139 +0,0 @@ ---- -title: "Challenge time!" -author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr), Hélène Polvèche [hpolveche@istem.fr](mailto:hpolveche@istem.fr)" -date: "Mars 2020" -output: - html_document: default - pdf_document: default ---- - <style type="text/css"> - h3 { /* Header 3 */ - position: relative ; - color: #729FCF ; - left: 5%; - } -h2 { /* Header 2 */ - color: darkblue ; - left: 10%; -} -h1 { /* Header 1 */ - color: #034b6f ; -} -#pencadre{ -border:1px; -border-style:solid; -border-color: #034b6f; - background-color: #EEF3F9; - padding: 1em; -text-align: center ; -border-radius : 5px 4px 3px 2px; -} -legend{ - color: #034b6f ; -} -#pquestion { -color: darkgreen; -font-weight: bold; -} -</style> - - ```{r setup, include=FALSE, cache=TRUE} -knitr::opts_chunk$set(echo = TRUE) -``` - - -### Filter challenges : - -Find all flights that: - - - Had an arrival delay of two or more hours -- Were operated by United, American, or Delta -- Departed between midnight and 6am (inclusive) - -Another useful dplyr filtering helper is `between()`. What does it do? Can you use it to simplify the code needed to answer the previous challenges? - -How many flights have a missing `dep_time`? What other variables are missing? What might these rows represent? - -Why is `NA ^ 0` not `NA`? Why is `NA | TRUE` not `NA`? Why is `FALSE & NA` not `NA`? Can you figure out the general rule? (`NA * 0` is a tricky counter-example!) - -### Arrange challenges : - -- Sort flights to find the most delayed flights. Find the flights that left earliest. -- Sort flights to find the fastest flights. -- Which flights traveled the longest? Which traveled the shortest? - -### Select challenges : - -- Brainstorm as many ways as possible to select `dep_time`, `dep_delay`, `arr_time`, and `arr_delay` from `flights`. -- What does the `one_of()` function do? Why might it be helpful in conjunction with this vector? -```{r select_one_of, eval=F, message=F, cache=T} -vars <- c("year", "month", "day", "dep_delay", "arr_delay") -``` -- Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default? -```{r select_contains, eval=F, message=F, cache=T} -select(flights, contains("TIME")) -``` - - -### Mutate challenges : - -- Currently `dep_time` and `sched_dep_time` are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight. - - -```{r mutate_challenges_a, eval=F, message=F, cache=T} -mutate( - flights, - dep_time = (dep_time %/% 100) * 60 + - dep_time %% 100, - sched_dep_time = (sched_dep_time %/% 100) * 60 + - sched_dep_time %% 100 -) -``` - -\ - -- Compare `dep_time`, `sched_dep_time`, and `dep_delay`. How would you expect those three numbers to be related? - -```{r mutate_challenge_b, eval=F, message=F, cache=T} -mutate( - flights, - dep_time = (dep_time %/% 100) * 60 + - dep_time %% 100, - sched_dep_time = (sched_dep_time %/% 100) * 60 + - sched_dep_time %% 100 -) -``` - -\ - -### Challenge with `summarise()` and `group_by()` - -Imagine that we want to explore the relationship between the distance and average delay for each location. -here are three steps to prepare this data: - -- Group flights by destination. -- Summarise to compute distance, average delay, and number of flights. -- Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport. - -```{r summarise_group_by_ggplot_a, eval = F} -flights %>% - group_by(dest) -``` - - \ - -Imagine that we want to explore the relationship between the distance and average delay for each location. - -- Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport. - -```{r summarise_group_by_ggplot_b, eval = F} -flights %>% - group_by(dest) %>% - summarise( - count = n(), - dist = mean(distance, na.rm = TRUE), - delay = mean(arr_delay, na.rm = TRUE) - ) -``` - - diff --git a/session_4/slides.Rmd b/session_4/slides.Rmd deleted file mode 100644 index 3aec2bd..0000000 --- a/session_4/slides.Rmd +++ /dev/null @@ -1,315 +0,0 @@ ---- -title: "R#4: data transformation" -author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)" -date: "08 Nov 2019" -output: - beamer_presentation: - theme: metropolis - slide_level: 3 - fig_caption: no - df_print: tibble - highlight: tango - latex_engine: xelatex - slidy_presentation: - highlight: tango ---- -```{r setup, include=FALSE, cache=TRUE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -``` - -## R#4: data transformation -The goal of this practical is to practices data transformation with `tidyverse`. -The objectives of this session will be to: - -- Filter rows with `filter()` -- Arrange rows with `arrange()` -- Select columns with `select()` -- Add new variables with `mutate()` -- Combining multiple operations with the pipe `%>%` - -## **nycflights13** - -`nycflights13::flights`contains all 336,776 flights that departed from New York City in 2013. The data comes from the US Bureau of Transportation Statistics, and is documented in `?flights` - -```{r load_data, eval=T, message=FALSE, cache=T} -library(nycflights13) -library(tidyverse) -``` - - - -## **nycflights13** - -```{r display_data, eval=F, message=FALSE, cache=T} -flights -``` - -- **int** stands for integers. -- **dbl** stands for doubles, or real numbers. -- **chr** stands for character vectors, or strings. -- **dttm** stands for date-times (a date + a time). -- **lgl** stands for logical, vectors that contain only TRUE or FALSE. -- **fctr** stands for factors, which R uses to represent categorical variables with fixed possible values. -- **date** stands for dates. - -## Filter rows with `filter()` - -`filter()` allows you to subset observations based on their values. - -```{r filter_month_day, eval=T, message=T, cache=T} -filter(flights, month == 1, day == 1) -``` - -## Filter rows with `filter()` - -`dplyr` functions never modify their inputs, so if you want to save the result, you’ll need to use the assignment operator, `<-` - -```{r filter_month_day_sav, eval=T, message=F, cache=T} -jan1 <- filter(flights, month == 1, day == 1) -``` - -R either prints out the results, or saves them to a variable. - -```{r filter_month_day_sav_display, eval=T, message=F, cache=T} -(dec25 <- filter(flights, month == 12, day == 25)) -``` - -## Logical operators - -Multiple arguments to `filter()` are combined with “andâ€: every expression must be true in order for a row to be included in the output. - -```{r logical_operator, echo=FALSE, out.width='100%'} -knitr::include_graphics('img/transform-logical.png') -``` - -## Logical operators - -Test the following operations: - -```{r filter_logical_operators, eval=T, message=F, cache=T} -filter(flights, month == 11 | month == 12) -filter(flights, month %in% c(11, 12)) -filter(flights, !(arr_delay > 120 | dep_delay > 120)) -filter(flights, arr_delay <= 120, dep_delay <= 120) -``` - -## Missing values - -One important feature of R that can make comparison tricky are missing values, or `NA`s (“not availablesâ€). - -```{r filter_logical_operators_NA, eval=T, message=T, cache=T} -NA > 5 -10 == NA -NA + 10 -NA / 2 -``` - -## Missing values - -```{r filter_logical_operators_test_NA, eval=T, message=T, cache=T} -NA == NA -is.na(NA) -``` - -## Filter challenges - -Find all flights that: - -- Had an arrival delay of two or more hours -- Were operated by United, American, or Delta -- Departed between midnight and 6am (inclusive) - -Another useful dplyr filtering helper is `between()`. What does it do? Can you use it to simplify the code needed to answer the previous challenges? - -How many flights have a missing `dep_time`? What other variables are missing? What might these rows represent? - -Why is `NA ^ 0` not `NA`? Why is `NA | TRUE` not `NA`? Why is `FALSE & NA` not `NA`? Can you figure out the general rule? (`NA * 0` is a tricky counter-example!) - - -## Arrange rows with `arrange()` - -`arrange()` works similarly to `filter()` except that instead of selecting rows, it changes their order. - -```{r arrange_ymd, eval=F, message=F, cache=T} -arrange(flights, year, month, day) -``` - -Use `desc()` to re-order by a column in descending order: - -```{r arrange_desc, eval=F, message=F, cache=T} -arrange(flights, desc(dep_delay)) -``` - -Missing values are always sorted at the end: - -```{r arrange_NA, eval=F, message=F, cache=T} -arrange(tibble(x = c(5, 2, NA)), x) -arrange(tibble(x = c(5, 2, NA)), desc(x)) -``` - -## Arrange challenges - -- Sort flights to find the most delayed flights. Find the flights that left earliest. -- Sort flights to find the fastest flights. -- Which flights traveled the longest? Which traveled the shortest? - -## Select columns with `select()` - -`select()` allows you to rapidly zoom in on a useful subset using operations based on the names of the variables. - -```{r select_ymd, eval=F, message=F, cache=T} -select(flights, year, month, day) -select(flights, year:day) -select(flights, -(year:day)) -``` - -## Select columns with `select()` - -here are a number of helper functions you can use within `select()`: - -- `starts_with("abc")`: matches names that begin with “abcâ€. -- `ends_with("xyz")`: matches names that end with “xyzâ€. -- `contains("ijk")`: matches names that contain “ijkâ€. -- `matches("(.)\\1")`: selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings. -- `num_range("x", 1:3)`: matches `x1`, `x2` and `x3`. - -See `?select` for more details. - -## Select challenges - -- Brainstorm as many ways as possible to select `dep_time`, `dep_delay`, `arr_time`, and `arr_delay` from `flights`. -- What does the `one_of()` function do? Why might it be helpful in conjunction with this vector? -```{r select_one_of, eval=F, message=F, cache=T} -vars <- c("year", "month", "day", "dep_delay", "arr_delay") -``` -- Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default? -```{r select_contains, eval=F, message=F, cache=T} -select(flights, contains("TIME")) -``` - -## Add new variables with `mutate()` - -It’s often useful to add new columns that are functions of existing columns. That’s the job of `mutate()`. - -```{r mutate, eval=F, message=F, cache=T} -flights_sml <- select(flights, - year:day, - ends_with("delay"), - distance, - air_time -) -mutate(flights_sml, - gain = dep_delay - arr_delay, - speed = distance / air_time * 60 -) -``` - -**4_a** - -## Add new variables with `mutate()` - -You can refer to columns that you’ve just created: - -```{r mutate_reuse, eval=F, message=F, cache=T} -mutate(flights, - gain = dep_delay - arr_delay, - hours = air_time / 60, - gain_per_hour = gain / hours -) -``` - -## Useful creation functions - -- Offsets: `lead()` and `lag()` allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. `x - lag(x)`) or find when values change (`x != lag(x)`). -- Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: `cumsum()`, `cumprod()`, `cummin()`, `cummax()`; and dplyr provides `cummean()` for cumulative means. -- Logical comparisons, `<`, `<=`, `>`, `>=`, `!=`, and `==` -- Ranking: there are a number of ranking functions, but you should start with `min_rank()`. There is also `row_number()`, `dense_rank()`, `percent_rank()`, `cume_dist()`, `ntile()` - -## Mutate challenges - -- Currently `dep_time` and `sched_dep_time` are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight. - -\pause - -```{r mutate_challenges_a, eval=F, message=F, cache=T} -mutate( - flights, - dep_time = (dep_time %/% 100) * 60 + - dep_time %% 100, - sched_dep_time = (sched_dep_time %/% 100) * 60 + - sched_dep_time %% 100 -) -``` - -**4_b** - -## Mutate challenges - -- Compare `dep_time`, `sched_dep_time`, and `dep_delay`. How would you expect those three numbers to be related? - -\pause - -```{r mutate_challenge_b, eval=F, message=F, cache=T} -mutate( - flights, - dep_time = (dep_time %/% 100) * 60 + - dep_time %% 100, - sched_dep_time = (sched_dep_time %/% 100) * 60 + - sched_dep_time %% 100 -) -``` - -**4_c** - -## Mutate challenges - -- Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for `min_rank()` - -\pause - -```{r mutate_challenge_c, eval=F, message=F, cache=T} -flights_md <- mutate(flights, most_delay = min_rank(desc(dep_delay))) -filter(flights_md, most_delay < 10) -``` - -**4_d** - -## Combining multiple operations with the pipe - -We don't want to create useless intermediate variables so we can use the pipe operator: `%>%` -(`ctrl + shift + M`). - -```{r pipe_example_a, eval=F, message=F, cache=T} -flights_md <- mutate(flights, - most_delay = min_rank(desc(dep_delay))) -flights_md <- filter(flights_md, most_delay < 10) -flights_md <- arrange(flights_md, most_delay) -``` - -## Combining multiple operations with the pipe - -We don't want to create useless intermediate variables so we can use the pipe operator: `%>%` -(`ctrl + shift + M`). - -```{r pipe_example_b, eval=F, message=F, cache=T} -flights %>% - mutate(most_delay = min_rank(desc(dep_delay))) %>% - filter(most_delay < 10) %>% - arrange(most_delay) -``` - -## Combining multiple operations with the pipe - -Behind the scenes, `x %>% f(y)` turns into `f(x, y)`, and `x %>% f(y) %>% g(z)` turns into `g(f(x, y), z)` and so on. You can use the pipe to rewrite multiple operations in a way that you can read left-to-right, top-to-bottom. - -You can access the transmitted variables with `.` - -```{r pipe_example_c, eval=F, message=F, cache=T} -flights %>% - mutate(most_delay = min_rank(desc(dep_delay))) %>% - filter(., most_delay < 10) %>% - arrange(., most_delay) -``` - -Working with the pipe is one of the key criteria for belonging to the `tidyverse`. The only exception is `ggplot2`: it was written before the pipe was discovered. Unfortunately, the next iteration of `ggplot2`, `ggvis`, which does use the pipe, isn’t quite ready for prime time yet. \ No newline at end of file -- GitLab