From b92b95d942574aa82a2f148c53beb01a3c98d0d8 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Thu, 9 Sep 2021 15:31:03 +0200
Subject: [PATCH] session_4.Rmd: fix typo

---
 session_1/example_1.R      | 118 --------
 session_1/tp.R             |   0
 session_4/HTML_toto_s4.Rmd | 342 ----------------------
 session_4/session_4.Rmd    | 566 +++++++++++++++++++++++++++++++++++++
 4 files changed, 566 insertions(+), 460 deletions(-)
 delete mode 100644 session_1/example_1.R
 delete mode 100644 session_1/tp.R
 delete mode 100644 session_4/HTML_toto_s4.Rmd
 create mode 100644 session_4/session_4.Rmd

diff --git a/session_1/example_1.R b/session_1/example_1.R
deleted file mode 100644
index c275a97..0000000
--- a/session_1/example_1.R
+++ /dev/null
@@ -1,118 +0,0 @@
-# Does news coverage boost support for presidential candidates in the Democratic primary?
-# https://www.jacob-long.com/post/news-coverage-candidate-support/ 
-
-library(tidyverse)
-library(jtools)
-library(tsibble)
-
-################################ Getting the data ##############################
-cable_mentions <- read_csv("https://github.com/fivethirtyeight/data/raw/master/media-mentions-2020/cable_weekly.csv")
-online_mentions <- read_csv("https://github.com/fivethirtyeight/data/raw/master/media-mentions-2020/online_weekly.csv")
-# Immediately convert `end_date` to date class
-polls <- read_csv("https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv")
-
-
-candidates <- c("Amy Klobuchar", "Andrew Yang", "Bernard Sanders",
-                "Beto O'Rourke", "Bill de Blasio", "Cory A. Booker",
-                "Elizabeth Warren", "Eric Swalwell", "Jay Robert Inslee",
-                "Joe Sestak", "John Hickenlooper", "John K. Delaney",
-                "Joseph R. Biden Jr.", "Julián Castro", "Kamala D. Harris",
-                "Kirsten E. Gillibrand", "Marianne Williamson",
-                "Michael F. Bennet", "Pete Buttigieg", "Seth Moulton",
-                "Steve Bullock", "Tim Ryan", "Tom Steyer", "Tulsi Gabbard",
-                "Wayne Messam")
-
-candidates_clean <- c("Amy Klobuchar", "Andrew Yang", "Bernie Sanders",
-                      "Beto O'Rourke", "Bill de Blasio", "Cory Booker",
-                      "Elizabeth Warren", "Eric Swalwell", "Jay Inslee",
-                      "Joe Sestak", "John Hickenlooper", "John Delaney",
-                      "Joe Biden", "Julian Castro", "Kamala Harris",
-                      "Kirsten Gillibrand", "Marianne Williamson",
-                      "Michael Bennet", "Pete Buttigieg", "Seth Moulton",
-                      "Steve Bullock", "Tim Ryan", "Tom Steyer",
-                      "Tulsi Gabbard", "Wayne Messam")
-
-
-########################### formating data #####################################
-
-polls <- polls %>%
-  # Convert date to date format
-  mutate(end_date = as.Date(end_date, format = "%m/%d/%y")) %>%
-  filter(
-    # include only polls of at least modest quality
-    fte_grade %in% c("C-", "C", "C+", "B-", "B", "B+", "A-", "A", "A+"),
-    # only include polls ending on or after 12/30/2018
-    end_date >= as.Date("12/30/2018", "%m/%d/%Y"),
-    # only include *Democratic* primary polls
-    party == "DEM",
-    # only include the selected candidates
-    candidate_name %in% candidates,
-    # only national polls
-    is.na(state),
-    # Exclude some head-to-head results, etc.
-    notes %nin% c("head-to-head poll",
-                  "HarrisX/SR Democrat LV, definite voter",
-                  "open-ended question")
-  ) %>%
-  mutate(
-    # Have to add 1 to the date to accommodate tsibble's yearweek()
-    # starting on Monday rather than Sunday like our other data sources
-    week = as.Date(yearweek(end_date + 1)) - 1,
-    # Convert candidate names to factor so I can relabel them
-    candidate_name = factor(candidate_name, levels = candidates, labels = candidates_clean)
-  )
-
-polls_agg <- polls %>%
-  group_by(week, candidate_name) %>%
-  summarize(
-    pct_polls = weighted.mean(pct, log(sample_size))
-  )
-
-library(ggplot2)
-top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders",
-                    "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke",
-                    "Cory Booker")
-ggplot(filter(polls_agg, candidate_name %in% top_candidates),
-       aes(x = week, y = pct_polls, color = candidate_name)) +
-  geom_line() +
-  theme_nice()
-
-
-media <-
-  inner_join(cable_mentions, online_mentions, by = c("date", "name")) %>%
-  mutate(
-    # Create new variables that put the media coverage variables on
-    # same scale as poll numbers
-    pct_cable = 100 * pct_of_all_candidate_clips,
-    pct_online = 100 * pct_of_all_candidate_stories
-  )
-
-top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders",
-                    "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke",
-                    "Cory Booker")
-ggplot(filter(media, name %in% top_candidates),
-       aes(x = date, y = pct_cable, color = name)) +
-  geom_line() +
-  theme_nice()
-
-top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders",
-                    "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke",
-                    "Cory Booker")
-ggplot(filter(media, name %in% top_candidates),
-       aes(x = date, y = pct_online, color = name)) +
-  geom_line() +
-  theme_nice()
-
-
-######################### Combine the data #####################################
-joined <- inner_join(polls_agg, media,
-                     by = c("candidate_name" = "name", "week" = "date"))
-
-library(panelr)
-
-# panel_data needs a number or ordered factor as wave variable
-joined$wave <- as.ordered(joined$week)
-joined_panel <- panel_data(ungroup(joined), id = candidate_name, wave = wave)
-joined_pdata <- as_pdata.frame(joined_panel)
-
-
diff --git a/session_1/tp.R b/session_1/tp.R
deleted file mode 100644
index e69de29..0000000
diff --git a/session_4/HTML_toto_s4.Rmd b/session_4/HTML_toto_s4.Rmd
deleted file mode 100644
index cb620ee..0000000
--- a/session_4/HTML_toto_s4.Rmd
+++ /dev/null
@@ -1,342 +0,0 @@
----
-title: "R#4: data transformation"
-author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr), Hélène Polvèche [hpolveche@istem.fr](mailto:hpolveche@istem.fr)"
-date: "Mars 2020"
-output:
-  html_document: default
-  pdf_document: default
----
-<style type="text/css">
-h3 { /* Header 3 */
-  position: relative ;
-  color: #729FCF ;
-  left: 5%;
-}
-h2 { /* Header 2 */
-  color: darkblue ;
-  left: 10%;
-} 
-h1 { /* Header 1 */
-  color: #034b6f ;
-} 
-#pencadre{
-  border:1px; 
-  border-style:solid; 
-  border-color: #034b6f; 
-  background-color: #EEF3F9; 
-  padding: 1em;
-  text-align: center ;
-  border-radius : 5px 4px 3px 2px;
-}
-legend{
-  color: #034b6f ;
-}
-#pquestion {
-  color: darkgreen;
-  font-weight: bold;
-}
-</style>
-
-```{r setup, include=FALSE, cache=TRUE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-The goal of this practical is to practices data transformation with `tidyverse`.
-The objectives of this session will be to:
-
-- Filter rows with `filter()`
-- Arrange rows with `arrange()`
-- Select columns with `select()`
-- Add new variables with `mutate()`
-- Combining multiple operations with the pipe `%>%`
-
-```R
-install.packages("nycflights13")
-```
-
-```{r packageloaded, include=TRUE, message=FALSE}
-library("tidyverse")
-library("nycflights13")
-```
-
- \ 
- 
-# Data set : nycflights13
-
-`nycflights13::flights`contains all 336,776 flights that departed from New York City in 2013. The data comes from the US Bureau of Transportation Statistics, and is documented in `?flights`
-
-
-```{r display_data, include=TRUE}
-flights
-```
-
-- **int** stands for integers.
-- **dbl** stands for doubles, or real numbers.
-- **chr** stands for character vectors, or strings.
-- **dttm** stands for date-times (a date + a time).
-- **lgl** stands for logical, vectors that contain only TRUE or FALSE.
-- **fctr** stands for factors, which R uses to represent categorical variables with fixed possible values.
-- **date** stands for dates.
-
- \ 
- 
-# Filter rows with `filter()`
-
-`filter()` allows you to subset observations based on their values. 
-
-```{r filter_month_day, include=TRUE}
-filter(flights, month == 1, day == 1)
-```
-
- \ 
- 
-`dplyr` functions never modify their inputs, so if you want to save the result, you’ll need to use the assignment operator, `<-`
-
-```{r filter_month_day_sav, include=TRUE}
-jan1 <- filter(flights, month == 1, day == 1)
-```
-
- \ 
- 
-R either prints out the results, or saves them to a variable.
-
-```{r filter_month_day_sav_display, include=TRUE}
-(dec25 <- filter(flights, month == 12, day == 25))
-```
-
- \ 
- 
-# Logical operators
-
-Multiple arguments to `filter()` are combined with “and”: every expression must be true in order for a row to be included in the output.
-
-![](./img/transform-logical.png)
-
- \ 
-
-Test the following operations:
-
-```{r filter_logical_operators, include=TRUE}
-filter(flights, month == 11 | month == 12)
-filter(flights, month %in% c(11, 12))
-filter(flights, !(arr_delay > 120 | dep_delay > 120))
-filter(flights, arr_delay <= 120, dep_delay <= 120)
-```
-
- \ 
- 
-# Missing values
-
-One important feature of R that can make comparison tricky are missing values, or `NA`s (“not availables”). 
-
-```{r filter_logical_operators_NA, include=TRUE}
-NA > 5
-10 == NA
-NA + 10
-```
-
-
-```{r filter_logical_operators_test_NA, include=TRUE}
-is.na(NA)
-```
-
- \ 
- 
-# Arrange rows with `arrange()`
-
- \ 
-
-`arrange()` works similarly to `filter()` except that instead of selecting rows, it changes their order.
-
-```{r arrange_ymd, include=TRUE}
-arrange(flights, year, month, day)
-```
-
- \ 
-Use `desc()` to re-order by a column in descending order:
-
-```{r arrange_desc, include=TRUE}
-arrange(flights, desc(dep_delay))
-```
-
-Missing values are always sorted at the end:
-
-```{r arrange_NA, include=TRUE}
-arrange(tibble(x = c(5, 2, NA)), x)
-arrange(tibble(x = c(5, 2, NA)), desc(x))
-```
-
- \ 
-
-# Select columns with `select()`
-
- \ 
- 
-`select()` allows you to rapidly zoom in on a useful subset using operations based on the names of the variables.
-
-```{r select_ymd, , include=TRUE}
-select(flights, year, month, day)
-select(flights, year:day)
-select(flights, -(year:day))
-```
-
- \ 
-
-here are a number of helper functions you can use within `select()`:
-
-- `starts_with("abc")`: matches names that begin with “abc”.
-- `ends_with("xyz")`: matches names that end with “xyz”.
-- `contains("ijk")`: matches names that contain “ijk”.
-- `num_range("x", 1:3)`: matches `x1`, `x2` and `x3`.
-
-See `?select` for more details.
-
- \ 
- 
-# Add new variables with `mutate()`
-
- \ 
- 
-It’s often useful to add new columns that are functions of existing columns. That’s the job of `mutate()`.
-
-```{r mutate, include=TRUE}
-flights_sml <- select(flights,  year:day, ends_with("delay"), distance, air_time)
-
-flights_sml
-
-mutate(flights_sml, gain = dep_delay - arr_delay,
-            speed = distance / air_time * 60)
-```
-
- \ 
-
-```{r mutate_reuse, include=TRUE}
-flights_sml <- mutate(flights_sml, gain = dep_delay - arr_delay,
-            speed = distance / air_time * 60)
-
-```
-
- \ 
- 
-### Useful creation functions
-
-- Offsets: `lead()` and `lag()` allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. `x - lag(x)`) or find when values change (`x != lag(x)`).
-- Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: `cumsum()`, `cumprod()`, `cummin()`, `cummax()`; and dplyr provides `cummean()` for cumulative means. 
-- Logical comparisons, `<`, `<=`, `>`, `>=`, `!=`, and `==`
-- Ranking: there are a number of ranking functions, but you should start with `min_rank()`. There is also `row_number()`, `dense_rank()`, `percent_rank()`, `cume_dist()`, `ntile()`
-
- \ 
- 
-# Combining multiple operations with the pipe
-
- \ 
- 
-We don't want to create useless intermediate variables so we can use the pipe operator: `%>%`
-( or `ctrl + shift + M`). 
-
-<div id="pquestion"> - Find the 10 most delayed flights using a ranking function. `min_rank()` </div>
-
-```{r pipe_example_a, include=TRUE}
-flights_md <- mutate(flights,
-                     most_delay = min_rank(desc(dep_delay)))
-flights_md <- filter(flights_md, most_delay < 10)
-flights_md <- arrange(flights_md, most_delay)
-```
-
- \ 
- 
-
-```{r pipe_example_b, include=TRUE}
-flights_md2 <- flights %>%
-    mutate(most_delay = min_rank(desc(dep_delay))) %>% 
-    filter(most_delay < 10) %>% 
-    arrange(most_delay)
-
-select(flights_md2, year:day, flight, origin, dest, dep_delay, most_delay)
-```
-
- \ 
-
-Behind the scenes, `x %>% f(y)` turns into `f(x, y)`, and `x %>% f(y) %>% g(z)` turns into `g(f(x, y), z)` and so on. You can use the pipe to rewrite multiple operations in a way that you can read left-to-right, top-to-bottom. 
-
- \ 
-
-Working with the pipe is one of the key criteria for belonging to the `tidyverse`. The only exception is `ggplot2`: it was written before the pipe was discovered. Unfortunately, the next iteration of `ggplot2`, `ggvis`, which does use the pipe, isn’t quite ready for prime time yet.
-
-# Grouped summaries with `summarise()`
-
-`summarise()` collapses a data frame to a single row:
-
-```{r load_data, include=TRUE}
-flights %>% 
-  summarise(delay = mean(dep_delay, na.rm = TRUE))
-```
-
-### The power of `summarise()` with `group_by()`
-
-This changes the unit of analysis from the complete dataset to individual groups. Then, when you use the `dplyr` verbs on a grouped data frame they’ll be automatically applied “by group”.
-
-```{r summarise_group_by, include=TRUE, fig.width=8, fig.height=3.5}
-flights_delay <- flights %>% 
-  group_by(year, month) %>% 
-  summarise(delay = mean(dep_delay, na.rm = TRUE), sd = sd(dep_delay, na.rm = TRUE)) %>% 
-  arrange(month)
-
-flights_delay
-
-ggplot(data = flights_delay, mapping = aes(x = month, y = delay)) +
-  geom_bar(stat="identity", color="black", fill = "#619CFF") +
-  geom_errorbar(mapping = aes( ymin=0, ymax=delay+sd)) + 
-  theme(axis.text.x = element_blank())
-
-```
-
-
-### Missing values
-
-You may have wondered about the na.rm argument we used above. What happens if we don’t set it?
-
-```{r summarise_group_by_NA, include=TRUE}
-flights %>% 
-  group_by(dest) %>% 
-  summarise(
-    dist = mean(distance),
-    delay = mean(arr_delay)
-  )
-```
-
-Aggregation functions obey the usual rule of missing values: if there’s any missing value in the input, the output will be a missing value.
-
-
-# Counts
-
-Whenever you do any aggregation, it’s always a good idea to include either a count (`n()`). That way you can check that you’re not drawing conclusions based on very small amounts of data.
-
-```{r summarise_group_by_count, include = TRUE, warning=F, message=F, fig.width=8, fig.height=3.5}
-summ_delay_filghts <- flights %>% 
-                      group_by(dest) %>% 
-                      summarise(
-                          count = n(),
-                          dist = mean(distance, na.rm = TRUE),
-                          delay = mean(arr_delay, na.rm = TRUE)
-                      )
-summ_delay_filghts
-
-ggplot(data = summ_delay_filghts, mapping = aes(x = dist, y = delay, size = count)) +
-  geom_point() +
-  geom_smooth(method = lm, se = FALSE) +
-  theme(legend.position='none')
-
-```
-
-## Thank you !
-
- \ 
- 
-## For curious or motivated people: Challenge time!
-
- \ 
- 
- \ 
- 
- 
diff --git a/session_4/session_4.Rmd b/session_4/session_4.Rmd
new file mode 100644
index 0000000..93d0a36
--- /dev/null
+++ b/session_4/session_4.Rmd
@@ -0,0 +1,566 @@
+---
+title: "R.4: data transformation"
+author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr), Hélène Polvèche [hpolveche@istem.fr](mailto:hpolveche@istem.fr)"
+date: "2021"
+output:
+  rmdformats::downcute:
+    self_contain: true
+    use_bookdown: true
+    default_style: "dark"
+    lightbox: true
+    css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css"
+---
+
+```{r setup, include=FALSE}
+rm(list=ls())
+knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(comment = NA)
+```
+```{r klippy, echo=FALSE, include=TRUE}
+klippy::klippy(
+  position = c('top', 'right'),
+  color = "white",
+  tooltip_message = 'Click to copy',
+  tooltip_success = 'Copied !')
+```
+
+# Introduction
+
+The goal of this practical is to practice data transformation with `tidyverse`.
+The objectives of this session will be to:
+
+- Filter rows with `filter()`
+- Arrange rows with `arrange()`
+- Select columns with `select()`
+- Add new variables with `mutate()`
+- Combining multiple operations with the pipe `%>%`
+
+<div class="pencadre">
+For this session we are going to work with a new dataset included in the `nycflights13` package.
+Install this package and load it.
+As usual you will also need the `tidyverse` library.
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```R
+install.packages("nycflights13")
+```
+
+```{r packageloaded, include=TRUE, message=FALSE}
+library("tidyverse")
+library("nycflights13")
+```
+</p>
+</details>
+ 
+## Data set : nycflights13
+
+`nycflights13::flights`Contains all 336,776 flights that departed from New York City in 2013.
+The data comes from the US Bureau of Transportation Statistics, and is documented in `?flights`
+
+```{r display_data, include=TRUE}
+flights
+```
+
+## Data type
+
+In programming languages, all variables are not equal.
+When you display a `tibble` you can see the **type** of a column.
+Here is a list of common variable **types** that you will encounter
+
+- **int** stands for integers.
+- **dbl** stands for doubles or real numbers.
+- **chr** stands for character vectors or strings.
+- **dttm** stands for date-times (a date + a time).
+- **lgl** stands for logical, vectors that contain only `TRUE` or `FALSE`.
+- **fctr** stands for factors, which R uses to represent categorical variables with fixed possible values.
+- **date** stands for dates.
+
+You cannot add an **int** to a **chr**, but you can add an **int** to a **dbl** the results will be a **dbl**.
+ 
+# `filter` rows
+
+Variable **types** are important to keep in mind for comparisons.
+The `filter()` function allows you to subset observations based on their values. 
+
+<div class="pencadre">
+
+What is the results of the following `filter` command ?
+
+```{r filter_month_day, include=TRUE, eval=FALSE}
+filter(flights, month == 1, day == 1)
+```
+</div>
+ 
+`dplyr` functions never modify their inputs, so if you want to save the result, you’ll need to use the assignment operator, `<-`
+
+<div class="pencadre">
+Save the previous command in a `jan1` variable
+</div
+<details><summary>Solution</summary>
+<p>
+```{r filter_month_day_sav, include=TRUE}
+jan1 <- filter(flights, month == 1, day == 1)
+```
+</p>
+</details>
+ 
+<div class="pencadre">
+R either prints out the results, or saves them to a variable.
+What happens when you put your variable assignment code between parenthesis `(` `)` ?
+
+```{r filter_month_day_sav_display, eval=FALSE}
+(dec25 <- filter(flights, month == 12, day == 25))
+```
+</div>
+ 
+## Logical operators
+
+Multiple arguments to `filter()` are combined with **AND**: every expression must be `TRUE` in order for a row to be included in the output.
+
+In R you can use the symbols `&`, `|`, `!` and the function `xor()` to build other kinds of tests.
+
+![](./img/transform-logical.png)
+<div class="pencadre">
+Test the following operations:
+
+```{r filter_logical_operators_a, eval=FALSE}
+filter(flights, month == 11 | month == 12)
+```
+```{r filter_logical_operators_b, eval=FALSE}
+filter(flights, month %in% c(11, 12))
+```
+```{r filter_logical_operators_c, eval=FALSE}
+filter(flights, !(arr_delay > 120 | dep_delay > 120))
+```
+```{r filter_logical_operators_d, eval=FALSE}
+filter(flights, arr_delay <= 120, dep_delay <= 120)
+```
+</div>
+
+Combinations of logical operators is a powerful programmatic way to select subset of data.
+Keep in mind, however, that long logical expression can be hard to read and understand, so it may be easier to apply successive small filters instead of one long one.
+
+## Missing values
+
+One important feature of R that can make comparison tricky is missing values, or `NA`s for **Not Availables**.
+Indeed each of the variable type can contain either a value of this type (i.e., `2` for an **int**) or nothing.
+The *nothing recorded in a variable* status is represented with the `NA` symbol.
+
+As operations with `NA` values don't make sense, if you have `NA` somewhere in your operation, the results will be `NA`
+
+```{r filter_logical_operators_NA, include=TRUE}
+NA > 5
+10 == NA
+NA + 10
+```
+
+However, you can test for `NA`s with the function `is.na()`:
+
+```{r filter_logical_operators_test_NA, include=TRUE}
+is.na(NA)
+```
+
+`filter()` only includes rows where the condition is `TRUE`; it excludes both `FALSE` and `NA` values. If you want to preserve missing values, ask for them explicitly:
+
+```{r filter_logical_operators_test_NA2, include=TRUE}
+df <- tibble(x = c(1, NA, 3))
+filter(df, x > 1)
+filter(df, is.na(x) | x > 1)
+```
+
+## Challenges
+
+<div class="pencadre">
+Find all flights that:
+- Had an arrival delay of two or more hours (you can check `?flights`)
+- Flew to Houston (IAH or HOU)
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r filter_chalenges_a, eval=TRUE}
+filter(flights, arr_delay >= 60 | arr_delay <= 120)
+```
+```{r filter_chalenges_b, eval=TRUE}
+filter(flights, dest %in% c("IAH", "HOU"))
+```
+</p>
+</details>
+
+<div class="pencadre">
+How many flights have a missing `dep_time` ?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r filter_chalenges_c, eval=TRUE}
+filter(flights, is.na(dep_time))
+```
+</p>
+</details>
+
+<div class="pencadre">
+Why is `NA ^ 0` not missing? Why is `NA | TRUE` not missing? Why is `FALSE & NA` not missing? Can you figure out the general rule? (`NA * 0` is a tricky counterexample!)
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r filter_chalenges_d, eval=TRUE}
+NA ^ 0 # ^ 0 is always 1 it's an arbitrary rule not a computation
+NA | TRUE # if a member of a OR operation is TRUE the results is TRUE
+FALSE & NA # if a member of a AN operation is FALSE the results is TRUE
+NA * 0 # here we have a true computation
+```
+</p>
+</details>
+
+# Arrange rows with `arrange()`
+
+`arrange()` works similarly to `filter()` except that instead of selecting rows, it changes their order.
+
+```{r arrange_ymd, include=TRUE}
+arrange(flights, year, month, day)
+```
+
+<div class="pencadre">
+Use `desc()` to reorder by a column in descending order:
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r arrange_desc, include=TRUE}
+arrange(flights, desc(dep_delay))
+```
+</p>
+</details>
+
+## Missing values
+
+Missing values are always sorted at the end:
+
+```{r arrange_NA, include=TRUE}
+arrange(tibble(x = c(5, 2, NA)), x)
+arrange(tibble(x = c(5, 2, NA)), desc(x))
+```
+
+## Challenges
+<div class="pencadre">
+
+- Find the most delayed flight.
+- Find the flight that left earliest.
+- How could you arrange all missing values to the start ?
+
+</div>
+
+<details><summary>Solution</summary>
+<p>
+Find the most delayed flight.
+```{r chalange_arrange_desc_a, include=TRUE}
+arrange(flights, desc(dep_delay))
+```
+Find the flight that left earliest.
+```{r chalange_arrange_desc_b, include=TRUE}
+arrange(flights, dep_delay)
+```
+How could you arrange all missing values to the start
+```{r chalange_arrange_desc_c, include=TRUE}
+arrange(tibble(x = c(5, 2, NA)), desc(is.na(x)))
+```
+</p>
+</details>
+
+
+# Select columns with `select()`
+
+`select()` allows you to rapidly zoom in on a useful subset using operations based on the names of the variables.
+
+You can select by column names
+```{r select_ymd_a, , include=TRUE}
+select(flights, year, month, day)
+```
+
+By defining a range of columns
+```{r select_ymd_b, , include=TRUE}
+select(flights, year:day)
+```
+
+Or you can do a negative (`-`) to remove columns.
+```{r select_ymd_c, , include=TRUE}
+select(flights, -(year:day))
+```
+
+## Helper functions
+
+here are a number of helper functions you can use within `select()`:
+
+- `starts_with("abc")`: matches names that begin with `"abc"`.
+- `ends_with("xyz")`: matches names that end with `"xyz"`.
+- `contains("ijk")`: matches names that contain `"ijk"`.
+- `num_range("x", 1:3)`: matches `x1`, `x2` and `x3`.
+
+See `?select` for more details.
+
+## Challenges
+
+<div class="pencadre">
+
+- Brainstorm as many ways as possible to select `dep_time`, `dep_delay`, `arr_time`, and `arr_delay` from `flights`.
+<details><summary>Solution</summary>
+<p>
+```{r challenge_select_a, eval=FALSE}
+select(flights, contains("time") | contains("delay"))
+select(flights, contains("_") & !starts_with("sched") & !starts_with("time"))
+```
+</p>
+</details>
+- What does the `one_of()` function do? Why might it be helpful in conjunction with this vector?
+```{r select_one_of, eval=T, message=F, cache=T}
+vars <- c("year", "month", "day", "dep_delay", "arr_delay")
+```
+<details><summary>Solution</summary>
+<p>
+```{r challenge_select_b, eval=FALSE}
+select(flights, one_of(vars))
+```
+</p>
+</details>
+- Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default?
+```{r select_contains, eval=F, message=F, cache=T}
+select(flights, contains("TIME"))
+```
+<details><summary>Solution</summary>
+<p>
+```{r challenge_select_c, eval=FALSE}
+select(flights, contains("TIME", ignore.case = FALSE))
+```
+</p>
+</details>
+
+</div>
+ 
+# Add new variables with `mutate()`
+
+It’s often useful to add new columns that are functions of existing columns. That’s the job of `mutate()`.
+
+<div class="pencadre">
+First let's create a smaller dataset to work on `flights_sml` that contains
+- columns from `year` to `day`
+- columns that ends with `delays`
+- the `distance` and `air_time` columns
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r mutate, include=TRUE}
+(flights_sml <- select(flights,  year:day, ends_with("delay"), distance, air_time))
+```
+</p>
+</details>
+
+## `mutate()`
+
+```R
+mutate(tbl, new_var_a = opperation_a, ..., new_var_n = opperation_n)
+```
+`mutate()` allows you to add new columns (`new_var_a`, ... , `new_var_n`) and to fill them with the results of an operation.
+
+We can create a `gain` column to check if the pilot managed to compensate is departure delay
+```{r mutate_gain}
+mutate(flights_sml, gain = dep_delay - arr_delay)
+```
+
+<div class="pencadre">
+Using `mutate` add a new column `gain` and `speed` that contains the average speed of the plane to the `flights_sml` tibble.
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r mutate_reuse, include=TRUE}
+flights_sml <- mutate(flights_sml,
+  gain = dep_delay - arr_delay,
+  speed = distance / air_time * 60
+)
+```
+</details>
+</p>
+
+
+<div class="pencadre">
+Currently `dep_time` and `sched_dep_time` are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of the number of minutes since midnight.
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r mutate_challenges_a, eval=F, message=F, cache=T}
+mutate(
+  flights,
+  dep_time = (dep_time %/% 100) * 60 +
+    dep_time %% 100,
+  sched_dep_time = (sched_dep_time %/% 100) * 60 +
+    sched_dep_time %% 100
+)
+```
+</details>
+</p>
+
+## Useful creation functions
+
+- Offsets: `lead()` and `lag()` allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. `x - lag(x)`) or find when values change (`x != lag(x)`).
+- Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: `cumsum()`, `cumprod()`, `cummin()`, `cummax()`; and dplyr provides `cummean()` for cumulative means. 
+- Logical comparisons, `<`, `<=`, `>`, `>=`, `!=`, and `==`
+- Ranking: there are a number of ranking functions, but you should start with `min_rank()`. There is also `row_number()`, `dense_rank()`, `percent_rank()`, `cume_dist()`, `ntile()`
+
+ 
+# Combining multiple operations with the pipe
+
+
+<div id="pencadre">
+Find the 10 most delayed flights using a ranking function. `min_rank()`
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r pipe_example_a, include=TRUE}
+flights_md <- mutate(flights,
+                     most_delay = min_rank(desc(dep_delay)))
+flights_md <- filter(flights_md, most_delay < 10)
+flights_md <- arrange(flights_md, most_delay)
+```
+</p>
+</details>
+
+
+We don't want to create useless intermediate variables so we can use the pipe operator: `%>%`
+( or `ctrl + shift + M`). 
+
+Behind the scenes, `x %>% f(y)` turns into `f(x, y)`, and `x %>% f(y) %>% g(z)` turns into `g(f(x, y), z)` and so on. You can use the pipe to rewrite multiple operations in a way that you can read left-to-right, top-to-bottom. 
+
+<div id="pencadre">
+Try to pipe operator to rewrite your precedent code with only **one** variable assignment.
+</div>
+ 
+
+<details><summary>Solution</summary>
+<p>
+```{r pipe_example_b, include=TRUE}
+flights_md2 <- flights %>%
+    mutate(most_delay = min_rank(desc(dep_delay))) %>% 
+    filter(most_delay < 10) %>% 
+    arrange(most_delay)
+```
+</p>
+</details>
+
+Working with the pipe is one of the key criteria for belonging to the `tidyverse`. The only exception is `ggplot2`: it was written before the pipe was discovered and use `+` instead of `%>%`. Unfortunately, the next iteration of `ggplot2`, `ggvis`, which does use the pipe, isn’t quite ready for prime time yet.
+
+# Grouped summaries with `summarise()`
+
+`summarise()` collapses a data frame to a single row:
+
+Check the difference between `summarise()` and `mutate()` with the following commands:
+
+```{r load_data, eval=FALSE}
+flights %>% 
+  mutate(delay = mean(dep_delay, na.rm = TRUE))
+flights %>% 
+  summarise(delay = mean(dep_delay, na.rm = TRUE))
+```
+
+## The power of `summarise()` with `group_by()`
+
+The `group_by()` function changes the unit of analysis from the complete dataset to individual groups.
+Then, when you use the function you already know on grouped data frame and they’ll be automatically applied *by group*.
+
+You can use the following code to compute the average delay per months across years.
+
+```{r summarise_group_by, include=TRUE, fig.width=8, fig.height=3.5}
+flights_delay <- flights %>% 
+  group_by(year, month) %>% 
+  summarise(delay = mean(dep_delay, na.rm = TRUE), sd = sd(dep_delay, na.rm = TRUE)) %>% 
+  arrange(month)
+
+ggplot(data = flights_delay, mapping = aes(x = month, y = delay)) +
+  geom_bar(stat="identity", color="black", fill = "#619CFF") +
+  geom_errorbar(mapping = aes( ymin=0, ymax=delay+sd)) + 
+  theme(axis.text.x = element_blank())
+```
+<div class="pencadre">
+Why did we `group_by` `year` and `month` and not only `year` ?
+</div>
+
+
+## Missing values
+
+<div class="pencadre">
+You may have wondered about the `na.rm` argument we used above. What happens if we don’t set it?
+</div>
+
+```{r summarise_group_by_NA, include=TRUE}
+flights %>% 
+  group_by(dest) %>% 
+  summarise(
+    dist = mean(distance),
+    delay = mean(arr_delay)
+  )
+```
+
+Aggregation functions obey the usual rule of missing values: **if there’s any missing value in the input, the output will be a missing value**.
+
+# Counts
+
+Whenever you do any aggregation, it’s always a good idea to include either a count (`n()`). That way you can check that you’re not drawing conclusions based on very small amounts of data.
+
+```{r summarise_group_by_count, include = T, echo=F, warning=F, message=F, fig.width=8, fig.height=3.5}
+summ_delay_filghts <- flights %>% 
+  group_by(dest) %>% 
+  summarise(
+    count = n(),
+    dist = mean(distance, na.rm = TRUE),
+    delay = mean(arr_delay, na.rm = TRUE)
+  ) %>% 
+  filter(dest != "HNL") %>% 
+  filter(delay < 40 & delay > -20)
+
+  
+
+ggplot(data = summ_delay_filghts, mapping = aes(x = dist, y = delay, size = count)) +
+  geom_point() +
+  geom_smooth(method = lm, se = FALSE) +
+  theme(legend.position='none')
+```
+
+<div class="pencadre">
+Imagine that we want to explore the relationship between the distance and average delay for each location and recreate the above figure. 
+here are three steps to prepare this data: 
+
+1. Group flights by destination.
+2. Summarize to compute distance, average delay, and number of flights using `n()`.
+3. Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport.
+4. Filter to remove noisy points with delay superior to 40 or inferior to -20
+5. Create a `mapping` on `dist`, `delay` and `count` as `size`.
+6. Use the layer `geom_point()` and `geom_smooth()`
+7. We can hide the legend with the layer `theme(legend.position='none')`
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r summarise_group_by_count_b, include = T, eval=F, warning=F, message=F, fig.width=8, fig.height=3.5}
+flights %>% 
+  group_by(dest) %>% 
+  summarise(
+    count = n(),
+    dist = mean(distance, na.rm = TRUE),
+    delay = mean(arr_delay, na.rm = TRUE)
+  ) %>% 
+  filter(dest != "HNL") %>% 
+  filter(delay < 40 & delay > -20) %>% 
+  ggplot(mapping = aes(x = dist, y = delay, size = count)) +
+  geom_point() +
+  geom_smooth(method = lm, se = FALSE) +
+  theme(legend.position='none')
+```
+</p>
+</details>
+ 
\ No newline at end of file
-- 
GitLab