Compare revisions

c9537afc · c9537afc · c9537afc · 06d9eb16 · c9537afc · 06d9eb16
--- a/session_1/bioconductor.Rmd
+++ b/session_1/bioconductor.Rmd
---
-title: 'R.1: Installing packages from Bioconductor'
-author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)"
-date: "2021"
-output:
-  rmdformats::downcute:
-    self_contain: false
-    use_bookdown: true
-    default_style: "dark"
-    lightbox: true
-    css: "../src/style.css"
---
-
-```{r setup, include=FALSE}
-rm(list=ls())
-knitr::opts_chunk$set(echo = TRUE)
-knitr::opts_chunk$set(comment = NA)
-```
-```{r klippy, echo=FALSE, include=TRUE}
-klippy::klippy(
-  position = c('top', 'right'),
-  color = "white",
-  tooltip_message = 'Click to copy',
-  tooltip_success = 'Copied !')
-```
-
-To install packages from [Bioconducor](http://www.bioconductor.org) you need to
\ No newline at end of file
--- a/session_1/example_1.R
+++ b/session_1/example_1.R
-# Does news coverage boost support for presidential candidates in the Democratic primary?
-# https://www.jacob-long.com/post/news-coverage-candidate-support/ 
-
-library(tidyverse)
-library(jtools)
-library(tsibble)
-
-################################ Getting the data ##############################
-cable_mentions <- read_csv("https://github.com/fivethirtyeight/data/raw/master/media-mentions-2020/cable_weekly.csv")
-online_mentions <- read_csv("https://github.com/fivethirtyeight/data/raw/master/media-mentions-2020/online_weekly.csv")
-# Immediately convert `end_date` to date class
-polls <- read_csv("https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv")
-
-
-candidates <- c("Amy Klobuchar", "Andrew Yang", "Bernard Sanders",
-                "Beto O'Rourke", "Bill de Blasio", "Cory A. Booker",
-                "Elizabeth Warren", "Eric Swalwell", "Jay Robert Inslee",
-                "Joe Sestak", "John Hickenlooper", "John K. Delaney",
-                "Joseph R. Biden Jr.", "Julián Castro", "Kamala D. Harris",
-                "Kirsten E. Gillibrand", "Marianne Williamson",
-                "Michael F. Bennet", "Pete Buttigieg", "Seth Moulton",
-                "Steve Bullock", "Tim Ryan", "Tom Steyer", "Tulsi Gabbard",
-                "Wayne Messam")
-
-candidates_clean <- c("Amy Klobuchar", "Andrew Yang", "Bernie Sanders",
-                      "Beto O'Rourke", "Bill de Blasio", "Cory Booker",
-                      "Elizabeth Warren", "Eric Swalwell", "Jay Inslee",
-                      "Joe Sestak", "John Hickenlooper", "John Delaney",
-                      "Joe Biden", "Julian Castro", "Kamala Harris",
-                      "Kirsten Gillibrand", "Marianne Williamson",
-                      "Michael Bennet", "Pete Buttigieg", "Seth Moulton",
-                      "Steve Bullock", "Tim Ryan", "Tom Steyer",
-                      "Tulsi Gabbard", "Wayne Messam")
-
-
-########################### formating data #####################################
-
-polls <- polls %>%
-  # Convert date to date format
-  mutate(end_date = as.Date(end_date, format = "%m/%d/%y")) %>%
-  filter(
-    # include only polls of at least modest quality
-    fte_grade %in% c("C-", "C", "C+", "B-", "B", "B+", "A-", "A", "A+"),
-    # only include polls ending on or after 12/30/2018
-    end_date >= as.Date("12/30/2018", "%m/%d/%Y"),
-    # only include *Democratic* primary polls
-    party == "DEM",
-    # only include the selected candidates
-    candidate_name %in% candidates,
-    # only national polls
-    is.na(state),
-    # Exclude some head-to-head results, etc.
-    notes %nin% c("head-to-head poll",
-                  "HarrisX/SR Democrat LV, definite voter",
-                  "open-ended question")
-  ) %>%
-  mutate(
-    # Have to add 1 to the date to accommodate tsibble's yearweek()
-    # starting on Monday rather than Sunday like our other data sources
-    week = as.Date(yearweek(end_date + 1)) - 1,
-    # Convert candidate names to factor so I can relabel them
-    candidate_name = factor(candidate_name, levels = candidates, labels = candidates_clean)
-  )
-
-polls_agg <- polls %>%
-  group_by(week, candidate_name) %>%
-  summarize(
-    pct_polls = weighted.mean(pct, log(sample_size))
-  )
-
-library(ggplot2)
-top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders",
-                    "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke",
-                    "Cory Booker")
-ggplot(filter(polls_agg, candidate_name %in% top_candidates),
-       aes(x = week, y = pct_polls, color = candidate_name)) +
-  geom_line() +
-  theme_nice()
-
-
-media <-
-  inner_join(cable_mentions, online_mentions, by = c("date", "name")) %>%
-  mutate(
-    # Create new variables that put the media coverage variables on
-    # same scale as poll numbers
-    pct_cable = 100 * pct_of_all_candidate_clips,
-    pct_online = 100 * pct_of_all_candidate_stories
-  )
-
-top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders",
-                    "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke",
-                    "Cory Booker")
-ggplot(filter(media, name %in% top_candidates),
-       aes(x = date, y = pct_cable, color = name)) +
-  geom_line() +
-  theme_nice()
-
-top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders",
-                    "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke",
-                    "Cory Booker")
-ggplot(filter(media, name %in% top_candidates),
-       aes(x = date, y = pct_online, color = name)) +
-  geom_line() +
-  theme_nice()
-
-
-######################### Combine the data #####################################
-joined <- inner_join(polls_agg, media,
-                     by = c("candidate_name" = "name", "week" = "date"))
-
-library(panelr)
-
-# panel_data needs a number or ordered factor as wave variable
-joined$wave <- as.ordered(joined$week)
-joined_panel <- panel_data(ungroup(joined), id = candidate_name, wave = wave)
-joined_pdata <- as_pdata.frame(joined_panel)
-
-
--- a/session_1/github.Rmd
+++ b/session_1/github.Rmd
---
-title: 'R.1: Installing packages from github'
-author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)"
-date: "2021"
-output:
-  rmdformats::downcute:
-    self_contain: false
-    use_bookdown: true
-    default_style: "dark"
-    lightbox: true
-    css: "../src/style.css"
---
-
-```{r setup, include=FALSE}
-rm(list=ls())
-knitr::opts_chunk$set(echo = TRUE)
-knitr::opts_chunk$set(comment = NA)
-```
-```{r klippy, echo=FALSE, include=TRUE}
-klippy::klippy(
-  position = c('top', 'right'),
-  color = "white",
-  tooltip_message = 'Click to copy',
-  tooltip_success = 'Copied !')
-```
-
-To install packages from [github](https://github.com/) you need to
\ No newline at end of file
--- a/session_1/session_1.Rmd
+++ b/session_1/session_1.Rmd
--- a/session_1/tp.R
+++ b/session_1/tp.R
--- a/session_2/session_2.Rmd
+++ b/session_2/session_2.Rmd
--- a/session_2/slides.Rmd
+++ b/session_2/slides.Rmd
---
-title: "R#2: introduction to Tidyverse"
-author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)"
-date: "24 Oct 2019"
-output:
-  slidy_presentation:
-    highlight: tango
-  beamer_presentation:
-    theme: metropolis
-    slide_level: 3
-    fig_caption: no
-    df_print: tibble
-    highlight: tango
-    latex_engine: xelatex
---
-
-```{r setup, include=FALSE, cache=TRUE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-tmp <- tempfile(fileext = ".zip")
-download.file("http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip",
-              tmp,
-              quiet = TRUE)
-unzip(tmp, exdir = "data-raw")
-new_class_level <- c(
-  "Compact Cars",
-  "Large Cars",
-  "Midsize Cars",
-  "Midsize Cars",
-  "Midsize Cars",
-  "Compact Cars",
-  "Minivan",
-  "Minivan",
-  "Pickup Trucks",
-  "Pickup Trucks",
-  "Pickup Trucks",
-  "Sport Utility Vehicle",
-  "Sport Utility Vehicle",
-  "Compact Cars",
-  "Special Purpose Vehicle",
-  "Special Purpose Vehicle",
-  "Special Purpose Vehicle",
-  "Special Purpose Vehicle",
-  "Special Purpose Vehicle",
-  "Special Purpose Vehicle",
-  "Sport Utility Vehicle",
-  "Sport Utility Vehicle",
-  "Pickup Trucks",
-  "Pickup Trucks",
-  "Pickup Trucks",
-  "Pickup Trucks",
-  "Sport Utility Vehicle",
-  "Sport Utility Vehicle",
-  "Compact Cars",
-  "Two Seaters",
-  "Vans",
-  "Vans",
-  "Vans",
-  "Vans"
-)
-new_fuel_level <- c(
-  "gas",
-  "Diesel",
-  "Regular",
-  "gas",
-  "gas",
-  "Regular",
-  "Regular",
-  "Hybrid",
-  "Hybrid",
-  "Regular",
-  "Regular",
-  "Hybrid",
-  "Hybrid"
-)
-read_csv("data-raw/vehicles.csv") %>%
-  select(
-    "id",
-    "make",
-    "model",
-    "year",
-    "VClass",
-    "trany",
-    "drive",
-    "cylinders",
-    "displ",
-    "fuelType",
-    "highway08",
-    "city08"
-  ) %>% 
-  rename(
-    "class" = "VClass",
-    "trans" = "trany",
-    "drive" = "drive",
-    "cyl" = "cylinders",
-    "displ" = "displ",
-    "fuel" = "fuelType",
-    "hwy" = "highway08",
-    "cty" = "city08"
-  ) %>%
-  filter(drive != "") %>%
-  drop_na() %>% 
-  arrange(make, model, year) %>%
-  mutate(class = factor(as.factor(class), labels = new_class_level)) %>%
-  mutate(fuel = factor(as.factor(fuel), labels = new_fuel_level)) %>%
-  write_csv("2_data.csv")
-```
-
-## R#2: introduction to Tidyverse
-The goal of this practical is to familiarize yourself with `ggplot2`.
-
-The objectives of this session will be to:
-
- Create basic plot with `ggplot2`
- Understand the `tibble` type
- Learn the different aesthetics in R plots
- Compose graphics
-
-## Tidyverse
-
-The tidyverse is a collection of R packages designed for data science.
-
-All packages share an underlying design philosophy, grammar, and data structures.
-
-```{r install_tidyverse, cache = TRUE, eval = FALSE}
-install.packages("tidyverse")
-```
-
-```{r load_tidyverse, cache = TRUE}
-library("tidyverse")
-```
-
-
-## Toy data set `mpg`
-
-This dataset contains a subset of the fuel economy data that the EPA makes available on **http://fueleconomy.gov**. It contains only models which had a new release every year between 1999 and 2008.
-
-
-```{r mpg_inspect, cache = TRUE, eval=FALSE}
-?mpg
-mpg
-dim(mpg)
-View(mpg)
-```
-
-## Updated version of the data
-
-`mpg` is loaded with tidyverse, we want to be able to read our own data from
-**http://perso.ens-lyon.fr/laurent.modolo/R/2_data.csv**
-
-```{r mpg_download, cache=TRUE, message=FALSE}
-new_mpg <- read_csv(
-  "http://perso.ens-lyon.fr/laurent.modolo/R/2_data.csv"
-  )
-```
-
-**http://perso.ens-lyon.fr/laurent.modolo/R/2_a**
-
-## First plot with `ggplot2`
-
-Relationship between engine size `displ` and fuel efficiency `hwy`.
-```{r new_mpg_plot_a, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = new_mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy))
-```
-
-## Composition of plot with `ggplot2`
-
-Composition of plot with `ggplot2`
-
-```R
-ggplot(data = <DATA>) + 
-  <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))
-```
-
- you begin a plot with the function `ggplot()`
- you complete your graph by adding one or more layers
- `geom_point()` adds a layer with a scatterplot
- each geom function in `ggplot2` takes a `mapping` argument
- the `mapping` argument is always paired with `aes()`
-
-
-## First challenge
-
- Run `ggplot(data = new_mpg)`. What do you see?
- How many rows are in `new_mpg`? How many columns?
- What does the `cty` variable describe? Read the help for `?mpg` to find out.
- Make a scatterplot of `hwy` vs. `cyl`.
- What happens if you make a scatterplot of `class` vs. `drive`? Why is the plot not useful?
-
-## Run `ggplot(data = mpg)`. What do you see?
-
-```{r empty_plot, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = new_mpg)
-```
-
-## How many rows are in `new_mpg`? How many columns?
-
-```{r size_of_mpg, cache = TRUE, fig.width=8, fig.height=4.5}
-new_mpg
-```
-
-## Make a scatterplot of `hwy` vs. `cyl`.
-
-```{r new_mpg_plot_b, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = new_mpg) + 
-  geom_point(mapping = aes(x = hwy, y = cyl))
-```
-
-## What happens if you make a scatterplot of `class` vs. `drive`?
-Why is the plot not useful?
-
-```{r new_mpg_plot_c, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = new_mpg) + 
-  geom_point(mapping = aes(x = class, y = drive))
-```
-
-## Aesthetic mappings
-
-How can you explain these cars?
-
-```{r new_mpg_plot_d, echo = FALSE, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy)) +
-  geom_point(data = mpg %>% filter(class == "2seater"),
-             mapping = aes(x = displ, y = hwy), color = "red")
-```
-
-##  Aesthetic mapping `color`
-
-```{r new_mpg_plot_e, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, color = class))
-```
-
-##  Aesthetic mappings
-
-`ggplot2` will automatically assign a unique level of the aesthetic (here a unique color) to each unique value of the variable, a process known as scaling. `ggplot2` will also add a legend that explains which levels correspond to which values.
-
-Try the following aesthetic:
-
- `size`
- `alpha`
- `shape`
-
-##  Aesthetic mapping `size`
-
-```{r new_mpg_plot_f, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, size = class))
-```
-
-##  Aesthetic mapping `alpha`
-
-```{r new_mpg_plot_g, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
-```
-
-##  Aesthetic mapping `shape`
-
-```{r new_mpg_plot_h, cache = TRUE, fig.width=8, fig.height=4.5, warning=FALSE}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, shape = class))
-```
-
-##  Aesthetic
-
-You can also set the aesthetic properties of your geom manually. For example, we can make all of the points in our plot blue:
-
-```{r new_mpg_plot_i, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy), color = "blue")
-```
-
-## Second challenge
-
- What’s gone wrong with this code? Why are the points not blue?
-
-```R
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, color = "blue"))
-```
-
- Which variables in `mpg` are **categorical**? Which variables are **continuous**? (Hint: type `mpg`)
- Map a **continuous** variable to color, size, and shape.
- What does the `stroke` aesthetic do? What shapes does it work with? (Hint: use ?geom_point)
- What happens if you map an aesthetic to something other than a variable name, like `color = displ < 5`?
-
-## Facets
-
-```{r new_mpg_plot_j, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy)) + 
-  facet_wrap(~class)
-```
-
-## Facets
-
-```{r new_mpg_plot_k, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy)) + 
-  facet_wrap(~class, nrow = 2)
-```
-
-## Facets
-
-```{r new_mpg_plot_l, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy)) + 
-  facet_wrap(~ fl + class, nrow = 2)
-```
-
-## Composition
-
-There are different ways to represent the information
-
-```{r new_mpg_plot_o, cache = TRUE, fig.width=8, fig.height=4.5}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy))
-```
-
-## Composition
-
-There are different ways to represent the information
-
-```{r new_mpg_plot_p, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg) + 
-  geom_smooth(mapping = aes(x = displ, y = hwy))
-```
-
-
-## Composition
-
-We can add as many layers as we want
-
-```{r new_mpg_plot_q, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy)) +
-  geom_smooth(mapping = aes(x = displ, y = hwy))
-```
-
-
-## Composition
-
-We can avoid code duplication
-
-```{r new_mpg_plot_r, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-  geom_point() +
-  geom_smooth()
-```
-
-
-## Composition
-
-We can make `mapping` layer specific
-
-```{r new_mpg_plot_s, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-  geom_point(mapping = aes(color = class)) +
-  geom_smooth()
-```
-
-## Composition
-
-We can use different `data` for different layer (You will lean more on `filter()` later)
-
-```{r new_mpg_plot_t, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-  geom_point(mapping = aes(color = class)) +
-  geom_smooth(data = filter(mpg, class == "subcompact"))
-```
-
-## Fird challenge
-
- Run this code in your head and predict what the output will look like. Then, run the code in R and check your predictions.
-```R
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
-  geom_point() + 
-  geom_smooth(se = FALSE)
-```
-**http://perso.ens-lyon.fr/laurent.modolo/R/2_d**
-
- What does `show.legend = FALSE` do?
- What does the `se` argument to `geom_smooth()` do?
-
-## Third challenge
-
- Recreate the R code necessary to generate the following graph
-
-```{r new_mpg_plot_u, echo = FALSE, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
-  geom_point() +
-  geom_smooth(mapping = aes(linetype = drv))
-```
-
-## Third challenge
-
-```{r new_mpg_plot_v, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
-  geom_point() +
-  geom_smooth(mapping = aes(linetype = drv))
-```
\ No newline at end of file
--- a/session_2/tp.R
+++ b/session_2/tp.R
--- a/session_2/tp.md
+++ b/session_2/tp.md
--- a/session_3/gapminder.xlsx
+++ b/session_3/gapminder.xlsx
--- a/session_3/HTML_tuto_s3.Rmd
+++ b/session_3/HTML_tuto_s3.Rmd
 ---
-title: 'R#3: Transformations with ggplot2'
+title: 'R.3: Transformations with ggplot2'
 author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr), Hélène Polvèche [hpolveche@istem.fr](mailto:hpolveche@istem.fr)"
-date: "Mars 2020"
-output:
-  html_document: default
-  pdf_document: default
+date: "2022"
 ---
-<style type="text/css">
-h3 { /* Header 3 */
-  position: relative ;
-  color: #729FCF ;
-  left: 5%;
-}
-h2 { /* Header 2 */
-  color: darkblue ;
-  left: 10%;
-} 
-h1 { /* Header 1 */
-  color: #034b6f ;
-} 
-#pencadre{
-  border:1px; 
-  border-style:solid; 
-  border-color: #034b6f; 
-  background-color: #EEF3F9; 
-  padding: 1em;
-  text-align: center ;
-  border-radius : 5px 4px 3px 2px;
-}
-legend{
-  color: #034b6f ;
-}
-#pquestion {
-  color: darkgreen;
-  font-weight: bold;
-}
-</style>
-
-```{r setup, include=FALSE, cache=TRUE}
+
+```{r include=FALSE}
+library(fontawesome)
+``` 
+
+```{r setup, include=FALSE}
+rm(list=ls())
 knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(comment = NA)
 ```

-The goal of this practical is to practices advanced features of `ggplot2`.
+## Introduction
+
+In the last session, we have seen how to use `ggplot2` and [The Grammar of Graphics](https://www.amazon.com/Grammar-Graphics-Statistics-Computing/dp/0387245448/ref=as_li_ss_tl). The goal of this practical is to practices more advanced features of `ggplot2`.

 The objectives of this session will be to:

@@ -49,71 +24,74 @@ The objectives of this session will be to:
 - practices position adjustments
 - change the coordinate systems

- \ 
- 
-# `ggplot2` statistical transformations

- \ 
- 
+The first step is to load the `tidyverse`.
+
+<details><summary>Solution</summary>
+<p>
 ```{r packageloaded, include=TRUE, message=FALSE}
 library("tidyverse")
 ```
+</p>
+</details>

- \ 
+Like in the previous sessions, it's good practice to create a new **.R** file to write your code instead of using the R terminal directly.
 
+## `ggplot2` statistical transformations
+
+In the previous session, we have plotted the data as they are by using the variable values as **x** or **y** coordinates, color shade, size or transparency.
+When dealing with categorical variables, also called **factors**, it can be interesting to perform some simple statistical transformations.
+For example, we may want to have coordinates on an axis proportional to the number of records for a given category.
+
 We are going to use the `diamonds` data set included in `tidyverse`.

- Use the `help` and `view` command to explore this data set.
+<div class="pencadre">
+
+- Use the `help` and `View` command to explore this data set.
+- How much records does this dataset contain ?
 - Try the `str` command, which information are displayed ?

-```R
+</div>
+
+```{r str_diamon}
 str(diamonds)
 ```

-```
-## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
-##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
-##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
-##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
-##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
-##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
-##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
-##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
-##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
-##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
-##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
-```
+### Introduction to `geom_bar`

- \ 
- 
-We saw scatterplot (`geom_point()`), smoothplot (`geom_smooth()`). Now barplot with `geom_bar()` : 
+We saw scatterplot (`geom_point()`), smoothplot (`geom_smooth()`).
+Now barplot with `geom_bar()` : 

-```{r diamonds_barplot, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
+```{r diamonds_barplot, cache = TRUE, fig.width=8, fig.height=4.5}
 ggplot(data = diamonds, mapping = aes(x = cut)) + 
  geom_bar()
 ```

 More diamonds are available with high quality cuts.

-On the x-axis, the chart displays cut, a variable from diamonds. On the y-axis, it displays count, but count is not a variable in diamonds!
+On the x-axis, the chart displays **cut**, a variable from diamonds. On the y-axis, it displays **count**, **but count is not a variable in diamonds!**

-The algorithm used to calculate new values for a graph is called a **stat**, short for statistical transformation. The figure below describes how this process works with `geom_bar()`.

-![](img/visualization-stat-bar.png)
+### **geom** and **stat**

+The algorithm used to calculate new values for a graph is called a **stat**, short for statistical transformation.
+The figure below describes how this process works with `geom_bar()`.
+
+![](img/visualization-stat-bar.png)

-You can generally use geoms and stats interchangeably. For example, you can recreate the previous plot using `stat_count()` instead of `geom_bar()`:
+You can generally use **geoms** and **stats** interchangeably. For example, you can recreate the previous plot using `stat_count()` instead of `geom_bar()`:

-```{r diamonds_stat_count, include=TRUE, fig.width=8, fig.height=4.5, message=FALSE}
+```{r diamonds_stat_count, include=TRUE, fig.width=8, fig.height=4.5}
 ggplot(data = diamonds, mapping = aes(x = cut)) + 
  stat_count()
 ```

- \ 
+Every **geom** has a default **stat**; and every **stat** has a default **geom**. This means that you can typically use **geoms** without worrying about the underlying statistical transformation. There are three reasons you might need to use a **stat** explicitly:

-Every geom has a default stat; and every stat has a default geom. This means that you can typically use geoms without worrying about the underlying statistical transformation. There are three reasons you might need to use a stat explicitly:
+### Why **stat** ?

- You might want to override the default stat. 
+You might want to override the default stat.
+For example, in the following `demo` dataset we already have a variable for the **counts** per `cut`.

 ```{r 3_a, include=TRUE, fig.width=8, fig.height=4.5}
 demo <- tribble(
@@ -124,43 +102,72 @@ demo <- tribble(
  "Premium",    13791,
  "Ideal",      21551
 )
+```
+
+(Don't worry that you haven't seen `tribble()` before. You might be able
+to guess at their meaning from the context, and you will learn exactly what
+they do soon!)

-# (Don't worry that you haven't seen <- or tribble() before. You might be able
-# to guess at their meaning from the context, and you will learn exactly what
-# they do soon!)
+<div class="pencadre">
+So instead of using the default `geom_bar` parameter `stat = "count"` try to use `"identity"`
+</div>

+<details><summary>Solution</summary>
+<p>
+```{r 3_ab, include=TRUE, fig.width=8, fig.height=4.5}
 ggplot(data = demo, mapping = aes(x = cut, y = freq)) +
  geom_bar(stat = "identity")
-
 ```
+</p>
+</details>
+
+You might want to override the default mapping from transformed variables to aesthetics ( e.g., proportion). 

- You might want to override the default mapping from transformed variables to aesthetics ( e.g. proportion). 
 ```{r 3_b, include=TRUE, fig.width=8, fig.height=4.5}
 ggplot(data = diamonds, mapping = aes(x = cut, y = ..prop.., group = 1)) + 
  geom_bar()
 ```
  
- In our proportion bar chart, we need to set `group = 1`. Why?
+<div class="pencadre">
+In our proportion bar chart, we need to set `group = 1`. Why?
+</div>

+<details><summary>Solution</summary>
+<p>
 ```{r diamonds_stats_challenge, include=TRUE, message=FALSE, fig.width=8, fig.height=4.5}
 ggplot(data = diamonds, mapping = aes(x = cut, y = ..prop..)) + 
  geom_bar()
 ```

-If group is not used, the proportion is calculated with respect to the data that contains that field and is ultimately going to be 100% in any case. For instance, The proportion of an ideal cut in the ideal cut specific data will be 1.
+If group is not used, the proportion is calculated with respect to the data that contains that field and is ultimately going to be 100% in any case. For instance, the proportion of an ideal cut in the ideal cut specific data will be 1.
+</p>
+</details>

- \ 
- 
- You might want to draw greater attention to the statistical transformation in your code. 
+### More details with `stat_summary`
+
+<div class="pencadre">
+You might want to draw greater attention to the statistical transformation in your code. 
+you might use `stat_summary()`, which summarize the **y** values for each unique **x**
+value, to draw attention to the summary that you are computing
+</div>

+<details><summary>Solution</summary>
+<p>
 ```{r 3_c, include=TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-# you might use stat_summary(), which summarises the y values for each unique x
-# value, to draw attention to the summary that you are computing:

 ggplot(data = diamonds, mapping = aes(x = cut, y = depth)) + 
  stat_summary()
+```
+</p>
+</details>

-  
+<div class="pencadre">
+Set the `fun.min`, `fun.max` and `fun` to the `min`, `max` and `median` function respectively
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r 3_d, include=TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, y = depth)) + 
  stat_summary(
    fun.min = min,
@@ -168,64 +175,80 @@ ggplot(data = diamonds, mapping = aes(x = cut, y = depth)) +
    fun = median
  )
 ```
+</p>
+</details>

+## Coloring area plots

-# Position adjustments
-
- \ 
- 
-You can colour a bar chart using either the `color` aesthetic, 
+<div class="pencadre">
+You can color a bar chart using either the `color` aesthetic, or, more usefully `fill`:
+Try both solutions on a `cut`, histogram.
+</div>

+<details><summary>Solution</summary>
+<p>
 ```{r diamonds_barplot_color, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, color = cut)) + 
  geom_bar()
 ```

- \ 
-
-or, more usefully, `fill`:
-
 ```{r diamonds_barplot_fill, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + 
  geom_bar()
 ```
+</p>
+</details>

-
-
+<div class="pencadre">
 You can also use `fill` with another variable:
+Try to color by `clarity`. Is `clarity` a continuous or categorial variable ?
+</div>

+<details><summary>Solution</summary>
+<p>
 ```{r diamonds_barplot_fill_clarity, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
  geom_bar()
 ```
+</p>
+</details>

+## Position adjustments

+The stacking of the `fill` parameter is performed by the position adjustment `position`

-The stacking is performed by the position adjustment `position`
+<div class="pencadre">
+Try the following `position` parameter for `geom_bar`: `"fill"`, `"dodge"` and `"jitter"`
+</div>

-### fill

+<details><summary>Solution</summary>
+<p>
 ```{r diamonds_barplot_pos_fill, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
  geom_bar( position = "fill")
 ```

-### dodge
-
 ```{r diamonds_barplot_pos_dodge, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
  geom_bar( position = "dodge")
 ```

-### jitter
-
 ```{r diamonds_barplot_pos_jitter, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
  geom_bar( position = "jitter")
 ```
+</p>
+</details>

+`jitter` is often used for plotting points when they are stacked on top of each other.

+<div class="pencadre">
+Compare `geom_point` to `geom_jitter`  plot `cut` versus `depth` and color by `clarity`
+</div>

+<details><summary>Solution</summary>
+<p>
 ```{r dia_jitter2, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + 
  geom_point()
@@ -235,64 +258,166 @@ ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) +
 ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + 
  geom_jitter()
 ```
+</p>
+</details>
+
+<div class="pencadre">
+What parameters of `geom_jitter` control the amount of jittering ?
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r dia_jitter4, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
+ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + 
+  geom_jitter(width = .1, height = .1)
+```
+</p>
+</details>

-### violin
+In the `geom_jitter` plot that we made, we cannot really see the limits of the different clarity groups. Instead we can use the `geom_violin` to see their density.

+<details><summary>Solution</summary>
+<p>
 ```{r dia_violon, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + 
  geom_violin()
 ```
+</p>
+</details>

-
-# Coordinate systems
+## Coordinate systems

 Cartesian coordinate system where the x and y positions act independently to determine the location of each point. There are a number of other coordinate systems that are occasionally helpful.

-
 ```{r dia_boxplot, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + 
  geom_boxplot()
 ```

+<div class="pencardre">
+Add the `coord_flip()` layer to the previous plot
+</div>

-
+<details><summary>Solution</summary>
+<p>
 ```{r dia_boxplot_flip, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
 ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) + 
  geom_boxplot() +
  coord_flip()
 ```
+</p>
+</details>

+<div class="pencardre">
+Add the `coord_polar()` layer to this  plot:

-```{r dia_12, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds, mapping = aes(x = depth, y = table)) + 
-  geom_point() +
-  geom_abline()
+```{r diamonds_bar, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE, eval=FALSE}
+ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + 
+  geom_bar( show.legend = FALSE,  width = 1 ) + 
+  theme(aspect.ratio = 1) +
+  labs(x = NULL, y = NULL)
 ```
+</div>

-
-```{r dia_quickmap, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds, mapping = aes(x = depth, y = table)) + 
-  geom_point() +
-  geom_abline() +
-  coord_quickmap()
+<details><summary>Solution</summary>
+<p>
+```{r diamonds_bar2, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
+ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + 
+  geom_bar( show.legend = FALSE,  width = 1 ) + 
+  theme(aspect.ratio = 1) +
+  labs(x = NULL, y = NULL) +
+  coord_polar()
 ```
+</p>
+</details>

+By combining the right **geom**, **coordinates** and **faceting** functions, you can build a large number of different plots to present your results.

+## See you in [R.4: data transformation](/session_4/session_4.html)

+## To go further:  animated plots from xls files

-```{r diamonds_bar, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-bar <- ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) + 
-  geom_bar( show.legend = FALSE,  width = 1 ) + 
-  theme(aspect.ratio = 1) +
-  labs(x = NULL, y = NULL)
+In order to be able to read information from a xls file, we will use the `openxlsx` packages. To generate animation we will use the `ggannimate` package. The additional `gifski` package will allow R to save your animation in the gif format (Graphics Interchange Format)
+
+```{r install_readxl, eval=F}
+install.packages(c("openxlsx", "gganimate", "gifski"))
+```
+```{r load_readxl}
+library(openxlsx)
+library(gganimate)
+library(gifski)
+```
+
+<div class="pencardre">
+Use the `openxlsx` package to save the [https://can.gitbiopages.ens-lyon.fr/R_basis/session_3/gapminder.xlsx](https://can.gitbiopages.ens-lyon.fr/R_basis/session_3/gapminder.xlsx) file to the `gapminder` variable
+</div>
+
+<details><summary>Solution</summary>
+<p>
+2 solutions :

-bar
+Use directly the url
+```{r load_xlsx_url, eval = F}
+gapminder <- read.xlsx("https://can.gitbiopages.ens-lyon.fr/R_basis/session_3/gapminder.xlsx")
 ```

+Dowload the file, save it in the same directory as your script then use the local path
+```{r load_xlsx}
+gapminder <- read.xlsx("gapminder.xlsx")
+```
+
+</p>
+</details>
+
+This dataset contains 4 variables of interest for us to display per country:
+- `gdpPercap` the GDP par capita (US$, inflation-adjusted)
+- `lifeExp` the life expectancy at birth, in years
+- `pop` the population size
+- `contient` a factor with 5 levels
+
+<div class="pencardre">
+Using `ggplot2`, build a scatterplot of the `gdpPercap` vs `lifeExp`. Add the `pop` and `continent` information to this plot.
+</div>

-```{r diamonds_bar_polar, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-bar + coord_polar()
+<details><summary>Solution</summary>
+<p>
+```{r gapminder_plot_a}
+ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, color = continent)) +
+  geom_point()
 ```
+</p>
+</details>
+
+<div class="pencardre">
+What's wrong ?
+You can use the `scale_x_log10()` to display the `gdpPercap` on the `log10` scale.
+</div>


-##See you to Session#4 : "data transformation"
\ No newline at end of file
+<details><summary>Solution</summary>
+<p>
+```{r gapminder_plot_b}
+ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, color = continent)) +
+  geom_point() + 
+  scale_x_log10()
+```
+</p>
+</details>
+
+<div class="pencardre">
+We would like to add the `year` information to the plots. We could use a `facet_wrap`, but instead we are going to use the `gganimate` package.
+
+For this we need to add a `transition_time` layer that will take as an argument `year` to our plot.
+</div>
+
+<details><summary>Solution</summary>
+<p>
+```{r gapminder_plot_c}
+ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, color = continent)) +
+  geom_point() + 
+  scale_x_log10() +
+  transition_time(year) +
+  labs(title = 'Year: {as.integer(frame_time)}')
+```
+</p>
+</details>
\ No newline at end of file
--- a/session_3/slides.Rmd
+++ b/session_3/slides.Rmd
---
-title: "R#3: stats with ggplot2"
-author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)"
-date: "08 Nov 2019"
-output:
-  slidy_presentation:
-    highlight: tango
-  beamer_presentation:
-    theme: metropolis
-    slide_level: 3
-    fig_caption: no
-    df_print: tibble
-    highlight: tango
-    latex_engine: xelatex
---
-
-```{r setup, include=FALSE, cache=TRUE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-```
-
-## R#3: stats with ggplot2
-The goal of this practical is to practices advanced features of `ggplot2`.
-
-The objectives of this session will be to:
-
- learn about statistical transformations
- practices position adjustments
- change the coordinate systems
-
-## `ggplot2` statistical transformations
-
-We are going to use the `diamonds` data set included in `tidyverse`.
-
- Use the `help` and `View` command to explore this data set.
- Try the `str` command, which information are displayed ?
-
-## `ggplot2` statistical transformations
-
-```{r diamonds_barplot, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut))
-```
-
-More diamonds are available with high quality cuts.
-
-## `ggplot2` statistical transformations
-
-On the x-axis, the chart displays cut, a variable from diamonds. On the y-axis, it displays count, but count is not a variable in diamonds!
-
-The algorithm used to calculate new values for a graph is called a **stat**, short for statistical transformation. The figure below describes how this process works with `geom_bar()`.
-
-\includegraphics[width=\textwidth]{img/visualization-stat-bar.png}
-
-## `ggplot2` statistical transformations
-
-You can generally use geoms and stats interchangeably. For example, you can recreate the previous plot using `stat_count()` instead of `geom_bar()`:
-
-```{r diamonds_stat_count, eval=FALSE, message=FALSE}
-ggplot(data = diamonds) + 
-  stat_count(mapping = aes(x = cut))
-```
-
-## `ggplot2` statistical transformations
-
-Every geom has a default stat; and every stat has a default geom. This means that you can typically use geoms without worrying about the underlying statistical transformation. There are three reasons you might need to use a stat explicitly:
-
- You might want to override the default stat. **3_a**
- You might want to override the default mapping from transformed variables to aesthetics. **3_b**
- You might want to draw greater attention to the statistical transformation in your code. **3_c**
-
-## Statistical transformation challenge
-
- What does `geom_col()` do? How is it different to `geom_bar()`?
- What variables does `stat_smooth()` compute? What parameters control its behaviour?
- In our proportion bar chart, we need to set `group = 1`. Why? In other words what is the problem with these two graphs?
-
-```{r diamonds_stats_challenge, eval=FALSE, message=FALSE}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, y = ..prop..))
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..))
-```
-
-## Position adjustments
-You can colour a bar chart using either the `colour` aesthetic, or, more usefully, `fill`:
-
-```{r diamonds_barplot_color, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, colour = cut))
-```
-
-## Position adjustments
-You can colour a bar chart using either the `colour` aesthetic, or, more usefully, `fill`:
-
-```{r diamonds_barplot_fill, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, fill = cut))
-```
-
-## Position adjustments
-
-You can also use `fill` with another variable:
-
-```{r diamonds_barplot_fill_clarity, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, fill = clarity))
-```
-
-## Position adjustments
-
-The stacking is performed by the position adjustment `position`
-
-```{r diamonds_barplot_pos_identity, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds,
-       mapping = aes(x = cut, colour = clarity)) + 
-  geom_bar(fill = NA, position = "identity")
-```
-
-## Position adjustments
-
-The stacking is performed by the position adjustment `position`
-
-```{r diamonds_barplot_pos_fill, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, fill = clarity),
-           position = "fill")
-```
-
-## Position adjustments
-
-The stacking is performed by the position adjustment `position`
-
-```{r diamonds_barplot_pos_dodge, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, fill = clarity),
-           position = "dodge")
-```
-
-## Position adjustments
-
-The stacking is performed by the position adjustment `position`
-
-```{r mpg_point_pos_jitter, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy),
-             position = "jitter")
-```
-
-## Position adjustments
-
-The stacking is performed by the position adjustment `position`
-
-```{r mpg_jitter, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg) + 
-  geom_jitter(mapping = aes(x = displ, y = hwy))
-```
-
-## Position adjustments challenges
-
- What is the problem with this plot? How could you improve it?
-```{r mpg_point, eval=F, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 
-  geom_point()
-```
- What parameters to `geom_jitter()` control the amount of jittering?
- Compare and contrast `geom_jitter()` with `geom_count()`
- What’s the default position adjustment for `geom_boxplot()` ? Create a visualisation of the `mpg` dataset that demonstrates it.
-
-## Coordinate systems
-
-Cartesian coordinate system where the x and y positions act independently to determine the location of each point. There are a number of other coordinate systems that are occasionally helpful.
-
-## Coordinate systems
-
-```{r mpg_boxplot, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
-  geom_boxplot()
-```
-
-## Coordinate systems
-
-```{r mpg_boxplot_flip, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
-  geom_boxplot() +
-  coord_flip()
-```
-
-## Coordinate systems
-
-```{r diamonds_bar, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-bar <- ggplot(data = diamonds) + 
-  geom_bar(
-    mapping = aes(x = cut, fill = cut), 
-    show.legend = FALSE,
-    width = 1
-  ) + 
-  theme(aspect.ratio = 1) +
-  labs(x = NULL, y = NULL)
-```
-**3_d**
-
-## Coordinate systems
-
-```{r diamonds_bar_plot, echo=F, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-bar
-```
-
-**3_d**
-
-## Coordinate systems
-```{r diamonds_bar_flip, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-bar + coord_flip()
-```
-
-## Coordinate systems
-
-```{r mpg_jitter_noquickmap, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = mpg) + 
-  geom_jitter(mapping = aes(x = cty, y = hwy))
-```
-
-## Coordinate systems
-
-```{r mpg_jitter_quickmap, cache = TRUE, fig.width=3.5, fig.height=3.5, message=FALSE}
-ggplot(data = mpg) + 
-  geom_jitter(mapping = aes(x = cty, y = hwy)) +
-  coord_quickmap()
-```
-
-## Coordinate systems
-
-```{r mpg_jitter_log, cache = TRUE, fig.width=8.5, fig.height=3.5, message=FALSE}
-ggplot(data = mpg) + 
-  geom_jitter(mapping = aes(x = cty, y = hwy)) +
-  scale_y_log10() +
-  scale_x_log10()
-```
-
-## Coordinate systems
-```{r diamonds_bar_polar, cache = TRUE, fig.width=5, fig.height=3.5, message=FALSE}
-bar + coord_polar()
-```
-
-## Coordinate systems challenges
-
- Turn a stacked bar chart into a pie chart using `coord_polar()`.
- What does `labs()` do? Read the documentation.
- What does the plot below tell you about the relationship between `city` and highway `mpg`? Why is `coord_fixed()` important? What does `geom_abline()` do?
-
-```{r mpg_point_fixed, eval = F, cache = TRUE, fig.width=4.5, fig.height=3.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
-  geom_point() + 
-  geom_abline() +
-  coord_fixed()
-```
-
-## Coordinate systems challenges
-
-```{r diamonds_barplot_pos_fill_polar, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, fill = clarity),
-           position = "fill") +
-  coord_polar()
-```
-
-## Coordinate systems challenges
-
-```{r mpg_point_nofixed_plot, eval = T, cache = TRUE, fig.width=8, fig.height=3.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
-  geom_point() +  geom_abline()
-```
-
-## Coordinate systems challenges
-
-```{r mpg_point_fixed_plot, eval = T, cache = TRUE, fig.width=8, fig.height=3.5, message=FALSE}
-ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
-  geom_point() +  geom_abline() + coord_fixed()
-```
--- a/session_3/tp.R
+++ b/session_3/tp.R
--- a/session_3/tp.md
+++ b/session_3/tp.md
--- a/session_4/EWang_Tibialis_DEGs_GRCH37-87_GSE86356.csv
+++ b/session_4/EWang_Tibialis_DEGs_GRCH37-87_GSE86356.csv
--- a/session_4/Expression_matrice_pivot_longer_DEGs_GSE86356.csv
+++ b/session_4/Expression_matrice_pivot_longer_DEGs_GSE86356.csv
--- a/session_4/HTML_toto_s4.Rmd
+++ b/session_4/HTML_toto_s4.Rmd
---
-title: "R#4: data transformation"
-author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr), Hélène Polvèche [hpolveche@istem.fr](mailto:hpolveche@istem.fr)"
-date: "Mars 2020"
-output:
-  html_document: default
-  pdf_document: default
---
-<style type="text/css">
-h3 { /* Header 3 */
-  position: relative ;
-  color: #729FCF ;
-  left: 5%;
-}
-h2 { /* Header 2 */
-  color: darkblue ;
-  left: 10%;
-} 
-h1 { /* Header 1 */
-  color: #034b6f ;
-} 
-#pencadre{
-  border:1px; 
-  border-style:solid; 
-  border-color: #034b6f; 
-  background-color: #EEF3F9; 
-  padding: 1em;
-  text-align: center ;
-  border-radius : 5px 4px 3px 2px;
-}
-legend{
-  color: #034b6f ;
-}
-#pquestion {
-  color: darkgreen;
-  font-weight: bold;
-}
-</style>
-
-```{r setup, include=FALSE, cache=TRUE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-The goal of this practical is to practices data transformation with `tidyverse`.
-The objectives of this session will be to:
-
- Filter rows with `filter()`
- Arrange rows with `arrange()`
- Select columns with `select()`
- Add new variables with `mutate()`
- Combining multiple operations with the pipe `%>%`
-
-```R
-install.packages("nycflights13")
-```
-
-```{r packageloaded, include=TRUE, message=FALSE}
-library("tidyverse")
-library("nycflights13")
-```
-
- \ 
- 
-# Data set : nycflights13
-
-`nycflights13::flights`contains all 336,776 flights that departed from New York City in 2013. The data comes from the US Bureau of Transportation Statistics, and is documented in `?flights`
-
-
-```{r display_data, include=TRUE}
-flights
-```
-
- **int** stands for integers.
- **dbl** stands for doubles, or real numbers.
- **chr** stands for character vectors, or strings.
- **dttm** stands for date-times (a date + a time).
- **lgl** stands for logical, vectors that contain only TRUE or FALSE.
- **fctr** stands for factors, which R uses to represent categorical variables with fixed possible values.
- **date** stands for dates.
-
- \ 
- 
-# Filter rows with `filter()`
-
-`filter()` allows you to subset observations based on their values. 
-
-```{r filter_month_day, include=TRUE}
-filter(flights, month == 1, day == 1)
-```
-
- \ 
- 
-`dplyr` functions never modify their inputs, so if you want to save the result, you’ll need to use the assignment operator, `<-`
-
-```{r filter_month_day_sav, include=TRUE}
-jan1 <- filter(flights, month == 1, day == 1)
-```
-
- \ 
- 
-R either prints out the results, or saves them to a variable.
-
-```{r filter_month_day_sav_display, include=TRUE}
-(dec25 <- filter(flights, month == 12, day == 25))
-```
-
- \ 
- 
-# Logical operators
-
-Multiple arguments to `filter()` are combined with “and”: every expression must be true in order for a row to be included in the output.
-
-![](./img/transform-logical.png)
-
- \ 
-
-Test the following operations:
-
-```{r filter_logical_operators, include=TRUE}
-filter(flights, month == 11 | month == 12)
-filter(flights, month %in% c(11, 12))
-filter(flights, !(arr_delay > 120 | dep_delay > 120))
-filter(flights, arr_delay <= 120, dep_delay <= 120)
-```
-
- \ 
- 
-# Missing values
-
-One important feature of R that can make comparison tricky are missing values, or `NA`s (“not availables”). 
-
-```{r filter_logical_operators_NA, include=TRUE}
-NA > 5
-10 == NA
-NA + 10
-```
-
-
-```{r filter_logical_operators_test_NA, include=TRUE}
-is.na(NA)
-```
-
- \ 
- 
-# Arrange rows with `arrange()`
-
- \ 
-
-`arrange()` works similarly to `filter()` except that instead of selecting rows, it changes their order.
-
-```{r arrange_ymd, include=TRUE}
-arrange(flights, year, month, day)
-```
-
- \ 
-Use `desc()` to re-order by a column in descending order:
-
-```{r arrange_desc, include=TRUE}
-arrange(flights, desc(dep_delay))
-```
-
-Missing values are always sorted at the end:
-
-```{r arrange_NA, include=TRUE}
-arrange(tibble(x = c(5, 2, NA)), x)
-arrange(tibble(x = c(5, 2, NA)), desc(x))
-```
-
- \ 
-
-# Select columns with `select()`
-
- \ 
- 
-`select()` allows you to rapidly zoom in on a useful subset using operations based on the names of the variables.
-
-```{r select_ymd, , include=TRUE}
-select(flights, year, month, day)
-select(flights, year:day)
-select(flights, -(year:day))
-```
-
- \ 
-
-here are a number of helper functions you can use within `select()`:
-
- `starts_with("abc")`: matches names that begin with “abc”.
- `ends_with("xyz")`: matches names that end with “xyz”.
- `contains("ijk")`: matches names that contain “ijk”.
- `num_range("x", 1:3)`: matches `x1`, `x2` and `x3`.
-
-See `?select` for more details.
-
- \ 
- 
-# Add new variables with `mutate()`
-
- \ 
- 
-It’s often useful to add new columns that are functions of existing columns. That’s the job of `mutate()`.
-
-```{r mutate, include=TRUE}
-flights_sml <- select(flights,  year:day, ends_with("delay"), distance, air_time)
-
-flights_sml
-
-mutate(flights_sml, gain = dep_delay - arr_delay,
-            speed = distance / air_time * 60)
-```
-
- \ 
-
-```{r mutate_reuse, include=TRUE}
-flights_sml <- mutate(flights_sml, gain = dep_delay - arr_delay,
-            speed = distance / air_time * 60)
-
-```
-
- \ 
- 
-### Useful creation functions
-
- Offsets: `lead()` and `lag()` allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. `x - lag(x)`) or find when values change (`x != lag(x)`).
- Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: `cumsum()`, `cumprod()`, `cummin()`, `cummax()`; and dplyr provides `cummean()` for cumulative means. 
- Logical comparisons, `<`, `<=`, `>`, `>=`, `!=`, and `==`
- Ranking: there are a number of ranking functions, but you should start with `min_rank()`. There is also `row_number()`, `dense_rank()`, `percent_rank()`, `cume_dist()`, `ntile()`
-
- \ 
- 
-# Combining multiple operations with the pipe
-
- \ 
- 
-We don't want to create useless intermediate variables so we can use the pipe operator: `%>%`
-( or `ctrl + shift + M`). 
-
-<div id="pquestion"> - Find the 10 most delayed flights using a ranking function. `min_rank()` </div>
-
-```{r pipe_example_a, include=TRUE}
-flights_md <- mutate(flights,
-                     most_delay = min_rank(desc(dep_delay)))
-flights_md <- filter(flights_md, most_delay < 10)
-flights_md <- arrange(flights_md, most_delay)
-```
-
- \ 
- 
-
-```{r pipe_example_b, include=TRUE}
-flights_md2 <- flights %>%
-    mutate(most_delay = min_rank(desc(dep_delay))) %>% 
-    filter(most_delay < 10) %>% 
-    arrange(most_delay)
-
-select(flights_md2, year:day, flight, origin, dest, dep_delay, most_delay)
-```
-
- \ 
-
-Behind the scenes, `x %>% f(y)` turns into `f(x, y)`, and `x %>% f(y) %>% g(z)` turns into `g(f(x, y), z)` and so on. You can use the pipe to rewrite multiple operations in a way that you can read left-to-right, top-to-bottom. 
-
- \ 
-
-Working with the pipe is one of the key criteria for belonging to the `tidyverse`. The only exception is `ggplot2`: it was written before the pipe was discovered. Unfortunately, the next iteration of `ggplot2`, `ggvis`, which does use the pipe, isn’t quite ready for prime time yet.
-
-# Grouped summaries with `summarise()`
-
-`summarise()` collapses a data frame to a single row:
-
-```{r load_data, include=TRUE}
-flights %>% 
-  summarise(delay = mean(dep_delay, na.rm = TRUE))
-```
-
-### The power of `summarise()` with `group_by()`
-
-This changes the unit of analysis from the complete dataset to individual groups. Then, when you use the `dplyr` verbs on a grouped data frame they’ll be automatically applied “by group”.
-
-```{r summarise_group_by, include=TRUE, fig.width=8, fig.height=3.5}
-flights_delay <- flights %>% 
-  group_by(year, month) %>% 
-  summarise(delay = mean(dep_delay, na.rm = TRUE), sd = sd(dep_delay, na.rm = TRUE)) %>% 
-  arrange(month)
-
-flights_delay
-
-ggplot(data = flights_delay, mapping = aes(x = month, y = delay)) +
-  geom_bar(stat="identity", color="black", fill = "#619CFF") +
-  geom_errorbar(mapping = aes( ymin=0, ymax=delay+sd)) + 
-  theme(axis.text.x = element_blank())
-
-```
-
-
-### Missing values
-
-You may have wondered about the na.rm argument we used above. What happens if we don’t set it?
-
-```{r summarise_group_by_NA, include=TRUE}
-flights %>% 
-  group_by(dest) %>% 
-  summarise(
-    dist = mean(distance),
-    delay = mean(arr_delay)
-  )
-```
-
-Aggregation functions obey the usual rule of missing values: if there’s any missing value in the input, the output will be a missing value.
-
-
-# Counts
-
-Whenever you do any aggregation, it’s always a good idea to include either a count (`n()`). That way you can check that you’re not drawing conclusions based on very small amounts of data.
-
-```{r summarise_group_by_count, include = TRUE, warning=F, message=F, fig.width=8, fig.height=3.5}
-summ_delay_filghts <- flights %>% 
-                      group_by(dest) %>% 
-                      summarise(
-                          count = n(),
-                          dist = mean(distance, na.rm = TRUE),
-                          delay = mean(arr_delay, na.rm = TRUE)
-                      )
-summ_delay_filghts
-
-ggplot(data = summ_delay_filghts, mapping = aes(x = dist, y = delay, size = count)) +
-  geom_point() +
-  geom_smooth(method = lm, se = FALSE) +
-  theme(legend.position='none')
-
-```
-
-## Thank you !
-
- \ 
- 
-## For curious or motivated people: Challenge time!
-
- \ 
- 
- \ 
- 
- 
--- a/session_4/challengeTime.Rmd
+++ b/session_4/challengeTime.Rmd
---
-title: "Challenge time!"
-author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr), Hélène Polvèche [hpolveche@istem.fr](mailto:hpolveche@istem.fr)"
-date: "Mars 2020"
-output:
-  html_document: default
-  pdf_document: default
---
-  <style type="text/css">
-  h3 { /* Header 3 */
-      position: relative ;
-    color: #729FCF ;
-      left: 5%;
-  }
-h2 { /* Header 2 */
-    color: darkblue ;
-  left: 10%;
-} 
-h1 { /* Header 1 */
-    color: #034b6f ;
-} 
-#pencadre{
-border:1px; 
-border-style:solid; 
-border-color: #034b6f; 
-  background-color: #EEF3F9; 
-  padding: 1em;
-text-align: center ;
-border-radius : 5px 4px 3px 2px;
-}
-legend{
-  color: #034b6f ;
-}
-#pquestion {
-color: darkgreen;
-font-weight: bold;
-}
-</style>
-  
-  ```{r setup, include=FALSE, cache=TRUE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-
-### Filter challenges :
-
-Find all flights that:
-  
-  - Had an arrival delay of two or more hours
- Were operated by United, American, or Delta
- Departed between midnight and 6am (inclusive)
-
-Another useful dplyr filtering helper is `between()`. What does it do? Can you use it to simplify the code needed to answer the previous challenges?
-
-How many flights have a missing `dep_time`? What other variables are missing? What might these rows represent?
-
-Why is `NA ^ 0` not `NA`? Why is `NA | TRUE` not `NA`? Why is `FALSE & NA` not `NA`? Can you figure out the general rule? (`NA * 0` is a tricky counter-example!)
-
-### Arrange challenges :
-
- Sort flights to find the most delayed flights. Find the flights that left earliest.
- Sort flights to find the fastest flights.
- Which flights traveled the longest? Which traveled the shortest?
-
-### Select challenges :
-
- Brainstorm as many ways as possible to select `dep_time`, `dep_delay`, `arr_time`, and `arr_delay` from `flights`.
- What does the `one_of()` function do? Why might it be helpful in conjunction with this vector?
-```{r select_one_of, eval=F, message=F, cache=T}
-vars <- c("year", "month", "day", "dep_delay", "arr_delay")
-```
- Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default?
-```{r select_contains, eval=F, message=F, cache=T}
-select(flights, contains("TIME"))
-```
-
-
-### Mutate challenges :
-
- Currently `dep_time` and `sched_dep_time` are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.
-
-
-```{r mutate_challenges_a, eval=F, message=F, cache=T}
-mutate(
-  flights,
-  dep_time = (dep_time %/% 100) * 60 +
-    dep_time %% 100,
-  sched_dep_time = (sched_dep_time %/% 100) * 60 +
-    sched_dep_time %% 100
-)
-```
-
-\ 
-
- Compare `dep_time`, `sched_dep_time`, and `dep_delay`. How would you expect those three numbers to be related?
-
-```{r mutate_challenge_b, eval=F, message=F, cache=T}
-mutate(
-  flights,
-  dep_time = (dep_time %/% 100) * 60 + 
-    dep_time %% 100,
-  sched_dep_time = (sched_dep_time %/% 100) * 60 +
-    sched_dep_time %% 100
-)
-```
-
-\ 
-
-### Challenge with `summarise()` and `group_by()`
-
-Imagine that we want to explore the relationship between the distance and average delay for each location. 
-here are three steps to prepare this data: 
-
- Group flights by destination.
- Summarise to compute distance, average delay, and number of flights.
- Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport.
-
-```{r summarise_group_by_ggplot_a, eval = F}
-flights %>% 
-  group_by(dest)
-```
-
- \ 
-
-Imagine that we want to explore the relationship between the distance and average delay for each location. 
-
- Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport.
-
-```{r summarise_group_by_ggplot_b, eval = F}
-flights %>% 
-  group_by(dest) %>% 
-  summarise(
-    count = n(),
-    dist = mean(distance, na.rm = TRUE),
-    delay = mean(arr_delay, na.rm = TRUE)
-  )
-```
-
-
--- a/session_4/img/colorsR.png
+++ b/session_4/img/colorsR.png
--- a/session_4/img/transform-logical.png
+++ b/session_4/img/transform-logical.png
No results found