From 878f2d8fbb961e4c733d3ac1372fce62ea1ffb89 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Tue, 23 Nov 2021 10:11:10 +0100 Subject: [PATCH] update session 7 and 8 --- session_1/session_1.Rmd | 2 +- session_2/session_2.Rmd | 2 +- session_3/session_3.Rmd | 2 +- session_4/session_4.Rmd | 2 +- session_5/session_5.Rmd | 4 +- session_6/session_6.Rmd | 2 +- session_7/{slides.Rmd => session_7.Rmd} | 201 +++++++++++++----------- session_8/{slides.Rmd => session_8.Rmd} | 87 +++++----- 8 files changed, 160 insertions(+), 142 deletions(-) rename session_7/{slides.Rmd => session_7.Rmd} (59%) rename session_8/{slides.Rmd => session_8.Rmd} (76%) diff --git a/session_1/session_1.Rmd b/session_1/session_1.Rmd index 83fb4f4..ab478db 100644 --- a/session_1/session_1.Rmd +++ b/session_1/session_1.Rmd @@ -6,7 +6,7 @@ output: rmdformats::downcute: self_contain: true use_bookdown: true - default_style: "dark" + default_style: "light" lightbox: true css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css" --- diff --git a/session_2/session_2.Rmd b/session_2/session_2.Rmd index 252e010..0a147ba 100644 --- a/session_2/session_2.Rmd +++ b/session_2/session_2.Rmd @@ -6,7 +6,7 @@ output: rmdformats::downcute: self_contain: true use_bookdown: true - default_style: "dark" + default_style: "light" lightbox: true css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css" --- diff --git a/session_3/session_3.Rmd b/session_3/session_3.Rmd index 3d8ff21..180bb69 100644 --- a/session_3/session_3.Rmd +++ b/session_3/session_3.Rmd @@ -6,7 +6,7 @@ output: rmdformats::downcute: self_contain: true use_bookdown: true - default_style: "dark" + default_style: "light" lightbox: true css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css" --- diff --git a/session_4/session_4.Rmd b/session_4/session_4.Rmd index 471665e..88e4e55 100644 --- a/session_4/session_4.Rmd +++ b/session_4/session_4.Rmd @@ -6,7 +6,7 @@ output: rmdformats::downcute: self_contain: true use_bookdown: true - default_style: "dark" + default_style: "light" lightbox: true css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css" --- diff --git a/session_5/session_5.Rmd b/session_5/session_5.Rmd index a5ddc1c..9f8d63c 100644 --- a/session_5/session_5.Rmd +++ b/session_5/session_5.Rmd @@ -6,7 +6,7 @@ output: rmdformats::downcute: self_contain: true use_bookdown: true - default_style: "dark" + default_style: "light" lightbox: true css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css" --- @@ -276,7 +276,7 @@ flights %>% mutate(wday = strftime(time_hour,'%A')) %>% group_by(wday) %>% mutate( - prop_cancel_day = sum(canceled)/sum(!canceled), + prop_cancel_day = sum(canceled)/n(), av_delay = mean(dep_delay, na.rm = TRUE) ) %>% ungroup() %>% diff --git a/session_6/session_6.Rmd b/session_6/session_6.Rmd index a0c6f32..52bcaad 100644 --- a/session_6/session_6.Rmd +++ b/session_6/session_6.Rmd @@ -6,7 +6,7 @@ output: rmdformats::downcute: self_contain: true use_bookdown: true - default_style: "dark" + default_style: "light" lightbox: true css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css" --- diff --git a/session_7/slides.Rmd b/session_7/session_7.Rmd similarity index 59% rename from session_7/slides.Rmd rename to session_7/session_7.Rmd index 27d61a2..3b2774f 100644 --- a/session_7/slides.Rmd +++ b/session_7/session_7.Rmd @@ -1,28 +1,55 @@ --- -title: '#7 String & RegExp' +title: "R.7: String & RegExp" author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)" -date: "08 Nov 2019" -always_allow_html: yes +date: "2021" output: - beamer_presentation: - theme: metropolis - slide_level: 3 - fig_caption: no - df_print: tibble - highlight: tango - latex_engine: xelatex - slidy_presentation: - highlight: tango + rmdformats::downcute: + self_contain: true + use_bookdown: true + default_style: "light" + lightbox: true + css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css" --- -```{r setup, include=FALSE, cache=TRUE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) + +```{r setup, include=FALSE} +rm(list=ls()) +knitr::opts_chunk$set(echo = TRUE) +knitr::opts_chunk$set(comment = NA) ``` +```{r klippy, echo=FALSE, include=TRUE} +klippy::klippy( + position = c('top', 'right'), + color = "white", + tooltip_message = 'Click to copy', + tooltip_success = 'Copied !') +``` + +# Introduction +In the previous session, we have often overlooked a particular type of data, the **string**. +In R a sequence of characters is stored as a string. -## String basics +In this session you will learn the distinctive features of the string type and how we can use string of character within a programming language which is composed of particular string of characters as function names, variables. +<div class="pencadre"> +As usual we will need the `tidyverse` library. +</div> + +<details><summary>Solution</summary> +<p> +```{r load_data, eval=T, message=F} +library(tidyverse) ``` +</p> +</details> + +# String basics + +## String definition + +A string can be defined within double `"` or simple `'` quote + +```{r string_def, eval=F, message=T} string1 <- "This is a string" string2 <- 'If I want to include a "quote" inside a string, I use single quotes' @@ -37,38 +64,35 @@ If you forget to close a quote, you’ll see +, the continuation character: + HELP I'M STUCK ``` -If this happen to you, press Escape and try again! +If this happens to you, press `Escape` and try again! -## String basics +To include a literal single or double quote in a string you can use \\ to *escape* it: -To include a literal single or double quote in a string you can use \ to “escape†it: - -``` +```{r string_def_escape, eval=F, message=T} double_quote <- "\"" # or '"' single_quote <- '\'' # or "'" ``` -if you want to include a literal backslash, you’ll need to double it up: `"\\"`. +If you want to include a literal backslash, you’ll need to double it up: `"\\"`. -## String basics + +## String representation -the printed representation of a string is not the same as string itself +The printed representation of a string is not the same as string itself -``` +```{r string_rep_escape_a, eval=T, message=T} x <- c("\"", "\\") x -#> [1] "\"" "\\" +``` +```{r string_rep_escape_b, eval=T, message=T} writeLines(x) -#> " -#> \ ``` -## String basics +Some characters have a special representation, they are called **special characters**. +The most common are `"\n"`, newline, and `"\t"`, tabulation, but you can see the complete list by requesting help on `"`: `?'"'` -Special characters: +## String operation -The most common are `"\n"`, newline, and `"\t"`, tab, but you can see the complete list by requesting help on `"`: `?'"'` - -## String basics +You can perform basic operation on strings like - String length @@ -87,9 +111,8 @@ x <- c("Apple", "Banana", "Pear") str_sub(x, 1, 3) ``` -## String basics - Subsetting strings -negative numbers count backwards from end +negative numbers count backwards from the end ```{r str_sub2, eval=T, message=FALSE, cache=T} str_sub(x, -3, -1) ``` @@ -104,13 +127,25 @@ str_to_lower(x) str_sort(x) ``` -## Matching patterns with regular expressions +# Matching patterns with regular expressions -Regexps are a very terse language that allow you to describe patterns in strings. +Regexps are a very terse language that allows you to describe patterns in strings. To learn regular expressions, we’ll use `str_view()` and `str_view_all()`. These functions take a character vector and a regular expression, and show you how they match. -## Matching patterns with regular expressions +<div class="pencadre"> +You need to install the `htmlwidgets` packages to use these functions +</div> + +<details><summary>Solution</summary> +<p> +```{r load_htmlwidgets, eval=T, message=F} +library(htmlwidgets) +``` +</p> +</details> + +The most basic regular expression is the exact match. ```{r str_view, eval=T, message=FALSE, cache=T} x <- c("apple", "banana", "pear") @@ -124,12 +159,14 @@ x <- c("apple", "banana", "pear") str_view(x, ".a.") ``` +But if “`.`†matches any character, how do you match the character “`.`â€? +You need to use an “escape†to tell the regular expression you want to match it exactly, not use its special behavior. -## Matching patterns with regular expressions +Like strings, regexps use the backslash, `\`, to escape special behaviour. +So to match an `.`, you need the regexp `\.`. Unfortunately this creates a problem. -But if “`.`†matches any character, how do you match the character “`.`â€? You need to use an “escape†to tell the regular expression you want to match it exactly, not use its special behaviour. Like strings, regexps use the backslash, `\`, to escape special behaviour. So to match an ., you need the regexp `\.`. Unfortunately this creates a problem. We use strings to represent regular expressions, and `\` is also used as an escape symbol in strings. So to create the regular expression `\.` we need the string "`\\.`". - -## Matching patterns with regular expressions +We use strings to represent regular expressions, and `\` is also used as an escape symbol in strings. +So to create the regular expression `\.` we need the string "`\\.`". ```{r str_viewdotescape, eval=T, message=FALSE, cache=T} dot <- "\\." @@ -137,12 +174,7 @@ writeLines(dot) str_view(c("abc", "a.c", "bef"), "a\\.c") ``` -## Matching patterns with regular expressions - -If `\` is used as an escape character in regular expressions, how do you match a literal `\`? Well you need to escape it, creating the regular expression `\\`. To create that regular expression, you need to use a string, which also needs to escape `\`. That means to match a literal `\` you need to write "`\\\\`" — you need four backslashes to match one! - - -## Matching patterns with regular expressions +If `\` is used as an escape character in regular expressions, how do you match a literal `\`? Well, you need to escape it, creating the regular expression `\\`. To create that regular expression, you need to use a string, which also needs to escape `\`. That means to match a literal `\` you need to write "`\\\\`" — you need four backslashes to match one! ```{r str_viewbackslashescape, eval=T, message=FALSE, cache=T} x <- "a\\b" @@ -152,34 +184,26 @@ str_view(x, "\\\\") ## Exercises -- Explain why each of these strings don’t match a \: "`\`", "`\\`", "`\\\`". +- Explain why each of these strings doesn’t match a \: "`\`", "`\\`", "`\\\`". - How would you match the sequence `"'\`? - What patterns will the regular expression `\..\..\..` match? How would you represent it as a string? ## Anchors -- `^` match the start of the string. -- `$` match the end of the string. +Until now we searched for patterns anywhere in the target string. But we can use anchors to be more precise. + +- `^` Match the start of the string. +- `$` Match the end of the string. ```{r str_viewanchors, eval=T, cache=T} x <- c("apple", "banana", "pear") str_view(x, "^a") ``` -## Anchors - -- `^` match the start of the string. -- `$` match the end of the string. - ```{r str_viewanchorsend, eval=T, cache=T} str_view(x, "a$") ``` -## Anchors - -- `^` match the start of the string. -- `$` match the end of the string. - ```{r str_viewanchorsstartend, eval=T, cache=T} x <- c("apple pie", "apple", "apple cake") str_view(x, "^apple$") @@ -187,36 +211,33 @@ str_view(x, "^apple$") ## Exercices - - - How would you match the literal string `"$^$"`? - - Given the corpus of common words in stringr::words, create regular expressions that find all words that: -Start with “yâ€. - End with “x†- Are exactly three letters long. (Don’t cheat by using `str_length()`!) - Have seven letters or more. -Since this list is long, you might want to use the match argument to str_view() to show only the matching or non-matching words. +Since this list is long, you might want to use the match argument to `str_view()` to show only the matching or non-matching words. ## Character classes and alternatives +In regular expression we have special character and patterns that match groups of characters. + - `\d`: matches any digit. - `\s`: matches any whitespace (e.g. space, tab, newline). - `[abc]`: matches a, b, or c. - `[^abc]`: matches anything except a, b, or c. -``` +```{r str_viewanchorsstartend_b, eval=T, cache=T} str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c") str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c") str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]") ``` -## Character classes and alternatives - -You can use alternation to pick between one or more alternative patterns. For example, abc|d..f will match either ‘“abcâ€â€™, or "deaf". Note that the precedence for | is low, so that abc|xyz matches abc or xyz not abcyz or abxyz. Like with mathematical expressions, if precedence ever gets confusing, use parentheses to make it clear what you want: +You can use alternations to pick between one or more alternative patterns. For example, `abc|d..f` will match either `abc`, or `deaf`. Note that the precedent for `|` is low, so that `abc|xyz` matches `abc` or `xyz` not `abcyz` or `abxyz`. Like with mathematical expressions, if presidents ever get confusing, use parentheses to make it clear what you want: -``` +```{r str_viewanchorsstartend_c, eval=T, cache=T} str_view(c("grey", "gray"), "gr(e|a)y") ``` @@ -225,25 +246,25 @@ str_view(c("grey", "gray"), "gr(e|a)y") Create regular expressions to find all words that: - Start with a vowel. -- That only contain consonants. (Hint: thinking about matching “notâ€-vowels.) +- That only contains consonants. (Hint: thinking about matching “notâ€-vowels.) - End with ed, but not with eed. - End with ing or ise. ## Repetition +Now that you know how to search for groups of characters you can define the number of times you want to see them. + - `?`: 0 or 1 - `+`: 1 or more - `*`: 0 or more -``` +```{r str_view_repetition, eval=T, cache=T} x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII" str_view(x, "CC?") str_view(x, "CC+") str_view(x, 'C[LX]+') ``` -## Repetition - You can also specify the number of matches precisely: - `{n}`: exactly n @@ -251,7 +272,7 @@ You can also specify the number of matches precisely: - `{,m}`: at most m - `{n,m}`: between n and m -``` +```{r str_view_repetition_b, eval=T, cache=T} str_view(x, "C{2}") str_view(x, "C{2,}") str_view(x, "C{2,3}") @@ -272,16 +293,14 @@ str_view(x, "C{2,3}") ## Grouping -You learned about parentheses as a way to disambiguate complex expressions. Parentheses also create a numbered capturing group (number 1, 2 etc.). A capturing group stores the part of the string matched by the part of the regular expression inside the parentheses. You can refer to the same text as previously matched by a capturing group with backreferences, like `\1`, `\2` etc. +You learned about parentheses as a way to disambiguate complex expressions. Parentheses also create a numbered capturing group (number 1, 2 etc.). A capturing group stores the part of the string matched by the part of the regular expression inside the parentheses. You can refer to the same text as previously matched by a capturing group with back references, like `\1`, `\2` etc. -``` +```{r str_view_grouping, eval=T, cache=T} str_view(fruit, "(..)\\1", match = TRUE) ``` ## Exercices - - - Describe, in words, what these expressions will match: - `"(.)\1\1"` - `"(.)(.)\\2\\1"` @@ -295,32 +314,34 @@ str_view(fruit, "(..)\\1", match = TRUE) ## Detect matches -``` +```{r str_view_match, eval=T, cache=T} x <- c("apple", "banana", "pear") str_detect(x, "e") ``` How many common words start with t? -``` +```{r str_view_match_b, eval=T, cache=T} sum(str_detect(words, "^t")) ``` -What proportion of common words end with a vowel? +What proportion of common words ends with a vowel? -``` +```{r str_view_match_c, eval=T, cache=T} mean(str_detect(words, "[aeiou]$")) ``` ## Combining detection Find all words containing at least one vowel, and negate -``` + +```{r str_view_detection, eval=T, cache=T} no_vowels_1 <- !str_detect(words, "[aeiou]") ``` Find all words consisting only of consonants (non-vowels) -``` + +```{r str_view_detection_b, eval=T, cache=T} no_vowels_2 <- str_detect(words, "^[^aeiou]+$") identical(no_vowels_1, no_vowels_2) ``` @@ -373,8 +394,6 @@ has_noun %>% str_extract(noun) ``` -## Grouped matches - `str_extract()` gives us the complete match; `str_match()` gives each individual component. ```{r noun_regex_match, eval=T, cache=T} @@ -384,11 +403,11 @@ has_noun %>% ## Exercises -- Find all words that come after a “number†like “oneâ€, “twoâ€, “three†etc. Pull out both the number and the word. +- Find all words that come after a `number` like `one`, `two`, `three` etc. Pull out both the number and the word. ## Replacing matches -Instead of replacing with a fixed string you can use backreferences to insert components of the match. In the following code, I flip the order of the second and third words. +Instead of replacing with a fixed string, you can use back references to insert components of the match. In the following code, I flip the order of the second and third words. ```{r replacing_matches, eval=T, cache=T} sentences %>% @@ -408,4 +427,6 @@ sentences %>% sentences %>% head(5) %>% str_split("\\s") -``` \ No newline at end of file +``` + +## See you in [R.8: Factors](http://perso.ens-lyon.fr/laurent.modolo/R/session_8/) \ No newline at end of file diff --git a/session_8/slides.Rmd b/session_8/session_8.Rmd similarity index 76% rename from session_8/slides.Rmd rename to session_8/session_8.Rmd index add9984..ac76a79 100644 --- a/session_8/slides.Rmd +++ b/session_8/session_8.Rmd @@ -1,25 +1,48 @@ --- -title: '#8 Factors' +title: "R.8: Factors" author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr)" -date: "31 Jan 2020" -always_allow_html: yes +date: "2021" output: - slidy_presentation: - highlight: tango - beamer_presentation: - theme: metropolis - slide_level: 3 - fig_caption: no - df_print: tibble - highlight: tango - latex_engine: xelatex + rmdformats::downcute: + self_contain: true + use_bookdown: true + default_style: "light" + lightbox: true + css: "http://perso.ens-lyon.fr/laurent.modolo/R/src/style.css" --- -```{r setup, include=FALSE, cache=TRUE} + +```{r setup, include=FALSE} +rm(list=ls()) knitr::opts_chunk$set(echo = TRUE) +knitr::opts_chunk$set(comment = NA) +``` +```{r klippy, echo=FALSE, include=TRUE} +klippy::klippy( + position = c('top', 'right'), + color = "white", + tooltip_message = 'Click to copy', + tooltip_success = 'Copied !') +``` + +# Introduction + +In this session, you will learn more about the factor type in R. +Factors can be very useful, but you have to be mindful of the implicit conversions from simple vector to factor ! +They are the source of loot of pain for R programmers. + +<div class="pencadre"> +As usual we will need the `tidyverse` library. +</div> + +<details><summary>Solution</summary> +<p> +```{r load_data, eval=T, message=F} library(tidyverse) ``` +</p> +</details> -## Creating factors +# Creating factors Imagine that you have a variable that records month: @@ -41,8 +64,6 @@ x2 <- c("Dec", "Apr", "Jam", "Mar") sort(x1) ``` -## Creating factors - You can fix both of these problems with a factor. ```{r sort_month_factor, eval=T, cache=T} @@ -55,8 +76,6 @@ y1 sort(y1) ``` -## Creating factors - And any values not in the set will be converted to NA: ```{r sort_month_factor2, eval=T, cache=T} @@ -72,16 +91,14 @@ f2 levels(f2) ``` -## General Social Survey +# General Social Survey ```{r race_count, eval=T, cache=T} gss_cat %>% count(race) ``` -## General Social Survey - -By default, ggplot2 will drop levels that don’t have any values. You can force them to display with: +By default, `ggplot2` will drop levels that don’t have any values. You can force them to display with: ```{r race_plot, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} ggplot(gss_cat, aes(race)) + @@ -89,7 +106,7 @@ ggplot(gss_cat, aes(race)) + scale_x_discrete(drop = FALSE) ``` -## Modifying factor order +# Modifying factor order It’s often useful to change the order of the factor levels in a visualisation. @@ -104,27 +121,17 @@ relig_summary <- gss_cat %>% ggplot(relig_summary, aes(tvhours, relig)) + geom_point() ``` -**8_a** - -## Modifying factor order - It is difficult to interpret this plot because there’s no overall pattern. We can improve it by reordering the levels of relig using `fct_reorder()`. `fct_reorder()` takes three arguments: - `f`, the factor whose levels you want to modify. - `x`, a numeric vector that you want to use to reorder the levels. - Optionally, `fun`, a function that’s used if there are multiple values of `x` for each value of `f`. The default value is `median`. -## Modifying factor order - ```{r tv_hour_order, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) + geom_point() ``` -**8_b** - -## Modifying factor order - As you start making more complicated transformations, I’d recommend moving them out of `aes()` and into a separate `mutate()` step. For example, you could rewrite the plot above as: ```{r tv_hour_order_mutate, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} @@ -133,9 +140,8 @@ relig_summary %>% ggplot(aes(tvhours, relig)) + geom_point() ``` -**8_c** -## `fct_reorder2()` +# `fct_reorder2()` Another type of reordering is useful when you are colouring the lines on a plot. `fct_reorder2()` reorders the factor by the `y` values associated with the largest `x` values. This makes the plot easier to read because the line colours line up with the legend. @@ -146,23 +152,14 @@ by_age <- gss_cat %>% group_by(age) %>% mutate(prop = n / sum(n)) ``` -**8_d** - -## `fct_reorder2()` ```{r fct_reorder2a, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} ggplot(by_age, aes(age, prop, colour = marital)) + geom_line(na.rm = TRUE) ``` -**8_e** - -## `fct_reorder2()` - ```{r fct_reorder2b, cache = TRUE, fig.width=8, fig.height=4.5, message=FALSE} ggplot(by_age, aes(age, prop, colour = fct_reorder2(marital, age, prop))) + geom_line() + labs(colour = "marital") -``` - -**8_f** \ No newline at end of file +``` \ No newline at end of file -- GitLab