session_4.Rmd

title: "R.4: data transformation"
author: "Laurent Modolo [laurent.modolo@ens-lyon.fr](mailto:laurent.modolo@ens-lyon.fr), Hélène Polvèche [hpolveche@istem.fr](mailto:hpolveche@istem.fr)"
date: "2022"
library(fontawesome)

if ("conflicted" %in% .packages()) {
  conflicted::conflicts_prefer(dplyr::filter)
}
rm(list = ls())
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(comment = NA)
install.packages("nycflights13")
library("tidyverse")
library("nycflights13")
?flights
flights
colnames(flights)
?filter
filter(flights, air_time >= 680)
filter(flights, carrier == "HA")
filter(flights, origin != "JFK")
filter(flights, carrier %in% c("OO", "AS"))
filter(flights, month %in% c(5, 6, 7, 12))
long_flights <- filter(flights, air_time >= 680)
filter(flights, month == 12, day == 25)
filter(long_flights, day <= 15 & carrier == "HA")
filter(long_flights, day <= 15 | carrier == "HA")
filter(long_flights, (day <= 15 | carrier == "HA") & (!month > 2))
long_flights

filter(long_flights, day <= 15 & carrier == "HA")
filter(long_flights, day <= 15 | carrier == "HA")
filter(long_flights, (day <= 15 | carrier == "HA") & (!month > 2))
filter(flights, month == 11 | month == 12)
filter(flights, month %in% c(11, 12))
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120 & dep_delay <= 120)
filter(flights, arr_delay <= 120, dep_delay <= 120)
(dec25 <- filter(flights, month == 12, day == 25))
NA > 5
10 == NA
NA + 10
is.na(NA)
df <- tibble(
  x = c("A", "B", "C"),
  y = c(1, NA, 3)
)
df
filter(df, y > 1)
filter(df, is.na(y) | y > 1)
filter(flights, is.na(dep_time))
NA^0 # ^ 0 is always 1 it's an arbitrary rule not a computation
NA | TRUE # if a member of a OR operation is TRUE the results is TRUE
FALSE & NA # if a member of a AND operation is FALSE the results is FALSE
NA * 0 # here we have a true computation
arrange(flights, distance, dep_delay)
arrange(flights, distance, desc(dep_delay))
df <- tibble(
  x = c("A", "B", "C"),
  y = c(1, NA, 3)
)
df

arrange(df, y)
arrange(df, desc(y))
arrange(flights, desc(arr_delay))
arrange(flights, dep_delay)
arrange(df, desc(is.na(y)))
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
select(flights, Y = year, M = month, D = day)
df_dep_arr <- select(flights, dep_time, dep_delay, arr_time, arr_delay)
colnames(df_dep_arr)
select(flights, dep_time, dep_delay, arr_time, arr_delay)
select(flights, starts_with("dep"), starts_with("arr"))
select(flights, starts_with("dep") | starts_with("arr"))
select(flights, matches("^(dep|arr)"))
select(flights, dep_time:arr_delay & !starts_with("sched"))
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, any_of(vars))
select(flights, all_of(vars))
vars <- c(vars, "toto")
select(flights, any_of(vars))
select(flights, all_of(vars))
select(flights, where(is.character))
select(flights, where(is.numeric))
select(flights, contains("TIME"))
select(flights, contains("TIME", ignore.case = FALSE))
(flights_thin <- select(flights, year:day, ends_with("delay"), distance, air_time, contains("dep_time")))
(flights_thin_toy <- head(flights_thin, n = 5))
(flights_thin_toy2 <- sample_n(flights_thin, size = 5))
mutate(tbl, new_var_a = opperation_a, ..., new_var_n = opperation_n)
mutate(flights_thin_toy, gain = dep_delay - arr_delay)
flights_thin_toy <- mutate(flights_thin_toy,
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60
)
flights_thin_toy
HH <- 2003 %/% 100
HH
MM <- 2003 %% 100
MM
HH * 60 + MM
mutate(
  flights_thin_toy,
  HH = dep_time %/% 100,
  MM = dep_time %% 100,
  dep_time2 = HH * 60 + MM
)
mutate(
  flights_thin_toy,
  HH = dep_time %/% 100,
  MM = dep_time %% 100,
  dep_time2 = HH * 60 + MM,
  .after = "dep_time"
)
mutate(
  flights_thin_toy,
  HH = dep_time %/% 100,
  MM = dep_time %% 100,
  dep_time2 = HH * 60 + MM,
  .keep = "used"
)
mutate(
  flights_thin_toy,
  dep_time2 = dep_time %/% 100 * 60 + dep_time %% 100,
  .after = "dep_time"
)
mutate(
  flights_thin_toy,
  dep_time = dep_time * 60 + dep_time
)
mutate(
  flights,
  dep_time = (dep_time %/% 100) * 60 + dep_time %% 100,
  sched_dep_time = (sched_dep_time %/% 100) * 60 + sched_dep_time %% 100
)
install.packages(c("ghibli", "RColorBrewer", "viridis"))
library(tidyverse)

library(RColorBrewer)
library(ghibli)
library(viridis)
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) +
  geom_point()
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) +
  geom_point() +
  scale_color_brewer(palette = "Dark2")
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class)) +
  geom_point() +
  scale_colour_ghibli_d("MononokeMedium")
display.brewer.all(colorblindFriendly = TRUE)
expr_DM1 <- read_csv2("Expression_matrice_pivot_longer_DEGs_GSE86356.csv")

expr_DM1
(expr_DM1 <- read_csv2("https://can.gitbiopages.ens-lyon.fr/R_basis/session_4/Expression_matrice_pivot_longer_DEGs_GSE86356.csv"))
(DM1_tile_base <-
  ggplot(expr_DM1, aes(samples, Genes, fill = log1p(counts))) +
  geom_tile() +
  labs(y = "Genes", x = "Samples") +
  theme(
    axis.text.y = element_text(size = 6),
    axis.text.x = element_text(size = 6, angle = 90)
  ))
DM1_tile_base + scale_fill_gradient2(low = "white", high = "springgreen4")

DM1_tile_base + scale_fill_viridis_c()
tab <- read_csv2("EWang_Tibialis_DEGs_GRCH37-87_GSE86356.csv")

tab
tab <- read_csv2("https://can.gitbiopages.ens-lyon.fr/R_basis/session_4/EWang_Tibialis_DEGs_GRCH37-87_GSE86356.csv")
tab
(
  tab.sig <- mutate(
    tab,
    sig = baseMean > 20 & padj < 0.05 & abs(log2FoldChange) >= 1.5,
    UpDown = ifelse(sig, ### we can use in the same mutate a column created by a previous line
      ifelse(log2FoldChange > 0, "Up", "Down"), "NO"
    )
  )
)
install.packages("ggrepel")
library(ggrepel)
(top10 <- arrange(tab.sig, desc(sig), padj))
(top10 <- mutate(top10, row_N = row_number()))
(top10 <- filter(top10, row_N <= 10))
(top10 <- filter(tab.sig, sig == TRUE))
(top10 <- slice_min(top10, padj, n = 10))
ggplot(tab.sig, aes(x = log2FoldChange, y = -log10(padj), color = UpDown)) +
  geom_point() +
  scale_color_manual(values = c("steelblue", "lightgrey", "firebrick")) +
  geom_hline(yintercept = -log10(0.05), col = "black") +
  geom_vline(xintercept = c(-1.5, 1.5), col = "black") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(y = "-log10(p-value)", x = "log2(FoldChange)") +
  geom_label_repel(data = top10, mapping = aes(label = gene_symbol))

ggplot(tab.sig, aes(x = log2FoldChange, y = -log10(padj), color = UpDown)) +
  geom_point() +
  scale_color_manual(values = c("steelblue", "lightgrey", "firebrick")) +
  geom_hline(yintercept = -log10(0.05), col = "black") +
  geom_vline(xintercept = c(-1.5, 1.5), col = "black") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(y = "-log10(p-value)", x = "log2(FoldChange)") +
  geom_label_repel(data = top10, mapping = aes(label = gene_symbol))