diff --git a/session_1/example_1.R b/session_1/example_1.R new file mode 100644 index 0000000000000000000000000000000000000000..c275a973c2b52e9d0bb2c6997fa2edb20e0a1b01 --- /dev/null +++ b/session_1/example_1.R @@ -0,0 +1,118 @@ +# Does news coverage boost support for presidential candidates in the Democratic primary? +# https://www.jacob-long.com/post/news-coverage-candidate-support/ + +library(tidyverse) +library(jtools) +library(tsibble) + +################################ Getting the data ############################## +cable_mentions <- read_csv("https://github.com/fivethirtyeight/data/raw/master/media-mentions-2020/cable_weekly.csv") +online_mentions <- read_csv("https://github.com/fivethirtyeight/data/raw/master/media-mentions-2020/online_weekly.csv") +# Immediately convert `end_date` to date class +polls <- read_csv("https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv") + + +candidates <- c("Amy Klobuchar", "Andrew Yang", "Bernard Sanders", + "Beto O'Rourke", "Bill de Blasio", "Cory A. Booker", + "Elizabeth Warren", "Eric Swalwell", "Jay Robert Inslee", + "Joe Sestak", "John Hickenlooper", "John K. Delaney", + "Joseph R. Biden Jr.", "Julián Castro", "Kamala D. Harris", + "Kirsten E. Gillibrand", "Marianne Williamson", + "Michael F. Bennet", "Pete Buttigieg", "Seth Moulton", + "Steve Bullock", "Tim Ryan", "Tom Steyer", "Tulsi Gabbard", + "Wayne Messam") + +candidates_clean <- c("Amy Klobuchar", "Andrew Yang", "Bernie Sanders", + "Beto O'Rourke", "Bill de Blasio", "Cory Booker", + "Elizabeth Warren", "Eric Swalwell", "Jay Inslee", + "Joe Sestak", "John Hickenlooper", "John Delaney", + "Joe Biden", "Julian Castro", "Kamala Harris", + "Kirsten Gillibrand", "Marianne Williamson", + "Michael Bennet", "Pete Buttigieg", "Seth Moulton", + "Steve Bullock", "Tim Ryan", "Tom Steyer", + "Tulsi Gabbard", "Wayne Messam") + + +########################### formating data ##################################### + +polls <- polls %>% + # Convert date to date format + mutate(end_date = as.Date(end_date, format = "%m/%d/%y")) %>% + filter( + # include only polls of at least modest quality + fte_grade %in% c("C-", "C", "C+", "B-", "B", "B+", "A-", "A", "A+"), + # only include polls ending on or after 12/30/2018 + end_date >= as.Date("12/30/2018", "%m/%d/%Y"), + # only include *Democratic* primary polls + party == "DEM", + # only include the selected candidates + candidate_name %in% candidates, + # only national polls + is.na(state), + # Exclude some head-to-head results, etc. + notes %nin% c("head-to-head poll", + "HarrisX/SR Democrat LV, definite voter", + "open-ended question") + ) %>% + mutate( + # Have to add 1 to the date to accommodate tsibble's yearweek() + # starting on Monday rather than Sunday like our other data sources + week = as.Date(yearweek(end_date + 1)) - 1, + # Convert candidate names to factor so I can relabel them + candidate_name = factor(candidate_name, levels = candidates, labels = candidates_clean) + ) + +polls_agg <- polls %>% + group_by(week, candidate_name) %>% + summarize( + pct_polls = weighted.mean(pct, log(sample_size)) + ) + +library(ggplot2) +top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders", + "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke", + "Cory Booker") +ggplot(filter(polls_agg, candidate_name %in% top_candidates), + aes(x = week, y = pct_polls, color = candidate_name)) + + geom_line() + + theme_nice() + + +media <- + inner_join(cable_mentions, online_mentions, by = c("date", "name")) %>% + mutate( + # Create new variables that put the media coverage variables on + # same scale as poll numbers + pct_cable = 100 * pct_of_all_candidate_clips, + pct_online = 100 * pct_of_all_candidate_stories + ) + +top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders", + "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke", + "Cory Booker") +ggplot(filter(media, name %in% top_candidates), + aes(x = date, y = pct_cable, color = name)) + + geom_line() + + theme_nice() + +top_candidates <- c("Joe Biden", "Elizabeth Warren", "Bernie Sanders", + "Pete Buttigieg", "Kamala Harris", "Beto O'Rourke", + "Cory Booker") +ggplot(filter(media, name %in% top_candidates), + aes(x = date, y = pct_online, color = name)) + + geom_line() + + theme_nice() + + +######################### Combine the data ##################################### +joined <- inner_join(polls_agg, media, + by = c("candidate_name" = "name", "week" = "date")) + +library(panelr) + +# panel_data needs a number or ordered factor as wave variable +joined$wave <- as.ordered(joined$week) +joined_panel <- panel_data(ungroup(joined), id = candidate_name, wave = wave) +joined_pdata <- as_pdata.frame(joined_panel) + +