
Beyoncé and Taylor Swift Lyrics
The data this week comes from Rosie Baillie and Dr. Sara Stoudt.
Beyoncé’s top 100 - Billboard. Taylor Swift’s top 100 - Billboard.
Rosie put together a wonderful analysis of Taylor Swift lyrics! Can you do some similar work with Beyoncé’s work?
Text analysis guides in tidytext or Supervised Machine Learning for Text Analysis in R.
Get the data here
# Get the Data
# Read in with tidytuesdayR package
# Install from CRAN via: install.packages("tidytuesdayR")
# This loads the readme and all the datasets for the week of interest
# Either ISO-8601 date or year/week works!
tuesdata <- tidytuesdayR::tt_load('2020-09-29')
tuesdata <- tidytuesdayR::tt_load(2020, week = 40)
beyonce_lyrics <- tuesdata$beyonce_lyrics
# Or read in the data manually
beyonce_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-29/beyonce_lyrics.csv')
taylor_swift_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-29/taylor_swift_lyrics.csv')
sales <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-29/sales.csv')
charts <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-29/charts.csv')Data Dictionary
beyonce_lyrics.csv
| variable | class | description |
|---|---|---|
| line | character | Lyric line |
| song_id | double | Song ID |
| song_name | character | Song Name |
| artist_id | double | Artist ID |
| artist_name | character | Artist Name |
| song_line | double | Song line number |
taylor_swift_lyrics.csv
| variable | class | description |
|---|---|---|
| Artist | character | Artist |
| Album | character | Album name |
| Title | character | Title of song |
| Lyrics | character | Lyrics |
sales.csv
| variable | class | description |
|---|---|---|
| artist | character | Artist name |
| title | character | Song title |
| country | character | Country for sales |
| sales | double | Sales in dollars |
| released | character | released date |
| re_release | character | Re-released date |
| label | character | Label released under |
| formats | character | Formats released as |
charts.csv
| variable | class | description |
|---|---|---|
| artist | character | Artist name |
| title | character | Song title |
| released | character | released date |
| re_release | character | Re-released date |
| label | character | Label released under |
| formats | character | Formats released as |
| chart | character | Country Chart |
| chart_position | character | Highest Chart position |
Cleaning Script
library(tidyverse)
library(rvest)
ts_url <- "https://en.wikipedia.org/wiki/Taylor_Swift_discography"
raw_ts_html <- ts_url %>%
read_html()
ts_raw <- raw_ts_html %>%
html_node("#mw-content-text > div.mw-parser-output > table:nth-child(10)") %>%
html_table(fill = TRUE) %>%
data.frame() %>%
janitor::clean_names() %>%
tibble() %>%
slice(-1, -nrow(.)) %>%
mutate(album_details = str_split(album_details, "\n"),
sales = str_split(sales, "\n"),
) %>%
select(-certifications) %>%
unnest_longer(album_details) %>%
separate(album_details, into = c("album_detail_type", "album_details"), sep = ": ") %>%
mutate(album_detail_type = if_else(album_detail_type == "Re-edition", "Re-release", album_detail_type)) %>%
pivot_wider(names_from = album_detail_type, values_from = album_details) %>%
select(-`na`) %>%
janitor::clean_names()
ts_sales <- ts_raw %>%
unnest_longer(sales) %>%
separate(sales, into = c("country", "sales"), sep = ": ") %>%
mutate(sales = str_trim(sales),
sales = parse_number(sales)) %>%
select(title, country, sales, released:formats) %>%
mutate(artist = "Taylor Swift", .before = title)
ts_chart <- ts_raw %>%
select(title, released:formats, contains("peak_chart")) %>%
pivot_longer(cols = contains("peak_chart"), names_to = "chart", values_to = "chart_position") %>%
mutate(
chart = str_remove(chart, "peak_chart_positions"),
chart = case_when(
chart == "" ~ "US",
chart == "_1" ~ "AUS",
chart == "_2" ~ "CAN",
chart == "_3" ~ "FRA",
chart == "_4" ~ "GER",
chart == "_5" ~ "IRE",
chart == "_6" ~ "JPN",
chart == "_7" ~ "NZ",
chart == "_8" ~ "SWE",
chart == "_9" ~ "UK",
TRUE ~ NA_character_
)
) %>%
mutate(artist = "Taylor Swift", .before = title)
# Beyonce -----------------------------------------------------------------
bey_url <- "https://en.wikipedia.org/wiki/Beyonc%C3%A9_discography"
raw_bey_html <- bey_url %>%
read_html()
bey_raw <- raw_bey_html %>%
html_node("#mw-content-text > div.mw-parser-output > table:nth-child(14)") %>%
#mw-content-text > div.mw-parser-output > table:nth-child(14) > tbody > tr:nth-child(3) > th > i > a
html_table(fill = TRUE) %>%
data.frame() %>%
janitor::clean_names() %>%
tibble() %>%
slice(-1, -nrow(.)) %>%
mutate(album_details = str_split(album_details, "\n"),
sales = str_split(sales, "\n"),
) %>%
select(-certifications) %>%
unnest_longer(album_details) %>%
separate(album_details, into = c("album_detail_type", "album_details"), sep = ": ") %>%
mutate(album_detail_type = if_else(album_detail_type == "Re-edition", "Re-release", album_detail_type)) %>%
pivot_wider(names_from = album_detail_type, values_from = album_details) %>%
janitor::clean_names()
bey_sales <- bey_raw %>%
unnest_longer(sales) %>%
separate(sales, into = c("country", "sales"), sep = ": ") %>%
mutate(sales = str_trim(sales),
sales = parse_number(sales)) %>%
select(title, country, sales, released:label, formats = format) %>%
mutate(artist = "Beyoncé", .before = title)
bey_chart <- bey_raw %>%
select(title, released:label, formats = format, contains("peak_chart")) %>%
pivot_longer(cols = contains("peak_chart"), names_to = "chart", values_to = "chart_position") %>%
mutate(
chart = str_remove(chart, "peak_chart_positions"),
chart = case_when(
chart == "" ~ "US",
chart == "_1" ~ "AUS",
chart == "_2" ~ "CAN",
chart == "_3" ~ "FRA",
chart == "_4" ~ "GER",
chart == "_5" ~ "IRE",
chart == "_6" ~ "JPN",
chart == "_7" ~ "NZ",
chart == "_8" ~ "SWE",
chart == "_9" ~ "UK",
TRUE ~ NA_character_
)
) %>%
mutate(artist = "Beyoncé", .before = title)
all_sales <- bind_rows(ts_sales, bey_sales)
all_charts <- bind_rows(ts_chart, bey_chart)
write_csv(all_sales, "2020/2020-09-29/sales.csv")
write_csv(all_charts, "2020/2020-09-29/charts.csv")