TidyTuesday
    • About TidyTuesday
    • Datasets
      • 2025
      • 2024
      • 2023
      • 2022
      • 2021
      • 2020
      • 2019
      • 2018
    • Useful links

    On this page

    • Beyoncé and Taylor Swift Lyrics
      • Get the data here
      • Data Dictionary
    • beyonce_lyrics.csv
    • taylor_swift_lyrics.csv
    • sales.csv
    • charts.csv
      • Cleaning Script

    Beyoncé and Taylor Swift Lyrics

    The data this week comes from Rosie Baillie and Dr. Sara Stoudt.

    Beyoncé’s top 100 - Billboard. Taylor Swift’s top 100 - Billboard.

    Rosie put together a wonderful analysis of Taylor Swift lyrics! Can you do some similar work with Beyoncé’s work?

    Text analysis guides in tidytext or Supervised Machine Learning for Text Analysis in R.

    The beyonce palettes R pkg.

    Get the data here

    # Get the Data
    
    # Read in with tidytuesdayR package 
    # Install from CRAN via: install.packages("tidytuesdayR")
    # This loads the readme and all the datasets for the week of interest
    
    # Either ISO-8601 date or year/week works!
    
    tuesdata <- tidytuesdayR::tt_load('2020-09-29')
    tuesdata <- tidytuesdayR::tt_load(2020, week = 40)
    
    beyonce_lyrics <- tuesdata$beyonce_lyrics
    
    # Or read in the data manually
    
    beyonce_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-29/beyonce_lyrics.csv')
    taylor_swift_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-29/taylor_swift_lyrics.csv')
    sales <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-29/sales.csv')
    charts <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-29/charts.csv')

    Data Dictionary

    beyonce_lyrics.csv

    variable class description
    line character Lyric line
    song_id double Song ID
    song_name character Song Name
    artist_id double Artist ID
    artist_name character Artist Name
    song_line double Song line number

    taylor_swift_lyrics.csv

    variable class description
    Artist character Artist
    Album character Album name
    Title character Title of song
    Lyrics character Lyrics

    sales.csv

    variable class description
    artist character Artist name
    title character Song title
    country character Country for sales
    sales double Sales in dollars
    released character released date
    re_release character Re-released date
    label character Label released under
    formats character Formats released as

    charts.csv

    variable class description
    artist character Artist name
    title character Song title
    released character released date
    re_release character Re-released date
    label character Label released under
    formats character Formats released as
    chart character Country Chart
    chart_position character Highest Chart position

    Cleaning Script

    library(tidyverse)
    library(rvest)
    
    ts_url <- "https://en.wikipedia.org/wiki/Taylor_Swift_discography"
    
    raw_ts_html <- ts_url %>% 
      read_html()
    
    ts_raw <- raw_ts_html %>% 
      html_node("#mw-content-text > div.mw-parser-output > table:nth-child(10)") %>% 
      html_table(fill = TRUE) %>% 
      data.frame() %>% 
      janitor::clean_names() %>% 
      tibble() %>% 
      slice(-1, -nrow(.)) %>% 
      mutate(album_details = str_split(album_details, "\n"),
             sales = str_split(sales, "\n"),
      ) %>% 
      select(-certifications) %>% 
      unnest_longer(album_details)  %>% 
      separate(album_details, into = c("album_detail_type", "album_details"), sep = ": ") %>% 
      mutate(album_detail_type = if_else(album_detail_type == "Re-edition", "Re-release", album_detail_type)) %>% 
      pivot_wider(names_from = album_detail_type, values_from = album_details) %>% 
      select(-`na`) %>% 
      janitor::clean_names() 
    
    ts_sales <- ts_raw %>% 
      unnest_longer(sales) %>% 
      separate(sales, into = c("country", "sales"), sep = ": ") %>% 
      mutate(sales = str_trim(sales),
             sales = parse_number(sales)) %>% 
      select(title, country, sales, released:formats) %>% 
      mutate(artist = "Taylor Swift", .before = title)
    
    
    ts_chart <- ts_raw %>% 
      select(title, released:formats, contains("peak_chart")) %>% 
      pivot_longer(cols = contains("peak_chart"), names_to = "chart", values_to = "chart_position") %>% 
      mutate(
        chart = str_remove(chart, "peak_chart_positions"),
      chart = case_when(
        chart == "" ~ "US",
        chart == "_1" ~ "AUS",
        chart == "_2" ~ "CAN",
        chart == "_3" ~ "FRA",
        chart == "_4" ~ "GER",
        chart == "_5" ~ "IRE",
        chart == "_6" ~ "JPN",
        chart == "_7" ~ "NZ",
        chart == "_8" ~ "SWE",
        chart == "_9" ~ "UK",
        TRUE ~ NA_character_
      )
      )  %>% 
      mutate(artist = "Taylor Swift", .before = title)
    
    
    # Beyonce -----------------------------------------------------------------
    
    
    bey_url <- "https://en.wikipedia.org/wiki/Beyonc%C3%A9_discography"
    
    raw_bey_html <- bey_url %>% 
      read_html()
    
    bey_raw <- raw_bey_html %>% 
      html_node("#mw-content-text > div.mw-parser-output > table:nth-child(14)") %>% 
      #mw-content-text > div.mw-parser-output > table:nth-child(14) > tbody > tr:nth-child(3) > th > i > a
      html_table(fill = TRUE) %>% 
      data.frame() %>% 
      janitor::clean_names() %>% 
      tibble() %>% 
      slice(-1, -nrow(.)) %>% 
      mutate(album_details = str_split(album_details, "\n"),
             sales = str_split(sales, "\n"),
      ) %>% 
      select(-certifications) %>% 
      unnest_longer(album_details)  %>% 
      separate(album_details, into = c("album_detail_type", "album_details"), sep = ": ") %>% 
      mutate(album_detail_type = if_else(album_detail_type == "Re-edition", "Re-release", album_detail_type)) %>% 
      pivot_wider(names_from = album_detail_type, values_from = album_details) %>% 
      janitor::clean_names() 
    
    bey_sales <- bey_raw %>% 
      unnest_longer(sales) %>% 
      separate(sales, into = c("country", "sales"), sep = ": ") %>% 
      mutate(sales = str_trim(sales),
             sales = parse_number(sales)) %>% 
      select(title, country, sales, released:label, formats = format)  %>% 
      mutate(artist = "Beyoncé", .before = title)
    
    bey_chart <- bey_raw %>% 
      select(title, released:label, formats = format, contains("peak_chart")) %>% 
      pivot_longer(cols = contains("peak_chart"), names_to = "chart", values_to = "chart_position") %>% 
      mutate(
        chart = str_remove(chart, "peak_chart_positions"),
        chart = case_when(
          chart == "" ~ "US",
          chart == "_1" ~ "AUS",
          chart == "_2" ~ "CAN",
          chart == "_3" ~ "FRA",
          chart == "_4" ~ "GER",
          chart == "_5" ~ "IRE",
          chart == "_6" ~ "JPN",
          chart == "_7" ~ "NZ",
          chart == "_8" ~ "SWE",
          chart == "_9" ~ "UK",
          TRUE ~ NA_character_
        )
      ) %>% 
      mutate(artist = "Beyoncé", .before = title)
    
    all_sales <- bind_rows(ts_sales, bey_sales)
    all_charts <- bind_rows(ts_chart, bey_chart)
    
    write_csv(all_sales, "2020/2020-09-29/sales.csv")
    write_csv(all_charts, "2020/2020-09-29/charts.csv")