TidyTuesday
    • About TidyTuesday
    • Datasets
      • 2025
      • 2024
      • 2023
      • 2022
      • 2021
      • 2020
      • 2019
      • 2018
    • Useful links

    On this page

    • Employed Status
      • Get the data here
      • Data Dictionary
    • employed.csv
    • earn.csv
      • Cleaning Script

    Employed Status

    Please note that the #DuboisChallenge is still on going! It might be interesting to revisit that data from last week OR apply the same visual techniques to this modern data.

    The data this week comes from the BLS, specifically table cpsaat17 across several years.

    Employed persons by industry, sex, race, and occupation

    I went ahead and downloaded the data by year for the past 6 years (2015 to 2020).

    I also scraped some earnings data from their self-service data tool, which provided an interesting challenge! If you want to play around with some “offline” web scraping it’s worth a shot.

    Weekly and hourly earnings data from the Current Population Survey

    The BLS has a corresponding article with similar summary data.

    Get the data here

    # Get the Data
    
    # Read in with tidytuesdayR package 
    # Install from CRAN via: install.packages("tidytuesdayR")
    # This loads the readme and all the datasets for the week of interest
    
    # Either ISO-8601 date or year/week works!
    
    tuesdata <- tidytuesdayR::tt_load('2021-02-23')
    tuesdata <- tidytuesdayR::tt_load(2021, week = 9)
    
    employed <- tuesdata$employed
    
    # Or read in the data manually
    
    employed <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-02-23/employed.csv')
    
    earn <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-02-23/earn.csv')

    Data Dictionary

    employed.csv

    Employed persons by industry, sex, race, and occupation

    variable class description
    industry character Industry group
    major_occupation character Major Occupation category
    minor_occupation character Minor Occupation Categoru
    race_gender character Race or gender group (total, men/women, race)
    industry_total double Industry total
    employ_n double Employed number
    year integer Year

    earn.csv

    Weekly median earnings and number of persons employed by race/gender/age group over time

    variable class description
    sex character Gender
    race character Racial group
    ethnic_origin character Ethnic origin (hispanic or non-hispanic)
    age character Age group
    year integer Year
    quarter integer Quarter
    n_persons double Number of persons employed by group
    median_weekly_earn integer Median weekly earning in current dollars

    Cleaning Script

    This was an interesting problem, and I think I have a decent reproducible pipeline for this one!

    Note the data is already cleaned, but feel free to test the below out for yourself.

    #basic url
    # https://www.bls.gov/cps/aa2020/cpsaat17.xlsx
    
    library(tidyverse)
    library(glue)
    
    
    get_bls_report <- function(year){
      
      report_url <- glue::glue("https://www.bls.gov/cps/aa{year}/cpsaat17.xlsx")
      
      download.file(report_url, destfile = glue("2021/2021-02-23/bls-{year}.xlsx"))
    }
    
    ex_2019 <- readxl::read_excel("2021/2021-02-23/bls-2019.xlsx")
    
    2015:2019 %>% 
      walk(get_bls_report)
    
    # 2020 has no year in front of it
    download.file(
      "https://www.bls.gov/cps/cpsaat17.xlsx", 
      destfile = "2021/2021-02-23/bls-2020.xlsx"
      )
    
    # Raw BLS -----------------------------------------------------------------
    
    raw_2020 <- readxl::read_excel("2021/2021-02-23/bls-2020.xlsx", skip = 3) %>% 
      slice(1:(n()-2))
    
    major_grp <- raw_2020 %>% 
      slice(1) %>% 
      select(3:last_col()) %>% 
      set_names(nm = glue::glue("...{1:ncol(.)}")) %>% 
      pivot_longer(cols = everything(), values_to = "major_grp") 
    
    minor_grp <- raw_2020 %>% 
      slice(2) %>% 
      select(3:last_col()) %>% 
      set_names(nm = glue::glue("...{1:ncol(.)}")) %>% 
      pivot_longer(cols = everything(), values_to = "minor_grp") 
    
    combo_grp <- left_join(major_grp, minor_grp, by = "name") %>% 
      mutate(across(.fns = ~str_replace_all(.x, "\n", " "))) %>% 
      mutate(across(.fns = ~str_remove_all(.x, "\r"))) %>% 
      mutate(across(.fns = ~str_replace_all(.x, "- ", ""))) %>% 
      tidyr::fill(major_grp)
    
    name_fill <- c("race_gender", "category", "total", glue("...{1:11}"))
    
    clean_2020 <- raw_2020 %>% 
      rename(category = 1) %>% 
      mutate(
        race_gender = if_else(
          str_detect(category, "Agriculture and related"),
          lag(category),
          NA_character_
          ),
        .before = category
        ) %>% 
      fill(race_gender) %>% 
      slice(5:n()) %>% 
      set_names(nm = name_fill) %>% 
      pivot_longer(cols = contains("..."), names_to = "name", values_to = "employ_n") %>% 
      left_join(combo_grp, by = "name") %>% 
      mutate(year = 2020) %>% 
      select(category, major_grp, minor_grp, race_gender, cat_total = total, employ_n)
    
    # Make a function!
    
    clean_bls <- function(year){
      
      raw_df <- readxl::read_excel(glue("2021/2021-02-23/bls-{year}.xlsx"), skip = 3) %>% 
        slice(1:(n()-2))
      
      major_grp <- raw_df %>% 
        slice(1) %>% 
        select(3:last_col()) %>% 
        set_names(nm = glue::glue("...{1:ncol(.)}")) %>% 
        pivot_longer(cols = everything(), values_to = "major_grp") 
      
      minor_grp <- raw_df %>% 
        slice(2) %>% 
        select(3:last_col()) %>% 
        set_names(nm = glue::glue("...{1:ncol(.)}")) %>% 
        pivot_longer(cols = everything(), values_to = "minor_grp") 
      
      combo_grp <- left_join(major_grp, minor_grp, by = "name") %>% 
        mutate(across(.fns = ~str_replace_all(.x, "\n", " "))) %>% 
        mutate(across(.fns = ~str_remove_all(.x, "\r"))) %>% 
        mutate(across(.fns = ~str_replace_all(.x, "- ", ""))) %>% 
        tidyr::fill(major_grp)
      
      name_fill <- c("race_gender", "category", "total", glue("...{1:11}"))
      
      clean_df <- raw_df %>% 
        rename(category = 1) %>% 
        mutate(
          race_gender = if_else(
            str_detect(category, "Agriculture and related"),
            lag(category),
            NA_character_
          ),
          .before = category
        ) %>% 
        fill(race_gender) %>% 
        slice(5:n()) %>% 
        set_names(nm = name_fill) %>% 
        pivot_longer(cols = contains("..."), names_to = "name", values_to = "employ_n") %>% 
        left_join(combo_grp, by = "name") %>% 
        select(industry = category, major_occupation = major_grp, minor_occupation = minor_grp, race_gender, industry_total = total, employ_n) %>% 
        mutate(year = year)
       
      clean_df
      
    }
    
    # combine the data
    
    all_bls <- 2015:2020 %>% 
      map_dfr(clean_bls) %>% 
      arrange(desc(year)) %>% 
      mutate(
        industry_total = as.integer(industry_total) * 1000,
        employ_n = as.integer(employ_n) * 1000
        )
    
    # sanity check plot
    
    all_bls %>% 
      filter(race_gender == "Black or African American") %>% 
      filter(minor_occupation == "Sales and related occupations") %>% 
      ggplot(aes(x = year, y = employ_n, group = industry)) +
      geom_line()
    
    all_bls %>% 
      write_csv("2021/2021-02-23/employed.csv")
      
    ### Here I'm reading against the HTML file included
    
    library(rvest)
    
    raw_html <- read_html("2021/2021-02-23/bls-all.htm")
    
    raw_html %>% 
      html_nodes("table.catalog") %>% 
      html_table()
    
    all_catalog_raw <- raw_html %>% 
      html_nodes("table.catalog") %>% 
      html_table() 
    
    
    all_catalog_clean <- all_catalog_raw %>% 
      map(clean_catalog)
    
    clean_catalog <- function(table){
      
      table %>% 
        pivot_wider(names_from = X1, values_from = X2) %>% 
        janitor::clean_names()
      
    }
    
    clean_catalog(all_catalog_raw[[2]])
    
    
    all_table_raw <- raw_html %>% 
      html_nodes("table.regular-data")%>% 
      html_table() 
    
    all_table_clean <- all_table_raw %>% 
      map_dfr(clean_table, .id = "id") %>% 
      mutate(id = as.integer(id))
    
    clean_table <- function(table){
      
      table %>% 
        filter(str_detect(Year, "Corrected", negate = TRUE)) %>% 
        mutate(across(everything(), ~str_remove(.x, "\\(C\\)"))) %>% 
        pivot_longer(Qtr1:Qtr4, names_to = "quarter", values_to = "value") %>% 
        rename(year = Year)
      
    }
    
    combine_tables <- bind_rows(all_table_clean) %>% 
      mutate(id = 1:n())
    
    combined_data <- all_catalog_clean %>% 
      bind_rows() %>% 
      mutate(id = row_number()) %>% 
      left_join(all_table_clean, by = "id") %>% 
      mutate(
        year = as.integer(year), 
        quarter = str_remove(quarter, "Qtr") %>% as.integer(),
        value = as.integer(value)
      ) 
    
    data_earn <- combined_data %>% 
      filter(earnings != "Person counts (number in thousands)") %>% 
      rename(median_weekly_earn = value) %>% 
      select(industry:last_col(), -id)
    
    data_earn 
    
    final_bls_earn <- combined_data %>%
      filter(earnings == "Person counts (number in thousands)") %>%
      rename(n_persons = value) %>%
      mutate(n_persons = n_persons * 1000) %>%
      select(industry:quarter, n_persons, -id) %>%
      left_join(
        data_earn,
        by = c(
          "industry", "occupation", "sex", "race", "ethnic_origin", "age", 
          "education", "class_of_worker", "labor_force_status", "year", "quarter"
          )
      ) %>% 
      select(sex, race, ethnic_origin, age, year:median_weekly_earn)
    
    final_bls_earn %>% 
      write_csv("2021/2021-02-23/earn.csv")
    
    # sanity check
    final_bls_earn %>% 
      filter(quarter == 2, sex == "Both Sexes", race != "All Races") %>% 
      ggplot(aes(x = year, y = median_weekly_earn, color = race)) +
      geom_line() +
      facet_wrap(~age)
      distinct(quarter)