TidyTuesday
    • About TidyTuesday
    • Datasets
      • 2025
      • 2024
      • 2023
      • 2022
      • 2021
      • 2020
      • 2019
      • 2018
    • Useful links

    On this page

    • Food Consumption and CO2 Emissions
      • Get the data here
      • Data Dictionary
    • food_consumption.csv
      • Cleaning Script

    Food Consumption and CO2 Emissions

    The data this week comes from nu3 and was contributed by Kasia Kulma.

    Kasia has put together a great guide on webscraping along with data cleaning and organization! Make sure to check out her blog post, and the raw code is duplicated as part of the cleaning script.

    Get the data here

    # Get the Data
    
    food_consumption <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-02-18/food_consumption.csv')
    
    # Or read in with tidytuesdayR package (https://github.com/dslc-io/tidytuesdayR)
    # PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE tidytuesdayR version ? from GitHub
    
    # Either ISO-8601 date or year/week works!
    
    # Install via pak::pak("dslc-io/tidytuesdayR")
    
    tuesdata <- tidytuesdayR::tt_load('2020-02-18')
    tuesdata <- tidytuesdayR::tt_load(2020, week = 8)
    
    
    food_consumption <- tuesdata$food_consumption

    Data Dictionary

    food_consumption.csv

    variable class description
    country character Country Name
    food_category character Food Category
    consumption double Consumption (kg/person/year)
    co2_emmission double Co2 Emission (Kg CO2/person/year)

    Cleaning Script

    library(tidyverse)
    library(janitor)
    library(rvest)
    
    # Credit to Kasia and minorly edited to create output file and test plot
    # Blog post at https://r-tastic.co.uk/post/from-messy-to-tidy/
    
    url <- "https://www.nu3.de/blogs/nutrition/food-carbon-footprint-index-2018"
    
    # scrape the website
    url_html <- read_html(url)
    
    # extract the HTML table
    whole_table <- url_html %>% 
      html_nodes('table') %>%
      html_table(fill = TRUE) %>%
      .[[1]]
    
    table_content <- whole_table %>%
      select(-X1) %>% # remove redundant column
      filter(!dplyr::row_number() %in% 1:3) # remove redundant rows
    
    raw_headers <- url_html %>%
      html_nodes(".thead-icon") %>%
      html_attr('title')
    
    tidy_bottom_header <- raw_headers[28:length(raw_headers)]
    tidy_bottom_header[1:10]
    
    raw_middle_header <- raw_headers[17:27]
    raw_middle_header
    
    tidy_headers <- c(
      rep(raw_middle_header[1:7], each = 2),
      "animal_total",
      rep(raw_middle_header[8:length(raw_middle_header)], each = 2),
      "non_animal_total",
      "country_total")
    
    tidy_headers
    
    combined_colnames <- paste(tidy_headers, tidy_bottom_header, sep = ';')
    colnames(table_content) <- c("Country", combined_colnames)
    glimpse(table_content[, 1:10])
    
    long_table <- table_content %>%
      # make column names observations of Category variable
      tidyr::pivot_longer(cols = -Country, names_to = "Category", values_to = "Values") %>%
      # separate food-related information from the metric
      tidyr::separate(col = Category, into = c("Food Category", "Metric"), sep = ';')
    
    glimpse(long_table)
    
    tidy_table <- long_table %>%
      tidyr::pivot_wider(names_from = Metric, values_from = Values) %>%
      janitor::clean_names('snake')
    
    glimpse(tidy_table)
    
    final_table <- tidy_table %>%
      rename(consumption = 3,
             co2_emmission = 4) %>%
      filter(!stringr::str_detect(food_category, "total"))
    
    clean_table <- final_table %>% 
      mutate_at(vars(consumption, co2_emmission), parse_number)
    
    clean_table %>% 
      write_csv(here::here("2020/2020-02-18", "food_consumption.csv"))
    
    clean_table %>% 
      ggplot(aes(x = fct_reorder(food_category, consumption), y = consumption, color = country)) +
      geom_jitter() +
      theme(legend.position = "none") +
      coord_flip()