Tidy Tuesday Datasets

Size of the Tidy Tuesday Datasets visualized

#TidyTuesday
Author

Aditya Dahiya

Published

June 27, 2024

The figure shows 644 overlapping red rectangles - each represents a dataset. The axes are on log scales. Each rectangle shows size of a #TidyTuesday dataset. The largest, smallest, widest and longest datasets are labelled.

How I made this graphic?

Loading required libraries, data import & creating custom functions

Code
# Data Import and Wrangling Tools
library(tidyverse)            # All things tidy

# Final plot tools
library(scales)               # Nice Scales for ggplot2
library(fontawesome)          # Icons display in ggplot2
library(ggtext)               # Markdown text support for ggplot2
library(showtext)             # Display fonts in ggplot2
library(colorspace)           # Lighten and Darken colours
library(patchwork)            # Combining plots

# Load data
tt_datasets <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-07-02/tt_datasets.csv')
# tt_summary <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-07-02/tt_summary.csv')
# tt_urls <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-07-02/tt_urls.csv')
# tt_variables <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-07-02/tt_variables.csv')

Exploratory Data Analysis & Data Wrangling

Code
tt_datasets |> 
  ggplot(aes(observations)) +
  geom_histogram() +
  scale_x_log10()

tt_datasets |> 
  ggplot(aes(variables)) +
  geom_histogram() +
  scale_x_log10()

plotdf1 <- tt_datasets |> 
  mutate(overall = variables * observations) |> 
  arrange(desc(overall)) |> 
  mutate(
    variables1 = log10(variables),
    observations1 = log10(observations)
  ) |> 
  mutate(
    x_min = -variables1 / 2,
    x_max = +variables1 / 2,
    y_min = -observations1 / 2,
    y_max = observations1 / 2,
    year = as_factor(year)
  ) |> 
  mutate(
    var_col = case_when(
      overall == max(overall) ~ TRUE,
      overall == min(overall) ~ TRUE,
      variables == max(variables) ~ TRUE,
      observations == max(observations) ~ TRUE,
      .default = FALSE
    )
  ) |> 
  arrange(var_col)

plotdf1 |> 
  filter(var_col) |> 
  mutate(
    type = c("largest", "longest", "widest", "smallest")
  ) |> 
  select(type, dataset_name, variables1, observations1,
         variables, observations) |> 
  inner_join(tt_datasets)

Visualization Parameters

Code
# Font for titles
font_add_google("Xanh Mono",
  family = "title_font"
) 

# Font for the caption
font_add_google("Saira Extra Condensed",
  family = "caption_font"
) 

# Font for plot text
font_add_google("M PLUS 1 Code",
  family = "body_font"
) 

showtext_auto()

# Credits for coffeee palette
mypal <- paletteer::paletteer_d("nbapalettes::cavaliers_retro")

bg_col <- "white"
text_col <-  mypal[2]
text_hil <- mypal[2]
bts = 80

# Caption stuff for the plot
sysfonts::font_add(
  family = "Font Awesome 6 Brands",
  regular = here::here("docs", "Font Awesome 6 Brands-Regular-400.otf")
)
github <- "&#xf09b"
github_username <- "aditya-dahiya"
xtwitter <- "&#xe61b"
xtwitter_username <- "@adityadahiyaias"
social_caption_1 <- glue::glue("<span style='font-family:\"Font Awesome 6 Brands\";'>{github};</span> <span style='color: {text_hil}'>{github_username}  </span>")
social_caption_2 <- glue::glue("<span style='font-family:\"Font Awesome 6 Brands\";'>{xtwitter};</span> <span style='color: {text_hil}'>{xtwitter_username}</span>")

Annotation Text for the Plot

Code
plot_title <- "Size of *#TidyTuesday* Datasets"

plot_caption <- paste0(
  "**Data:** #TidyTuesday Datasets", 
  " |  **Code:** ", 
  social_caption_1, 
  " |  **Graphics:** ", 
  social_caption_2
  )

The static plot

Code
g <- plotdf1 |> 
  ggplot() +
  geom_rect(
    mapping = aes(
      ymin = y_min,
      ymax = y_max,
      xmin = x_min,
      xmax = x_max,
      colour = var_col,
      linewidth = var_col
    ),
    alpha = 0.15,
    fill = mypal[1]
  ) +
  scale_colour_manual(values = c("white", mypal[2])) +
  scale_linewidth_manual(values = c(0.1,  0.75)) +
  scale_x_continuous(
    breaks = c(-1, -log10(5), 0, log10(5), 1),
    labels = c(10, 5, 0, 5, 10),
    expand = expansion(c(0.05, 0))
  ) +
  scale_y_continuous(
    breaks = c(-5:-1, 0, 1:5),
    labels = c(10^(5:1), 0, 10^(1:5))
  ) +
  labs(
    title = plot_title,
    subtitle = "Each rectangle represents one of the 644 datasets !",
    caption = plot_caption,
    x = "Number of Variables (Rows)",
    y = "Number of Observations (Columns)"
  ) +
  theme_classic(
    base_family = "body_font",
    base_size = bts
  ) +
  theme(
    legend.position = "none",
    plot.title.position = "plot",
    axis.line = element_blank(),
    # axis.line = element_line(
    #   arrow = arrow(
    #     ends = "both",
    #     length = unit(5, "mm"), 
    #     angle = 20
    #   ),
    #   linewidth = 0.2
    # ),
    plot.title = element_markdown(
      colour = text_col,
      margin = margin(10,0,15,0, "mm"),
      hjust = 0.5,
      size = bts * 3,
      family = "title_font"
    ),
    plot.subtitle = element_text(
      colour = text_col,
      margin = margin(0,0,15,0, "mm"),
      hjust = 0.5,
      size = 1.5 * bts
    ),
    plot.caption = element_textbox(
      hjust = 0.5,
      margin = margin(5,0,0,0, "mm"),
      colour = text_col,
      family = "caption_font"
    ),
    plot.margin = margin(5,5,5,5, "mm"),
    axis.title = element_text(
      colour = text_col,
      margin = margin(0,0,0,0, "mm")
    ),
    axis.text = element_text(
      colour = text_col,
      margin = margin(0,0,0,0, "mm")
    ),
    axis.ticks = element_line(
      colour = text_col,
      linewidth = 0.2
    ),
    axis.ticks.length = unit(2, "mm")
  )

# Add Annotations
g_full <- g +
  annotate(
    geom = "text",
    x = 0, y = 6.62/2,
    label = "Longest Dataset: Drought FIPS.\n2022, Week 24. 3.7 million observations, 4 variables.",
    hjust = 0.5,
    vjust = 0,
    lineheight = 0.3,
    colour = text_col,
    family = "caption_font",
    size = bts / 3
  ) +
  annotate(
    geom = "text",
    x = 1.94 / 2, y = 5.67 / 2,
    label = "Largest Dataset: Rstats Tweets.\n2019, Week 1. 429k observations, 88 variables.",
    hjust = 0.8,
    vjust = 0,
    lineheight = 0.3,
    colour = text_col,
    family = "caption_font",
    size = bts / 3
  ) +
  annotate(
    geom = "text",
    x = -2.13 / 2, y = 0,
    label = "Widest Dataset: OWID Energy.\n2023, Week 23. 21k observations, 129 variables.",
    hjust = 0.5,
    vjust = 0,
    lineheight = 0.3,
    colour = text_col,
    family = "caption_font",
    size = bts / 3,
    angle = 90
  ) +
  annotate(
    geom = "text",
    x = 0, y = 0.63 / 2,
    label = "Smallest Dataset: City Rural.\n2021, Week 8. 4 observations, 2 variables.",
    hjust = 0.5,
    vjust = 0,
    lineheight = 0.3,
    colour = text_col,
    family = "caption_font",
    size = bts / 3
  )

Savings the graphics

Code
ggsave(
  filename = here::here("data_vizs", "tidy_tuesday_datasets.png"),
  plot = g_full,
  width = 400,    # Best Twitter Aspect Ratio = 5:4
  height = 500,   
  units = "mm",
  bg = bg_col
)


library(magick)
# Saving a thumbnail for the webpage
image_read(here::here("data_vizs", "tidy_tuesday_datasets.png")) |> 
  image_resize(geometry = "400") |> 
  image_write(here::here("data_vizs", "thumbnails", "tidy_tuesday_datasets.png"))