Average Age of Olympians over the years

A ridgeline density plot of the ages of Olympics participants and medal winners, faceted by the medal outcome. Plotted from 1896 to 2016 Olympics (120 years). The central dot shows the mean age for that year.

#TidyTuesday
Sports
{ggridges}
Author

Aditya Dahiya

Published

August 6, 2024

This week, we’re diving into the rich history of the modern Olympic Games with a dataset from Kaggle’s RGriffin. The dataset, titled “120 years of Olympic history: athletes and results,” offers an extensive collection of data on athletes and their medal results from the Athens 1896 Olympics up to the Rio 2016 Games. This dataset, which we previously explored during TidyTuesday on July 27, 2021, includes detailed biographical data on athletes.

The data was sourced from sports-reference.com in May 2018. This site provides comprehensive data from both the Summer and Winter Games, which were held in the same year until 1992. The subsequent staggering of these events, with the Winter Games starting in 1994 and the Summer Games in 1996, is an important historical detail often overlooked in analyses. For a deeper exploration, check out the Olympics results page and the blog on sports-reference.com.

Figure 1: A ridgeline density plot of the ages of Olympics participants and medal winners, faceted by the medal outcome. Plotted from 1896 to 2016 Olympics (120 years). The central dot shows the mean age for that year.

How I made this graphic?

Loading libraries & data

Code
# Data Import and Wrangling Tools
library(tidyverse)            # All things tidy

# Final plot tools
library(scales)               # Nice Scales for ggplot2
library(fontawesome)          # Icons display in ggplot2
library(ggtext)               # Markdown text support for ggplot2
library(showtext)             # Display fonts in ggplot2
library(colorspace)           # Lighten and Darken colours
library(seecolor)             # To print and view colours
library(patchwork)            # Combining plots

# Option 2: Read directly from GitHub

olympics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-08-06/olympics.csv')

Data Wrangling (1)

Code
years_select <- seq(2000, 2016, 4)

continents <- rnaturalearth::ne_countries() |> 
  as_tibble() |> 
  select(iso_a3, continent, name)

df <- olympics |> 
  # filter(year %in% years_select) |> 
  filter(season == "Summer") |> 
  group_by(year) |> 
  count(team, noc, sort = TRUE)

df1 <- df |> 
  left_join(continents, by = join_by(team == name)) |> 
  select(-iso_a3)
  
df2 <- df |> 
  left_join(continents, by = join_by(noc == iso_a3)) |> 
  select(-name)

df3 <- df1 |> 
  bind_rows(df2) |> 
  ungroup() |> 
  drop_na() |> 
  distinct() |> 
  arrange(desc(n))

df4 <- df3 |> 
  group_by(year, continent) |> 
  summarise(n = sum(n))

df5 <- df4 |> 
  group_by(year) |> 
  mutate(perc = round(100 * n / sum(n), 1))
  
plot_donut <- function(yr){
  df5 |> 
  filter(year == yr) |> 
  ggplot(aes(1, n, fill = continent)) +
  geom_col(
    colour = bg_col,
    width = 0.15
  ) +
  geom_text(
    mapping = aes(
      label = paste0(perc, " %")
    ),
    position = position_stack(vjust = 0.5),
    family = "body_font"
  ) +
  annotate(
    geom = "text",
    x = 0 , y = 0,
    label = yr,
    size = bts * 1.5,
    family = "body_font"
  ) +
  # facet_wrap(~ year, scales = "free") +
  scale_x_continuous(limits = c(0, 1.5)) +
  coord_polar(theta = "y") +
  theme_void(
    base_family = "body_font",
    base_size = bts
  ) +
  theme(
    legend.position = "none",
    plot.background = element_rect(
      fill = "transparent",
      colour = "transparent"
    ),
    panel.background = element_rect(
      fill = "transparent",
      colour = "transparent"
    )
  )
}

bmi_df <- olympics |> 
  filter(season == "Summer") |> 
  mutate(
    medal = replace_na(medal, "Others"),
    medal = fct(
      medal, 
      levels = c("Gold", "Silver", "Bronze", "Others")
    )
  ) |>
  mutate(bmi = (weight / ((height/100)^2)))

# What percentages are NA for each medal type
bmi_df |> 
  group_by(medal) |> 
  summarise(na_percentage = mean(is.na(bmi)) * 100) %>%
  pull(na_percentage)


library(ggridges)

bmi_df |> 
  ggplot(
    mapping = aes(
      x = weight,
      y = as_factor(year)
    )
  ) +
  geom_density_ridges(
    alpha = 0.5
  ) +
  scale_x_continuous(limits = c(40, 120)) +
  # scale_x_continuous(limits = c(150, 210)) +
  # scale_x_continuous(limits = c(18, 30)) +
  facet_wrap(~ medal, nrow = 1)


olympics |> 
  filter(!is.na(medal)) |> 
  group_by(name) |> 
  count(sort = T)

olympics |>
  filter(season == "Summer") |> 
  filter(!is.na(medal)) |> 
  group_by(year, team, noc, medal) |> 
  count() |> 
  pivot_wider(
    id_cols = c(year, noc, team),
    names_from = medal,
    values_from = n,
    values_fill = 0
  ) |> 
  janitor::clean_names() |> 
  mutate(total = bronze + silver + gold) |> 
  group_by(year) |> 
  slice_max(order_by = total, n = 10) |> 
  mutate(rank = row_number())

Data Wrangling (2)

Code
continents <- rnaturalearth::ne_countries() |> 
  as_tibble() |> 
  select(iso_a3, continent, name)

df <- olympics |> 
  filter(!is.na(medal)) |> 
  filter(season == "Summer") |> 
  group_by(year) |> 
  count(team, noc, medal)

df1 <- df |> 
  left_join(continents, by = join_by(team == name)) |> 
  select(-iso_a3)
  
df2 <- df |> 
  left_join(continents, by = join_by(noc == iso_a3)) |> 
  select(-name)

df3 <- df1 |> 
  bind_rows(df2) |> 
  ungroup() |> 
  drop_na() |> 
  distinct() |> 
  arrange(desc(n))

df4 <- df3 |> 
  group_by(year, continent, medal) |> 
  summarise(n = sum(n)) |> 
  group_by(year) |> 
  mutate(perc = round(100 * n / sum(n), 1)) |> 
  ungroup() |>
  mutate(
    medal = fct(
      medal, 
      levels = c("Gold", "Silver", "Bronze")
    )
  )
  
df4 |> 
  ggplot(
    mapping = aes(
      x = year, 
      y = n,
      fill = continent,
      alpha = medal
    )
  ) +
  # geom_area(
  #   position = "fill",
  #   colour = "white"
  # ) +
  ggstream::geom_stream(
    type = "proportional",
    bw = 0.5
  ) +
  scale_alpha_manual(
    values = c(0.9, 0.7, 0.5)
  )

Data Wrangling (3) - Age Distribution amongst medal winners

Code
mypal <- c("#FFD700", "#C0C0C0", "#ad8a56", "grey20")

age_df <- olympics |> 
  filter(season == "Summer") |> 
  mutate(
    medal = replace_na(medal, "Other Participants"),
    medal = if_else(
      medal == "Other Participants",
      medal,
      paste0(medal, " Medalists")
    ),
    medal = fct(
      medal, 
      levels = c("Gold Medalists", 
                 "Silver Medalists", 
                 "Bronze Medalists", 
                 "Other Participants")
    )
  )

age_med <- age_df |> 
  group_by(year, medal) |> 
  summarise(
    median_age = median(age, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE)
  )

Visualization Parameters

Code
# Font for titles
font_add_google("Bilbo",
  family = "title_font"
) 

# Font for the caption
font_add_google("Barlow Semi Condensed",
  family = "caption_font",
  regular.wt = 400,
  bold.wt = 700
) 

# Font for plot text
font_add_google("Ubuntu Condensed",
  family = "body_font"
) 

showtext_auto()

text_col <- "grey20"
text_hil <- "grey30"

bts <- 80

# Caption stuff for the plot
sysfonts::font_add(
  family = "Font Awesome 6 Brands",
  regular = here::here("docs", "Font Awesome 6 Brands-Regular-400.otf")
)
github <- "&#xf09b"
github_username <- "aditya-dahiya"
xtwitter <- "&#xe61b"
xtwitter_username <- "@adityadahiyaias"
social_caption_1 <- glue::glue("<span style='font-family:\"Font Awesome 6 Brands\";'>{github};</span> <span style='color: {text_hil}'>{github_username}  </span>")
social_caption_2 <- glue::glue("<span style='font-family:\"Font Awesome 6 Brands\";'>{xtwitter};</span> <span style='color: {text_hil}'>{xtwitter_username}</span>")

plot_title <- "Average Age of Olympians"

plot_subtitle <- "Since the start of Modern Summer Olympics in 1896, the average age **(represented by**<br>**the dots)** of Olympics' medallists and participants has stayed around 25 years<br>**(represented by the vertical line)**. Of late, average age has been increasing slightly."
str_view(plot_subtitle)

plot_caption <- paste0(
  "**Data:** _#TidyTuesday_ & RGriffin's Kaggle Dataset: ", 
  " |  **Code:** ", 
  social_caption_1, 
  " |  **Graphics:** ", 
  social_caption_2
  )

rm(github, github_username, xtwitter, 
   xtwitter_username, social_caption_1, 
   social_caption_2)

The static plot: Attempt 1

Code
df5 |> 
  ggplot(
    mapping = aes(
      x = 1, 
      y = n, 
      fill = continent
    )
  ) +
  geom_col(
    colour = bg_col,
    width = 0.15
  ) +
  geom_text(
    mapping = aes(
      label = paste0(perc, " %")
    ),
    position = position_stack(vjust = 0.5),
    family = "body_font"
  ) +
  facet_wrap(~ year, scales = "free") +
  scale_x_continuous(
    limits = c(0, 1.5), 
    expand = expansion(c(0, -0.3))
  ) +
  coord_polar(theta = "y") +
  theme_void(
    base_family = "body_font",
    base_size = bts
  ) +
  theme(
    legend.position = "none",
    plot.background = element_rect(
      fill = "transparent",
      colour = "transparent"
    ),
    panel.background = element_rect(
      fill = "transparent",
      colour = "transparent"
    )
  )

The static plot: Attempt 2

Code
df5 |> 
  ggplot(aes(year, n, fill = continent)) +
  ggstream::geom_stream(
    type = "proportional",
    colour = "white"
  )

The static plot: Attempt 3 produced the Figure 1

Code
bts = 80

g <- age_df |> 
  ggplot(
    mapping = aes(
      x = age,
      y = as_factor(year),
      fill = medal
    )
  ) +
  ggridges::geom_density_ridges(
    alpha = 0.5,
    panel_scaling = TRUE
  ) +
  scale_fill_manual(
    values = mypal
  ) +
  scale_colour_manual(
    values = darken(mypal, 0.3)
  ) +
  geom_point(
    data = age_med,
    mapping = aes(
      x = mean_age,
      y = as_factor(year),
      colour = medal
    ),
    position = position_nudge(y = 0.5),
    size = 3
  ) +
  geom_vline(
    mapping = aes(colour = medal),
    xintercept = 25,
    linetype = 1,
    linewidth = 0.75,
    alpha = 0.1
  ) +
  scale_x_continuous(
    limits = c(15, 35),
    expand = expansion(0)
  ) +
  scale_y_discrete(
    expand = expansion(0)
  ) +
  coord_cartesian(clip = "off") +
  facet_wrap(~ medal, nrow = 1) +
  labs(
    title = plot_title,
    subtitle = plot_subtitle,
    caption = plot_caption,
    y = "Year of the Summer Olympics",
    x = "Age (in years)"
  ) +
  theme_minimal(
    base_family = "body_font",
    base_size = bts
  ) +
  theme(
    legend.position = "none",
    plot.title.position = "plot",
    plot.margin = margin(10,15,10,15, "mm"),
    plot.title = element_text(
      colour = text_hil,
      size = 5 * bts,
      margin = margin(5,0,5,0, "mm"), 
      hjust = 0.5,
      family = "title_font",
      face = "bold"
    ),
    plot.subtitle = element_markdown(
      colour = text_hil,
      size = 1.2 * bts,
      lineheight = 0.35,
      margin = margin(0,0,5,0, "mm"),
      family = "caption_font",
      hjust = 0.5
    ),
    plot.caption = element_textbox(
      hjust = 0.5,
      colour = text_hil
    ),
    strip.text = element_text(
      margin = margin(10,0,5,0, "mm"),
      colour = text_hil,
      size = 1.2 * bts
    ),
    panel.grid = element_blank(),
    axis.text.y = element_text(
      vjust = -0.2,
      colour = text_hil
    ),
    axis.ticks = element_blank(),
    axis.ticks.length = unit(0, "mm"),
    panel.spacing = unit(15, "mm"),
    axis.text.x = element_text(
      colour = text_hil,
      margin = margin(5,0,0,0, "mm")
    ),
    axis.title = element_text(
      colour = text_hil
    )
  )

Savings the graphics

Code
ggsave(
  filename = here::here("data_vizs", "tidy_olympics.png"),
  plot = g,
  width = 400,    # Best Twitter Aspect Ratio = 5:4
  height = 500,   
  units = "mm",
  bg = "white"
)

library(magick)
# Saving a thumbnail for the webpage
image_read(here::here("data_vizs", "tidy_olympics.png")) |> 
  image_resize(geometry = "400") |> 
  image_write(here::here("data_vizs", "thumbnails", "tidy_olympics.png"))