Flint Water Lead Levels: Distribution Patterns

Statistical distributions visualized through combined boxplots and violin plots.

#TidyTuesday

Author

Aditya Dahiya

Published

November 3, 2025

About the Data

This dataset examines lead concentration levels in water samples collected from Flint, Michigan in 2015, during the city’s water crisis. The data comes from a 2018 paper by Loux and Gibson who advocate for using this as a teaching example in introductory statistics courses. The dataset includes two separate collections: samples gathered by the Michigan Department of Environment Quality (MDEQ) and samples from a citizen science project coordinated by Professor Marc Edwards and colleagues at Virginia Tech. The community-sourced samples were collected after concerns emerged about the MDEQ excluding certain samples from their analysis. You can read about the “murky” story behind this data and the ethical questions it raises about data handling. The data is part of the #TidyTuesday project, a weekly social data project in the R community, and was curated by Jen Richmond. All lead measurements are reported in parts per billion (ppb).

Figure 1: This visualization compares lead concentrations across three water sample collections from Flint, Michigan (2015). The y-axis shows lead levels in parts per billion (ppb) on a logarithmic scale. Each panel combines a half-boxplot (left) displaying the median, quartiles, and whiskers (range excluding outliers), with a half-violin plot (right) showing the probability density. Individual measurements appear as colored dots. The MDEQ dataset with all samples includes two high outliers (104 and 58 ppb) that were controversially removed in the second MDEQ analysis. Virginia Tech’s citizen science project collected 271 samples, showing higher variability and maximum lead levels reaching 158 ppb.

How I Made This Graphic

Loading required libraries

Code

pacman::p_load(
  tidyverse, # All things tidy

  scales, # Nice Scales for ggplot2
  fontawesome, # Icons display in ggplot2
  ggtext, # Markdown text support for ggplot2
  showtext, # Display fonts in ggplot2
  colorspace, # Lighten and Darken colours

  patchwork,  # Composing Plots
  gghalves # For half geoms with ggplot2
)

flint_mdeq <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-04/flint_mdeq.csv')

flint_vt <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-04/flint_vt.csv')

Visualization Parameters

Code

# Font for titles
font_add_google("Saira",
  family = "title_font"
)

# Font for the caption
font_add_google("Saira Condensed",
  family = "body_font"
)

# Font for plot text
font_add_google("Saira Extra Condensed",
  family = "caption_font"
)

showtext_auto()

# A base Colour
bg_col <- "white"
seecolor::print_color(bg_col)

# Colour for highlighted text
text_hil <- "grey40"
seecolor::print_color(text_hil)

# Colour for the text
text_col <- "grey30"
seecolor::print_color(text_col)

line_col <- "grey30"

# Define Base Text Size
bts <- 80

# Caption stuff for the plot
sysfonts::font_add(
  family = "Font Awesome 6 Brands",
  regular = here::here("docs", "Font Awesome 6 Brands-Regular-400.otf")
)
github <- "&#xf09b"
github_username <- "aditya-dahiya"
xtwitter <- "&#xe61b"
xtwitter_username <- "@adityadahiyaias"
social_caption_1 <- glue::glue("<span style='font-family:\"Font Awesome 6 Brands\";'>{github};</span> <span style='color: {text_hil}'>{github_username}  </span>")
social_caption_2 <- glue::glue("<span style='font-family:\"Font Awesome 6 Brands\";'>{xtwitter};</span> <span style='color: {text_hil}'>{xtwitter_username}</span>")
plot_caption <- paste0(
  "**Data:**  Michigan Department of Environment (MDEQ) | Virginia Tech",
  " |  **Code:** ",
  social_caption_1,
  " |  **Graphics:** ",
  social_caption_2
)
rm(
  github, github_username, xtwitter,
  xtwitter_username, social_caption_1,
  social_caption_2
)

# Add text to plot-------------------------------------------------
plot_title <- "Lead Levels in Flint Water Samples (2015)"

plot_subtitle <- "Half-boxplots reveal statistical summaries (median, quartiles, whiskers) while half-violin plots display probability densities. Individual dots represent actual measurements, showing the spread and concentration of lead levels across three sample collections." |> str_wrap(90)

plot_subtitle |> str_view()

Exploratory Data Analysis and Wrangling

Code

# pacman::p_load(summarytools)
# 
# flint_mdeq |> 
#   dfSummary() |> 
#   view()
# 
# flint_vt |> 
#   dfSummary() |> 
#   view()
# 
# flint_mdeq |> 
#   summary()
# 
# flint_vt |> 
#   summary()
# pacman::p_unload(summarytools)

plotdf <- bind_rows(
  flint_mdeq |> 
    rename(
      sample_id = sample,
      mdeq = lead,
      mdeq2 = lead2
    ) |>
    select(-notes) |> 
    pivot_longer(
      cols = c(mdeq, mdeq2),
      values_to = "value",
      names_to = "sample"
    ),
  
  flint_vt |> 
    mutate(
      sample_id = sample,
      sample = "vt"
    ) |> 
  rename(value = lead)
) |> 
  mutate(
    sample_num = case_when(
      sample == "mdeq" ~ 1,
      sample == "mdeq2" ~ 2,
      sample == "vt" ~ 3
    ),
    sample_num = as_factor(sample_num)
  )

# plotdf |> 
#   ggplot(aes(x = value, y = sample)) +
#   geom_boxplot() +
#   scale_x_continuous(
#     transform = "log10"
#   )
# 
# plotdf |> 
#   ggplot(aes(x = value, group = sample, fill = sample)) +
#   geom_histogram(
#     alpha = 0.3,
#     binwidth = 0.1
#   ) +
#   scale_x_continuous(
#     transform = "log10"
#   )
# Add factor and numeric columns to plotdf
plotdf <- plotdf |>
  mutate(
    sample_factor = factor(sample, levels = c("mdeq", "mdeq2", "vt")),
    sample_num = as.numeric(sample_factor)
  )

# Calculate boxplot statistics correctly (using the same method ggplot uses)
stats_df <- plotdf |>
  group_by(sample, sample_factor) |>
  summarise(
    sample_num = first(sample_num),
    median = median(value, na.rm = TRUE),
    q1 = quantile(value, 0.25, na.rm = TRUE),
    q3 = quantile(value, 0.75, na.rm = TRUE),
    iqr = IQR(value, na.rm = TRUE),
    lower_whisker = max(min(value, na.rm = TRUE), q1 - 1.5 * iqr),
    upper_whisker = min(max(value, na.rm = TRUE), q3 + 1.5 * iqr),
    .groups = "drop"
  )

The Plot

Code

g <- plotdf |>
  ggplot(
    mapping = aes(
      x = sample_factor, 
      y = value, 
      fill = sample_factor
    )
  ) +
  
  # Half box-plot (left half)
  geom_half_boxplot(
    side = "left",
    width = 1,
    staplewidth = 0.4,
    varwidth = TRUE,
    alpha = 0.75,
    outlier.shape = NA,
    errorbar.draw = TRUE,
    colour = "grey10",
    linewidth = 0.5
  ) +
  # Violin Plot (right half)
  geom_half_violin(
    side = "right",
    alpha = 0.5,
    trim = FALSE,
    width = 0.75,
    colour = "grey10",
    linewidth = 0.5
  ) +
  
  # Dotplot (right half)
  geom_half_dotplot(
    mapping = aes(colour = sample_factor),
    side = "right",
    nudge = 0.1,
    dotsize = 0.7,
    alpha = 0.6,
    binwidth = 0.05,
    stackratio = 1.1
  ) +
  paletteer::scale_colour_paletteer_d("khroma::highcontrast") +
  paletteer::scale_fill_paletteer_d("khroma::highcontrast") +
  
  
  scale_y_continuous(
    trans = "log10",
    breaks = c(0.1, 0.5, 1, 2, 5, 10, 20, 50, 100),
    labels = label_number(suffix = " ppb")
  ) +
  
  scale_x_discrete(
    labels = c("MDEQ\n(All samples)", 
               "MDEQ\n(2 samples removed)", 
               "Virginia Tech")
  ) +
  coord_cartesian(
    clip = "off"
  ) +
  labs(
    y = "Lead Concentration (ppb) (Log Scale)",
    x = NULL,
    title = plot_title,
    subtitle = plot_subtitle,
    caption = plot_caption,
    fill = NULL, colour = NULL
  ) +

  theme_minimal(
    base_family = "body_font",
    base_size = bts
  ) +
  theme(
    legend.position = "none",
    
    # Overall
    text = element_text(
      margin = margin(0, 0, 0, 0, "mm"),
      colour = text_col,
      lineheight = 0.3
    ),
    
    # Axes
    axis.text.x.bottom = element_text(
      size = bts * 1.2,
      margin = margin(0,0,0,0, "mm")
    ),
    axis.text.y.left = element_text(
      size = bts,
      margin = margin(0,0,0,0, "mm")
    ),
    axis.ticks.x.bottom = element_blank(),
    axis.ticks.length.x = unit(0, "mm"),
    axis.ticks.length.y.left = unit(0, "mm"),
    axis.line = element_blank(),
    panel.grid = element_blank(),
    panel.grid.major.y = element_line(
      linetype = 3,
      linewidth = 0.6,
      colour = text_hil
    ),
    panel.grid.minor.y = element_line(
      linetype = 3,
      linewidth = 0.3,
      colour = text_hil
    ),
    
    # Labels and Strip Text
    plot.title = element_text(
      margin = margin(5, 0, 5, 0, "mm"),
      hjust = 0.5,
      vjust = 0.5,
      colour = text_hil,
      size = 2.3 * bts,
      family = "body_font",
      face = "bold",
      lineheight = 0.25
    ),
    plot.subtitle = element_text(
      margin = margin(2, 0, -20, 0, "mm"),
      vjust = 0.5,
      colour = text_hil,
      size = 1.3 * bts,
      hjust = 0.5,
      halign = 0.5,
      family = "body_font",
      lineheight = 0.3
    ),
    plot.caption = element_markdown(
      family = "caption_font",
      hjust = 0.5,
      margin = margin(5,0,0,0, "mm"),
      colour = text_hil
    ),
    plot.caption.position = "plot",
    plot.title.position = "plot",
    plot.margin = margin(5, 5, 5, 5, "mm")
  )

ggsave(
  filename = here::here(
    "data_vizs",
    "tidy_water_lead_samples.png"
  ),
  plot = g,
  width = 400,
  height = 500,
  units = "mm",
  bg = bg_col
)

Savings the thumbnail for the webpage

Code

# Saving a thumbnail

library(magick)

# Saving a thumbnail for the webpage
image_read(here::here(
  "data_vizs",
  "tidy_water_lead_samples.png"
)) |>
  image_resize(geometry = "x400") |>
  image_write(
    here::here(
      "data_vizs",
      "thumbnails",
      "tidy_water_lead_samples.png"
    )
  )

Session Info

Code

sessioninfo::session_info()$packages |>
  as_tibble() |>
  
  # The attached column is TRUE for packages that were 
  # explicitly loaded with library()
  dplyr::filter(attached == TRUE) |>
  dplyr::select(package,
    version = loadedversion,
    date, source
  ) |>
  dplyr::arrange(package) |>
  janitor::clean_names(
    case = "title"
  ) |>
  gt::gt() |>
  gt::opt_interactive(
    use_search = TRUE
  ) |>
  gtExtras::gt_theme_espn()

Table 1: R Packages and their versions used in the creation of this page and graphics

About the Data

How I Made This Graphic

Loading required libraries

Visualization Parameters

Exploratory Data Analysis and Wrangling

The Plot

Savings the thumbnail for the webpage

Session Info

Links