Chapter 8



Aditya Dahiya


December 17, 2023


This Chapter has no exercises. So, we explore annotations and packages using Holiday Episodes data from #TidyTuesday All code is annotated to explain the steps.

Loading Libraries and the Data-Set: —

library(tidyverse)         # data wrangling and ggplot2
library(summarytools)      # EDA
library(gt)                # Tables
library(ggthemes)          # Themes
library(ggtext)            # Text, Annotations and Markdown
library(magick)            # Images

tuesdata <- tidytuesdayR::tt_load('2023-12-19')

    Downloading file 1 of 2: `holiday_episodes.csv`
    Downloading file 2 of 2: `holiday_episode_genres.csv`
holep <- tuesdata$holiday_episodes
holep_genres <- tuesdata$holiday_episode_genres

A look at 5 rows of the complete data set: —

# Pipe the 'holep' dataframe through a series of operations using the magrittr pipe operator %>%
holep |> 
  # Select the first 5 rows for top 5 highest votes received TV episodes
  slice_max(num_votes, n = 5) |> 
  # Create a gt table
  gt() |> 
  # Label columns using janitor::make_clean_names function
  cols_label_with(fn = ~ janitor::make_clean_names(., case = "title")) |> 
   # Apply styling to the table cells to make the text small
    style = cell_text(size = "small"),
    locations = cells_body()
  ) |> 
  # Apply the gt theme from gtExtras package
Tconst Parent Tconst Season Number Episode Number Primary Title Original Title Year Runtime Minutes Genres Simple Title Average Rating Num Votes Parent Title Type Parent Primary Title Parent Original Title Parent Start Year Parent End Year Parent Runtime Minutes Parent Genres Parent Simple Title Parent Average Rating Parent Num Votes Christmas Hanukkah Kwanzaa Holiday
tt3973198 tt2085059 2 4 White Christmas White Christmas 2014 73 Drama,Mystery,Sci-Fi white christmas 9.1 66843 tvSeries Black Mirror Black Mirror 2011 NA 60 Drama,Mystery,Sci-Fi black mirror 8.7 620664 TRUE FALSE FALSE FALSE
tt10166582 tt10160804 1 6 So This Is Christmas? So This Is Christmas? 2021 61 Action,Adventure,Crime so this is christmas 8.0 11460 tvMiniSeries Hawkeye Hawkeye 2021 2021 339 Action,Adventure,Crime hawkeye 7.5 206915 TRUE FALSE FALSE FALSE
tt1672218 tt0436992 6 0 A Christmas Carol A Christmas Carol 2010 62 Adventure,Drama,Sci-Fi a christmas carol 8.5 8109 tvSeries Doctor Who Doctor Who 2005 NA 45 Adventure,Drama,Sci-Fi doctor who 8.6 239270 TRUE FALSE FALSE FALSE
tt0562994 tt0436992 2 0 The Christmas Invasion The Christmas Invasion 2005 60 Adventure,Drama,Sci-Fi the christmas invasion 8.0 8089 tvSeries Doctor Who Doctor Who 2005 NA 45 Adventure,Drama,Sci-Fi doctor who 8.6 239270 TRUE FALSE FALSE FALSE
tt0664513 tt0386676 2 10 Christmas Party Christmas Party 2005 22 Comedy christmas party 8.7 7369 tvSeries The Office The Office 2005 2013 22 Comedy the office 9.0 680216 TRUE FALSE FALSE FALSE

8.1 Plot and axis titles

# Pipe the 'holep' dataframe through a series of operations using the magrittr pipe operator %>%
holep |>
  # Select the top 200 rows based on the 'num_votes' column
  slice_max(order_by = num_votes, n = 200) |>
  # Arrange the data in descending order of 'num_votes'
  arrange(desc(num_votes)) |>
  # Add new columns: 'id' (row number) and 
  # 'primary_title' (conditional labeling)
    id = row_number(),
    primary_title = if_else(id <= 20,
  ) |>
  # Create a ggplot scatterplot
  ggplot(aes(x = runtime_minutes, y = average_rating)) +
  geom_smooth(method = "lm", col = "lightblue", se = FALSE) +
  geom_jitter(aes(size = num_votes), alpha = 0.2) +
  geom_text(aes(label = primary_title), 
            check_overlap = TRUE, 
            col = "black") +
   # Customize axis scales and size scale
  scale_x_continuous(breaks = c(0, 15, 30, 45, 60, 90, 120), 
                     limits = c(0, 120)) +
  scale_size_continuous(range = c(1, 10), 
                        labels = scales::label_number_si(), 
                        trans = "sqrt") +
  # Set themes for the plot
  cowplot::theme_half_open() +
    legend.position = "bottom",
    legend.direction = "horizontal",
    axis.title.x = element_markdown(hjust = 1),
    axis.title.y = element_markdown(hjust = 1),
    plot.title = element_markdown(face = "plain"),
    legend.title = element_markdown(),
    axis.line = element_line(arrow = arrow(angle = 15, 
                                           length = unit(4, "mm")))
  ) +
   # Add labels and annotations
    x = "Run-time *(in min.)*",
    y = "Average Rating",
    size = "Number of Votes (on **IMDb**)",
    title = "**IMDb** Holiday TV Show Episodes: Ratings _vs._ Run-time",
    subtitle = "A Scatterplot of the top-200 episodes (by number of votes on IMBDb) of holiday TV shows,\nwith top 20 labelled with names of episodes"
  ) +
    geom = "text",
    label = quote(y == a.x + b),
    x = 90, y = 6,
    col = "blue",
    fontface = "italic"

Figure 1: Scatterplot of TV Episodes Ratings vs. Runtime - demonstrating ‘labs’ of ggplot2 - markdown elements

8.2 Text labels

# Pipe the 'holep' dataframe through a series of operations using the magrittr pipe operator %>%
holep |>
  # Group the data by 'parent_primary_title'
  group_by(parent_primary_title) |>
   # Summarize the data: count of episodes, mean votes, mean ratings, and concatenate unique genres
    n = n(),
    votes = mean(parent_num_votes),
    ratings = mean(parent_average_rating),
    genre = paste(unique(genres), collapse = ",")
  ) |>
  # Select the top 40 rows based on 'votes'
  slice_max(order_by = votes, n = 40) |>
  # Arrange the data in descending order of 'votes'
  arrange(desc(votes)) |>
  # Add a new column 'gen_col' based on genre classification
  mutate(gen_col = case_when(
    str_detect(genre, "Comedy") ~ "Comedy",
    str_detect(genre, "Horror") ~ "Horror",
    str_detect(genre, "Action") ~ "Action",
    str_detect(genre, "Drama") ~ "Drama",
    .default = "Others"
  )) |>
  # Create a ggplot scatterplot
  ggplot(aes(x = votes, y = ratings, size = n, label = parent_primary_title, color = gen_col)) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  # Customize labels, titles, and scales
    x = "Number of votes (on IMDb) for the TV Series",
    y = "Average Rating of episodes",
    size = "Number of Holiday Season episodes",
    color = "Genre",
    title = "Most popular TV Series with at least 1 holiday episode",
    subtitle = "A scatter-plot replaced by Text Labels"
  ) +
  scale_size_continuous(range = c(4, 7)) +
  scale_x_continuous(labels = scales::label_number_si(), 
                     limits = c(0, 1100000)) +
  scale_color_manual(values = c("blue", "red", "darkgreen", "purple")) +
  # Set themes for the plot
  theme_classic() +
    legend.position = "bottom", = "vertical",
    legend.margin = margin(0, 0, 0, 0),
    legend.spacing = unit(0, "pt"),
    axis.line.x = element_line(arrow = arrow(angle = 15))

Figure 2: Demonstrating the use of Text Labels in place of points in a scatterplot

8.3 Building custom annotations

# IMDb logo image for annotation in the plot
img <- image_read("")

# Extract the top 10 TV series with at least 1 holiday episode based on IMDb votes
tv10 <- holep |>
  group_by(parent_tconst, parent_primary_title) |>
    start = mean(parent_start_year, na.rm = TRUE),
    end = mean(parent_end_year, na.rm = TRUE),
    votes = mean(parent_num_votes, na.rm = TRUE),
    runtime = mean(parent_runtime_minutes, na.rm = TRUE),
    rating = mean(parent_average_rating, na.rm = TRUE),
    num_episodes = n()
  ) |>
  ungroup() |>
  mutate(years = end - start) |>
  drop_na() |>
  slice_max(order_by = votes, n = 10)

# Filter the 'holep' dataframe to see only the holiday episodes of the top 10 series
holep |>
  filter(parent_primary_title %in% (tv10 |> pull(parent_primary_title))) |>
    parent_primary_title =
        levels = (tv10 |> pull(parent_primary_title))
  ) |>
  # Create a ggplot scatterplot
  ggplot(aes(x = year, y = fct_rev(parent_primary_title))) +
  ggrepel::geom_text_repel(aes(label = primary_title),
                           vjust = +1) +
    data = tv10,
    aes(x = start, xend = end, y = parent_primary_title, yend = parent_primary_title),
    alpha = 0.3, lineend = "round", lwd = 3
  ) +
    aes(color = average_rating, size = num_votes),
    alpha = 0.7
  ) +
  # Customize labels, titles, and scales
    x = NULL, y = NULL,
    title = "Holiday Episodes of IMDb's top 10 TV Series",
    subtitle = "Size of dot indicates number of votes, Color indicates Average Rating on IMDb",
    colour = "Average Rating\n(IMDb)",
    size = "Number of Votes (IMDb)"
  ) +
  scale_color_gradient(low = "pink", high = "purple") +
  scale_size_continuous(range = c(2, 8)) +
  theme_minimal() +
    legend.position = "bottom", 
    plot.title.position = "plot"
  ) +
    geom = "label",
    x = 1993,
    y = 1.5,
    label = "Source: IMDb Non-Commercial Datasets",
    fontface = "italic",
    hjust = 0,
    fill = "#f2c522"
  ) +
    grob = grid::rasterGrob(img),
    xmin = 1993,
    xmax = 1998,
    ymin = 2,
    ymax = 5

Figure 3: Text Annotations within a plot’s panel area

Building a Visualization with Image annotations on the y-axis

An attempt to make a nice visualization with annotations for #TidyTuesday: —

# Libraries --------------------------------------------------------------------
library(tidyverse)      # Data Wrangling and Plotting
library(here)           # Files location and loading
library(summarytools)   # Exploratory Data Analysis
library(colorfindr)     # To get colour palettes for the Viz
library(showtext)       # Using Fonts More Easily in R Graphs
library(ggimage)        # Using Images in ggplot2
library(fontawesome)    # Social Media icons
library(ggtext)         # Markdown Text in ggplot2
library(patchwork)      # For compiling plots
library(figpatch)       # Images in patchwork
library(magick)         # Work with Images and Logos
library(ggimage)        # Background Image
library(cropcircles)    # Crop Images
library(cowplot)        # Images on axis ticks

# Data Load-in------------------------------------------------------------------

tuesdata <- tidytuesdayR::tt_load('2023-12-19')
holep <- tuesdata$holiday_episodes

# Data Wrangling----------------------------------------------------------------
# Find Top 10 series of IMDb
tv10 <- holep |> 
  group_by(parent_tconst, parent_primary_title) |> 
    start = mean(parent_start_year, na.rm = TRUE),
    end = mean(parent_end_year, na.rm = TRUE),
    votes = mean(parent_num_votes, na.rm = TRUE),
    runtime = mean(parent_runtime_minutes, na.rm = TRUE),
    rating = mean(parent_average_rating, na.rm = TRUE), 
    num_episodes = n()
  ) |> 
  ungroup() |> 
  mutate(years = end - start) |> 
  drop_na() |> 
  slice_max(order_by = votes, n = 10)

# The Actual Data to be plotted
df <- holep |> 
  # See only the holiday episodes of top 10 series
    parent_primary_title %in% (tv10 |> pull(parent_primary_title))
  ) |> 
  # An ordered factor to display TV Series Ranking wise in the plot
    parent_primary_title = 
          levels = (tv10 |> pull(parent_primary_title))))

# Options & Visualization Parameters--------------------------------------------

# Load fonts
font_add_google("Ubuntu Condensed", 
                family = "title_font")       # Font for titles
font_add_google("Saira Extra Condensed", 
                family = "caption_font")     # Font for the caption
                family = "body_font")            # Font for plot text

# Creating Images for 10 Series Titles
# Image to extract
img <- ""

# Color Palette
mypal <- met.brewer("Tam")

# Define colours
low_col <- mypal[2]                   # Heat map: low colour
hi_col <- mypal[5]                    # Heat map: high colour
bg_col <- "white"                    # Background Colour
text_col <- mypal[8]                  # Colour for the text
text_hil <- mypal[7]                  # Colour for highlighted text

# Define Text Size
ts = 24                              # Text Size

# Caption stuff
sysfonts::font_add(family = "Font Awesome 6 Brands",
                   regular = here::here("docs", "Font Awesome 6 Brands-Regular-400.otf"))
github <- "&#xf09b"
github_username <- "aditya-dahiya"
xtwitter <- "&#xe61b"
xtwitter_username <- "@adityadahiyaias"
linkedin <- "&#xf08c"
linkedin_username <- "dr-aditya-dahiya-ias"
social_caption <- glue::glue("<span style='font-family:\"Font Awesome 6 Brands\";'>{github};</span> <span style='color: {text_col}'>{github_username}  </span> <span style='font-family:\"Font Awesome 6 Brands\";'>{xtwitter};</span> <span style='color: {text_col}'>{xtwitter_username}</span> <span style='font-family:\"Font Awesome 6 Brands\";'>{linkedin};</span> <span style='color: {text_col}'>{linkedin_username}</span>")

# Add text to plot--------------------------------------------------------------
plot_title <- "Holiday Episodes of IMDb's top-10 TV Series"

subtitle_text <- "The Office had the most (6) holiday season episodes, while the highest rated episode is Dexter's Dex Takes a Holiday."
plot_subtitle <- paste(strwrap(subtitle_text, 150), collapse = "\n")

plot_caption <- paste0("**Data:** IMDb Non-Commercial Datasets. | ", "**Graphics:** ", social_caption)

# Images for Y-Axis ------------------------------------------------------------
url1 <- ""
url2 <- ""
url3 <- ""
url4 <- ""
url5 <- ""
url6 <- ""
url7 <- ""
url8 <- ""
url9 <- ""
url10 <- ""

mk_logo <- function(url){
  image_read(url) |> 
  image_resize("x300") |> 
  circle_crop(border_size = 1, 
              border_colour = "black") |> 

# Data Visualization------------------------------------------------------------

p <- df |> 
  ggplot(aes(x = year,
             y = fct_rev(parent_primary_title))) + 
    aes(label = primary_title),
    family = "body_font",
    col = mypal[7],
    size = 3, nudge_y = -0.5) +
    data = tv10,
    aes(x = start,
        xend = end,
        y = parent_primary_title,
        yend = parent_primary_title),
    alpha = 0.3,
    lineend = "round",
    lwd = 4, 
    col = mypal[7]
  ) +
    aes(color = average_rating,
        size = num_votes),
    alpha = 0.96
  ) +
  scale_color_gradient(low = low_col, 
                        high = hi_col) +
  scale_size_continuous(range = c(4, 12),
                        labels = scales::label_number_si()) +
  scale_x_continuous(limits = c(1993, 2023),
                     breaks = seq(1995, 2020, 5),
                     expand = c(0, 0)) +
  theme_minimal() +
    legend.position = "bottom"
  ) +
  labs(title = plot_title,
       caption = plot_caption,
       subtitle = NULL,
       x = NULL, y = NULL,
       color = "Episode Rating (IMDb)",
       size = "Number of Votes (IMDb)") +
  guides(size = guide_legend(override.aes = list(colour = text_hil)),
         alpha = "none") + 
    plot.caption =  element_textbox(family = "caption_font",
                                    hjust = 0.5,
                                    colour = text_col,
                                    size = ts/2),
    plot.title   =     element_text(hjust = 0.5,
                                    size = 2*ts,
                                    family = "title_font",
                                    face = "bold",
                                    colour = text_col),
    plot.subtitle    = element_text(hjust = 0,
                                    size = ts/2,
                                    family = "body_font",
                                    colour = text_col),
    plot.background =  element_rect(fill = bg_col,
                                    color = bg_col,
                                    linewidth = 0),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(linetype = 2),
    axis.text       =  element_text(hjust = 0.5,
                                    size = ts/2,
                                    family = "body_font",
                                    colour = text_col),
    legend.title = element_text(family = "body_font",
                                colour = text_col,
                                vjust = 0.5),
    legend.key.height = unit(2, "mm"),
    legend.text = element_text(family = "body_font",
                               colour = text_col),
    plot.title.position = "plot",
    plot.caption.position = "plot"
scale_fac = 0.9

pimage <- axis_canvas(
  axis = "y") + 
  draw_image(mk_logo(url1), y = 0.5, scale = scale_fac) +
  draw_image(mk_logo(url2), y = 1.5, scale = scale_fac) +
  draw_image(mk_logo(url3), y = 2.5, scale = scale_fac) +
  draw_image(mk_logo(url4), y = 3.5, scale = scale_fac) +
  draw_image(mk_logo(url5), y = 4.5, scale = scale_fac) +
  draw_image(mk_logo(url6), y = 5.5, scale = scale_fac) +
  draw_image(mk_logo(url7), y = 6.5, scale = scale_fac) +
  draw_image(mk_logo(url8), y = 7.5, scale = scale_fac) +
  draw_image(mk_logo(url9), y = 8.5, scale = scale_fac) +
  draw_image(mk_logo(url10), y = 9.5, scale = scale_fac)

# insert the image strip into the plot
  position = "left",
  width = unit(15, "mm")))

8.4 Direct labelling

An Example using directlabels package (Hocking 2023) is at Figure 4


# Top 10 TV Series with most holiday season episodes
names_series <- holep |> 
  count(parent_primary_title, sort = TRUE) |> 
  filter(n > 10) |> 

select_name = "Holiday Baking Championship"
n = 5

holep |> 
  filter(parent_primary_title %in% names_series[1:n]) |> 
    aes(x = num_votes,
        y = average_rating,
        color = parent_primary_title)
  ) +
  # Background Highlighting of specific series
    data = (holep |> filter(parent_primary_title == select_name)),
    size = 5,
    color = "lightgrey"
  ) +
  # Plotting all the points
  geom_point() +
  # Text Annotation Arrow
    geom = "curve",
    x = (holep |> 
           filter(parent_primary_title == select_name) |> 
           arrange(average_rating) |> 
           slice_head(n = 1) |> 
    y = (holep |> 
           filter(parent_primary_title == select_name) |> 
           arrange(average_rating) |> 
           slice_head(n = 1) |> 
    xend = 80, yend = 4,
    arrow = arrow(length = unit(2, "mm")),
    col = "darkgrey"
  ) +
  # Text Annotation
    geom = "label",
    x = 80, y = 4,
    hjust = 0,
    vjust = 0.5,
    label = paste0("TV Episodes of\n", select_name),
    fill = "grey",
    fontface = "italic",
    label_padding = unit(15, "mm"),
    label_size = unit(0, "mm")
  ) +
  # Labels and Titles
    x = "Number of Votes for the episode on IMDb",
    y = "Average Rating of the Episode",
    title = paste0("Ratings and Votes for episodes of ", select_name),
    subtitle = paste0("Comparison with other series in the top ", n, " TV Series by number of holiday episodes")
  ) +
  scale_x_continuous(trans = "log10") +
  scale_color_brewer(palette = "Set1") + 
  cowplot::theme_half_open() +
    axis.title = element_text(hjust = 1),
    legend.position = "none",
    axis.line = element_line(arrow = arrow(length = unit(3, "mm")))
  ) +
  # Using directlabels
    aes(label = parent_primary_title),
    method = "smart.grid"

Figure 4: Using directlabels and annotations to make reading the scatterplot easier, instead of a legend

Another example, which uses geom_mark_ellipse() of ggforce package (Pedersen 2022) to focus on specific groups within a scatter-plot. The Figure 5 shows this.

names_highlight = c("Holiday Baking Championship",
                    "Thomas & Friends")

holep |> 
  filter(parent_primary_title %in% names_series[1:n]) |> 
    aes(x = num_votes,
        y = average_rating,
        color = parent_primary_title)
  ) +
  # Background Highlighting of specific series
    data = (holep |> filter(parent_primary_title %in% names_highlight)),
    size = 5,
    color = "lightgrey"
  ) +
  # Plotting all the points
  geom_point() +
 # Labels and Titles
    x = "Number of Votes for the episode on IMDb",
    y = "Average Rating of the Episode",
    title = paste0("Ratings and Votes for TV Series with most holiday episodes"),
    subtitle = paste0("Highlighting the TV Series: ", paste0(names_highlight, collapse = ", "))
  ) +
    data = (holep |> filter(parent_primary_title %in% names_highlight)),
    aes(label = parent_primary_title,
        group = parent_primary_title,
        fill = parent_primary_title),
    linetype = 2,
    alpha = 0.2,
    label.margin = margin(0,0,0,0),
    con.linetype = 2,
    label.fill = "lightgrey"
  ) +
  scale_x_continuous(trans = "log10") +
  scale_color_brewer(palette = "Set1") + 
  scale_fill_brewer(palette = "Set1") +
  cowplot::theme_half_open() +
    axis.title = element_text(hjust = 1),
    legend.position = "none",
    axis.line = element_line(arrow = arrow(length = unit(3, "mm")))

Figure 5: Using ellipses to highlight areas of specific groups in a scatterplot

8.5 Annotation across facets

Similarly, using gghighlight package (Yutani 2022), we can annotate different facets in one go, as shown in Figure 6.

holep |> 
  filter(parent_primary_title %in% names_series[1:4]) |> 
    aes(x = num_votes,
        y = average_rating,
        color = parent_primary_title)
  ) +
  # Plotting all the points
  geom_point(size = 2) +
  # Faceting by TV Series
  facet_wrap(~ parent_primary_title) +
  # gghighlight to annotate
  gghighlight::gghighlight() +
  # Labels and Titles
    x = "Number of Votes for the episode on IMDb",
    y = "Average Rating of the Episode",
    title = paste0("Ratings and Votes for 4 TV Series with most holiday episodes"),
    subtitle = "Highlighting the TV Series in each panel."
  ) +
  scale_x_continuous(trans = "log10") +
  scale_color_brewer(palette = "Set1") + 
  scale_fill_brewer(palette = "Set1") +
  cowplot::theme_half_open() +
    axis.title = element_text(hjust = 1),
    legend.position = "none",
    axis.line = element_line(arrow = arrow(length = unit(3, "mm"))),
    strip.background = element_rect(fill = "white")

Figure 6: Annotating different facets by using gghighlight


Hocking, Toby Dylan. 2023. “Directlabels: Direct Labels for Multicolor Plots.”
Pedersen, Thomas Lin. 2022. “Ggforce: Accelerating ’Ggplot2’.”
Yutani, Hiroaki. 2022. “Gghighlight: Highlight Lines and Points in ’Ggplot2’.”