Chapter 24

Hierarchical data

Author

Aditya Dahiya

Published

October 5, 2023

library(tidyverse)
library(jsonlite)
library(repurrrsive)
library(gt)
library(gtExtras)

24.3.5 Exercises

Question 1

What happens when you use unnest_wider() with unnamed list-columns like df2? What argument is now necessary? What happens to missing values?

When we use unnest_wider() with unnamed list columns, it results in an error because unnest_wider() cannot un-nest columns with missing names of the children of the list-column. The error message tells us that the argument names_sep = "_" is now necessary, where "_" is any separator that we may wish to use. The result names the new columns as y_1 , y_2 etc. The missing values are represented with an NA .

# Create df2 with an unnamed list-column
df2 <- tribble(
  ~x, ~y,
  1, list(11, 12, 13),
  2, list(21),
  3, list(31, 32),
)

# Running unnest_wider()
# df2 |>
#   unnest_wider(y)
# Error in `unnest_wider()`:
# ℹ In column: `y`.
# ℹ In row: 1.
# Caused by error:
# ! Can't unnest elements with missing names.
# ℹ Supply `names_sep` to generate automatic names.
# Run `rlang::last_trace()` to see where the error occurred.

df2 |>
  unnest_wider(y, names_sep = "_")
# A tibble: 3 × 4
      x   y_1   y_2   y_3
  <dbl> <dbl> <dbl> <dbl>
1     1    11    12    13
2     2    21    NA    NA
3     3    31    32    NA

Question 2

What happens when you use unnest_longer() with named list-columns like df1? What additional information do you get in the output? How can you suppress that extra detail?

When we use unnest_longer() with named list-columns like df1 , we get a new row for each value of the list, and in addition, we get a new column named "<name of list column>_id" which tells us the name of the list child from which the value has been picked up.

We can suppress this extra detail by adding the argument indices_include = FALSE .

# Create a named list-column like df1
df1 <- tribble(
  ~x, ~y,
  1, list(a = 11, b = 12),
  2, list(a = 21, b = 22),
  3, list(a = 31, b = 32),
)

# Unnest Longer function on a named list-column
df1 |>
  unnest_longer(y)
# A tibble: 6 × 3
      x     y y_id 
  <dbl> <dbl> <chr>
1     1    11 a    
2     1    12 b    
3     2    21 a    
4     2    22 b    
5     3    31 a    
6     3    32 b    
# Additional argument to suppress the extra detail of _id column
df1 |>
  unnest_longer(y, indices_include = FALSE)
# A tibble: 6 × 2
      x     y
  <dbl> <dbl>
1     1    11
2     1    12
3     2    21
4     2    22
5     3    31
6     3    32

Question 3

From time-to-time you encounter data frames with multiple list-columns with aligned values. For example, in the following data frame, the values of y and z are aligned (i.e. y and z will always have the same length within a row, and the first value of y corresponds to the first value of z). What happens if you apply two unnest_longer() calls to this data frame? How can you preserve the relationship between x and y? (Hint: carefully read the docs).

df4 <- tribble(
  ~x, ~y, ~z,
  "a", list("y-a-1", "y-a-2"), list("z-a-1", "z-a-2"),
  "b", list("y-b-1", "y-b-2", "y-b-3"), list("z-b-1", "z-b-2", "z-b-3")
)

If we apply two consequtive unnest_longer() calls to this data-frame it results in a permutation-combination like situation, where each newly created row of y is treated a un-linked to each element of lists in z and thus the resulting data-frame produces a combination of all possible values of y and z .

df4 <- tribble(
  ~x, ~y, ~z,
  "a", list("y-a-1", "y-a-2"), list("z-a-1", "z-a-2"),
  "b", list("y-b-1", "y-b-2", "y-b-3"), list("z-b-1", "z-b-2", "z-b-3")
)

# Result if we apply two unnest_longer() calls to this data frame
df4 |>
  unnest_longer(y) |>
  unnest_longer(z)
# A tibble: 13 × 3
   x     y     z    
   <chr> <chr> <chr>
 1 a     y-a-1 z-a-1
 2 a     y-a-1 z-a-2
 3 a     y-a-2 z-a-1
 4 a     y-a-2 z-a-2
 5 b     y-b-1 z-b-1
 6 b     y-b-1 z-b-2
 7 b     y-b-1 z-b-3
 8 b     y-b-2 z-b-1
 9 b     y-b-2 z-b-2
10 b     y-b-2 z-b-3
11 b     y-b-3 z-b-1
12 b     y-b-3 z-b-2
13 b     y-b-3 z-b-3

We can preserve the relationship between x and y (and z ) by including multiple columns in a single call of unnest_longer() . The help documentation with unnest_longer() provides that

col : List-column(s) to unnest. When selecting multiple columns, values from the same row will be recycled to their common size.

df4 |>
  unnest_longer(c(y, z))
# A tibble: 5 × 3
  x     y     z    
  <chr> <chr> <chr>
1 a     y-a-1 z-a-1
2 a     y-a-2 z-a-2
3 b     y-b-1 z-b-1
4 b     y-b-2 z-b-2
5 b     y-b-3 z-b-3

24.4.4 Exercises

Question 1

Roughly estimate when gh_repos was created. Why can you only roughly estimate the date?

Using the code shown below, and viewing the results of the maximum (i.e. the latest time stamps) we find that the latest data observation in the data-set is of

2016-10-25 03:09:53

Thus, the data-set gh_repos was created on or after 25 October, 2016 3:09 AM UTC. We can only find the estimate, and not the exact time of its creation since the data-set does not explicitly contain the information of date and time of its creation. We can only infer it from the contents of the data.

# Convert the gh_repos into a tibble for easy viewing
ghrepos = tibble(json = gh_repos)
ghrepos |>
  
  # Rectangling the data
  unnest_longer(json) |>
  unnest_wider(json) |>
  
  # Selecting the time variables to find the latest date in the data
  select(created_at, updated_at, pushed_at) |>
  
  # Covert to date-time objects
  mutate(across(.cols = 1:3, .fns = ymd_hms)) |>
  
  # Find the maximum (i.e. the latest) time in the three columns
  summarize(
    max_created_at = max(created_at),
    max_updated_at = max(updated_at),
    max_pushed_at = max(pushed_at)
  ) |>
  
  # Nice display
  gt() |>
  cols_label_with(fn = ~ janitor::make_clean_names(., case = "title")) |>
  gt_theme_538()
Max Created at Max Updated at Max Pushed at
2016-10-24 08:52:01 2016-10-25 03:09:53 2016-10-24 19:29:48

Question 2

The owner column of gh_repo contains a lot of duplicated information because each owner can have many repos. Can you construct an owners data frame that contains one row for each owner? (Hint: does distinct() work with list-cols?)

The code below shows the two ways of creating an an owners data frame that contains one row for each owner. The Method 1 uses the unnest_wider() first, and then uses distinct() on the resulting data frame. The Method 2 first uses distinct() and then unnest_wider() . The results are the same.

Yes, we observe that surprisingly, the function distinct() does work with list-cols .

########### Method 1: The long way, after unnesting
ghrepos |>
  
  # Rectangling the data
  unnest_longer(json) |>
  unnest_wider(json) |>
  
  # Selecting the owner variable
  select(owner) |>
  
  # Unnesting the named list i.e., owner column, 
  # using names_sep to ensure id is not duplicated original id column
  # when joining the data frames later
  unnest_wider(owner, names_sep = "_") |>
  
  # Checking the nature of data, what are the contents of each column like
  # visdat::vis_dat()
  
  # Keeping only distinct columns, dropping duplicated rows
  distinct()
# A tibble: 6 × 17
  owner_login owner_id owner_avatar_url              owner_gravatar_id owner_url
  <chr>          <int> <chr>                         <chr>             <chr>    
1 gaborcsardi   660288 https://avatars.githubuserco… ""                https://…
2 jennybc       599454 https://avatars.githubuserco… ""                https://…
3 jtleek       1571674 https://avatars.githubuserco… ""                https://…
4 juliasilge  12505835 https://avatars.githubuserco… ""                https://…
5 leeper       3505428 https://avatars.githubuserco… ""                https://…
6 masalmon     8360597 https://avatars.githubuserco… ""                https://…
# ℹ 12 more variables: owner_html_url <chr>, owner_followers_url <chr>,
#   owner_following_url <chr>, owner_gists_url <chr>, owner_starred_url <chr>,
#   owner_subscriptions_url <chr>, owner_organizations_url <chr>,
#   owner_repos_url <chr>, owner_events_url <chr>,
#   owner_received_events_url <chr>, owner_type <chr>, owner_site_admin <lgl>
######### Method 2: The short way - using distinct() on the owner list-column
owners <- ghrepos |>
   # Rectangling the data
  unnest_longer(json) |>
  unnest_wider(json) |>
  distinct(owner) |>
  unnest_wider(owner, names_sep = "_")
owners
# A tibble: 6 × 17
  owner_login owner_id owner_avatar_url              owner_gravatar_id owner_url
  <chr>          <int> <chr>                         <chr>             <chr>    
1 gaborcsardi   660288 https://avatars.githubuserco… ""                https://…
2 jennybc       599454 https://avatars.githubuserco… ""                https://…
3 jtleek       1571674 https://avatars.githubuserco… ""                https://…
4 juliasilge  12505835 https://avatars.githubuserco… ""                https://…
5 leeper       3505428 https://avatars.githubuserco… ""                https://…
6 masalmon     8360597 https://avatars.githubuserco… ""                https://…
# ℹ 12 more variables: owner_html_url <chr>, owner_followers_url <chr>,
#   owner_following_url <chr>, owner_gists_url <chr>, owner_starred_url <chr>,
#   owner_subscriptions_url <chr>, owner_organizations_url <chr>,
#   owner_repos_url <chr>, owner_events_url <chr>,
#   owner_received_events_url <chr>, owner_type <chr>, owner_site_admin <lgl>

Question 3

Follow the steps used for titles to create similar tables for the aliases, allegiances, books, and TV series for the Game of Thrones characters.

# A tibble for aliases
aliases = tibble(json = got_chars) |>
  unnest_wider(json) |>
  select(id, aliases) |>
  unnest_longer(aliases) |>
  filter(aliases != "")
aliases
# A tibble: 107 × 2
      id aliases           
   <int> <chr>             
 1  1022 Prince of Fools   
 2  1022 Theon Turncloak   
 3  1022 Reek              
 4  1022 Theon Kinslayer   
 5  1052 The Imp           
 6  1052 Halfman           
 7  1052 The boyman        
 8  1052 Giant of Lannister
 9  1052 Lord Tywin's Doom 
10  1052 Lord Tywin's Bane 
# ℹ 97 more rows
# A tibble for allegiances
allegiances = tibble(json = got_chars) |>
  unnest_wider(json) |>
  select(id, allegiances) |>
  unnest_longer(allegiances) |>
  filter(allegiances != "")
allegiances
# A tibble: 33 × 2
      id allegiances                      
   <int> <chr>                            
 1  1022 House Greyjoy of Pyke            
 2  1052 House Lannister of Casterly Rock 
 3  1074 House Greyjoy of Pyke            
 4  1166 House Nymeros Martell of Sunspear
 5   130 House Nymeros Martell of Sunspear
 6  1303 House Targaryen of King's Landing
 7  1319 House Baratheon of Dragonstone   
 8  1319 House Seaworth of Cape Wrath     
 9   148 House Stark of Winterfell        
10   149 House Oakheart of Old Oak        
# ℹ 23 more rows
# A tibble for aliases
books = tibble(json = got_chars) |>
  unnest_wider(json) |>
  select(id, books) |>
  unnest_longer(books) |>
  filter(books != "")
books
# A tibble: 77 × 2
      id books                    
   <int> <chr>                    
 1  1022 A Game of Thrones        
 2  1022 A Storm of Swords        
 3  1022 A Feast for Crows        
 4  1052 A Feast for Crows        
 5  1052 The World of Ice and Fire
 6  1074 A Game of Thrones        
 7  1074 A Clash of Kings         
 8  1074 A Storm of Swords        
 9  1109 A Clash of Kings         
10  1166 A Game of Thrones        
# ℹ 67 more rows
# A tibble for aliases
tvSeries = tibble(json = got_chars) |>
  unnest_wider(json) |>
  select(id, tvSeries) |>
  unnest_longer(tvSeries) |>
  filter(tvSeries != "")
tvSeries
# A tibble: 93 × 2
      id tvSeries
   <int> <chr>   
 1  1022 Season 1
 2  1022 Season 2
 3  1022 Season 3
 4  1022 Season 4
 5  1022 Season 5
 6  1022 Season 6
 7  1052 Season 1
 8  1052 Season 2
 9  1052 Season 3
10  1052 Season 4
# ℹ 83 more rows

We could also try to create a function for this: –

# Create a function to do the same job for each variable in the nested data
got_unnest <- function(variable, nested_tibble){
  
  tibble(json = nested_tibble) |>
  unnest_wider(json) |>
  select(id, variable) |>
  unnest_longer(variable) |>
  filter(variable != "")
}

Question 4

Explain the following code line-by-line. Why is it interesting? Why does it work for got_chars but might not work in general?

The following annotations to the code explain it line by line. This is interesting because it provides us a way to directly create a tidy data, in long form, for plotting by ggplot2 etc. from the list.

# Convert lsit into tibble for easy viewing and rectangling operations
tibble(json = got_chars) |> 
  
  # Unnest the named column json into 18 different columns
  unnest_wider(json) |> 
  
  # Select to view only the id column, and nested columns which are list-columns
  select(id, where(is.list)) |> 
  
  # Convert into long format by making different row for each column
  pivot_longer(
    where(is.list), 
    names_to = "name", 
    values_to = "value"
  ) |>  
  
  # Now unnest the value column to display one row for each list item
  unnest_longer(value)
# A tibble: 479 × 3
      id name        value                                               
   <int> <chr>       <chr>                                               
 1  1022 titles      Prince of Winterfell                                
 2  1022 titles      Lord of the Iron Islands (by law of the green lands)
 3  1022 aliases     Prince of Fools                                     
 4  1022 aliases     Theon Turncloak                                     
 5  1022 aliases     Reek                                                
 6  1022 aliases     Theon Kinslayer                                     
 7  1022 allegiances House Greyjoy of Pyke                               
 8  1022 books       A Game of Thrones                                   
 9  1022 books       A Storm of Swords                                   
10  1022 books       A Feast for Crows                                   
# ℹ 469 more rows

This worked for got_chars because luckily, each list that we pivoted in pivot_longer() had exactly the same level of nesting. That is, none of the contents of the created column value has more than one level of nesting. This may not work in general because different data sets and lists may have different levels of nesting for each column. For example, one column may have 3 levels of nesting, while other may have 10 levels of nesting.

Question 5

In gmaps_cities, what does address_components contain? Why does the length vary between rows? Un-nest it appropriately to figure it out. (Hint: types always appears to contain two elements. Does unnest_wider() make it easier to work with than unnest_longer()?) .

The address_components contain the complete words and parts (city, county, state and country) that come for the complete address of the city. The contents are the following: –

  • long_name : The actual complete name of the component (city, or county or state or country)

  • short_name : The abbreviation for the name (for states and country)

  • types: The address_components further sub-component types contains two elements, one is the type of address, (i..e one of the Locality, Administrative Area Level 2, Administrative Area Level 1, Country) and second is same for all, i.e. type of address political .

The length varies between rows because some address have only two levels, for example:--

  • Washington, United States

But, others have three levels in the address such as: –

  • New York City, New York State, United States

And, lastly, others have four levels in the address such as: –

  • Houston, Harris County, Texas, United States

Yes, unnest_wider() make it easier to work with than unnest_longer() because we can create columns of the components for easy display. Tidy data in long form is good for ggplot2 visualization, but not easy to read. The output is shown below in Figure 1.

Code
gmaps_cities |>
  unnest_wider(json) |>
  
  # Remove status as it adds no info
  select(-status) |>
  
  # Unnamed lists, so unnest longer - make rows
  unnest_longer(results) |>
  
  # Named lists to unnest wider into columns
  unnest_wider(results) |>
  
  # Select an id variable and address_components
  select(formatted_address, address_components) |>
  
  # Since we know the address components have City, County, State and Country
  # Names, lets try to create a column for each by unnest_wider
  unnest_wider(address_components, names_sep = "_") |>
  
  # To create tidy data of address levels
  pivot_longer(cols = -formatted_address,
               names_to = "level",
               values_to = "address_components") |>
  mutate(level = parse_number(level)) |>
  
  # Further, making new columns from remaining list-columns
  unnest_wider(address_components) |>
  unnest_wider(types, names_sep = "_") |>
  
  # Remove types_2 ("political) as it does not add any information
  select(-types_2) |>
  
  # Tidying up final display
  rename(level_type = types_1) |>
  relocate(level_type, .before = long_name) |>
  drop_na() |>
  mutate(level_type = snakecase::to_any_case(level_type, "title")) |>
  
  # gt() to display the output nicely
  gt(rowname_col = NULL,
     groupname_col = "formatted_address") |>
  tab_options(row_group.as_column = TRUE) |>
  cols_label_with(fn = ~ janitor::make_clean_names(., case = "title")) |>
  gt_theme_538()
Level Level Type Long Name Short Name
Houston, TX, USA 1 Locality Houston Houston
2 Administrative Area Level 2 Harris County Harris County
3 Administrative Area Level 1 Texas TX
4 Country United States US
Washington, USA 1 Administrative Area Level 1 Washington WA
2 Country United States US
Washington, DC, USA 1 Locality Washington Washington
2 Administrative Area Level 2 District of Columbia District of Columbia
3 Administrative Area Level 1 District of Columbia DC
4 Country United States US
New York, NY, USA 1 Locality New York New York
2 Administrative Area Level 1 New York NY
3 Country United States US
Chicago, IL, USA 1 Locality Chicago Chicago
2 Administrative Area Level 2 Cook County Cook County
3 Administrative Area Level 1 Illinois IL
4 Country United States US
Arlington, TX, USA 1 Locality Arlington Arlington
2 Administrative Area Level 2 Tarrant County Tarrant County
3 Administrative Area Level 1 Texas TX
4 Country United States US
Arlington, VA, USA 1 Locality Arlington Arlington
2 Administrative Area Level 2 Arlington County Arlington County
3 Administrative Area Level 1 Virginia VA
4 Country United States US
Figure 1: The components of addresses in gmaps_cities

24.5.4 Exercises

Question 1

Rectangle the df_col and df_row below. They represent the two ways of encoding a data frame in JSON.

json_col <- parse_json('
  {
    "x": ["a", "x", "z"],
    "y": [10, null, 3]
  }
')


json_row <- parse_json('
  [
    {"x": "a", "y": 10},
    {"x": "x", "y": null},
    {"x": "z", "y": 3}
  ]
')

df_col <- tibble(json = list(json_col)) 

df_row <- tibble(json = json_row)

# Rectangling df_col
df_col |>
  unnest_wider(json) |>
  unnest_longer(c(x, y))
# A tibble: 3 × 2
  x         y
  <chr> <int>
1 a        10
2 x        NA
3 z         3
# Reactangling df_row
df_row |>
  unnest_wider(json)
# A tibble: 3 × 2
  x         y
  <chr> <int>
1 a        10
2 x        NA
3 z         3