# Loading Required Packages
library(tidyverse)
library(arrow)
library(dbplyr, warn.conflicts = FALSE)
library(duckdb)
Chapter 23
Arrow
23.3 Opening a dataset
Workign with the Seattle library 9 GB data!
dir.create("data", showWarnings = FALSE)
::multi_download(
curlurl = "https://r4ds.s3.us-west-2.amazonaws.com/seattle-library-checkouts.csv",
destfiles = "data/seattle-library-checkouts.csv",
resume = TRUE
)
# Opening the dataset file
<- open_dataset(
seattle_csv sources = "data/seattle-library-checkouts.csv",
format = "csv"
)# Viewing a portion of the data-set file
|>
seattle_csv glimpse()
23.4 The Parquet Format
# Convert the 9GB csv file into smaller parquet files,
# each for 1 checkout year.
<- "data/seattle-library-checkouts"
pq_path
|>
seattle_csv group_by(CheckoutYear) |>
write_dataset(path = pq_path, format = "parquet")
# Viweing the results
tibble(
files = list.files(pq_path, recursive = TRUE),
size_MB = file.size(file.path(pq_path, files)) / 1024^2
)
23.5 Using dplyr with arrow
# Read in the parquet files
<- open_dataset(pq_path)
seattle_pq
# Count the total number of books checked out in
# each month for the last five years
<- seattle_pq |>
query filter(CheckoutYear >= 2018, MaterialType == "BOOK") |>
group_by(CheckoutYear, CheckoutMonth) |>
summarize(TotalCheckouts = sum(Checkouts)) |>
arrange(CheckoutYear, CheckoutMonth)
query
# Get the results by calling collect()
|> collect()
query
# Comparing performance
|>
seattle_csv filter(CheckoutYear == 2021, MaterialType == "BOOK") |>
group_by(CheckoutMonth) |>
summarize(TotalCheckouts = sum(Checkouts)) |>
arrange(desc(CheckoutMonth)) |>
collect() |>
system.time()
|>
seattle_pq filter(CheckoutYear == 2021, MaterialType == "BOOK") |>
group_by(CheckoutMonth) |>
summarize(TotalCheckouts = sum(Checkouts)) |>
arrange(desc(CheckoutMonth)) |>
collect() |>
system.time()
Important: Whenever we want to output the results of our dplyr
code, we will use collect()
.