Chapter 5 (Lab)

Resampling Methods

Author

Aditya Dahiya

Published

February 13, 2021

ISLR Chapter 5 (Lab)

5.3 Lab

Cross-Validation and the Bootstrap

5.3.1 The Validation Set Approach

We try out the Validation Set approach using Auto data set, and predict mpg using horsepower.

library(ISLR)
set.seed(1)
data(Auto)
attach(Auto)
library(tidyverse)
library(kableExtra)

# Create a train vector, to create a random sample as "Training Set".
train <- sample(1:392, size = 196, replace = FALSE)

# Create a linear regression.
fit.lm <- lm(mpg ~ horsepower, data = Auto, subset = train)
pred.lm <- predict(fit.lm, newdata = Auto[-train, ])
MSE.lin <- mean((pred.lm - Auto$mpg[-train])^2)
MSE.lin
[1] 23.26601
# Create polynomial regression 2 and 3
fit.lm2 <- lm(mpg ~ poly(horsepower, 2), data = Auto, subset = train)
pred.lm2 <- predict(fit.lm2, newdata = Auto[-train, ])
MSE.lin2 <- mean((pred.lm2 - Auto$mpg[-train])^2)
MSE.lin2
[1] 18.71646
fit.lm3 <- lm(mpg ~ poly(horsepower, 3), data = Auto, subset = train)
pred.lm3 <- predict(fit.lm3, newdata = Auto[-train, ])
MSE.lin3 <- mean((pred.lm3 - Auto$mpg[-train])^2)
MSE.lin3
[1] 18.79401
# Create 10 different validation sets and plot the MSE from polynomials upto 10
Result <- data.frame(
  Poly1 = rep(NA, 10),
  Poly2 = rep(NA, 10),
  Poly3 = rep(NA, 10),
  Poly4 = rep(NA, 10),
  Poly5 = rep(NA, 10),
  Poly6 = rep(NA, 10),
  Poly7 = rep(NA, 10),
  Poly8 = rep(NA, 10),
  Poly9 = rep(NA, 10),
  Poly10 = rep(NA, 10),
  Repetition = c(1:10)
)
for (p in 1:10) {
  for (i in 1:10) {
    set.seed(i)
    train <- sample(1:392, size = 196, replace = FALSE)
    fit.lm <- lm(mpg ~ poly(horsepower, p), data = Auto, subset = train)
    pred.lm <- predict(fit.lm, newdata = Auto[-train, ])
    Result[i, p] <- mean((pred.lm - Auto$mpg[-train])^2)
  }
}
kable(Result, digits = 1) |> kable_classic_2()
Poly1 Poly2 Poly3 Poly4 Poly5 Poly6 Poly7 Poly8 Poly9 Poly10 Repetition
23.3 18.7 18.8 19.2 19.4 19.6 19.0 19.1 19.1 22.9 1
25.7 20.4 20.4 20.3 19.8 19.6 19.5 19.9 20.8 22.1 2
22.0 17.7 17.7 17.6 17.1 17.1 17.2 17.2 17.2 18.2 3
24.3 19.0 19.0 19.3 19.1 19.0 18.9 19.3 19.5 19.5 4
20.9 16.8 17.0 17.3 16.8 16.6 16.6 16.5 16.6 18.2 5
23.6 17.7 17.7 18.0 17.6 17.3 17.1 17.2 17.2 17.5 6
22.3 19.7 20.0 19.9 19.2 18.9 18.7 19.7 22.4 26.5 7
24.9 18.9 18.9 19.2 19.2 20.4 21.7 19.8 25.0 22.4 8
27.5 22.1 22.5 22.4 21.6 21.3 21.1 21.2 21.2 21.2 9
26.4 19.9 20.3 20.2 19.3 19.0 18.7 18.7 18.7 21.1 10

These results show that a quadratic fit is much better than linear fit, but the fits of higher order are not significantly better as MSE stays the same.

# Attempt to reproduce graph in Figure 5.2 (right hand side panel)
ResultLong <- gather(Result, key = "Poly", value = "MSE", -Repetition)
ResultLong <- ResultLong %>%
  mutate(Poly = parse_number(Poly)) %>%
  mutate(Repetition = paste0("Rep", Repetition, sep = ""))
ggplot(data = ResultLong) +
  geom_line(aes(x = Poly, y = MSE, color = Repetition), lwd = 1) +
  theme(legend.position = "none") +
  theme_bw()

5.3.2 Leave-One-Out Cross Validation (LOOCV)

Perform LOOCV using cv.glm function of the boot library.

library(boot)
fit.glm <- glm(mpg ~ horsepower, data = Auto)
cv.err <- cv.glm(data = Auto, glmfit = fit.glm)
names(cv.err)
[1] "call"  "K"     "delta" "seed" 
cv.err$delta
[1] 24.23151 24.23114

Now, we create a loop using for function to calculate LOOCV error rate for polynomials 1 to 5.

set.seed(17)
cv.error = rep(NA, 10)
for (i in 1:10) {
  fit.glm <- glm(mpg ~ poly(horsepower, i), data = Auto)
  cv.error[i] <- cv.glm(data = Auto, glmfit = fit.glm)$delta[1]
}
round(cv.error, 2)
 [1] 24.23 19.25 19.33 19.42 19.03 18.98 18.83 18.96 19.07 19.49

5.3.3 k-Fold Cross Validation

We now use the cv.glm function to do k-fold CV (using k=10).

set.seed(17)
cv.error.10 <- rep(NA, 10)
for (i in 1:10) {
  fit.glm <- glm(mpg ~ poly(horsepower, i), data = Auto)
  cv.error.10[i] <- cv.glm(data = Auto, glmfit = fit.glm)$delta[1]
}
round(cv.error.10, 2)
 [1] 24.23 19.25 19.33 19.42 19.03 18.98 18.83 18.96 19.07 19.49

5.3.4 The Bootstrap

library(ISLR)
data("Portfolio")
summary(Portfolio)
       X                  Y           
 Min.   :-2.43276   Min.   :-2.72528  
 1st Qu.:-0.88847   1st Qu.:-0.88572  
 Median :-0.26889   Median :-0.22871  
 Mean   :-0.07713   Mean   :-0.09694  
 3rd Qu.: 0.55809   3rd Qu.: 0.80671  
 Max.   : 2.46034   Max.   : 2.56599  
# Creating a function with an index which can then be use for the boot function
# alpha.fn to return alpha with lowest variance
alpha.fn <- function(data, index){
  X <- data$X[index]
  Y <- data$Y[index]
  return((var(Y) - cov(X,Y)) / (var(X) + var(Y) - 2*cov(X,Y) ) )
}

# Using alpha.fn to find alpha using the full data set
alpha.fn(Portfolio, 1:100)
[1] 0.5758321
# Using alpha.fn to find alpha using a random sample of 100 with replacement
set.seed(2)
alpha.fn(Portfolio, sample(100, 100, replace = TRUE))
[1] 0.5539577
# We can repeat this many many times using different seed. Or,

# We can use boot() function to automate this
library(boot)
boot(Portfolio, alpha.fn, R = 1000)

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Portfolio, statistic = alpha.fn, R = 1000)


Bootstrap Statistics :
     original       bias    std. error
t1* 0.5758321 0.0009480333  0.09303824

Estimating the accuracy of a linear regression model

# Comparing coefficients from a linear regression of mpg onto horsepower,
# from standard least squares regression and bootstrap methods.

# Creating a function to be used in bootstrap which computes output i.e. coefficients
# of interest, and takes in an index vector as input.
boot.fn <- function(data, index) {
  return(coef(lm(mpg ~ horsepower, data = data, subset = index)))
}

# Testing the boot.fn on the entire Auto data set
boot.fn(Auto, 1:392)
(Intercept)  horsepower 
 39.9358610  -0.1578447 
# Getting coefficient estimates using a random sample of 392 with replacement
boot.fn(Auto, sample(392, 392, replace = TRUE))
(Intercept)  horsepower 
 39.5910901  -0.1524194 
boot.fn(Auto, sample(392, 392, replace = TRUE))
(Intercept)  horsepower 
 41.1261106  -0.1690852 
# We can repeat this command a thousand times, or

# Use bootstrap to automate this process
boot(data = Auto, statistic = boot.fn, R = 1000)

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Auto, statistic = boot.fn, R = 1000)


Bootstrap Statistics :
      original        bias    std. error
t1* 39.9358610  0.0907930895  0.82249078
t2* -0.1578447 -0.0007940743  0.00708581
# Compare these with standard errors from least squares estimates
summary(lm(mpg ~ horsepower, data = Auto))$coef
              Estimate  Std. Error   t value      Pr(>|t|)
(Intercept) 39.9358610 0.717498656  55.65984 1.220362e-187
horsepower  -0.1578447 0.006445501 -24.48914  7.031989e-81

Now, we perform bootstrap to compare standard errors from least squares estimates of linear regression of mpg on quadratic form of horsepower, and standard errors computed using bootstrap.

# creating a boot.fn to be used in boot() later
boot.fn <- function(data, index){
  return(coefficients(lm(mpg ~ horsepower + I(horsepower^2), data = data, subset = index)))
}
boot.fn(Auto, 1:392)
    (Intercept)      horsepower I(horsepower^2) 
   56.900099702    -0.466189630     0.001230536 
set.seed(1)

# Using boot() to estimate standard errors of coefficients
boot(data = Auto, statistic = boot.fn, R = 1000)

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Auto, statistic = boot.fn, R = 1000)


Bootstrap Statistics :
        original        bias     std. error
t1* 56.900099702  3.511640e-02 2.0300222526
t2* -0.466189630 -7.080834e-04 0.0324241984
t3*  0.001230536  2.840324e-06 0.0001172164
# Compare with standard errors of least square estimates
summary(lm(mpg~horsepower+I(horsepower^2), data = Auto))$coef
                    Estimate   Std. Error   t value      Pr(>|t|)
(Intercept)     56.900099702 1.8004268063  31.60367 1.740911e-109
horsepower      -0.466189630 0.0311246171 -14.97816  2.289429e-40
I(horsepower^2)  0.001230536 0.0001220759  10.08009  2.196340e-21