Chapter 6 (Lab)

Linear Model Selection and Regularization

Author

Aditya Dahiya

Published

February 20, 2021

Lab 1: Subset Selection Methods

6.5.1 Best Subset Selection

Load the libraries ISLR and leaps to perform Best Subset Selection on Hitters data.

library(ISLR)
library(leaps)
library(kableExtra)
data(Hitters)

# Examine the data set
dim(Hitters)
[1] 322  20
names(Hitters)
 [1] "AtBat"     "Hits"      "HmRun"     "Runs"      "RBI"       "Walks"    
 [7] "Years"     "CAtBat"    "CHits"     "CHmRun"    "CRuns"     "CRBI"     
[13] "CWalks"    "League"    "Division"  "PutOuts"   "Assists"   "Errors"   
[19] "Salary"    "NewLeague"
# Find and remove missing data
sum(is.na(Hitters))
[1] 59
Hitters <- na.omit(Hitters)
dim(Hitters)
[1] 263  20
# Create an Object with Best Subset Selection performed on Hitters Data set
regfit.trial <- regsubsets(Salary ~ ., data = Hitters)
# Examine contents of summary of the regsubsets output
names(regfit.trial)
 [1] "np"        "nrbar"     "d"         "rbar"      "thetab"    "first"    
 [7] "last"      "vorder"    "tol"       "rss"       "bound"     "nvmax"    
[13] "ress"      "ir"        "nbest"     "lopt"      "il"        "ier"      
[19] "xnames"    "method"    "force.in"  "force.out" "sserr"     "intercept"
[25] "lindep"    "nullrss"   "nn"        "call"     
names(summary(regfit.trial))
[1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"   
# Now use Best Subset Selection using all 19 variables
regfit.full <- regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
summary(regfit.full)
Subset selection object
Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19)
19 Variables  (and intercept)
           Forced in Forced out
AtBat          FALSE      FALSE
Hits           FALSE      FALSE
HmRun          FALSE      FALSE
Runs           FALSE      FALSE
RBI            FALSE      FALSE
Walks          FALSE      FALSE
Years          FALSE      FALSE
CAtBat         FALSE      FALSE
CHits          FALSE      FALSE
CHmRun         FALSE      FALSE
CRuns          FALSE      FALSE
CRBI           FALSE      FALSE
CWalks         FALSE      FALSE
LeagueN        FALSE      FALSE
DivisionW      FALSE      FALSE
PutOuts        FALSE      FALSE
Assists        FALSE      FALSE
Errors         FALSE      FALSE
NewLeagueN     FALSE      FALSE
1 subsets of each size up to 19
Selection Algorithm: exhaustive
          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
7  ( 1 )  " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "   " " 
8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"   " " 
9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
4  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
5  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
7  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
# Display R-Squared, Adjusted R-Squared, Mallow's Cp and B.I.C from the best models
names(summary(regfit.full))
[1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"   
summary(regfit.full)$rsq
 [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
 [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
[15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
summary(regfit.full)$adjr2
 [1] 0.3188503 0.4208024 0.4450753 0.4672734 0.4808971 0.4972001 0.5007849
 [8] 0.5137083 0.5180572 0.5222606 0.5225706 0.5217245 0.5206736 0.5195431
[15] 0.5178661 0.5162219 0.5144464 0.5126097 0.5106270
summary(regfit.full)$cp
 [1] 104.281319  50.723090  38.693127  27.856220  21.613011  14.023870
 [7]  13.128474   7.400719   6.158685   5.009317   5.874113   7.330766
[13]   8.888112  10.481576  12.346193  14.187546  16.087831  18.011425
[19]  20.000000
summary(regfit.full)$bic
 [1]  -90.84637 -128.92622 -135.62693 -141.80892 -144.07143 -147.91690
 [7] -145.25594 -147.61525 -145.44316 -143.21651 -138.86077 -133.87283
[13] -128.77759 -123.64420 -118.21832 -112.81768 -107.35339 -101.86391
[19]  -96.30412

Now, we plot the various statistics generated by the Best Subset Selection using the regsubsets() function in the object regfit.full.

par(mfrow = c(2, 2))
# Plotting Residual Sum of Squares
plot(summary(regfit.full)$rss,
  xlab = "Number of Variables",
  ylab = "Residual Sum of Squares", type = "l"
)
points(
  x = which.min(summary(regfit.full)$rss),
  y = min(summary(regfit.full)$rss),
  col = "red", cex = 2, pch = 20
)

# Plotting Adjusted R-Squared
plot(summary(regfit.full)$adjr2,
  xlab = "Number of variables",
  ylab = "Adjusted R-Squared", type = "l"
)
points(
  x = which.max(summary(regfit.full)$adjr2),
  y = max(summary(regfit.full)$adjr2),
  col = "red", cex = 2, pch = 20
)

# Plotting Mallow's Cp
plot(summary(regfit.full)$cp,
  xlab = "Number of variables",
  ylab = "Mallow's Cp", type = "l"
)
points(
  x = which.min(summary(regfit.full)$cp),
  y = min(summary(regfit.full)$cp),
  col = "red", cex = 2, pch = 20
)

# Plotting Bayesian Information Criterion (B.I.C)
plot(summary(regfit.full)$bic,
  xlab = "Number of variables",
  ylab = "B.I.C", type = "l"
)
points(
  x = which.min(summary(regfit.full)$bic),
  y = min(summary(regfit.full)$bic),
  col = "red", cex = 2, pch = 20
)

Now, we use the in-built plot function in the regsubsets() to examine the models.

plot(regfit.full, scale = "r2")

plot(regfit.full, scale = "adjr2")

plot(regfit.full, scale = "Cp")

plot(regfit.full, scale = "bic")

coef(regfit.full, 6)
 (Intercept)        AtBat         Hits        Walks         CRBI    DivisionW 
  91.5117981   -1.8685892    7.6043976    3.6976468    0.6430169 -122.9515338 
     PutOuts 
   0.2643076 

6.5.2 Forward and Backward Step-wise Selection

We can use the regsubsets() function to do forward and backward selection as well. We use the same data set Hitters and compare the selected models of 7 variables from the three approaches: (1) Best Subset Selection (2) Forward Selection and (3) Backward Selection.

# Forward Selection; Listing the variables in 7 variable model
regfit.fwd <- regsubsets(Salary ~ ., data = Hitters, method = "forward")
names(coef(regfit.fwd, 7))
[1] "(Intercept)" "AtBat"       "Hits"        "Walks"       "CRBI"       
[6] "CWalks"      "DivisionW"   "PutOuts"    
# Backward Selection; Listing out the variables selected in 7 variable model
regfit.bwd <- regsubsets(Salary ~ ., data = Hitters, method = "backward")
names(coef(regfit.bwd, 7))
[1] "(Intercept)" "AtBat"       "Hits"        "Walks"       "CRuns"      
[6] "CWalks"      "DivisionW"   "PutOuts"    
# Create a nice table to display and compare the coefficients
ComCoef7 <- data.frame(
  BestSubsetSelection = names(coef(regfit.full, 7)),
  ForwardSelection = names(coef(regfit.fwd, 7)),
  BackwardSelection = names(coef(regfit.bwd, 7))
)
ComCoef7
  BestSubsetSelection ForwardSelection BackwardSelection
1         (Intercept)      (Intercept)       (Intercept)
2                Hits            AtBat             AtBat
3               Walks             Hits              Hits
4              CAtBat            Walks             Walks
5               CHits             CRBI             CRuns
6              CHmRun           CWalks            CWalks
7           DivisionW        DivisionW         DivisionW
8             PutOuts          PutOuts           PutOuts

6.5.3 Choosing among Models using the Validation Set approach and Cross Validation

We create test and train boolean vectors to be used to subset the Hitters data set into a training subset and a validation subset. The, we use regsubsets() to find the Best Subset Selection model for \(k = 1 to p\) variables in the model. Finally, we will calculate the Mean Squared Error (MSE) for each of the models. This will allow us to see which model has the lowest MSE, i.e. Test Error on the Validation Set.

# Creating Training and Test boolean vectors
set.seed(5)
train <- sample(c(TRUE, FALSE), size = nrow(Hitters), replace = TRUE)
test <- !train

# Run the Best Subset Selection on the training data
regfit.best <- regsubsets(Salary ~ ., data = Hitters[train, ], nvmax = 19)

# Creating a test matrix (X) with which we can multiply the coefficients of the
# best model to generate predicted values of Y (i.e. Salary). The model.matrix()
# creates matrices based on formulas (eg: including dummy variables etc.)
test.matrix <- model.matrix(Salary ~ ., data = Hitters[test, ])

# Create empty vector to store Validation Set Error Rates for each best model
ValSetErrors <- rep(NA, 19)

# Calculate Validation Set error rate for each model using loops
for (i in 1:19) {
  coeffs <- coef(regfit.best, id = i) # Extract coefficients in ith model
  pred <- test.matrix[, names(coeffs)] %*% coeffs # Calculate predicted Y
  ValSetErrors[i] <- mean((Hitters$Salary[test] - pred)^2)
}

# Find which Model has minimum MSE
which.min(ValSetErrors)
[1] 10
# Display the coefficients used in the model with minimum MSE
coef(regfit.best, id = which.min(ValSetErrors))
(Intercept)       AtBat        Hits       Walks       CHits       CRuns 
-57.3166343  -1.1352411   5.6853121   4.3289280  -0.5825003   1.3342097 
       CRBI      CWalks   DivisionW     PutOuts  NewLeagueN 
  0.6996060  -0.2490187 -58.2957815   0.1895886  33.3294093 
# Finally, we obtain coefficient estimates from running the best subset selection
# model on the complete data set. This will allow us to get better coefficient
# estimates.
regfit.best <- regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
round(coef(regfit.best, id = 10),
  digits = 3
)
(Intercept)       AtBat        Hits       Walks      CAtBat       CRuns 
    162.535      -2.169       6.918       5.773      -0.130       1.408 
       CRBI      CWalks   DivisionW     PutOuts     Assists 
      0.774      -0.831    -112.380       0.297       0.283 

Now, we can create an automated function to calculate MSE for each model of a regsubsets() object, so that we can use it in loops later when we use Cross-Validation.

predict.regsubsets <- function(object, newdata, id, ...){
  formula <- as.formula(object$call[[2]]) # Extracting formula from regsubsets object 
  coeffs <- coef(object, id) # Extracting coefficients with their names
  testmat <- model.matrix(formula, data = newdata) # Create test X matrix
  testmat[, names(coeffs)] %*% coeffs # Predicted values
}

We now use Cross Validation, instead of Validation Set approach to find out the Test MSE.

# Create k folds in the data-set for k-fold Cross Validation
k <- 10
set.seed(1)
folds <- sample(1:k, size = nrow(Hitters), replace = TRUE)
# table(folds) # To check : Each observation has been assigned one of the k folds

# Create a matrix to store k Test MSEs of each of the 19 models
cv.errors <- matrix(
  data = NA,
  nrow = k, ncol = 19,
  dimnames = list(NULL, 1:19)
)

# Create two loops : (1) Outer Loop to select the cross-validation k fold and
# then run regsubsets (2) Inner loop to find Test MSE of each on 1 to 19 variable
# models in the k-th test fold
for (j in 1:k) {
  reg.fit <- regsubsets(Salary ~ .,
    nvmax = 19,
    data = Hitters[folds != j, ]
  )
  for (i in 1:19) {
    pred <- predict.regsubsets(
      object = reg.fit, id = i,
      newdata = Hitters[folds == j, ]
    )
    cv.errors[j, i] <- mean((Hitters$Salary[folds == j] - pred)^2) # Store MSE
  }
}

# Calculate mean MSE for each of 1 to 19 variable models. Plot MSEs.
mean.cv.errors <- apply(cv.errors, MARGIN = 2, FUN = mean)
par(mfrow = c(1, 1))
plot(mean.cv.errors,
  type = "b",
  xlab = "Number of variables in the Model",
  ylab = paste(k, "-fold Cross-Validation MSE", collapse = "")
)
points(
  x = which.min(mean.cv.errors),
  y = min(mean.cv.errors),
  col = "red", pch = 20, cex = 2
)

# Now, find coefficients for best model selected by CV, using regsubsets() on
# full data set
round(coef(regsubsets(Salary ~ ., Hitters, nvmax = 19),
  id = which.min(mean.cv.errors)
), 2)
(Intercept)       AtBat        Hits       Walks      CAtBat       CRuns 
     162.54       -2.17        6.92        5.77       -0.13        1.41 
       CRBI      CWalks   DivisionW     PutOuts     Assists 
       0.77       -0.83     -112.38        0.30        0.28 

Lab 2 : Ridge Regression and the Lasso

We need to load the glmnet library to use the glmnet() function which can perform ridge regression and the Lasso. Further, we need to create an x matrix and a y vector to use in the glmnet() function.

library(glmnet)
x = model.matrix(Salary ~ ., data = Hitters)[, -1] # remove intercept column with [,-1]
y = Hitters$Salary

6.6.1 Ridge Regression

We can fit ridge regression using glmnet() with alpha = 0. First, we need to create a vector of possible values of lambda, lamvec to use in glmnet().

lamvec = 10^seq(from = 10, to = -2, length = 100)

# Fitting ridge regression model
ridge.mod = glmnet(x, y, alpha = 0, lambda = lamvec)

# Examine some of the contents of a ridge regression object 
class(ridge.mod)
[1] "elnet"  "glmnet"
names(ridge.mod)
 [1] "a0"        "beta"      "df"        "dim"       "lambda"    "dev.ratio"
 [7] "nulldev"   "npasses"   "jerr"      "offset"    "call"      "nobs"     
# Matrix containing the Coefficients for each value of lambda in a 20 X 100 matrix
dim(coef(ridge.mod))
[1]  20 100
# Examining the ell-2 norm of the coefficients at some lambda values to check the
# fact that at high lambda values, this ell-2 norm should be low (near zero)

# Get the 25th Lambda value i.e. a large lambda value (we expect ell-2 norm to be low)
ridge.mod$lambda[25]
[1] 12328467
sqrt(sum(coef(ridge.mod)[-1, 25]^2))
[1] 0.006553409
# Get the 75th Lambda value i.e. a small lambda value (we expect ell-2 norm to be high)
ridge.mod$lambda[75]
[1] 10.72267
sqrt(sum(coef(ridge.mod)[-1, 75]^2))
[1] 140.3536
# Plot the coefficients simply using plot() on the glmnet object
plot(ridge.mod)

We can now try to predict the coefficients for a fixed arbitrary value of lambda = 50, which we may not have used in the original ridge regression fitting using glmnet().

predict(ridge.mod, s = 50, type = "coefficients")
20 x 1 sparse Matrix of class "dgCMatrix"
                       s1
(Intercept)  4.876610e+01
AtBat       -3.580999e-01
Hits         1.969359e+00
HmRun       -1.278248e+00
Runs         1.145892e+00
RBI          8.038292e-01
Walks        2.716186e+00
Years       -6.218319e+00
CAtBat       5.447837e-03
CHits        1.064895e-01
CHmRun       6.244860e-01
CRuns        2.214985e-01
CRBI         2.186914e-01
CWalks      -1.500245e-01
LeagueN      4.592589e+01
DivisionW   -1.182011e+02
PutOuts      2.502322e-01
Assists      1.215665e-01
Errors      -3.278600e+00
NewLeagueN  -9.496680e+00

Now, which is the best value of lambda to use for predicting the coefficients? We can answer this question using cross validation. First, we split half of sample as training set, and the remaining half as test set.

# Setting seed for replicability, and creating training and testing sets
set.seed(1)
train = sample(c(TRUE, FALSE), size = nrow(x), replace = TRUE)
test = !train

# Running Ridge Regression on Training Set
ridge.mod = glmnet(x[train,], y[train], alpha = 0, lambda = lamvec, 
                   thresh = 1e-12)

# Calculate predicted MSE for some arbitrary lambda value, say 4
ridge.pred = predict(ridge.mod, s = 4, newx = x[test,])
mean((ridge.pred - y[test])^2)
[1] 143937.4
# Calculate MSE for a very large lambda (i.e. NULL model, all coefficients nearly zero)
ridge.pred = predict(ridge.mod, s = 1e10, newx = x[test,])
mean((ridge.pred - y[test])^2)
[1] 208338.9
# Re-check that this MSE is same as using mean of y as predicted value for all of test set
mean( (mean(y[train]) - y[test] )^2 )
[1] 208338.9
# Both values are nearly the same. Further, at lambda = 4, MSE is lower than NULL model

# Calculate MSE for lambda = 0 (i.e. least squares regression)
# ridge.pred = predict(ridge.mod, x = x[test,], y = y[test],
#                      s = 0, newx = x[test,], 
#                      exact = T)
#mean((ridge.pred - y[test])^2)

# Verify that the MSE is same as least squares regression
# lmfit <- lm(Salary ~ ., Hitters, subset = train)
# lm.pred <- predict(lmfit, newdata = Hitters[test,])
# mean((lm.pred - y[test])^2)

Now, we use the cross-validation approach to select the lambda value with the lowest MSE. The function cv.glmnet() automatically performs this with a default value of folds = 10.

set.seed(3)
cv.out = cv.glmnet(x[train, ], y[train], alpha = 0)

# Examine the output of cv.glmnet()
class(cv.out)
[1] "cv.glmnet"
names(cv.out)
 [1] "lambda"     "cvm"        "cvsd"       "cvup"       "cvlo"      
 [6] "nzero"      "call"       "name"       "glmnet.fit" "lambda.min"
[11] "lambda.1se" "index"     
# Plotting the output of cv.glmnet()
plot(cv.out)

# Finding the test MSE associated with the best lambda value
bestlam = cv.out$lambda.min
ridge.pred = predict(ridge.mod, s = bestlam, newx = x[test,])
mean( (ridge.pred - y[test])^2 )
[1] 143953.8
# Finally, we find out the coefficients for all variables in Hitters for lambda 
# value of bestlam using the full dataset
out = glmnet(x, y, alpha = 0)
round(predict(out, type = "coefficients", s = bestlam),3)[1:20, ]
(Intercept)       AtBat        Hits       HmRun        Runs         RBI 
     36.159      -0.246       1.696      -1.104       1.172       0.829 
      Walks       Years      CAtBat       CHits      CHmRun       CRuns 
      2.481      -4.869       0.008       0.097       0.596       0.193 
       CRBI      CWalks     LeagueN   DivisionW     PutOuts     Assists 
      0.198      -0.102      42.316    -115.086       0.242       0.102 
     Errors  NewLeagueN 
     -3.041      -5.554 

6.6.2 The Lasso

The Lasso can be run using the same function glmnet() but with the argument alpha = 1.

lasso.mod <- glmnet(x[train, ], y[train], alpha = 1, lambda = lamvec)
plot(lasso.mod)

# Using cross validation to find out the best lambda value
set.seed(3)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1, )
plot(cv.out)

bestlam <- cv.out$lambda.min
bestlam
[1] 1.179335
# Finding Lasso predicted values on test set using best lambda value
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test, ])

# Calculating MSE at best lambda value with Lasso
mean((lasso.pred - y[test])^2)
[1] 141723.2
# Using full data to find coefficient values at best lambda values
out <- glmnet(x, y, alpha = 1)
coeffs <- predict(out, s = bestlam, type = "coefficients")[1:20, ]
coeffs[coeffs != 0]
  (Intercept)         AtBat          Hits         HmRun          Runs 
 150.65616208   -1.89777785    6.63473771    1.01307454   -0.73367711 
        Walks         Years        CAtBat        CHmRun         CRuns 
   5.49534050   -8.26890842   -0.05204784    0.30697694    1.04151736 
         CRBI        CWalks       LeagueN     DivisionW       PutOuts 
   0.53045013   -0.70560505   43.84887506 -117.16220845    0.28144497 
      Assists        Errors    NewLeagueN 
   0.27463999   -2.76569697   -7.82822649 

Lab 3 : PCR and PLS Regression

6.7.1 Principal Components Regression

We can fit the Principal Components Regression using the pcr() function which is a part pls library.

library(pls)
library(ISLR)
set.seed(1)

# Recreating data sets once again
data("Hitters")
Hitters <- na.omit(Hitters)
train <- sample(1:nrow(Hitters), size = nrow(Hitters) / 2)
test <- -train
x <- model.matrix(Salary ~ ., data = Hitters)[, -1]
y <- Hitters$Salary

# Fitting Principal Components Regression on the Hitters data
pcr.fit <- pcr(Salary ~ ., data = Hitters, scale = TRUE, validation = "CV")

# Examining the pcr() object and its summary
class(pcr.fit)
[1] "mvr"
names(pcr.fit)
 [1] "coefficients"  "scores"        "loadings"      "Yloadings"    
 [5] "projection"    "Xmeans"        "Ymeans"        "fitted.values"
 [9] "residuals"     "Xvar"          "Xtotvar"       "fit.time"     
[13] "ncomp"         "method"        "center"        "scale"        
[17] "validation"    "call"          "terms"         "model"        
summary(pcr.fit)
Data:   X dimension: 263 19 
    Y dimension: 263 1
Fit method: svdpc
Number of components considered: 19

VALIDATION: RMSEP
Cross-validated using 10 random segments.
       (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
CV             452    350.6    352.3    352.4    349.7    344.7    342.7
adjCV          452    350.3    351.9    351.9    349.1    344.1    342.0
       7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
CV       344.3    345.1    345.7     346.5     347.2     349.4     349.1
adjCV    343.4    344.2    344.8     345.4     346.1     348.1     347.8
       14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
CV        342.8     342.7     335.4     337.3     336.9     340.5
adjCV     341.2     341.3     333.9     335.6     335.1     338.4

TRAINING: % variance explained
        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
X         38.31    60.16    70.84    79.03    84.29    88.63    92.26    94.96
Salary    40.63    41.58    42.17    43.22    44.90    46.48    46.69    46.75
        9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
X         96.28     97.26     97.98     98.65     99.15     99.47     99.75
Salary    46.86     47.76     47.82     47.85     48.10     50.40     50.55
        16 comps  17 comps  18 comps  19 comps
X          99.89     99.97     99.99    100.00
Salary     53.01     53.85     54.61     54.61
# Using validationplot() to see the results of pcr() - plots with RMSE and MSE
par(mfrow = c(1, 2))
validationplot(pcr.fit)
validationplot(pcr.fit, val.type = "MSEP")

# Performing PCR on training set and evaluating MSE on test set
pcr.fit <- pcr(Salary ~ .,
  data = Hitters, subset = train,
  scale = TRUE, validation = "CV"
)
validationplot(pcr.fit, val.type = "MSEP")

# Using ncomp = 7, to find predicted values and MSE
pcr.pred <- predict(pcr.fit, newdata = x[test, ], ncomp = 7)
mean((pcr.pred - y[test])^2)
[1] 140751.3
# Finally, we fit the PCR model with M=7 to the entire data set
pcr.fit <- pcr(Salary ~ ., data = Hitters, scale = TRUE, ncomp = 7)
summary(pcr.fit)
Data:   X dimension: 263 19 
    Y dimension: 263 1
Fit method: svdpc
Number of components considered: 7
TRAINING: % variance explained
        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
X         38.31    60.16    70.84    79.03    84.29    88.63    92.26
Salary    40.63    41.58    42.17    43.22    44.90    46.48    46.69

6.7.2 Partial Least Squares

We now implement the Partial Least Squares method using the plsr() function in the pls library. The syntax is similar to the pcr() function.

pls.fit <- plsr(Salary ~ .,
  data = Hitters, subset = train,
  scale = TRUE, validation = "CV"
)
summary(pls.fit)
Data:   X dimension: 131 19 
    Y dimension: 131 1
Fit method: kernelpls
Number of components considered: 19

VALIDATION: RMSEP
Cross-validated using 10 random segments.
       (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
CV           428.3    326.7    330.3    326.4    335.6    336.1    338.9
adjCV        428.3    326.0    328.4    324.7    333.4    333.6    335.7
       7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
CV       342.7    354.1    351.6     349.6     343.7     347.9     351.2
adjCV    339.7    349.9    347.7     346.0     340.3     344.1     346.2
       14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
CV        349.6     348.4     350.2     344.8     344.7     345.3
adjCV     345.4     344.4     346.2     341.0     340.9     341.4

TRAINING: % variance explained
        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
X         39.13    48.80    60.09    75.07    78.58    81.12    88.21    90.71
Salary    46.36    50.72    52.23    53.03    54.07    54.77    55.05    55.66
        9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
X         93.17     96.05     97.08     97.61     97.97     98.70     99.12
Salary    55.95     56.12     56.47     56.68     57.37     57.76     58.08
        16 comps  17 comps  18 comps  19 comps
X          99.61     99.70     99.95    100.00
Salary     58.17     58.49     58.56     58.62
# Creating Validation Plots to see MSE or RMSE versus M-value
par(mfrow = c(1, 2))
validationplot(pls.fit)
validationplot(pls.fit, val.type = "MSEP")

# Lowest CV-MSE at M=2, so we compute Test MSE at M=2
pred.pls <- predict(pls.fit, newx = x[test, ], ncomp = 2)
mean((pred.pls - mean(y[test]))^2)
[1] 91696.61
# Finally, perform PLS on the entire data set, with M=2
pls.fit <- plsr(Salary ~ ., data = Hitters, scale = TRUE, ncomp = 2)
summary(pls.fit)
Data:   X dimension: 263 19 
    Y dimension: 263 1
Fit method: kernelpls
Number of components considered: 2
TRAINING: % variance explained
        1 comps  2 comps
X         38.08    51.03
Salary    43.05    46.40