Chapter 8 (Lab)

Tree-Based Methods

Author

Aditya Dahiya

Published

March 11, 2021

8.3 Lab: Decision Trees

8.3.1 Fitting Classification Trees

# Loading libraries and data set
library(tree)
library(ISLR)
library(tidyverse)
data("Carseats")
options(digits = 2)
# Creating a binary variable for Sales; removing Sales variable from data set
Carseats <- Carseats %>%
  mutate(SalesHigh = as.factor(ifelse(Sales > 8, yes = "Yes", no = "No"))) %>%
  select(-Sales)

# Fitting a classification tree model to the data set for predicting SalesHigh
tree.carseats <- tree(SalesHigh ~ ., data = Carseats)

# Displaying Summary fo Classification Tree
summary(tree.carseats)


Classification tree:
tree(formula = SalesHigh ~ ., data = Carseats)
Variables actually used in tree construction:
[1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
[6] "Advertising" "Age"         "US"         
Number of terminal nodes:  27 
Residual mean deviance:  0.46 = 171 / 373 
Misclassification error rate: 0.09 = 36 / 400

# Plotting the Classification tree
plot(tree.carseats, col = "brown", type = "proportional")
text(tree.carseats, pretty = 0, cex = 0.5)

# Examining the output from tree object print
names(tree.carseats)

[1] "frame"   "where"   "terms"   "call"    "y"       "weights"

names(tree.carseats$frame)

[1] "var"    "n"      "dev"    "yval"   "splits" "yprob"

# Creating a testing and training set in Carseats Data Set
nrow(Carseats)

[1] 400

set.seed(3)
train <- sample(c(TRUE, FALSE), size = 200, replace = TRUE)
Test.Carseats <- Carseats[!train, ]
Truth <- Carseats[!train, "SalesHigh"]

# Fitting a classification tree to the training data
tree.carseats <- tree(SalesHigh ~ ., data = Carseats, subset = train)

# Obtaining predicted values in test data
pred <- predict(tree.carseats, newdata = Test.Carseats, type = "class")

# Creating Confusion Matrix; calculating test error rate
table(Predicted = pred, Truth)

         Truth
Predicted No Yes
      No  95  55
      Yes 22  42

# Prediction Accuracy
mean(pred == Truth)

[1] 0.64

# Test error rate
mean(pred != Truth)

[1] 0.36

# Now, we prune the Classification Tree, finding the optimum pruning by C.V.
cv.carseats <- cv.tree(tree.carseats, FUN = prune.misclass)

# Examining the Cross Validation Pruning Tree Object
names(cv.carseats)

[1] "size"   "dev"    "k"      "method"

cv.carseats

$size
[1] 18 13  8  7  5  4  2  1

$dev
[1] 65 65 64 63 65 71 78 68

$k
[1] -Inf  0.0  1.0  2.0  3.0  5.0  6.5 13.0

$method
[1] "misclass"

attr(,"class")
[1] "prune"         "tree.sequence"

# Plotting the error rate as a fucntion of k (alpha) and size of tree (size)
par(mfrow = c(1, 2))
plot(
  x = cv.carseats$size, y = cv.carseats$dev, type = "b",
  ylab = "Error Rate", xlab = "Size of pruned tree"
)
points(
  x = 7, y = min(cv.carseats$dev), col = "red",
  pch = 20, cex = 1.5
)
abline(v = 7, col = "red", lty = 2)
plot(
  x = cv.carseats$k, y = cv.carseats$dev, type = "b",
  ylab = "Error Rate", xlab = "alpha (k)"
)
points(
  x = 2, y = min(cv.carseats$dev), col = "red",
  pch = 20, cex = 1.5
)
abline(v = 2, col = "red", lty = 2)

# Pruning the tree at the best number of variables (i.e. 7)
prune.carseats <- prune.tree(tree.carseats, best = 7)

# Plotting the best (as per C.V.) pruned classification tree
plot(prune.carseats, col = "brown")
text(prune.carseats, cex = 0.9, pretty = 0)

# Calculating the test error rate for the best pruned classification tree
pred <- predict(prune.carseats, newdata = Test.Carseats, type = "class")
table(Predicted = pred, Truth)

         Truth
Predicted  No Yes
      No  107  49
      Yes  10  48

# Prediction Accuracy
mean(pred == Truth)

[1] 0.72

# Test error rate
mean(pred != Truth)

[1] 0.28

8.3.2 Fitting Regression Trees

# Loading libraries and data set
library(MASS)
library(tree)
data(Boston)
dim(Boston)

[1] 506  14

# creating a training and test subset of the data
set.seed(3)
train <- sample(c(TRUE, FALSE), size = nrow(Boston) / 2, replace = TRUE)
Truth <- Boston[!train, "medv"]

# Fitting a regression tree on the training data set
tree.boston <- tree(medv ~ ., data = Boston, subset = train)

# Displaying the fitted regression tree
summary(tree.boston)


Regression tree:
tree(formula = medv ~ ., data = Boston, subset = train)
Variables actually used in tree construction:
[1] "lstat" "rm"    "nox"   "crim" 
Number of terminal nodes:  9 
Residual mean deviance:  13 = 2910 / 223 
Distribution of residuals:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  -12.5    -2.0    -0.1     0.0     2.0    15.0

names(tree.boston)

[1] "frame"   "where"   "terms"   "call"    "y"       "weights"

names(tree.boston$frame)

[1] "var"    "n"      "dev"    "yval"   "splits"

# Plotting the regression tree
plot(tree.boston, col = "brown")
text(tree.boston, pretty = 0, cex = 0.8)

# Finding the best pruned regression tree using cross validation
cv.boston <- cv.tree(tree.boston, K = 10)
cv.boston

$size
[1] 9 8 7 6 5 4 3 2 1

$dev
[1]  6092  6337  6701  6950  7558  7781  9069 12520 18862

$k
[1] -Inf  247  329  460  664  783 1588 3077 8702

$method
[1] "deviance"

attr(,"class")
[1] "prune"         "tree.sequence"

plot(
  x = cv.boston$size, y = cv.boston$dev, xlab = "No. of variables",
  ylab = "Cross-Validation Deviance", type = "b"
)

# Using the unpruned tree to make preductions on the test data set
pred <- predict(tree.boston, newdata = Boston[!train, ])

# Plotting Predicted vs. True values of medv
plot(
  x = Truth, y = pred, xlab = "True values of medv",
  ylab = "Predicted values of medv"
)
abline(a = 0, b = 1, col = "red")

# Calculating Test MSE
options(digits = 4)
mean((pred - Truth)^2)

[1] 21.18

sqrt(mean((pred - Truth)^2))

[1] 4.602

8.3.3 Bagging and Random Forests

# Loading libraries and Data Set
library(MASS)
library(randomForest)
data(Boston)
set.seed(3)

# Fitting a bagging model with randomForest() with m = p
length(names(Boston))

[1] 14

bag.boston <- randomForest(medv ~ .,
  data = Boston, subset = train,
  mtry = 13, importance = TRUE
)
bag.boston


Call:
 randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE,      subset = train) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 13

          Mean of squared residuals: 15.6
                    % Var explained: 80.71

# Calculating test set predictions and plotting them
pred <- predict(bag.boston, newdata = Boston[!train, ])
plot(
  x = Truth, y = pred, xlab = "True response values (medv)",
  ylab = "Predicted response values (medv)"
)
abline(a = 0, b = 1, col = "red")

# Calculating the Test MSE
mean((pred - Truth)^2)

[1] 13.26

# Fitting a Random Forest Model with 6 variables mtry
rf.boston <- randomForest(medv ~ .,
  data = Boston, subset = train,
  mtry = 6, importance = TRUE
)
rf.boston


Call:
 randomForest(formula = medv ~ ., data = Boston, mtry = 6, importance = TRUE,      subset = train) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 6

          Mean of squared residuals: 13.74
                    % Var explained: 83.01

# Calculating test set predictions and plotting them
pred <- predict(rf.boston, newdata = Boston[!train, ])
plot(
  x = Truth, y = pred, xlab = "True response values (medv)",
  ylab = "Predicted response values (medv)"
)
abline(a = 0, b = 1, col = "red")

# Calculating the Test MSE
mean((pred - Truth)^2)

[1] 12.44

# Showing the importance of variables as table and plot
round(importance(rf.boston), 2)

        %IncMSE IncNodePurity
crim      11.18       1082.37
zn         3.25        117.71
indus      8.77        866.36
chas       0.01         21.87
nox       13.55       1022.72
rm        30.67       5883.64
age       10.69        504.33
dis       10.23        760.49
rad        3.93        124.16
tax        7.53        637.97
ptratio    9.17        654.75
black      8.66        330.35
lstat     31.83       6239.90

varImpPlot(rf.boston)

8.3.4 Boosting

# Loading libraries
library(gbm)
set.seed(3)

# Fitting a boosting model
boost.boston <- gbm(medv ~ ., data = Boston[train, ], distribution = "gaussian",
                    n.trees = 5000, interaction.depth = 4)
boost.boston

gbm(formula = medv ~ ., distribution = "gaussian", data = Boston[train, 
    ], n.trees = 5000, interaction.depth = 4)
A gradient boosted model with gaussian loss function.
5000 iterations were performed.
There were 13 predictors of which 13 had non-zero influence.

names(boost.boston)

 [1] "initF"             "fit"               "train.error"      
 [4] "valid.error"       "oobag.improve"     "trees"            
 [7] "c.splits"          "bag.fraction"      "distribution"     
[10] "interaction.depth" "n.minobsinnode"    "num.classes"      
[13] "n.trees"           "nTrain"            "train.fraction"   
[16] "response.name"     "shrinkage"         "var.levels"       
[19] "var.monotone"      "var.names"         "var.type"         
[22] "verbose"           "data"              "Terms"            
[25] "cv.folds"          "call"              "m"

summary(boost.boston)

            var  rel.inf
lstat     lstat 35.06153
rm           rm 32.09065
dis         dis  6.92124
crim       crim  6.19716
nox         nox  4.33675
age         age  4.31233
black     black  3.94817
ptratio ptratio  2.51777
tax         tax  2.22554
indus     indus  1.18808
rad         rad  0.86941
zn           zn  0.25900
chas       chas  0.07236

# Plotting the marginal effects of selected variables on the response
par(mfrow = c(1,2))
plot(boost.boston, i = "rm")

plot(boost.boston, i = "lstat")

# Use boosted model to predict the medv on test data set
pred <- predict(boost.boston, newdata = Boston[!train,])

# Calculating the test MSE
mean((pred - Truth)^2)

[1] 13.89

# Using boosting model with differnt lambda value = 0.2 (instead of default 0.001)
boost.boston <- gbm(medv ~ ., data = Boston[train, ], distribution = "gaussian",
                    n.trees = 5000, interaction.depth = 4, shrinkage = 0.2)
pred <- predict(boost.boston, newdata = Boston[!train,])
mean((pred - Truth)^2)

[1] 13.32