Chapter 10 (Lab)

Unsupervised Learning

Author

Aditya Dahiya

Published

March 1, 2021

Lab 1: Principal Components Analysis

In this lab, we use the USArrests data set in base R and do a principal components analysis using prcomp() function of base R.

# Loading data
data("USArrests")
options(digits = 2)

# Examining data
dim(USArrests)

[1] 50  4

# Names of variables
colnames(USArrests)

[1] "Murder"   "Assault"  "UrbanPop" "Rape"

# Names of States
row.names(USArrests)

 [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
 [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
 [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
[13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
[17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
[21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
[25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
[29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
[33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
[37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
[41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
[45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
[49] "Wisconsin"      "Wyoming"

# Computing means and variances of variables
colMeans(USArrests)

  Murder  Assault UrbanPop     Rape 
     7.8    170.8     65.5     21.2

apply(X = USArrests, MARGIN = 2, FUN = var)

  Murder  Assault UrbanPop     Rape 
      19     6945      210       88

# Performing Principal Components Analysis
prUSA <- prcomp(USArrests, scale = TRUE)

# Examining contents of output
class(prUSA)

[1] "prcomp"

names(prUSA)

[1] "sdev"     "rotation" "center"   "scale"    "x"

prUSA$center # means of original variables

  Murder  Assault UrbanPop     Rape 
     7.8    170.8     65.5     21.2

prUSA$scale # standard deviation of original variables

  Murder  Assault UrbanPop     Rape 
     4.4     83.3     14.5      9.4

prUSA$scale^2 # variance of original variables

  Murder  Assault UrbanPop     Rape 
      19     6945      210       88

# The Principal Component Loading Vectors (columns of Rotation Matrix)
prUSA$rotation

           PC1   PC2   PC3    PC4
Murder   -0.54 -0.42  0.34  0.649
Assault  -0.58 -0.19  0.27 -0.743
UrbanPop -0.28  0.87  0.38  0.134
Rape     -0.54  0.17 -0.82  0.089

# The Principal Component Score Vectors (columns of x matrix)
head(prUSA$x, 2)

          PC1  PC2   PC3   PC4
Alabama -0.98 -1.1  0.44  0.15
Alaska  -1.93 -1.1 -2.02 -0.43

# Standard Deviation of Principal Component Score Vectors
prUSA$sdev

[1] 1.57 0.99 0.60 0.42

# Computing Proportion of variance explained
pr.var <- prUSA$sdev^2 / sum(prUSA$sdev^2)
pr.var

[1] 0.620 0.247 0.089 0.043

# Creating a Scree Plot and a Cumulative Variance Explained plot
par(mfrow = c(1, 2))
plot(pr.var,
  type = "b", xlab = "Principal Component",
  ylab = "Proportion of Variance Explained", ylim = c(0, 1)
)
plot(cumsum(pr.var),
  type = "b", xlab = "Principal Component",
  ylab = "Cumulative Prop. of Variance Explained", ylim = c(0, 1)
)

# Creating a biplot
par(mfrow = c(1, 1))
biplot(prUSA, scale = 0)

Lab 2: Clustering

In this part, we will use \(K\)-means clustering and Hierarchical Clustering.

10.5.1 K-Means Clustering

# Generate a simulated data set matrix (25 X 2) with two clusters
set.seed(3)
x <- matrix(rnorm(100), ncol = 2)
x[1:25, 1] <- x[1:25, 1] + 3
x[1:25, 2] <- x[1:25, 2] - 4

# Perform K Means Clustering with K = 2
km2 <- kmeans(x = x, centers = 2, nstart = 20)
km2

K-means clustering with 2 clusters of sizes 25, 25

Cluster means:
  [,1]  [,2]
1 0.16  0.12
2 2.71 -3.95

Clustering vector:
 [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
[39] 1 1 1 1 1 1 1 1 1 1 1 1

Within cluster sum of squares by cluster:
[1] 37 32
 (between_SS / total_SS =  80.6 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

# Displaying identified clusters and plotting them
km2$cluster

 [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
[39] 1 1 1 1 1 1 1 1 1 1 1 1

plot(x,
  col = (km2$cluster + 1), pch = 20, xlab = "", ylab = "", cex = 1.5,
  main = "K-means clustering results with K = 2"
)

# Performing K-means clustering with K = 3
km3 <- kmeans(x = x, centers = 3, nstart = 20)
km3

K-means clustering with 3 clusters of sizes 12, 13, 25

Cluster means:
   [,1]   [,2]
1 -0.68  0.017
2  0.93  0.221
3  2.71 -3.951

Clustering vector:
 [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 2 1 1 2 2 2 2 1 2 2 1
[39] 1 2 2 1 2 1 2 1 1 2 1 1

Within cluster sum of squares by cluster:
[1] 10.7  9.9 32.3
 (between_SS / total_SS =  85.2 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

plot(x,
  col = (km3$cluster + 1), pch = 20, xlab = "", ylab = "", cex = 1.5,
  main = "K-means clustering results with K = 3"
)

# Demonstrating the use of nstart argument
km3_1 <- kmeans(x = x, centers = 3, nstart = 1)
km3$tot.withinss

[1] 53

km3_1$tot.withinss

[1] 53

10.5.2 Hierarchical Clustering

# Creating a distance matrix with dist()
options(digits = 2)
class(dist(x))

[1] "dist"

# Using hclust() to do hierarchical clustering in R on same data set
# Three different linkage methods
hc.complete <- hclust(d = dist(x), method = "complete")
hc.average <- hclust(d = dist(x), method = "average")
hc.single <- hclust(d = dist(x), method = "single")

# Plotting the dendrograms from each linkage method
par(mfrow = c(1, 3))
plot(hc.complete,
  cex = 0.7, xlab = "", ylab = "", sub = "",
  main = "Complete Linkage"
)
plot(hc.average,
  cex = 0.7, xlab = "", ylab = "", sub = "",
  main = "Average Linkage"
)
plot(hc.single,
  cex = 0.7, xlab = "", ylab = "", sub = "",
  main = "Single Linkage"
)

# Comparing clusters generated using cutree() function
cutree(tree = hc.complete, k = 2)

 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
[39] 2 2 2 2 2 2 2 2 2 2 2 2

cutree(tree = hc.average, k = 2)

 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
[39] 2 2 2 2 2 2 2 2 2 2 2 2

cutree(tree = hc.single, k = 2)

 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
[39] 2 2 2 2 2 2 2 2 2 2 2 2

cutree(tree = hc.complete, k = 2) == cutree(tree = hc.average, k = 2)

 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[46] TRUE TRUE TRUE TRUE TRUE

cutree(tree = hc.complete, k = 2) == cutree(tree = hc.single, k = 2)

 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[46] TRUE TRUE TRUE TRUE TRUE

# Scaling variables before performing hierarchical clustering
xsc <- scale(x)
plot(hclust(d = dist(xsc), method = "complete"),
  cex = 0.7, xlab = "",
  ylab = "", sub = "", main = "Complete Linkage and Scaled Features"
)
# Using correlation based distance

# Simulated data set of 3 dimensions (with three clusters)
x <- matrix(rnorm(90), ncol = 3)
x[1:10, ] <- x[1:10, ] - 2
x[20:30, ] <- x[20:30, ] + 2

# Euclidean distance based clustering (Partial success in both methods)
kmeans(x = x, centers = 3, nstart = 20)$cluster

 [1] 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 2 2 1 2 2 2 2 2 2 2 2 2 2 2

cutree(hclust(d = dist(x), method = "complete"), k = 3)

 [1] 1 1 1 1 1 1 1 1 1 1 2 3 2 2 2 1 3 3 2 3 3 3 3 3 3 3 3 3 3 2

# Creating a matrix of correlation based distance
dd <- as.dist(m = 1 - cor(t(x)))
par(mfrow = c(1, 1))

plot(hclust(dd, method = "complete"),
  xlab = "", sub = "",
  main = "Complete Linkage with Correlation Based Distance"
)

Lab 3: NCI60 Data Example

PCA on NCI60 data

In this lab, we use th NCI60 cancer cell line micro-array data from the ISLR package. We will perform K-Means Clustering and Hierarchical Clustering on the data set.

# Loading data set and storing it locally
library(ISLR)
library(tidyverse)
names(NCI60)

[1] "data" "labs"

nci.data <- NCI60$data
nci.labs <- NCI60$labs

dim(nci.data)

[1]   64 6830

length(nci.labs)

[1] 64

nci.data[1:5, 1:5]

      1     2     3     4     5
V1 0.30  1.18  0.55  1.14 -0.26
V2 0.68  1.29  0.17  0.38  0.46
V3 0.94 -0.04 -0.17 -0.04 -0.60
V4 0.28 -0.31  0.68 -0.81  0.62
V5 0.48 -0.46  0.40  0.90  0.20

n_distinct(nci.labs)

[1] 14

# Performing PCA on the NCI60 data set
prnci <- prcomp(x = nci.data, scale = TRUE)

# Plotting the data on first two principal components
# creating a customized function to color the observation spoints
Cols <- function(vec) {
  cols <- rainbow(length(unique(vec)))
  return(cols[as.numeric(as.factor(vec))])
}
par(mfrow = c(1, 2))
plot(prnci$x[, 1:2],
  col = Cols(nci.labs),
  pch = 19, xlab = "Z1", ylab = "Z2"
)
plot(prnci$x[, c(1, 3)],
  col = Cols(nci.labs),
  pch = 19, xlab = "Z1", ylab = "Z3"
)

# Plotting the Scree Plot and Proportion of Variance Explained
par(mfrow = c(1, 3))
plot(prnci, main = "Default plot : prcomp()")

pve <- 100 * (prnci$sdev^2) / sum(prnci$sdev^2)
plot(pve,
  xlab = "Principal Component", ylab = "Proportion Variance Explained",
  main = "Scree Plot", col = "blue", type = "o"
)
plot(cumsum(pve),
  xlab = "Principal Component", ylab = "Cumulative Prop. Var. Explained",
  main = "Cum. Prop. Var. Explained", col = "brown3", type = "o"
)

# Examining summary of a prcomp() object
summary(prnci)$importance[1:3, 1:7]

                         PC1    PC2    PC3    PC4    PC5    PC6    PC7
Standard deviation     27.85 21.481 19.820 17.033 15.972 15.721 14.471
Proportion of Variance  0.11  0.068  0.058  0.042  0.037  0.036  0.031
Cumulative Proportion   0.11  0.181  0.239  0.281  0.318  0.355  0.385

Clustering on observations of the NCI60 data

# Scaling the data
sd.data <- scale(nci.data)

# Ploting hierarchical clustering on three different linkage methods
data.dist <- dist(sd.data)

par(mfrow = c(1, 3))
plot(hclust(data.dist, method = "complete"),
  labels = nci.labs,
  main = "Complete Linkage", xlab = "", ylab = "", sub = "", cex = 0.6
)
plot(hclust(data.dist, method = "average"),
  labels = nci.labs,
  main = "Average Linkage", xlab = "", ylab = "", sub = "", cex = 0.6
)
plot(hclust(data.dist, method = "single"),
  labels = nci.labs,
  main = "Single Linkage", xlab = "", ylab = "", sub = "", cex = 0.6
)

# Plotting a complete linkage dendrogram cut at 4 clusters
hcnci <- hclust(data.dist, method = "complete")
par(mfrow = c(1, 1))
plot(hcnci, labels = nci.labs, cex = 0.6)
abline(h = 139, col = "red")

# Comparing clusters by hclust() with actual ones
hc.clusters <- cutree(hcnci, 4)
table(hc.clusters, nci.labs)

           nci.labs
hc.clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
          1      2   3     2           0           0        0           0
          2      3   2     0           0           0        0           0
          3      0   0     0           1           1        6           0
          4      2   0     5           0           0        0           1
           nci.labs
hc.clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
          1           0        8     8       6        2     8       1
          2           0        0     1       0        0     1       0
          3           0        0     0       0        0     0       0
          4           1        0     0       0        0     0       0

# Printing output of a hclust() object
hcnci


Call:
hclust(d = data.dist, method = "complete")

Cluster method   : complete 
Distance         : euclidean 
Number of objects: 64

# Comparing hierarchical clustering with k-means clustering
set.seed(3)
km.clusters <- kmeans(sd.data, centers = 4, nstart = 20)$cluster
table(km.clusters, hc.clusters)

           hc.clusters
km.clusters  1  2  3  4
          1 20  7  0  0
          2  9  0  0  0
          3 11  0  0  9
          4  0  0  8  0

# Performing Hierarchical and K-Means clustering only on first 5 principal components
hc.pr.nci <- hclust(dist(scale(prnci$x[, 1:5])))
hc.pr.clusters <- cutree(hc.pr.nci, 4)
km.pr.clusters <- kmeans(prnci$x[, 1:5], centers = 4, nstart = 20)$cluster
table(km.pr.clusters, hc.pr.clusters)

              hc.pr.clusters
km.pr.clusters  1  2  3  4
             1 24  3  0  0
             2  0  0  5  2
             3  6  4 11  0
             4  9  0  0  0

# Plotting dendrogram with 5-Principal Components Hierarchical Clustering
plot(hc.pr.nci,
  labels = nci.labs, cex = 0.6, xlab = "", ylab = "", sub = "",
  main = "Hierarchical Clustering on first 5 PCs"
)