ISL: Chapter 6


Bài 8

set.seed(123)
x=rnorm(100,mean=10)
set.seed(124)
ep=rnorm(100)

y=1+2*x+3*x^2+4*x^3+ep

plot(x,y)

plot of chunk SubsetSelection

xy=data.frame(x,y)
library(leaps)
reg.full=regsubsets(y~poly(x,10),data=xy,nvmax=10)
reg.sum=summary(reg.full)

par(mfrow=c(2,2))
par(mar=rep(4,4))

plot(reg.sum$rss,xlab="Number of Variables",ylab="RSS",type="l")

plot(reg.sum$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
m=which.max(reg.sum$adjr2)
points(m,reg.sum$adjr2[m], col="red",cex=2,pch=20)

plot(reg.sum$cp,xlab="Number of Variables",ylab="Cp",type='l')
m=which.min(reg.sum$cp)
points(m,reg.sum$cp[m], col="red",cex=2,pch=20)

plot(reg.sum$bic,xlab="Number of Variables",ylab="BIC",type='l')
m=which.min(reg.sum$bic)
points(m,reg.sum$bic[m], col="red",cex=2,pch=20)

plot of chunk SubsetSelection

par(mfrow=c(1,1))
plot(reg.full,scale="r2")

plot of chunk SubsetSelection

plot(reg.full,scale="adjr2")

plot of chunk SubsetSelection

plot(reg.full,scale="Cp")

plot of chunk SubsetSelection

plot(reg.full,scale="bic")

plot of chunk SubsetSelection

coef(reg.full,3)
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 
## 4539 11813 1380 60
# Code lines below are wrong in term of using here
# but the syntax is correct,
# so I keep it here for latter uses.

# z=sapply(x,function(z) coef(reg.full,3)*c(1,z,z^2,z^3))
# plot(x,apply(z,2,sum),col="red")
# lines(x,y,col="green")
reg.fwd=regsubsets(y~poly(x,10),data=xy,nvmax=10,method="forward")
reg.sum=summary(reg.fwd)

par(mfrow=c(2,2))
par(mar=rep(4,4))

plot(reg.sum$rss,xlab="Number of Variables",ylab="RSS",type="l")

plot(reg.sum$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
m=which.max(reg.sum$adjr2)
points(m,reg.sum$adjr2[m], col="red",cex=2,pch=20)

plot(reg.sum$cp,xlab="Number of Variables",ylab="Cp",type='l')
m=which.min(reg.sum$cp)
points(m,reg.sum$cp[m], col="red",cex=2,pch=20)

plot(reg.sum$bic,xlab="Number of Variables",ylab="BIC",type='l')
m=which.min(reg.sum$bic)
points(m,reg.sum$bic[m], col="red",cex=2,pch=20)

plot of chunk ForwardSelection

coef(reg.fwd,3)
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 
## 4539 11813 1380 60
reg.bwd=regsubsets(y~poly(x,10),data=xy,nvmax=10,method="backward")
reg.sum=summary(reg.bwd)

plot(reg.sum$rss,xlab="Number of Variables",ylab="RSS",type="l")

plot of chunk BackwardSelection

plot(reg.sum$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
m=which.max(reg.sum$adjr2)
points(m,reg.sum$adjr2[m], col="red",cex=2,pch=20)

plot of chunk BackwardSelection

plot(reg.sum$cp,xlab="Number of Variables",ylab="Cp",type='l')
m=which.min(reg.sum$cp)
points(m,reg.sum$cp[m], col="red",cex=2,pch=20)

plot of chunk BackwardSelection

plot(reg.sum$bic,xlab="Number of Variables",ylab="BIC",type='l')
m=which.min(reg.sum$bic)
points(m,reg.sum$bic[m], col="red",cex=2,pch=20)

plot of chunk BackwardSelection

coef(reg.bwd,3)
##  (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 
## 4539 11813 1380 60
library(glmnet)
set.seed(123)
train=sample(c(TRUE,FALSE), 100, replace=T)
test=!train

xm=model.matrix(y~poly(x,10))[,-1]
y.test=y[test]

grid=10^seq(10,-2,length=100)

# RIDGE for compare
ridge.mod=glmnet(xm[train,],y[train],alpha=0,lambda=grid, thresh=1e-13)
# lambda=1 MSE (arbitrarily lamda)
ridge.pred=predict(ridge.mod,s=1,newx=xm[test,])
mean((ridge.pred-y.test)^2)
## [1] 141136
# C-V ridge MSE
set.seed(123)
cv.out=cv.glmnet(xm[train,],y[train],alpha=0)
bestlam=cv.out$lambda.min
ridge.pred=predict(ridge.mod,s=bestlam,newx=xm[test,])
mean((ridge.pred-y.test)^2)
## [1] 2e+05
# vs. lm() function
# What's wrong here?
lm(y~xm,subset=train)
## 
## Call:
## lm(formula = y ~ xm, subset = train)
##
## Coefficients:
## (Intercept) xmpoly(x, 10)1 xmpoly(x, 10)2 xmpoly(x, 10)3
## 4.54e+03 1.18e+04 1.37e+03 7.18e+01
## xmpoly(x, 10)4 xmpoly(x, 10)5 xmpoly(x, 10)6 xmpoly(x, 10)7
## -1.07e+01 7.85e+00 -7.38e+00 3.88e+00
## xmpoly(x, 10)8 xmpoly(x, 10)9 xmpoly(x, 10)10
## -2.31e+00 7.05e-01 -5.16e-02
predict(ridge.mod,s=0,exact=T,type="coefficients")[1:11,]
## Error in drop(y): error in evaluating the argument 'x' in selecting a method for function 'drop': Error: object 'y' not found
# linear again
ridge.pred=predict(ridge.mod,s=0,newx=xm[test,],exact=T)
## Error in drop(y): error in evaluating the argument 'x' in selecting a method for function 'drop': Error: object 'y' not found
summary(ridge.pred)
##        1       
## Min. :2629
## 1st Qu.:3805
## Median :4484
## Mean :4509
## 3rd Qu.:5004
## Max. :7353
mean((ridge.pred-y.test)^2)
## [1] 2e+05
# LASSO
lasso.mod=glmnet(xm[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)

plot of chunk TheLasso

# C-V lasso MSE
set.seed(123)
cv.out=cv.glmnet(xm[train,],y[train],alpha=1)
plot(cv.out)

plot of chunk TheLasso

bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=xm[test,])
mean((lasso.pred-y.test)^2)
## [1] 994
out=glmnet(xm,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:11,]
lasso.coef
##   (Intercept)  poly(x, 10)1  poly(x, 10)2  poly(x, 10)3  poly(x, 10)4 
## 4539 11512 1079 0 0
## poly(x, 10)5 poly(x, 10)6 poly(x, 10)7 poly(x, 10)8 poly(x, 10)9
## 0 0 0 0 0
## poly(x, 10)10
## 0
lasso.coef[lasso.coef!=0]
##  (Intercept) poly(x, 10)1 poly(x, 10)2 
## 4539 11512 1079

Bài 9

college=read.csv("./ISL/College.csv")

# Extremely notice for row name of data
rownames(college)=college[,1]
college=college[,-1]

set.seed(234)
train=sample(c(TRUE,FALSE),nrow(college),replace=TRUE)
test=!train
x=model.matrix(Apps~.,data=college)[,-1]
y=college$Apps
y.test=y[test]

# Linear regression
reg.lm=lm(Apps~.,data=college,subset=train)
pred.lm=predict(reg.lm,college[test,])
mean((pred.lm-y.test)^2)
## [1] 957560
# Ridge regression
library(glmnet)
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
set.seed(123)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
bestlam=cv.out$lambda.min
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean((ridge.pred-y.test)^2)
## [1] 9e+05
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:17,]
## (Intercept)  PrivateYes      Accept      Enroll   Top10perc   Top25perc 
## -1.5e+03 -5.3e+02 9.8e-01 4.7e-01 2.5e+01 1.0e+00
## F.Undergrad P.Undergrad Outstate Room.Board Books Personal
## 7.7e-02 2.4e-02 -2.1e-02 2.0e-01 1.4e-01 -9.0e-03
## PhD Terminal S.F.Ratio perc.alumni Expend
## -3.8e+00 -4.7e+00 1.3e+01 -8.8e+00 7.5e-02
# Lasso model
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)

plot of chunk Collge

set.seed(456)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)

plot of chunk Collge

bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean((lasso.pred-y.test)^2)
## [1] 950941
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:17,]
lasso.coef
## (Intercept)  PrivateYes      Accept      Enroll   Top10perc   Top25perc 
## -471.885 -491.011 1.569 -0.751 47.983 -12.735
## F.Undergrad P.Undergrad Outstate Room.Board Books Personal
## 0.041 0.044 -0.083 0.149 0.015 0.029
## PhD Terminal S.F.Ratio perc.alumni Expend
## -8.379 -3.253 14.450 -0.052 0.077
lasso.coef[lasso.coef!=0]
## (Intercept)  PrivateYes      Accept      Enroll   Top10perc   Top25perc 
## -471.885 -491.011 1.569 -0.751 47.983 -12.735
## F.Undergrad P.Undergrad Outstate Room.Board Books Personal
## 0.041 0.044 -0.083 0.149 0.015 0.029
## PhD Terminal S.F.Ratio perc.alumni Expend
## -8.379 -3.253 14.450 -0.052 0.077
# PCR model
library(pls)
set.seed(789)
pcr.fit=pcr(Apps~., data=College,subset=train,scale=TRUE,validation="CV")
validationplot(pcr.fit,val.type="MSEP")

plot of chunk Collge

pcr.pred=predict(pcr.fit,x[test,],ncomp=5)
mean((pcr.pred-y.test)^2)
## [1] 2197160
pcr.fit=pcr(y~x,scale=TRUE,ncomp=5)
summary(pcr.fit)
## Data: 	X dimension: 777 17 
## Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 5
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps
## X 31.670 57.30 64.30 69.90 75.39
## y 2.316 73.06 73.07 82.08 84.08
# PLS model
set.seed(101112)
pls.fit=plsr(Apps~.,data=College,subset=train,scale=TRUE,validation="CV")
summary(pls.fit)
## Data: 	X dimension: 385 17 
## Y dimension: 385 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4263 2216 1826 1767 1663 1480 1350
## adjCV 4263 2208 1811 1759 1638 1455 1336
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1319 1311 1317 1322 1325 1327 1326
## adjCV 1308 1300 1305 1310 1313 1314 1314
## 14 comps 15 comps 16 comps 17 comps
## CV 1326 1325 1325 1325
## adjCV 1314 1313 1313 1313
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 25.60 34.60 63.20 66.60 70.13 74.39 78.71
## Apps 75.49 84.93 86.42 90.32 92.50 92.88 92.94
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 81.41 83.63 85.74 89.24 91.44 93.37 95.19
## Apps 93.00 93.05 93.09 93.10 93.10 93.11 93.11
## 15 comps 16 comps 17 comps
## X 96.94 98.00 100.00
## Apps 93.11 93.11 93.11
validationplot(pls.fit,val.type="MSEP")

plot of chunk Collge

pls.pred=predict(pls.fit,x[test,],ncomp=5)
mean((pls.pred-y.test)^2)
## [1] 1e+06
pls.fit=plsr(Apps~.,data=College,scale=TRUE,ncomp=5)
summary(pls.fit)
## Data: 	X dimension: 777 17 
## Y dimension: 777 1
## Fit method: kernelpls
## Number of components considered: 5
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps
## X 25.76 40.33 62.59 64.97 66.87
## Apps 78.01 85.14 87.67 90.73 92.63