Linear Model Selection and Regularization


Các phương pháp lựa chọn tập con

Lựa chọn tập con tốt nhất

library(ISLR)
names(Hitters)
##  [1] "AtBat"     "Hits"      "HmRun"     "Runs"      "RBI"      
## [6] "Walks" "Years" "CAtBat" "CHits" "CHmRun"
## [11] "CRuns" "CRBI" "CWalks" "League" "Division"
## [16] "PutOuts" "Assists" "Errors" "Salary" "NewLeague"
dim(Hitters)
## [1] 322  20
sum(is.na(Hitters$Salary))
## [1] 59
Hitters=na.omit(Hitters)
dim(Hitters)
## [1] 263  20
sum(is.na(Hitters))
## [1] 0
library(leaps)
regfit.full=regsubsets(Salary~.,Hitters)
summary(regfit.full)
## Subset selection object
## Call: jekyll_it()
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " " " "*" "*" " " " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " " " "
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
reg.summary$rsq
##  [1] 0.32 0.43 0.45 0.48 0.49 0.51 0.51 0.53 0.53 0.54 0.54 0.54 0.54 0.55
## [15] 0.55 0.55 0.55 0.55 0.55
par(mfrow=c(2,2))
par(mar=rep(1,4))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
## [1] 11
points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20)
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)
## [1] 10
points(10,reg.summary$cp[10],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)

plot of chunk Best Subset Selection

par(mfrow=c(1,1))
plot(regfit.full,scale="r2")

plot of chunk Best Subset Selection

plot(regfit.full,scale="adjr2")

plot of chunk Best Subset Selection

plot(regfit.full,scale="Cp")

plot of chunk Best Subset Selection

plot(regfit.full,scale="bic")

plot of chunk Best Subset Selection

coef(regfit.full,6)
## (Intercept)       AtBat        Hits       Walks        CRBI   DivisionW 
## 91.51 -1.87 7.60 3.70 0.64 -122.95
## PutOuts
## 0.26

Lựa chọn bậc thang: thuận và nghịch

regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: jekyll_it()
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
summary(regfit.bwd)
## Subset selection object
## Call: jekyll_it()
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: backward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " " " " " "*" " " " " " "
## 5 ( 1 ) " " " " " " " " "*" " " " " " "
## 6 ( 1 ) " " " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
coef(regfit.full,7)
## (Intercept)        Hits       Walks      CAtBat       CHits      CHmRun 
## 79.45 1.28 3.23 -0.38 1.50 1.44
## DivisionW PutOuts
## -129.99 0.24
coef(regfit.fwd,7)
## (Intercept)       AtBat        Hits       Walks        CRBI      CWalks 
## 109.79 -1.96 7.45 4.91 0.85 -0.31
## DivisionW PutOuts
## -127.12 0.25
coef(regfit.bwd,7)
## (Intercept)       AtBat        Hits       Walks       CRuns      CWalks 
## 105.65 -1.98 6.76 6.06 1.13 -0.72
## DivisionW PutOuts
## -116.17 0.30

Chọn lựa mô hình

set.seed(1)
train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test=(!train)
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)
test.mat=model.matrix(Salary~.,data=Hitters[test,])
val.errors=rep(NA,19)
for(i in 1:19){
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean((Hitters$Salary[test]-pred)^2)
}
val.errors
##  [1] 220968 169157 178518 163426 168418 171271 162377 157909 154056 148162
## [11] 151156 151742 152214 157359 158541 158743 159973 159860 160106
which.min(val.errors)
## [1] 10
coef(regfit.best,10)
## (Intercept)       AtBat        Hits       Walks      CAtBat       CHits 
## -80.28 -1.47 7.16 3.64 -0.19 1.11
## CHmRun CWalks LeagueN DivisionW PutOuts
## 1.38 -0.75 84.56 -53.03 0.24
# Writing function for latter uses
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit.best,10)
## (Intercept)       AtBat        Hits       Walks      CAtBat       CRuns 
## 162.54 -2.17 6.92 5.77 -0.13 1.41
## CRBI CWalks DivisionW PutOuts Assists
## 0.77 -0.83 -112.38 0.30 0.28
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace=TRUE)
cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))
for(j in 1:k){
best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
for(i in 1:19){
pred=predict(best.fit,Hitters[folds==j,],id=i)
cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
}
}
## Error in object$call[[2]]: subscript out of bounds
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
which.min(mean.cv.errors)
## integer(0)
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Error in plot.window(...): need finite 'ylim' values

plot of chunk Choosing Among Models

reg.best=regsubsets(Salary~.,data=Hitters, nvmax=19)
coef(reg.best,11)
## (Intercept)       AtBat        Hits       Walks      CAtBat       CRuns 
## 135.75 -2.13 6.92 5.62 -0.14 1.46
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.79 -0.82 43.11 -111.15 0.29 0.27

Ridge Regression and the Lasso

x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary

Ridge Regression

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 1.9-8
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
dim(coef(ridge.mod))
## [1]  20 100
ridge.mod$lambda[50]
## [1] 11498
coef(ridge.mod)[,50]
## (Intercept)       AtBat        Hits       HmRun        Runs         RBI 
## 407.3561 0.0370 0.1382 0.5246 0.2307 0.2398
## Walks Years CAtBat CHits CHmRun CRuns
## 0.2896 1.1077 0.0031 0.0117 0.0875 0.0234
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.0241 0.0250 0.0850 -6.2154 0.0165 0.0026
## Errors NewLeagueN
## -0.0205 0.3014
sqrt(sum(coef(ridge.mod)[-1,50]^2))
## [1] 6.4
ridge.mod$lambda[60]
## [1] 705
coef(ridge.mod)[,60]
## (Intercept)       AtBat        Hits       HmRun        Runs         RBI 
## 54.325 0.112 0.656 1.180 0.938 0.847
## Walks Years CAtBat CHits CHmRun CRuns
## 1.320 2.596 0.011 0.047 0.338 0.094
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.098 0.072 13.684 -54.659 0.119 0.016
## Errors NewLeagueN
## -0.704 8.612
sqrt(sum(coef(ridge.mod)[-1,60]^2))
## [1] 57
predict(ridge.mod,s=50,type="coefficients")[1:20,]
## (Intercept)       AtBat        Hits       HmRun        Runs         RBI 
## 4.9e+01 -3.6e-01 2.0e+00 -1.3e+00 1.1e+00 8.0e-01
## Walks Years CAtBat CHits CHmRun CRuns
## 2.7e+00 -6.2e+00 5.4e-03 1.1e-01 6.2e-01 2.2e-01
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 2.2e-01 -1.5e-01 4.6e+01 -1.2e+02 2.5e-01 1.2e-01
## Errors NewLeagueN
## -3.3e+00 -9.5e+00
set.seed(1)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
mean((ridge.pred-y.test)^2)
## [1] 1e+05
mean((mean(y[train])-y.test)^2)
## [1] 193253
ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
mean((ridge.pred-y.test)^2)
## [1] 193253
ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T)
## Error in drop(y): error in evaluating the argument 'x' in selecting a method for function 'drop': Error: object 'y' not found
mean((ridge.pred-y.test)^2)
## [1] 193253
lm(y~x, subset=train)
## 
## Call:
## lm(formula = y ~ x, subset = train)
##
## Coefficients:
## (Intercept) xAtBat xHits xHmRun xRuns
## 299.4285 -2.5403 8.3668 11.6451 -9.0992
## xRBI xWalks xYears xCAtBat xCHits
## 2.4410 9.2344 -22.9367 -0.1815 -0.1160
## xCHmRun xCRuns xCRBI xCWalks xLeagueN
## -1.3389 3.3284 0.0754 -1.0784 59.7607
## xDivisionW xPutOuts xAssists xErrors xNewLeagueN
## -98.8623 0.3409 0.3416 -0.6421 -0.6744
predict(ridge.mod,s=0,exact=T,type="coefficients")[1:20,]
## Error in drop(y): error in evaluating the argument 'x' in selecting a method for function 'drop': Error: object 'y' not found
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)

plot of chunk ridge

bestlam=cv.out$lambda.min
bestlam
## [1] 212
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean((ridge.pred-y.test)^2)
## [1] 96016
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:20,]
## (Intercept)       AtBat        Hits       HmRun        Runs         RBI 
## 9.885 0.031 1.009 0.139 1.113 0.873
## Walks Years CAtBat CHits CHmRun CRuns
## 1.804 0.131 0.011 0.065 0.452 0.129
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.137 0.029 27.182 -91.634 0.191 0.043
## Errors NewLeagueN
## -1.812 7.212

The Lasso

lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)

plot of chunk lasso

set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)

plot of chunk lasso

bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean((lasso.pred-y.test)^2)
## [1] 1e+05
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso.coef
## (Intercept)       AtBat        Hits       HmRun        Runs         RBI 
## 18.54 0.00 1.87 0.00 0.00 0.00
## Walks Years CAtBat CHits CHmRun CRuns
## 2.22 0.00 0.00 0.00 0.00 0.21
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.41 0.00 3.27 -103.48 0.22 0.00
## Errors NewLeagueN
## 0.00 0.00
lasso.coef[lasso.coef!=0]
## (Intercept)        Hits       Walks       CRuns        CRBI     LeagueN 
## 18.54 1.87 2.22 0.21 0.41 3.27
## DivisionW PutOuts
## -103.48 0.22

PCR and PLS Regression

Principal Components Regression

library(pls)
## 
## Attaching package: 'pls'
##
## The following object is masked from 'package:stats':
##
## loadings
set.seed(2)
pcr.fit=pcr(Salary~., data=Hitters,scale=TRUE,validation="CV")
summary(pcr.fit)
## Data: 	X dimension: 263 19 
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 452 348.9 352.2 353.5 352.8 350.1 349.1
## adjCV 452 348.7 351.8 352.9 352.1 349.3 348.0
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 349.6 350.9 352.9 353.8 355.0 356.2 363.5
## adjCV 348.5 349.8 351.6 352.3 353.4 354.5 361.6
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 355.2 357.4 347.6 350.1 349.2 352.6
## adjCV 352.8 355.2 345.5 347.6 346.7 349.8
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26
## Salary 40.63 41.58 42.17 43.22 44.90 46.48 46.69
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 94.96 96.28 97.26 97.98 98.65 99.15 99.47
## Salary 46.75 46.86 47.76 47.82 47.85 48.10 50.40
## 15 comps 16 comps 17 comps 18 comps 19 comps
## X 99.75 99.89 99.97 99.99 100.00
## Salary 50.55 53.01 53.85 54.61 54.61
validationplot(pcr.fit,val.type="MSEP")

plot of chunk PCR

set.seed(1)
pcr.fit=pcr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
validationplot(pcr.fit,val.type="MSEP")

plot of chunk PCR

pcr.pred=predict(pcr.fit,x[test,],ncomp=7)
mean((pcr.pred-y.test)^2)
## [1] 96556
pcr.fit=pcr(y~x,scale=TRUE,ncomp=7)
summary(pcr.fit)
## Data: 	X dimension: 263 19 
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26
## y 40.63 41.58 42.17 43.22 44.90 46.48 46.69

Partial Least Squares

set.seed(1)
pls.fit=plsr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
summary(pls.fit)
## Data: 	X dimension: 131 19 
## Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 464.6 394.2 391.5 393.1 395.0 415.0 424.0
## adjCV 464.6 393.4 390.2 391.1 392.9 411.5 418.8
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 424.5 415.8 404.6 407.1 412.0 414.4 410.3
## adjCV 418.9 411.4 400.7 402.2 407.2 409.3 405.6
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 406.2 408.6 410.5 408.8 407.8 410.2
## adjCV 401.8 403.9 405.6 404.1 403.2 405.5
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.12 53.46 66.05 74.49 79.33 84.56 87.09
## Salary 33.58 38.96 41.57 42.43 44.04 45.59 47.05
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 90.74 92.55 93.94 97.23 97.88 98.35 98.85
## Salary 47.53 48.42 49.68 50.04 50.54 50.78 50.92
## 15 comps 16 comps 17 comps 18 comps 19 comps
## X 99.11 99.43 99.78 99.99 100.00
## Salary 51.04 51.11 51.15 51.16 51.18
validationplot(pls.fit,val.type="MSEP")

plot of chunk PLS

pls.pred=predict(pls.fit,x[test,],ncomp=2)
mean((pls.pred-y.test)^2)
## [1] 1e+05
pls.fit=plsr(Salary~., data=Hitters,scale=TRUE,ncomp=2)
summary(pls.fit)
## Data: 	X dimension: 263 19 
## Y dimension: 263 1
## Fit method: kernelpls
## Number of components considered: 2
## TRAINING: % variance explained
## 1 comps 2 comps
## X 38.08 51.03
## Salary 43.05 46.40