set.seed(123)x=rnorm(100,mean=10)set.seed(124)ep=rnorm(100) y=1+2*x+3*x^2+4*x^3+ep plot(x,y)
xy=data.frame(x,y)library(leaps)reg.full=regsubsets(y~poly(x,10),data=xy,nvmax=10)reg.sum=summary(reg.full) par(mfrow=c(2,2))par(mar=rep(4,4)) plot(reg.sum$rss,xlab="Number of Variables",ylab="RSS",type="l") plot(reg.sum$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")m=which.max(reg.sum$adjr2)points(m,reg.sum$adjr2[m], col="red",cex=2,pch=20) plot(reg.sum$cp,xlab="Number of Variables",ylab="Cp",type='l')m=which.min(reg.sum$cp)points(m,reg.sum$cp[m], col="red",cex=2,pch=20) plot(reg.sum$bic,xlab="Number of Variables",ylab="BIC",type='l')m=which.min(reg.sum$bic)points(m,reg.sum$bic[m], col="red",cex=2,pch=20)
par(mfrow=c(1,1))plot(reg.full,scale="r2")
plot(reg.full,scale="adjr2")
plot(reg.full,scale="Cp")
plot(reg.full,scale="bic")
coef(reg.full,3)
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 ## 4539 11813 1380 60
# Code lines below are wrong in term of using here# but the syntax is correct,# so I keep it here for latter uses. # z=sapply(x,function(z) coef(reg.full,3)*c(1,z,z^2,z^3))# plot(x,apply(z,2,sum),col="red")# lines(x,y,col="green")
reg.fwd=regsubsets(y~poly(x,10),data=xy,nvmax=10,method="forward")reg.sum=summary(reg.fwd) par(mfrow=c(2,2))par(mar=rep(4,4)) plot(reg.sum$rss,xlab="Number of Variables",ylab="RSS",type="l") plot(reg.sum$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")m=which.max(reg.sum$adjr2)points(m,reg.sum$adjr2[m], col="red",cex=2,pch=20) plot(reg.sum$cp,xlab="Number of Variables",ylab="Cp",type='l')m=which.min(reg.sum$cp)points(m,reg.sum$cp[m], col="red",cex=2,pch=20) plot(reg.sum$bic,xlab="Number of Variables",ylab="BIC",type='l')m=which.min(reg.sum$bic)points(m,reg.sum$bic[m], col="red",cex=2,pch=20)
coef(reg.fwd,3)
reg.bwd=regsubsets(y~poly(x,10),data=xy,nvmax=10,method="backward")reg.sum=summary(reg.bwd) plot(reg.sum$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.sum$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")m=which.max(reg.sum$adjr2)points(m,reg.sum$adjr2[m], col="red",cex=2,pch=20)
plot(reg.sum$cp,xlab="Number of Variables",ylab="Cp",type='l')m=which.min(reg.sum$cp)points(m,reg.sum$cp[m], col="red",cex=2,pch=20)
plot(reg.sum$bic,xlab="Number of Variables",ylab="BIC",type='l')m=which.min(reg.sum$bic)points(m,reg.sum$bic[m], col="red",cex=2,pch=20)
coef(reg.bwd,3)
library(glmnet)set.seed(123)train=sample(c(TRUE,FALSE), 100, replace=T)test=!train xm=model.matrix(y~poly(x,10))[,-1]y.test=y[test] grid=10^seq(10,-2,length=100) # RIDGE for compareridge.mod=glmnet(xm[train,],y[train],alpha=0,lambda=grid, thresh=1e-13)# lambda=1 MSE (arbitrarily lamda)ridge.pred=predict(ridge.mod,s=1,newx=xm[test,])mean((ridge.pred-y.test)^2)
## [1] 141136
# C-V ridge MSEset.seed(123)cv.out=cv.glmnet(xm[train,],y[train],alpha=0)bestlam=cv.out$lambda.minridge.pred=predict(ridge.mod,s=bestlam,newx=xm[test,])mean((ridge.pred-y.test)^2)
## [1] 2e+05
# vs. lm() function# What's wrong here?lm(y~xm,subset=train)
## ## Call:## lm(formula = y ~ xm, subset = train)## ## Coefficients:## (Intercept) xmpoly(x, 10)1 xmpoly(x, 10)2 xmpoly(x, 10)3 ## 4.54e+03 1.18e+04 1.37e+03 7.18e+01 ## xmpoly(x, 10)4 xmpoly(x, 10)5 xmpoly(x, 10)6 xmpoly(x, 10)7 ## -1.07e+01 7.85e+00 -7.38e+00 3.88e+00 ## xmpoly(x, 10)8 xmpoly(x, 10)9 xmpoly(x, 10)10 ## -2.31e+00 7.05e-01 -5.16e-02
predict(ridge.mod,s=0,exact=T,type="coefficients")[1:11,]
## Error in drop(y): error in evaluating the argument 'x' in selecting a method for function 'drop': Error: object 'y' not found
# linear againridge.pred=predict(ridge.mod,s=0,newx=xm[test,],exact=T)
summary(ridge.pred)
## 1 ## Min. :2629 ## 1st Qu.:3805 ## Median :4484 ## Mean :4509 ## 3rd Qu.:5004 ## Max. :7353
mean((ridge.pred-y.test)^2)
# LASSOlasso.mod=glmnet(xm[train,],y[train],alpha=1,lambda=grid)plot(lasso.mod)
# C-V lasso MSEset.seed(123)cv.out=cv.glmnet(xm[train,],y[train],alpha=1)plot(cv.out)
bestlam=cv.out$lambda.minlasso.pred=predict(lasso.mod,s=bestlam,newx=xm[test,])mean((lasso.pred-y.test)^2)
## [1] 994
out=glmnet(xm,y,alpha=1,lambda=grid)lasso.coef=predict(out,type="coefficients",s=bestlam)[1:11,]lasso.coef
## (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 poly(x, 10)4 ## 4539 11512 1079 0 0 ## poly(x, 10)5 poly(x, 10)6 poly(x, 10)7 poly(x, 10)8 poly(x, 10)9 ## 0 0 0 0 0 ## poly(x, 10)10 ## 0
lasso.coef[lasso.coef!=0]
## (Intercept) poly(x, 10)1 poly(x, 10)2 ## 4539 11512 1079
college=read.csv("./ISL/College.csv") # Extremely notice for row name of datarownames(college)=college[,1]college=college[,-1] set.seed(234)train=sample(c(TRUE,FALSE),nrow(college),replace=TRUE)test=!trainx=model.matrix(Apps~.,data=college)[,-1]y=college$Appsy.test=y[test] # Linear regressionreg.lm=lm(Apps~.,data=college,subset=train)pred.lm=predict(reg.lm,college[test,])mean((pred.lm-y.test)^2)
## [1] 957560
# Ridge regressionlibrary(glmnet)grid=10^seq(10,-2,length=100)ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)set.seed(123)cv.out=cv.glmnet(x[train,],y[train],alpha=0)bestlam=cv.out$lambda.minridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])mean((ridge.pred-y.test)^2)
## [1] 9e+05
out=glmnet(x,y,alpha=0)predict(out,type="coefficients",s=bestlam)[1:17,]
## (Intercept) PrivateYes Accept Enroll Top10perc Top25perc ## -1.5e+03 -5.3e+02 9.8e-01 4.7e-01 2.5e+01 1.0e+00 ## F.Undergrad P.Undergrad Outstate Room.Board Books Personal ## 7.7e-02 2.4e-02 -2.1e-02 2.0e-01 1.4e-01 -9.0e-03 ## PhD Terminal S.F.Ratio perc.alumni Expend ## -3.8e+00 -4.7e+00 1.3e+01 -8.8e+00 7.5e-02
# Lasso modellasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)plot(lasso.mod)
set.seed(456)cv.out=cv.glmnet(x[train,],y[train],alpha=1)plot(cv.out)
bestlam=cv.out$lambda.minlasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])mean((lasso.pred-y.test)^2)
## [1] 950941
out=glmnet(x,y,alpha=1,lambda=grid)lasso.coef=predict(out,type="coefficients",s=bestlam)[1:17,]lasso.coef
## (Intercept) PrivateYes Accept Enroll Top10perc Top25perc ## -471.885 -491.011 1.569 -0.751 47.983 -12.735 ## F.Undergrad P.Undergrad Outstate Room.Board Books Personal ## 0.041 0.044 -0.083 0.149 0.015 0.029 ## PhD Terminal S.F.Ratio perc.alumni Expend ## -8.379 -3.253 14.450 -0.052 0.077
# PCR modellibrary(pls)set.seed(789)pcr.fit=pcr(Apps~., data=College,subset=train,scale=TRUE,validation="CV")validationplot(pcr.fit,val.type="MSEP")
pcr.pred=predict(pcr.fit,x[test,],ncomp=5)mean((pcr.pred-y.test)^2)
## [1] 2197160
pcr.fit=pcr(y~x,scale=TRUE,ncomp=5)summary(pcr.fit)
## Data: X dimension: 777 17 ## Y dimension: 777 1## Fit method: svdpc## Number of components considered: 5## TRAINING: % variance explained## 1 comps 2 comps 3 comps 4 comps 5 comps## X 31.670 57.30 64.30 69.90 75.39## y 2.316 73.06 73.07 82.08 84.08
# PLS modelset.seed(101112)pls.fit=plsr(Apps~.,data=College,subset=train,scale=TRUE,validation="CV")summary(pls.fit)
## Data: X dimension: 385 17 ## Y dimension: 385 1## Fit method: kernelpls## Number of components considered: 17## ## VALIDATION: RMSEP## Cross-validated using 10 random segments.## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps## CV 4263 2216 1826 1767 1663 1480 1350## adjCV 4263 2208 1811 1759 1638 1455 1336## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps## CV 1319 1311 1317 1322 1325 1327 1326## adjCV 1308 1300 1305 1310 1313 1314 1314## 14 comps 15 comps 16 comps 17 comps## CV 1326 1325 1325 1325## adjCV 1314 1313 1313 1313## ## TRAINING: % variance explained## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps## X 25.60 34.60 63.20 66.60 70.13 74.39 78.71## Apps 75.49 84.93 86.42 90.32 92.50 92.88 92.94## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps## X 81.41 83.63 85.74 89.24 91.44 93.37 95.19## Apps 93.00 93.05 93.09 93.10 93.10 93.11 93.11## 15 comps 16 comps 17 comps## X 96.94 98.00 100.00## Apps 93.11 93.11 93.11
validationplot(pls.fit,val.type="MSEP")
pls.pred=predict(pls.fit,x[test,],ncomp=5)mean((pls.pred-y.test)^2)
## [1] 1e+06
pls.fit=plsr(Apps~.,data=College,scale=TRUE,ncomp=5)summary(pls.fit)
## Data: X dimension: 777 17 ## Y dimension: 777 1## Fit method: kernelpls## Number of components considered: 5## TRAINING: % variance explained## 1 comps 2 comps 3 comps 4 comps 5 comps## X 25.76 40.33 62.59 64.97 66.87## Apps 78.01 85.14 87.67 90.73 92.63
[trang chủ] [liên hệ] [code]