Decision trees


Cây phân loại

library(tree)
library(ISLR)
attach(Carseats)
## The following objects are masked from Carseats (pos = 21):
##
## Advertising, Age, CompPrice, Education, Income, Population,
## Price, Sales, ShelveLoc, Urban, US
##
## The following objects are masked from Carseats (pos = 22):
##
## Advertising, Age, CompPrice, Education, Income, Population,
## Price, Sales, ShelveLoc, Urban, US
High=ifelse(Sales<=8,"No","Yes")
Carseats=data.frame(Carseats,High)
tree.carseats=tree(High~.-Sales,Carseats)
summary(tree.carseats)
## 
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "CompPrice" "Population"
## [6] "Advertising" "Age" "US"
## Number of terminal nodes: 27
## Residual mean deviance: 0.46 = 171 / 373
## Misclassification error rate: 0.09 = 36 / 400
plot(tree.carseats)
text(tree.carseats,pretty=0)

plot of chunk Fitting Classification Trees

tree.carseats
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
## Error in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, : invalid 'digits' argument
set.seed(2)
train=sample(1:nrow(Carseats), 200)
Carseats.test=Carseats[-train,]
High.test=High[-train]
tree.carseats=tree(High~.-Sales,Carseats,subset=train)
tree.pred=predict(tree.carseats,Carseats.test,type="class")
table(tree.pred,High.test)
##          High.test
## tree.pred No Yes
## No 86 27
## Yes 30 57
(86+57)/200
## [1] 0.71
set.seed(3)
cv.carseats=cv.tree(tree.carseats,FUN=prune.misclass)
## Error in eval(expr, envir, enclos): object 'High' not found
names(cv.carseats)
## Error in eval(expr, envir, enclos): object 'cv.carseats' not found
cv.carseats
## Error in eval(expr, envir, enclos): object 'cv.carseats' not found
par(mfrow=c(1,2))
plot(cv.carseats$size,cv.carseats$dev,type="b")
## Error in plot(cv.carseats$size, cv.carseats$dev, type = "b"): error in evaluating the argument 'x' in selecting a method for function 'plot': Error: object 'cv.carseats' not found
plot(cv.carseats$k,cv.carseats$dev,type="b")
## Error in plot(cv.carseats$k, cv.carseats$dev, type = "b"): error in evaluating the argument 'x' in selecting a method for function 'plot': Error: object 'cv.carseats' not found
prune.carseats=prune.misclass(tree.carseats,best=9)
plot(prune.carseats)
text(prune.carseats,pretty=0)
tree.pred=predict(prune.carseats,Carseats.test,type="class")
table(tree.pred,High.test)
##          High.test
## tree.pred No Yes
## No 94 24
## Yes 22 60
(94+60)/200
## [1] 0.77
prune.carseats=prune.misclass(tree.carseats,best=15)
plot(prune.carseats)
text(prune.carseats,pretty=0)

plot of chunk Fitting Classification Trees

tree.pred=predict(prune.carseats,Carseats.test,type="class")
table(tree.pred,High.test)
##          High.test
## tree.pred No Yes
## No 86 22
## Yes 30 62
(86+62)/200
## [1] 0.74

Cây hồi quy

library(MASS)
set.seed(1)
train = sample(1:nrow(Boston), nrow(Boston)/2)
tree.boston=tree(medv~.,Boston,subset=train)
summary(tree.boston)
## 
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "lstat" "rm" "dis"
## Number of terminal nodes: 8
## Residual mean deviance: 13 = 3100 / 245
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.1 -2.0 -0.1 0.0 2.0 12.6
plot(tree.boston)
text(tree.boston,pretty=0)

plot of chunk Fitting Regression Trees

cv.boston=cv.tree(tree.boston)
plot(cv.boston$size,cv.boston$dev,type='b')

plot of chunk Fitting Regression Trees

prune.boston=prune.tree(tree.boston,best=5)
plot(prune.boston)
text(prune.boston,pretty=0)

plot of chunk Fitting Regression Trees

yhat=predict(tree.boston,newdata=Boston[-train,])
boston.test=Boston[-train,"medv"]
plot(yhat,boston.test)
abline(0,1)

plot of chunk Fitting Regression Trees

mean((yhat-boston.test)^2)
## [1] 25

Bagging và Random Forests

library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE)
bag.boston
## 
## Call:
## randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 11
## % Var explained: 87
yhat.bag = predict(bag.boston,newdata=Boston[-train,])
plot(yhat.bag, boston.test)
abline(0,1)

plot of chunk Bagging and Random Forests

mean((yhat.bag-boston.test)^2)
## [1] 13
bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,ntree=25)
yhat.bag = predict(bag.boston,newdata=Boston[-train,])
mean((yhat.bag-boston.test)^2)
## [1] 13
set.seed(1)
rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE)
yhat.rf = predict(rf.boston,newdata=Boston[-train,])
mean((yhat.rf-boston.test)^2)
## [1] 11
importance(rf.boston)
##         %IncMSE IncNodePurity
## crim 12.5 1095
## zn 1.4 64
## indus 9.3 1086
## chas 2.5 76
## nox 12.8 1009
## rm 31.6 6705
## age 10.0 575
## dis 12.8 1351
## rad 3.9 94
## tax 7.6 453
## ptratio 12.0 919
## black 7.4 359
## lstat 27.7 6928
varImpPlot(rf.boston)

plot of chunk Bagging and Random Forests

Boosting

library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:boot':
##
## aml
##
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:boot':
##
## melanoma
##
## Loading required package: parallel
## Loaded gbm 2.1
set.seed(1)
boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4)
summary(boost.boston)

plot of chunk Boosting

##             var rel.inf
## lstat lstat 45.963
## rm rm 31.224
## dis dis 6.809
## crim crim 4.074
## nox nox 2.561
## ptratio ptratio 2.275
## black black 1.797
## age age 1.649
## tax tax 1.360
## indus indus 1.271
## chas chas 0.801
## rad rad 0.203
## zn zn 0.015
par(mfrow=c(1,2))
plot(boost.boston,i="rm")
plot(boost.boston,i="lstat")

plot of chunk Boosting

yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
mean((yhat.boost-boston.test)^2)
## [1] 12
boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F)
yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
mean((yhat.boost-boston.test)^2)
## [1] 12