Decision trees
Cây phân loại
library ( tree ) library ( ISLR ) attach ( Carseats )
## The following objects are masked from Carseats (pos = 21): ## ## Advertising, Age, CompPrice, Education, Income, Population, ## Price, Sales, ShelveLoc, Urban, US ## ## The following objects are masked from Carseats (pos = 22): ## ## Advertising, Age, CompPrice, Education, Income, Population, ## Price, Sales, ShelveLoc, Urban, US
High = ifelse ( Sales <= 8 , "No" , "Yes" ) Carseats = data.frame ( Carseats , High ) tree.carseats = tree ( High ~ . - Sales , Carseats ) summary ( tree.carseats )
## ## Classification tree: ## tree(formula = High ~ . - Sales, data = Carseats) ## Variables actually used in tree construction: ## [1] "ShelveLoc" "Price" "Income" "CompPrice" "Population" ## [6] "Advertising" "Age" "US" ## Number of terminal nodes: 27 ## Residual mean deviance: 0.46 = 171 / 373 ## Misclassification error rate: 0.09 = 36 / 400
plot ( tree.carseats ) text ( tree.carseats , pretty = 0 )
tree.carseats
## node), split, n, deviance, yval, (yprob) ## * denotes terminal node
## Error in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, : invalid 'digits' argument
set.seed ( 2 ) train = sample ( 1 : nrow ( Carseats ), 200 ) Carseats.test = Carseats [ - train ,] High.test = High [ - train ] tree.carseats = tree ( High ~ . - Sales , Carseats , subset = train ) tree.pred = predict ( tree.carseats , Carseats.test , type = "class" ) table ( tree.pred , High.test )
## High.test ## tree.pred No Yes ## No 86 27 ## Yes 30 57
( 86+57 ) / 200
## [1] 0.71
set.seed ( 3 ) cv.carseats = cv.tree ( tree.carseats , FUN = prune.misclass )
## Error in eval(expr, envir, enclos): object 'High' not found
names ( cv.carseats )
## Error in eval(expr, envir, enclos): object 'cv.carseats' not found
cv.carseats
## Error in eval(expr, envir, enclos): object 'cv.carseats' not found
par ( mfrow = c ( 1 , 2 )) plot ( cv.carseats $ size , cv.carseats $ dev , type = "b" )
## Error in plot(cv.carseats$size, cv.carseats$dev, type = "b"): error in evaluating the argument 'x' in selecting a method for function 'plot': Error: object 'cv.carseats' not found
plot ( cv.carseats $ k , cv.carseats $ dev , type = "b" )
## Error in plot(cv.carseats$k, cv.carseats$dev, type = "b"): error in evaluating the argument 'x' in selecting a method for function 'plot': Error: object 'cv.carseats' not found
prune.carseats = prune.misclass ( tree.carseats , best = 9 ) plot ( prune.carseats ) text ( prune.carseats , pretty = 0 ) tree.pred = predict ( prune.carseats , Carseats.test , type = "class" ) table ( tree.pred , High.test )
## High.test ## tree.pred No Yes ## No 94 24 ## Yes 22 60
( 94+60 ) / 200
## [1] 0.77
prune.carseats = prune.misclass ( tree.carseats , best = 15 ) plot ( prune.carseats ) text ( prune.carseats , pretty = 0 )
tree.pred = predict ( prune.carseats , Carseats.test , type = "class" ) table ( tree.pred , High.test )
## High.test ## tree.pred No Yes ## No 86 22 ## Yes 30 62
( 86+62 ) / 200
## [1] 0.74
Cây hồi quy
library ( MASS ) set.seed ( 1 ) train = sample ( 1 : nrow ( Boston ), nrow ( Boston ) / 2 ) tree.boston = tree ( medv ~ . , Boston , subset = train ) summary ( tree.boston )
## ## Regression tree: ## tree(formula = medv ~ ., data = Boston, subset = train) ## Variables actually used in tree construction: ## [1] "lstat" "rm" "dis" ## Number of terminal nodes: 8 ## Residual mean deviance: 13 = 3100 / 245 ## Distribution of residuals: ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -14.1 -2.0 -0.1 0.0 2.0 12.6
plot ( tree.boston ) text ( tree.boston , pretty = 0 )
cv.boston = cv.tree ( tree.boston ) plot ( cv.boston $ size , cv.boston $ dev , type = 'b' )
prune.boston = prune.tree ( tree.boston , best = 5 ) plot ( prune.boston ) text ( prune.boston , pretty = 0 )
yhat = predict ( tree.boston , newdata = Boston [ - train ,]) boston.test = Boston [ - train , "medv" ] plot ( yhat , boston.test ) abline ( 0 , 1 )
mean (( yhat - boston.test ) ^ 2 )
## [1] 25
Bagging và Random Forests
library ( randomForest )
## randomForest 4.6-10 ## Type rfNews() to see new features/changes/bug fixes.
set.seed ( 1 ) bag.boston = randomForest ( medv ~ . , data = Boston , subset = train , mtry = 13 , importance = TRUE ) bag.boston
## ## Call: ## randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE, subset = train) ## Type of random forest: regression ## Number of trees: 500 ## No. of variables tried at each split: 13 ## ## Mean of squared residuals: 11 ## % Var explained: 87
yhat.bag = predict ( bag.boston , newdata = Boston [ - train ,]) plot ( yhat.bag , boston.test ) abline ( 0 , 1 )
mean (( yhat.bag - boston.test ) ^ 2 )
## [1] 13
bag.boston = randomForest ( medv ~ . , data = Boston , subset = train , mtry = 13 , ntree = 25 ) yhat.bag = predict ( bag.boston , newdata = Boston [ - train ,]) mean (( yhat.bag - boston.test ) ^ 2 )
## [1] 13
set.seed ( 1 ) rf.boston = randomForest ( medv ~ . , data = Boston , subset = train , mtry = 6 , importance = TRUE ) yhat.rf = predict ( rf.boston , newdata = Boston [ - train ,]) mean (( yhat.rf - boston.test ) ^ 2 )
## [1] 11
importance ( rf.boston )
## %IncMSE IncNodePurity ## crim 12.5 1095 ## zn 1.4 64 ## indus 9.3 1086 ## chas 2.5 76 ## nox 12.8 1009 ## rm 31.6 6705 ## age 10.0 575 ## dis 12.8 1351 ## rad 3.9 94 ## tax 7.6 453 ## ptratio 12.0 919 ## black 7.4 359 ## lstat 27.7 6928
varImpPlot ( rf.boston )
Boosting
library ( gbm )
## Loading required package: survival ## ## Attaching package: 'survival' ## ## The following object is masked from 'package:boot': ## ## aml ## ## Loading required package: lattice ## ## Attaching package: 'lattice' ## ## The following object is masked from 'package:boot': ## ## melanoma ## ## Loading required package: parallel ## Loaded gbm 2.1
set.seed ( 1 ) boost.boston = gbm ( medv ~ . , data = Boston [ train ,], distribution = "gaussian" , n.trees = 5000 , interaction.depth = 4 ) summary ( boost.boston )
## var rel.inf ## lstat lstat 45.963 ## rm rm 31.224 ## dis dis 6.809 ## crim crim 4.074 ## nox nox 2.561 ## ptratio ptratio 2.275 ## black black 1.797 ## age age 1.649 ## tax tax 1.360 ## indus indus 1.271 ## chas chas 0.801 ## rad rad 0.203 ## zn zn 0.015
par ( mfrow = c ( 1 , 2 )) plot ( boost.boston , i = "rm" ) plot ( boost.boston , i = "lstat" )
yhat.boost = predict ( boost.boston , newdata = Boston [ - train ,], n.trees = 5000 ) mean (( yhat.boost - boston.test ) ^ 2 )
## [1] 12
boost.boston = gbm ( medv ~ . , data = Boston [ train ,], distribution = "gaussian" , n.trees = 5000 , interaction.depth = 4 , shrinkage = 0.2 , verbose = F ) yhat.boost = predict ( boost.boston , newdata = Boston [ - train ,], n.trees = 5000 ) mean (( yhat.boost - boston.test ) ^ 2 )
## [1] 12