ISL: Chapter 5


Bài 5

# Fit a logistic model
library(ISLR)
names(Default)
## [1] "default" "student" "balance" "income"
attach(Default)
glmout=glm(default~income+balance,data=Default,family=binomial)
summary(glmout)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.473 -0.144 -0.057 -0.021 3.724
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.15e+01 4.35e-01 -26.54 <2e-16 ***
## income 2.08e-05 4.99e-06 4.17 3e-05 ***
## balance 5.65e-03 2.27e-04 24.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
glmpro=predict(glmout,type="response")
contrasts(default)
##     Yes
## No 0
## Yes 1
glmpre=rep("No",10000)
glmpre[glmpro>.5]="Yes"
table(glmpre,default)
##       default
## glmpre No Yes
## No 9629 225
## Yes 38 108
mean(glmpre==default)
## [1] 0.97
# Validation approach
set.seed(777)
train=sample(10000,5000)
valid.set=Default[-train,]
glmout=glm(default~income+balance,data=Default,family=binomial,subset=train)
glmpro=predict(glmout,valid.set,type="response")
glmpre=rep("No",5000)
glmpre[glmpro>.5]="Yes"
mean(glmpre==valid.set$default)
## [1] 0.98
# Repeat #2
set.seed(298)
train=sample(10000,5000)
valid.set=Default[-train,]
glmout=glm(default~income+balance,data=Default,family=binomial,subset=train)
glmpro=predict(glmout,valid.set,type="response")
glmpre=rep("No",5000)
glmpre[glmpro>.5]="Yes"
mean(glmpre==valid.set$default)
## [1] 0.97
# Repeat #3
set.seed(777298)
train=sample(10000,5000)
valid.set=Default[-train,]
glmout=glm(default~income+balance,data=Default,family=binomial,subset=train)
glmpro=predict(glmout,valid.set,type="response")
glmpre=rep("No",5000)
glmpre[glmpro>.5]="Yes"
mean(glmpre==valid.set$default)
## [1] 0.97
# Add dummy variable 'student'
set.seed(777)
train=sample(10000,5000)
valid.set=Default[-train,]
glmout=glm(default~income+balance+student,data=Default,family=binomial,subset=train)
contrasts(student)
##     Yes
## No 0
## Yes 1
glmpro=predict(glmout,valid.set,type="response")
glmpre=rep("No",5000)
glmpre[glmpro>.5]="Yes"
mean(glmpre==valid.set$default)
## [1] 0.98

Bài 6

library(ISLR)
library(boot)
attach(Default)
## The following objects are masked from Default (pos = 3):
##
## balance, default, income, student
glmout=glm(default~income+balance,data=Default,family=binomial)
summary(glmout)$coef
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2e+01 4.3e-01 -26.5 3.0e-155
## income 2.1e-05 5.0e-06 4.2 3.0e-05
## balance 5.6e-03 2.3e-04 24.8 3.6e-136
coef(glmout)
## (Intercept)      income     balance 
## -1.2e+01 2.1e-05 5.6e-03
boot.fn=function(data,index)
return(coef(glm(default~income+balance,data=data,family=binomial,subset=index)))
set.seed(298)
# boot(Default,boot.fn,1000)
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = Default, statistic = boot.fn, R = 1000)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* -1.154047e+01 -2.227518e-02 4.364461e-01
# t2* 2.080898e-05 4.609849e-08 4.774216e-06
# t3* 5.647103e-03 1.223872e-05 2.299849e-04

Bài 7

library(ISLR)
attach(Weekly)
## The following objects are masked from Weekly (pos = 8):
##
## Direction, Lag1, Lag2, Lag3, Lag4, Lag5, Today, Volume, Year
##
## The following objects are masked from Smarket:
##
## Direction, Lag1, Lag2, Lag3, Lag4, Lag5, Today, Volume, Year
glmout=glm(Direction~Lag1+Lag2,data=Weekly,family=binomial)
glmout1=glm(Direction~Lag1+Lag2,data=Weekly,family=binomial,subset=(2:1089))
glmpre=if(predict.glm(glmout1,Weekly[1,],type="response")>0.5) "Up" else "Down"
glmpre
## [1] "Up"
Weekly[i,]$Direction
## [1] Down
## Levels: Down Up
glmpre==as.character(Weekly[i,]$Direction)
## [1] FALSE

Thực hiện lặp và dùng công thức để tính LOOCV error. Bỏ dấu # để chạy lệnh.

# n=1089
# err=rep(0,n)
# for (i in 1:n) {
# glmouti=glm(Direction~Lag1+Lag2,data=Weekly,family=binomial,subset=(1:1089)!=i)
# glmproi=predict.glm(glmouti,Weekly[i,],type="response")
# glmprei=if (glmproi>.5) "Up" else "Down"
# err[i]=ifelse(glmprei==as.character(Weekly[i,]$Direction),0,1)
# }
# mean(err)
# [1] 0.4499541

# library(boot)
# cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5)
# cv.glm(Weekly,glmout,cost, K=1089)$delta[1]
# [1] 0.4499541

Chú ý khi gọi hàm cv.glm() với phân phối binomial của glm() phải có hàm cost như sau:

cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5)

Bài 8

set.seed (1)
y=rnorm(100)
x=rnorm(100)
y=x-2*x^2+rnorm(100)
plot(x,y)

plot of chunk cv

xy=data.frame(x,y)
set.seed(777)
library(boot)
glm.fit=glm(y~x,data=xy)
cv.err=cv.glm(xy,glm.fit)
cv.err$delta
## [1] 5.9 5.9
for (i in 1:4) {
glm.fit=glm(y~poly(x,i),data=xy)
cv.err=cv.glm(xy,glm.fit)
print(paste("delta for i =",i,": ",cv.err$delta[1]))
}
## [1] "delta for i = 1 :  5.89097855988843"
## [1] "delta for i = 2 : 1.0865955642745"
## [1] "delta for i = 3 : 1.10258509387339"
## [1] "delta for i = 4 : 1.11477226814508"