ISL: Chaper 3


Bài 8

# fileUrl <- "http://www-bcf.usc.edu/~gareth/ISL/Auto.csv"
# download.file(fileUrl,destfile="./ISL/Auto.csv")
auto=read.csv("./ISL/Auto.csv")
auto=na.omit(auto)
names(auto)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"  
## [5] "weight" "acceleration" "year" "origin"
## [9] "name"
auto$horsepower=as.numeric(auto$horsepower)
lmout=lm(mpg ~ horsepower,data=auto)
summary(lmout)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.351 -6.004 -0.391 4.952 22.982
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.8076 0.7113 25.04 <2e-16 ***
## horsepower 0.1108 0.0119 9.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.1 on 395 degrees of freedom
## Multiple R-squared: 0.179, Adjusted R-squared: 0.177
## F-statistic: 86 on 1 and 395 DF, p-value: <2e-16
predict(lmout,data.frame(horsepower=98))
##  1 
## 29
predict(lmout,data.frame(horsepower=98),interval="confidence")
##   fit lwr upr
## 1 29 27 30
predict(lmout,data.frame(horsepower=98),interval="prediction")
##   fit lwr upr
## 1 29 15 43
plot(auto$horsepower,auto$mpg,col=3)
abline(lmout,lwd=3,col=2)

plot of chunk mpg vs horsepower

plot(predict(lmout), residuals(lmout))

plot of chunk mpg vs horsepower

plot(predict(lmout), rstudent(lmout))

plot of chunk mpg vs horsepower

par(mfrow=c(2,2))
plot(lmout)

plot of chunk mpg vs horsepower

Bài 9

pairs(auto)

plot of chunk mul

str(auto)
## 'data.frame':	397 obs. of  9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 17 35 29 29 24 42 47 46 48 40 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
pairs(auto[c(1:6)])

plot of chunk mul

cor(auto[c(1:6)])
##                mpg cylinders displacement horsepower weight acceleration
## mpg 1.00 -0.78 -0.80 0.42 -0.83 0.42
## cylinders -0.78 1.00 0.95 -0.55 0.90 -0.50
## displacement -0.80 0.95 1.00 -0.48 0.93 -0.54
## horsepower 0.42 -0.55 -0.48 1.00 -0.48 0.27
## weight -0.83 0.90 0.93 -0.48 1.00 -0.42
## acceleration 0.42 -0.50 -0.54 0.27 -0.42 1.00
lmout2=lm(mpg~.-name,data=auto)
summary(lmout2)
## 
## Call:
## lm(formula = mpg ~ . - name, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.629 -2.034 -0.046 1.801 13.010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21.28440 4.25878 -5.00 8.8e-07 ***
## cylinders -0.29265 0.33819 -0.87 0.387
## displacement 0.01603 0.00728 2.20 0.028 *
## horsepower 0.00794 0.00681 1.17 0.244
## weight -0.00687 0.00058 -11.85 < 2e-16 ***
## acceleration 0.15391 0.07750 1.99 0.048 *
## year 0.77344 0.04939 15.66 < 2e-16 ***
## origin 1.34644 0.26907 5.00 8.5e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.3 on 389 degrees of freedom
## Multiple R-squared: 0.822, Adjusted R-squared: 0.819
## F-statistic: 257 on 7 and 389 DF, p-value: <2e-16
par(mfrow=c(2,2))
plot(lmout2)

plot of chunk mul

Bài 10

library(ISLR)
names(Carseats)
##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
attach(Carseats)
## The following objects are masked from Carseats (pos = 3):
##
## Advertising, Age, CompPrice, Education, Income, Population,
## Price, Sales, ShelveLoc, Urban, US
lmsales=lm(Sales~Price+Urban+US)
summary(lmsales)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.921 -1.622 -0.056 1.579 7.058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.04347 0.65101 20.04 < 2e-16 ***
## Price -0.05446 0.00524 -10.39 < 2e-16 ***
## UrbanYes -0.02192 0.27165 -0.08 0.94
## USYes 1.20057 0.25904 4.63 4.9e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.5 on 396 degrees of freedom
## Multiple R-squared: 0.239, Adjusted R-squared: 0.234
## F-statistic: 41.5 on 3 and 396 DF, p-value: <2e-16
contrasts(Urban)
##     Yes
## No 0
## Yes 1
summary(lm(Sales~Urban))
## 
## Call:
## lm(formula = Sales ~ Urban)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.564 -2.111 -0.011 1.791 8.802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.5636 0.2603 29.06 <2e-16 ***
## UrbanYes -0.0954 0.3100 -0.31 0.76
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.8 on 398 degrees of freedom
## Multiple R-squared: 0.000238, Adjusted R-squared: -0.00227
## F-statistic: 0.0947 on 1 and 398 DF, p-value: 0.759
lmsales2=lm(Sales~Price+US)
summary(lmsales2)
## 
## Call:
## lm(formula = Sales ~ Price + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.927 -1.629 -0.057 1.577 7.052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.65 < 2e-16 ***
## Price -0.05448 0.00523 -10.42 < 2e-16 ***
## USYes 1.19964 0.25846 4.64 4.7e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.5 on 397 degrees of freedom
## Multiple R-squared: 0.239, Adjusted R-squared: 0.235
## F-statistic: 62.4 on 2 and 397 DF, p-value: <2e-16
coef(summary(lmsales2))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.031 0.6310 20.7 7.0e-65
## Price -0.054 0.0052 -10.4 1.3e-22
## USYes 1.200 0.2585 4.6 4.7e-06

Bài 11

set.seed(1)
x=rnorm(100)
y=2*x+rnorm(100)
lmyx=lm(y~x-1)
# or lmxy=lm(y~x+0)
summary(lmyx)
## 
## Call:
## lm(formula = y ~ x - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.915 -0.647 -0.177 0.506 2.311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.994 0.106 18.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.96 on 99 degrees of freedom
## Multiple R-squared: 0.78, Adjusted R-squared: 0.778
## F-statistic: 351 on 1 and 99 DF, p-value: <2e-16
lmxy=lm(x~y-1)
summary(lmxy)
## 
## Call:
## lm(formula = x ~ y - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.870 -0.237 0.103 0.286 0.894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.3911 0.0209 18.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.42 on 99 degrees of freedom
## Multiple R-squared: 0.78, Adjusted R-squared: 0.778
## F-statistic: 351 on 1 and 99 DF, p-value: <2e-16
summary(lm(x~y))
## 
## Call:
## lm(formula = x ~ y)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9085 -0.2810 0.0627 0.2457 0.8574
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0388 0.0427 0.91 0.37
## y 0.3894 0.0210 18.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.42 on 98 degrees of freedom
## Multiple R-squared: 0.778, Adjusted R-squared: 0.776
## F-statistic: 344 on 1 and 98 DF, p-value: <2e-16
summary(lm(y~x))
## 
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.877 -0.614 -0.140 0.539 2.346
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0377 0.0970 -0.39 0.7
## x 1.9989 0.1077 18.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.96 on 98 degrees of freedom
## Multiple R-squared: 0.778, Adjusted R-squared: 0.776
## F-statistic: 344 on 1 and 98 DF, p-value: <2e-16

Bài 13

set.seed(28)
x=rnorm(100)
eps=rnorm(100,sd=0.25)
y=-1+0.5*x+eps
lm13=lm(y~x)
summary(lm13)
## 
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6328 -0.1530 0.0172 0.1745 0.5443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0563 0.0261 -40.5 <2e-16 ***
## x 0.4644 0.0259 17.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.26 on 98 degrees of freedom
## Multiple R-squared: 0.766, Adjusted R-squared: 0.764
## F-statistic: 321 on 1 and 98 DF, p-value: <2e-16
plot(x,y)
abline(lm13,col=2,lwd=2)
abline(-1,0.5,col=3,lwd=2)
legend(-2.5,0.3,c("Least square lines","Population regression line"),col=c(2,3),lwd=2)

plot of chunk noise

lmx2=lm(y~x+I(x^2))
anova(lmx2,lm13)
## Analysis of Variance Table
##
## Model 1: y ~ x + I(x^2)
## Model 2: y ~ x
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 97 6.65
## 2 98 6.66 -1 -0.00114 0.02 0.9

Bài 14

set.seed(1)
x1=runif(100)
x2=0.5*x1+rnorm(100)/10
y=2+2*x1+0.3*x2+rnorm(100)
cor(x1,x2)
## [1] 0.84
pairs(data.frame(x1,x2,y),col=2)

plot of chunk collinearity

summary(lm(y~x1+x2))
## 
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8311 -0.7273 -0.0537 0.6338 2.3359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.130 0.232 9.19 7.6e-15 ***
## x1 1.440 0.721 2.00 0.049 *
## x2 1.010 1.134 0.89 0.375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.1 on 97 degrees of freedom
## Multiple R-squared: 0.209, Adjusted R-squared: 0.193
## F-statistic: 12.8 on 2 and 97 DF, p-value: 1.16e-05
summary(lm(y~x1))
## 
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8950 -0.6687 -0.0779 0.5922 2.4556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.112 0.231 9.15 8.3e-15 ***
## x1 1.976 0.396 4.99 2.7e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.1 on 98 degrees of freedom
## Multiple R-squared: 0.202, Adjusted R-squared: 0.194
## F-statistic: 24.9 on 1 and 98 DF, p-value: 2.66e-06
summary(lm(y~x2))
## 
## Call:
## lm(formula = y ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.627 -0.752 -0.036 0.724 2.449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.390 0.195 12.26 < 2e-16 ***
## x2 2.900 0.633 4.58 1.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.1 on 98 degrees of freedom
## Multiple R-squared: 0.176, Adjusted R-squared: 0.168
## F-statistic: 21 on 1 and 98 DF, p-value: 1.37e-05
x1=c(x1,0.1)
x2=c(x2,0.8)
y=c(y,6)
summary(lm(y~x1+x2))
## 
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7335 -0.6932 -0.0526 0.6638 2.3062
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.227 0.231 9.62 7.9e-16 ***
## x1 0.539 0.592 0.91 0.3646
## x2 2.515 0.898 2.80 0.0061 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.1 on 98 degrees of freedom
## Multiple R-squared: 0.219, Adjusted R-squared: 0.203
## F-statistic: 13.7 on 2 and 98 DF, p-value: 5.56e-06
summary(lm(y~x1))
## 
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.890 -0.656 -0.091 0.568 3.567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.257 0.239 9.44 1.8e-15 ***
## x1 1.766 0.412 4.28 4.3e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.1 on 99 degrees of freedom
## Multiple R-squared: 0.156, Adjusted R-squared: 0.148
## F-statistic: 18.3 on 1 and 99 DF, p-value: 4.29e-05
summary(lm(y~x2))
## 
## Call:
## lm(formula = y ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.647 -0.710 -0.069 0.727 2.381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.345 0.191 12.26 < 2e-16 ***
## x2 3.119 0.604 5.16 1.3e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.1 on 99 degrees of freedom
## Multiple R-squared: 0.212, Adjusted R-squared: 0.204
## F-statistic: 26.7 on 1 and 99 DF, p-value: 1.25e-06
par(mfrow=c(2,2))
plot(lm(y~x1+x2))

plot of chunk collinearity

plot(lm(y~x1))

plot of chunk collinearity

plot(lm(y~x2))

plot of chunk collinearity