ggplot2: Toolbox

Các loại biểu đồ cơ bản

library(ggplot2)
df <- data.frame(
  x = c(3, 1, 5), 
  y = c(2, 4, 6), 
  label = c("a","b","c")
)
p <- ggplot(df, aes(x, y, label = label)) + 
  xlab(NULL) + ylab(NULL)
p + geom_point() + labs(title = "geom_point")

plot of chunk 7 geom_xxx

p + geom_bar(stat="identity") + 
  labs(title = "geom_bar(stat=\"identity\")")

plot of chunk 7 geom_xxx

p + geom_line() + labs(title = "geom_line")

plot of chunk 7 geom_xxx

p + geom_area() + labs(title = "geom_area")

plot of chunk 7 geom_xxx

p + geom_path() + labs(title = "geom_path")

plot of chunk 7 geom_xxx

p + geom_text() + labs(title = "geom_text")

plot of chunk 7 geom_xxx

p + geom_tile() + labs(title = "geom_tile")

plot of chunk 7 geom_xxx

p + geom_polygon() + labs(title = "geom_polygon")

plot of chunk 7 geom_xxx

Các phân phối xác suất

Sử dụng histogram

# Mặc định:
qplot(depth, data=diamonds, geom="histogram")

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk histo

# Phóng to đoạn (55, 70), tăng mức chi tiết xuống 0.1:
qplot(depth, data=diamonds, geom="histogram", xlim=c(55, 70), binwidth=0.1)

## Warning: position_stack requires constant width: output may be incorrect

plot of chunk histo

So sánh phân phối giữa các nhóm

depth_dist <- ggplot(diamonds, aes(depth)) + xlim(58, 68)
 
# Tạo nhiều histogram
depth_dist + 
  geom_histogram(aes(y = ..density..), binwidth = 0.1) +
  facet_grid(cut ~ .)

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

plot of chunk compare dist

# Sử dụng đồ thị mật độ có điều kiện
depth_dist + geom_histogram(aes(fill = cut), binwidth = 0.1, 
  position = "fill")

## Warning: position_fill requires constant width: output may be incorrect

plot of chunk compare dist

# Sử dụng các đường tần suất
depth_dist + geom_freqpoly(aes(y = ..density.., colour = cut), 
  binwidth = 0.1)

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

plot of chunk compare dist

Phân phối liên tục có điều kiện

# conditional on a discrete variable: `cut`
qplot(cut, depth, data=diamonds, geom="boxplot")

plot of chunk boxplot

# conditional on a continuous variable: `carat`
library(plyr)
qplot(carat, depth, data=diamonds, geom="boxplot", 
  group = round_any(carat, 0.1, floor), xlim = c(0, 3))

## Warning: position_dodge requires constant width: output may be incorrect

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 2 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

plot of chunk boxplot

Thêm độ nhiễu (jitter) vào phân phối

# Phân phối liên tục
qplot(class, cty, data=mpg, geom="jitter")

## Error: ggplot2 doesn't know how to deal with data of class numeric

# Phân phối rời rạc
qplot(class, drv, data=mpg, geom="jitter")

## Error: ggplot2 doesn't know how to deal with data of class numeric

Đường mật độ

qplot(depth, data=diamonds, geom="density", xlim = c(54, 70))

## Warning: Removed 38 rows containing non-finite values (stat_density).

plot of chunk density

qplot(depth, data=diamonds, geom="density", xlim = c(54, 70), 
  fill = cut, alpha = I(0.2))

## Warning: Removed 37 rows containing non-finite values (stat_density).

## Warning: Removed 1 rows containing non-finite values (stat_density).

plot of chunk density

Biểu diễn các điểm trùng lặp

Làm nhỏ các điểm biểu diễn

df <- data.frame(x = rnorm(2000), y = rnorm(2000))
norm <- ggplot(df, aes(x, y))
norm + geom_point()

plot of chunk make smaller

norm + geom_point(shape = 1)

plot of chunk make smaller

norm + geom_point(shape = ".") # Pixel sized

plot of chunk make smaller

Làm mờ các điểm biểu diễn

library(scales)
norm + geom_point(colour = alpha("black", 1/3))

plot of chunk make transparency

norm + geom_point(colour = alpha("black", 1/5))

plot of chunk make transparency

norm + geom_point(colour = alpha("black", 1/10))

plot of chunk make transparency

Làm nhiễu

td <- ggplot(diamonds, aes(table, depth)) + 
  xlim(50, 70) + ylim(50, 70)
# geom point
td + geom_point()

## Warning: Removed 36 rows containing missing values (geom_point).

plot of chunk make jitter

# geo jitter mặc định
td + geom_jitter()

## Warning: Removed 41 rows containing missing values (geom_point).

plot of chunk make jitter

# geo jitter theo phương ngang
jit <- position_jitter(width = 0.5)
td + geom_jitter(position = jit)

## Warning: Removed 41 rows containing missing values (geom_point).

plot of chunk make jitter

td + geom_jitter(position = jit, colour = alpha("black", 1/10))

## Warning: Removed 45 rows containing missing values (geom_point).

plot of chunk make jitter

td + geom_jitter(position = jit, colour = alpha("black", 1/50))

## Warning: Removed 39 rows containing missing values (geom_point).

plot of chunk make jitter

td + geom_jitter(position = jit, colour = alpha("black", 1/200))

## Warning: Removed 43 rows containing missing values (geom_point).

plot of chunk make jitter

Phân nhóm

d <- ggplot(diamonds, aes(carat, price)) + xlim(1,3) +
  labs(legend.position = "none")
d + stat_bin2d()

plot of chunk bin

d + stat_bin2d(bins = 10)

plot of chunk bin

d + stat_bin2d(binwidth=c(0.02, 200))

plot of chunk bin

d + stat_binhex()

## Warning: Removed 34912 rows containing missing values (stat_hexbin).

plot of chunk bin

d + stat_binhex(bins = 10)

## Warning: Removed 34912 rows containing missing values (stat_hexbin).

plot of chunk bin

d + stat_binhex(binwidth=c(0.02, 200))

## Warning: Removed 34912 rows containing missing values (stat_hexbin).

plot of chunk bin

Sử dụng các đường đồng mức mật độ

d <- ggplot(diamonds, aes(carat, price)) + xlim(1,3) + 
  labs(legend.position = "none")
 
d + geom_point() + geom_density2d()

## Warning: Removed 34912 rows containing non-finite values (stat_density2d).

## Warning: Removed 34912 rows containing missing values (geom_point).

plot of chunk overlay contours

d + stat_density2d(geom = "point", aes(size = ..density..), 
 contour = F) + scale_size_area(0.2, 1.5)

## Warning: Removed 34912 rows containing non-finite values (stat_density2d).

plot of chunk overlay contours

d + stat_density2d(geom = "tile", aes(fill = ..density..), 
  contour = F)

## Warning: Removed 34912 rows containing non-finite values (stat_density2d).

plot of chunk overlay contours

last_plot() + scale_fill_gradient(limits = c(1e-5,8e-4))

## Warning: Removed 34912 rows containing non-finite values (stat_density2d).

plot of chunk overlay contours

Vẽ bản đồ

Gói maps Chỉ mới hỗ trợ bản đồ Pháp, Ý, Tân Tây Lan, Mỹ và bản đồ thế giới.

library(maps)
# Chú thích bằng điểm
data(us.cities)
big_cities <- subset(us.cities, pop > 500000)
qplot(long, lat, data = big_cities) + borders("state", size = 0.5)

plot of chunk maps

tx_cities <- subset(us.cities, country.etc == "TX")
ggplot(tx_cities, aes(long, lat)) +
  borders("county", "texas", colour = "grey70") +
  geom_point(colour = alpha("black", 0.5))

plot of chunk maps

# Tô màu bản đồ
states <- map_data("state")
arrests <- USArrests
names(arrests) <- tolower(names(arrests))
arrests$region <- tolower(rownames(USArrests))
 
choro <- merge(states, arrests, by = "region")
# Reorder the rows because order matters when drawing polygons
# and merge destroys the original ordering
choro <- choro[order(choro$order), ]
qplot(long, lat, data = choro, group = group, 
  fill = assault, geom = "polygon")

plot of chunk maps

qplot(long, lat, data = choro, group = group, 
  fill = assault / murder, geom = "polygon")

plot of chunk maps

# Vài tùy chọn khác
ia <- map_data("county", "iowa")
mid_range <- function(x) mean(range(x, na.rm = TRUE))
centres <- ddply(ia, .(subregion), 
  colwise(mid_range, .(lat, long)))
ggplot(ia, aes(long, lat)) + 
  geom_polygon(aes(group = group), 
    fill = NA, colour = "grey60") +
  geom_text(aes(label = subregion), data = centres, 
    size = 2, angle = 45)

plot of chunk maps

Độ bất định

d <- subset(diamonds, carat < 2.5 & 
  rbinom(nrow(diamonds), 1, 0.2) == 1)
# carat và price sau khi log10 sẽ có quan hệ tuyến tính
d$lcarat <- log10(d$carat)
d$lprice <- log10(d$price)
 
# Loại bỏ quan hệ tuyến tính bằng cách sử dụng 
# giá trị residual trong mô hình tuyến tính
# làm biến dự báo mới
detrend <- lm(lprice ~ lcarat, data = d)
d$lprice2 <- resid(detrend)
 
# Nhớ lcarat * color <=> lcarat + color + lcarat:color
# lcarat:color là interaction term giữa lcarat và color.
# Chạy mô hình với biến dự báo mới để xem xét độ bất định
# trong mô hình cũ là do nhân tố nào gây ra.
mod <- lm(lprice2 ~ lcarat * color, data = d)
 
library(effects)

## Loading required package: grid
## Loading required package: colorspace
## 
## Attaching package: 'effects'
## 
## The following object is masked from 'package:car':
## 
##     Prestige

effectdf <- function(...) {
  suppressWarnings(as.data.frame(effect(...)))
}
color <- effectdf("color", mod)

## NOTE: color is not a high-order term in the model

both1 <- effectdf("lcarat:color", mod)
 
carat <- effectdf("lcarat", mod, default.levels = 50)

## NOTE: lcarat is not a high-order term in the model

both2 <- effectdf("lcarat:color", mod, default.levels = 3)
 
# Đồ thị tuyến tính
qplot(lcarat, lprice, data=d, colour = color)

plot of chunk uncertainly

# Đồ thị sau khi loại bỏ quan hệ tuyến tính,
# đây chính là đồ thị biểu diễn độ bất định
qplot(lcarat, lprice2, data=d, colour = color)

plot of chunk uncertainly

# Khởi tạo đồ thị biểu diễn độ bất định trong việc 
# tính toán mô hình đối với biến colour
fplot <- ggplot(mapping = aes(y = fit, ymin = lower, ymax = upper)) + 
  ylim(range(both2$lower, both2$upper))
 
# Ảnh hưởng biên của colour
fplot %+% color + aes(x = color) + geom_point() + geom_errorbar()

plot of chunk uncertainly

# Ảnh hưởng có điều kiện của colour đối với các mức carat
fplot %+% both2 + 
  aes(x = color, colour = lcarat, group = interaction(color, lcarat)) + 
  geom_errorbar() + geom_line(aes(group=lcarat)) + 
  scale_colour_gradient()

plot of chunk uncertainly

# Đồ thị biểu diễn độ bất định trong việc tính toán mô hình
# đối với biến carat.
 
# Ảnh hưởng biên của carat
fplot %+% carat + aes(x = lcarat) + geom_smooth(stat="identity")

plot of chunk uncertainly

ends <- subset(both1, lcarat == max(lcarat))
# Ảnh hưởng có điều kiện của carat với các mức colour khác nhau
fplot %+% both1 + aes(x = lcarat, colour = color) +
 geom_smooth(stat="identity") + 
 scale_colour_hue() + labs(legend.position = "none") +
 geom_text(aes(label = color, x = lcarat + 0.02), ends)

plot of chunk uncertainly