ggplot2: Toolbox


Các loại biểu đồ cơ bản

library(ggplot2)
df <- data.frame(
x = c(3, 1, 5),
y = c(2, 4, 6),
label = c("a","b","c")
)
p <- ggplot(df, aes(x, y, label = label)) +
xlab(NULL) + ylab(NULL)
p + geom_point() + labs(title = "geom_point")

plot of chunk 7 geom_xxx

p + geom_bar(stat="identity") + 
labs(title = "geom_bar(stat=\"identity\")")

plot of chunk 7 geom_xxx

p + geom_line() + labs(title = "geom_line")

plot of chunk 7 geom_xxx

p + geom_area() + labs(title = "geom_area")

plot of chunk 7 geom_xxx

p + geom_path() + labs(title = "geom_path")

plot of chunk 7 geom_xxx

p + geom_text() + labs(title = "geom_text")

plot of chunk 7 geom_xxx

p + geom_tile() + labs(title = "geom_tile")

plot of chunk 7 geom_xxx

p + geom_polygon() + labs(title = "geom_polygon")

plot of chunk 7 geom_xxx

Các phân phối xác suất

Sử dụng histogram

# Mặc định:
qplot(depth, data=diamonds, geom="histogram")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk histo

# Phóng to đoạn (55, 70), tăng mức chi tiết xuống 0.1:
qplot(depth, data=diamonds, geom="histogram", xlim=c(55, 70), binwidth=0.1)
## Warning: position_stack requires constant width: output may be incorrect

plot of chunk histo

So sánh phân phối giữa các nhóm

depth_dist <- ggplot(diamonds, aes(depth)) + xlim(58, 68)

# Tạo nhiều histogram
depth_dist +
geom_histogram(aes(y = ..density..), binwidth = 0.1) +
facet_grid(cut ~ .)
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect

plot of chunk compare dist

# Sử dụng đồ thị mật độ có điều kiện
depth_dist + geom_histogram(aes(fill = cut), binwidth = 0.1,
position = "fill")
## Warning: position_fill requires constant width: output may be incorrect

plot of chunk compare dist

# Sử dụng các đường tần suất
depth_dist + geom_freqpoly(aes(y = ..density.., colour = cut),
binwidth = 0.1)
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).

plot of chunk compare dist

Phân phối liên tục có điều kiện

# conditional on a discrete variable: `cut`
qplot(cut, depth, data=diamonds, geom="boxplot")

plot of chunk boxplot

# conditional on a continuous variable: `carat`
library(plyr)
qplot(carat, depth, data=diamonds, geom="boxplot",
group = round_any(carat, 0.1, floor), xlim = c(0, 3))
## Warning: position_dodge requires constant width: output may be incorrect
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).

plot of chunk boxplot

Thêm độ nhiễu (jitter) vào phân phối

# Phân phối liên tục
qplot(class, cty, data=mpg, geom="jitter")
## Error: ggplot2 doesn't know how to deal with data of class numeric
# Phân phối rời rạc
qplot(class, drv, data=mpg, geom="jitter")
## Error: ggplot2 doesn't know how to deal with data of class numeric

Đường mật độ

qplot(depth, data=diamonds, geom="density", xlim = c(54, 70))
## Warning: Removed 38 rows containing non-finite values (stat_density).

plot of chunk density

qplot(depth, data=diamonds, geom="density", xlim = c(54, 70), 
fill = cut, alpha = I(0.2))
## Warning: Removed 37 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing non-finite values (stat_density).

plot of chunk density

Biểu diễn các điểm trùng lặp

Làm nhỏ các điểm biểu diễn

df <- data.frame(x = rnorm(2000), y = rnorm(2000))
norm <- ggplot(df, aes(x, y))
norm + geom_point()

plot of chunk make smaller

norm + geom_point(shape = 1)

plot of chunk make smaller

norm + geom_point(shape = ".") # Pixel sized

plot of chunk make smaller

Làm mờ các điểm biểu diễn

library(scales)
norm + geom_point(colour = alpha("black", 1/3))

plot of chunk make transparency

norm + geom_point(colour = alpha("black", 1/5))

plot of chunk make transparency

norm + geom_point(colour = alpha("black", 1/10))

plot of chunk make transparency

Làm nhiễu

td <- ggplot(diamonds, aes(table, depth)) + 
xlim(50, 70) + ylim(50, 70)
# geom point
td + geom_point()
## Warning: Removed 36 rows containing missing values (geom_point).

plot of chunk make jitter

# geo jitter mặc định
td + geom_jitter()
## Warning: Removed 41 rows containing missing values (geom_point).

plot of chunk make jitter

# geo jitter theo phương ngang
jit <- position_jitter(width = 0.5)
td + geom_jitter(position = jit)
## Warning: Removed 41 rows containing missing values (geom_point).

plot of chunk make jitter

td + geom_jitter(position = jit, colour = alpha("black", 1/10))
## Warning: Removed 45 rows containing missing values (geom_point).

plot of chunk make jitter

td + geom_jitter(position = jit, colour = alpha("black", 1/50))
## Warning: Removed 39 rows containing missing values (geom_point).

plot of chunk make jitter

td + geom_jitter(position = jit, colour = alpha("black", 1/200))  
## Warning: Removed 43 rows containing missing values (geom_point).

plot of chunk make jitter

Phân nhóm

d <- ggplot(diamonds, aes(carat, price)) + xlim(1,3) +
labs(legend.position = "none")
d + stat_bin2d()

plot of chunk bin

d + stat_bin2d(bins = 10)

plot of chunk bin

d + stat_bin2d(binwidth=c(0.02, 200))

plot of chunk bin

d + stat_binhex()
## Warning: Removed 34912 rows containing missing values (stat_hexbin).

plot of chunk bin

d + stat_binhex(bins = 10)
## Warning: Removed 34912 rows containing missing values (stat_hexbin).

plot of chunk bin

d + stat_binhex(binwidth=c(0.02, 200))
## Warning: Removed 34912 rows containing missing values (stat_hexbin).

plot of chunk bin

Sử dụng các đường đồng mức mật độ

d <- ggplot(diamonds, aes(carat, price)) + xlim(1,3) + 
labs(legend.position = "none")

d + geom_point() + geom_density2d()
## Warning: Removed 34912 rows containing non-finite values (stat_density2d).
## Warning: Removed 34912 rows containing missing values (geom_point).

plot of chunk overlay contours

d + stat_density2d(geom = "point", aes(size = ..density..), 
contour = F) + scale_size_area(0.2, 1.5)
## Warning: Removed 34912 rows containing non-finite values (stat_density2d).

plot of chunk overlay contours

d + stat_density2d(geom = "tile", aes(fill = ..density..), 
contour = F)
## Warning: Removed 34912 rows containing non-finite values (stat_density2d).

plot of chunk overlay contours

last_plot() + scale_fill_gradient(limits = c(1e-5,8e-4))
## Warning: Removed 34912 rows containing non-finite values (stat_density2d).

plot of chunk overlay contours

Vẽ bản đồ

Gói maps Chỉ mới hỗ trợ bản đồ Pháp, Ý, Tân Tây Lan, Mỹ và bản đồ thế giới.

library(maps)
# Chú thích bằng điểm
data(us.cities)
big_cities <- subset(us.cities, pop > 500000)
qplot(long, lat, data = big_cities) + borders("state", size = 0.5)

plot of chunk maps

tx_cities <- subset(us.cities, country.etc == "TX")
ggplot(tx_cities, aes(long, lat)) +
borders("county", "texas", colour = "grey70") +
geom_point(colour = alpha("black", 0.5))

plot of chunk maps

# Tô màu bản đồ
states <- map_data("state")
arrests <- USArrests
names(arrests) <- tolower(names(arrests))
arrests$region <- tolower(rownames(USArrests))

choro <- merge(states, arrests, by = "region")
# Reorder the rows because order matters when drawing polygons
# and merge destroys the original ordering
choro <- choro[order(choro$order), ]
qplot(long, lat, data = choro, group = group,
fill = assault, geom = "polygon")

plot of chunk maps

qplot(long, lat, data = choro, group = group, 
fill = assault / murder, geom = "polygon")

plot of chunk maps

# Vài tùy chọn khác
ia <- map_data("county", "iowa")
mid_range <- function(x) mean(range(x, na.rm = TRUE))
centres <- ddply(ia, .(subregion),
colwise(mid_range, .(lat, long)))
ggplot(ia, aes(long, lat)) +
geom_polygon(aes(group = group),
fill = NA, colour = "grey60") +
geom_text(aes(label = subregion), data = centres,
size = 2, angle = 45)

plot of chunk maps

Độ bất định

d <- subset(diamonds, carat < 2.5 & 
rbinom(nrow(diamonds), 1, 0.2) == 1)
# carat và price sau khi log10 sẽ có quan hệ tuyến tính
d$lcarat <- log10(d$carat)
d$lprice <- log10(d$price)

# Loại bỏ quan hệ tuyến tính bằng cách sử dụng
# giá trị residual trong mô hình tuyến tính
# làm biến dự báo mới
detrend <- lm(lprice ~ lcarat, data = d)
d$lprice2 <- resid(detrend)

# Nhớ lcarat * color <=> lcarat + color + lcarat:color
# lcarat:color là interaction term giữa lcarat và color.
# Chạy mô hình với biến dự báo mới để xem xét độ bất định
# trong mô hình cũ là do nhân tố nào gây ra.
mod <- lm(lprice2 ~ lcarat * color, data = d)

library(effects)
## Loading required package: grid
## Loading required package: colorspace
##
## Attaching package: 'effects'
##
## The following object is masked from 'package:car':
##
## Prestige
effectdf <- function(...) {
suppressWarnings(as.data.frame(effect(...)))
}
color <- effectdf("color", mod)
## NOTE: color is not a high-order term in the model
both1 <- effectdf("lcarat:color", mod)

carat <- effectdf("lcarat", mod, default.levels = 50)
## NOTE: lcarat is not a high-order term in the model
both2 <- effectdf("lcarat:color", mod, default.levels = 3)

# Đồ thị tuyến tính
qplot(lcarat, lprice, data=d, colour = color)

plot of chunk uncertainly

# Đồ thị sau khi loại bỏ quan hệ tuyến tính,
# đây chính là đồ thị biểu diễn độ bất định
qplot(lcarat, lprice2, data=d, colour = color)

plot of chunk uncertainly

# Khởi tạo đồ thị biểu diễn độ bất định trong việc 
# tính toán mô hình đối với biến colour
fplot <- ggplot(mapping = aes(y = fit, ymin = lower, ymax = upper)) +
ylim(range(both2$lower, both2$upper))

# Ảnh hưởng biên của colour
fplot %+% color + aes(x = color) + geom_point() + geom_errorbar()

plot of chunk uncertainly

# Ảnh hưởng có điều kiện của colour đối với các mức carat
fplot %+% both2 +
aes(x = color, colour = lcarat, group = interaction(color, lcarat)) +
geom_errorbar() + geom_line(aes(group=lcarat)) +
scale_colour_gradient()

plot of chunk uncertainly

# Đồ thị biểu diễn độ bất định trong việc tính toán mô hình
# đối với biến carat.

# Ảnh hưởng biên của carat
fplot %+% carat + aes(x = lcarat) + geom_smooth(stat="identity")

plot of chunk uncertainly

ends <- subset(both1, lcarat == max(lcarat))
# Ảnh hưởng có điều kiện của carat với các mức colour khác nhau
fplot %+% both1 + aes(x = lcarat, colour = color) +
geom_smooth(stat="identity") +
scale_colour_hue() + labs(legend.position = "none") +
geom_text(aes(label = color, x = lcarat + 0.02), ends)

plot of chunk uncertainly

Tóm tắt các giá trị thống kê

# x rời rạc
m <- ggplot(movies, aes(year, rating))
m + stat_summary(fun.y = "median", geom = "line")

plot of chunk statistical summaries

m + stat_summary(fun.data = "median_hilow", geom = "smooth")

plot of chunk statistical summaries

m + stat_summary(fun.y = "mean", geom = "line")

plot of chunk statistical summaries

m + stat_summary(fun.data = "mean_cl_boot", geom = "smooth")

plot of chunk statistical summaries

# x liên tục
m2 <- ggplot(movies, aes(round(rating), log10(votes)))
m2 + stat_summary(fun.y = "mean", geom = "point")

plot of chunk statistical summaries

m2 + stat_summary(fun.data = "mean_cl_normal", geom = "errorbar")

plot of chunk statistical summaries

m2 + stat_summary(fun.data = "median_hilow", geom = "pointrange")

plot of chunk statistical summaries

m2 + stat_summary(fun.data = "median_hilow", geom = "crossbar")

plot of chunk statistical summaries

Các hàm tóm tắt lẻ

midm <- function(x) mean(x, trim = 0.5)
m2 +
stat_summary(aes(colour = "trimmed"), fun.y = midm,
geom = "point") +
stat_summary(aes(colour = "raw"), fun.y = mean,
geom = "point") +
scale_colour_hue("Mean")

plot of chunk individual sum

Các hàm tóm tắt đơn

iqr <- function(x, ...) {
qs <- quantile(as.numeric(x), c(0.25, 0.75), na.rm = T)
names(qs) <- c("ymin", "ymax")
qs
}
m + stat_summary(fun.data = "iqr", geom="ribbon")

plot of chunk single

Chú thích đồ thị

(unemp <- qplot(date, unemploy, data=economics, geom="line", 
xlab = "", ylab = "No. unemployed (1000s)"))

plot of chunk annotate

presidential <- presidential[-(1:3), ]

yrng <- range(economics$unemploy)
xrng <- range(economics$date)
unemp + geom_vline(aes(xintercept = as.numeric(start)),
data = presidential)

plot of chunk annotate

unemp + geom_rect(aes(NULL, NULL, xmin = start, xmax = end, 
fill = party), ymin = yrng[1], ymax = yrng[2],
data = presidential) + scale_fill_manual(values =
alpha(c("blue", "red"), 0.2))

plot of chunk annotate

last_plot() + geom_text(aes(x = start, y = yrng[1], label = name), 
data = presidential, size = 3, hjust = 0, vjust = 0)

plot of chunk annotate

caption <- paste(strwrap("Unemployment rates in the US have 
varied a lot over the years"
, 40), collapse="\n")
unemp + geom_text(aes(x, y, label = caption),
data = data.frame(x = xrng[2], y = yrng[2]),
hjust = 1, vjust = 1, size = 4)

plot of chunk annotate

highest <- subset(economics, unemploy == max(unemploy))
unemp + geom_point(data = highest,
size = 3, colour = alpha("red", 0.5))

plot of chunk annotate

Dữ liệu có trọng số

# Không có trọng số
qplot(percwhite, percbelowpoverty, data = midwest)

plot of chunk weighted data

# Trọng số dân số
qplot(percwhite, percbelowpoverty, data = midwest,
size = poptotal / 1e6) + scale_size_area("Population\n(millions)",
breaks = c(0.5, 1, 2, 4))

plot of chunk weighted data

# Trọng số khu vực
qplot(percwhite, percbelowpoverty, data = midwest, size = area) +
scale_size_area()

plot of chunk weighted data

# Vẽ đường tương quan
lm_smooth <- geom_smooth(method = lm, size = 1)
qplot(percwhite, percbelowpoverty, data = midwest) + lm_smooth

plot of chunk weighted data

qplot(percwhite, percbelowpoverty, data = midwest, 
weight = popdensity, size = popdensity) + lm_smooth

plot of chunk weighted data

# Histogram không trọng số
qplot(percbelowpoverty, data = midwest, binwidth = 1)

plot of chunk weighted data

# Histogram có trọng số
qplot(percbelowpoverty, data = midwest, weight = poptotal,
binwidth = 1) + ylab("population")

plot of chunk weighted data