写在前面
之前曾抱怨过complexheatmap太复杂了,现在我想说是我太菜了。折腾了一圈complexheatmap后,发现complexheatmap真的是极致全面的一个热图包了。
TidyTuesday
如果你想入门R或者python,但是没有数据给你折腾而发愁。那么我推荐你去看看TidyTueday。点这里
每周二一个新的数据包,覆盖各种类型。比如今天我们折腾的数据就是来自TidyTuesday。想要获取数据非常简单,只需要安装TidyTuesday的包。
install.packages("tidytuesdayR")
首先读入需要的包。
#Loading necessary packages
library(tidytuesdayR)
library(tidyverse)
library(DataExplorer) # easy way for data exploring
然后
# Import data from tidytuesday
tuesdata <- tidytuesdayR::tt_load('2020-09-01')
# or
tuesdata <- tidytuesdayR::tt_load(2020, week = 36)
--- Compiling #TidyTuesday Information for 2020-09-01 ----
--- There are 5 files available ---
--- Starting Download ---
Downloading file 1 of 5: `arable_land_pin.csv`
Downloading file 2 of 5: `cereal_crop_yield_vs_fertilizer_application.csv`
Downloading file 3 of 5: `cereal_yields_vs_tractor_inputs_in_agriculture.csv`
Downloading file 4 of 5: `key_crop_yields.csv`
Downloading file 5 of 5: `land_use_vs_yield_change_in_cereal_production.csv`
--- Download complete ---
当然,具体请考虑自己的网速。因为数据保存在github上……
数据可视化
key_crop_yields <- tuesdata$key_crop_yields
key_crop_yields %>%
filter(Code != ""& Entity != "World") %>%
select(-(2)) %>%
create_report(
config = configure_report(
add_plot_bar = FALSE
)
)
这次使用DataExplorer自动生成报告,当然你可以通过函数查看更具体的内容。
key_crop_yields %>%
filter(
Entity %in% c(
"Japan",
"China",
"United States",
"Brazil",
"Indonesia",
"India",
"Pakistan",
"Nigeria",
"Bangladesh",
"Russia",
"Mexico"
)
) %>%
select(-(2)) %>%
mutate(Year = as.factor(Year),
Entity = as.factor(Entity)) %>% plot_boxplot(by = c("Year"))
可可的产量很有意思啊。
或者
key_crop_yields %>%
drop_na() %>%
filter(Code != "" & Entity != "World" & Year == 2018) %>%
select(-(1:3)) %>%
plot_correlation(type = "c")
当然,也能用ggplot2
key_crop_yields %>%
filter(
Entity %in% c(
"Japan",
"China",
"United States",
"Brazil",
"Indonesia",
"India",
"Pakistan",
"Nigeria",
"Bangladesh",
"Russia",
"Mexico"
)
) %>%
ggplot(., aes(x = Year, y = `Wheat (tonnes per hectare)`, color = Entity)) +
geom_point(size = .1) +
geom_smooth(method = 'lm') +
theme_bw()
尼日利亚的小麦产量居然是下降的,听说他们主粮是一种豆子。中国小麦公顷产增长超级快,那么中国小麦公顷产达到世界前列了么?一张热图告诉你。
注:这里选择的是人口前11的国家。
世界小麦公顷产量热图
hmmm 看来不是…… 什么你问热图怎么做的?complexheatmap 啊!
代码具体参考kevinblighe/E-MTAB-6141: Data from Lewis, Barnes, Blighe et al., Cell Rep. 2019 Aug 27; 28(9): 2455–2470.e5. (github.com)
#载入包
require(RColorBrewer)
require(ComplexHeatmap)
require(circlize)
require(cluster)
require(countrycode)
然后整理一下数据
set.seed(2021)
yield_heatmap <- key_crop_yields %>%
filter(!is.na(Code)) %>%
select(Entity, Year, `Wheat (tonnes per hectare)`) %>%
spread(Year, `Wheat (tonnes per hectare)`) %>%
filter(Entity != "World") %>%
column_to_rownames(var = "Entity") %>%
drop_na()
key_crop_yields %>%
filter(str_detect(Entity,"Sudan")) %>%
select(Entity, Year, `Wheat (tonnes per hectare)`) %>%
drop_na() %>%
mutate(Entity = "Sudan") %>%
arrange(Year) %>%
spread(Year, `Wheat (tonnes per hectare)`) %>%
column_to_rownames(var = "Entity") %>%
bind_rows(yield_heatmap) -> heat
上面做了两件事,一个是把数据里各种大洲和世界数据去掉了,另一个就是把苏丹数据跟前苏丹的数据合并了。
注:这里指的是Entity不是Country,防杠还是去掉了一些有争议的地区。
然后joint一下地理信息,后来发现起始直接通过Code就能获得准确信息了……以后说吧。
rownames(heat) %>% data.frame(country = .) -> df
df$Continent <- countrycode(sourcevar = df[, "country"],
origin = "country.name",
destination = "continent")
接下来是定义五大洲板块的色块注释。
ann <- data.frame(Continent = df$Continent,
row.names = df$country,
stringsAsFactors = FALSE)
colours <- list(
Continent = c('Africa' = '#2171B5', 'Asia' = '#6A51A3', 'Europe' = 'red2', 'Americas' = 'green3', 'Oceania' = 'orange')
)
colAnn <- HeatmapAnnotation(
df = ann,
which = 'row', # 'col' (samples) or 'row' (gene) annotation?
col = colours,
annotation_height = 0.6,
annotation_width = unit(1, 'cm'),
gap = unit(1, 'mm'),
annotation_legend_param = list(
Continent = list(
nrow = 5,
title = 'Continent',
title_position = 'topcenter',
legend_direction = 'vertical',
title_gp = gpar(fontsize = 12, fontface = 'bold'),
labels_gp = gpar(fontsize = 12, fontface = 'bold'))
)
)
顶部的箱线图注释
pick.col <- brewer.pal(5, 'Greens')
boxplotCol <- HeatmapAnnotation(
Average = anno_boxplot(
heat,
border = FALSE,
gp = gpar(fill = colorRampPalette(pick.col)(length(1:58))),
pch = '.',
size = unit(2, 'mm'),
axis = TRUE,
axis_param = list(
gp = gpar(fontsize = 12),
side = 'left')),
annotation_width = unit(c(2.0), 'cm'),
which = 'col')
然后是顶部线图的注释
heat_anno <- HeatmapAnnotation(
Average = anno_lines(
apply(heat, 2, mean),
border = TRUE,
gp = gpar(col = "red2"),
size = unit(2, 'mm'),
axis = TRUE,
axis_param = list(
gp = gpar(fontsize = 10),
side = 'left')),
Distribution = anno_boxplot(
heat,
border = FALSE,
gp = gpar(fill = colorRampPalette(pick.col)(length(1:58))),
pch = '.',
size = unit(2, 'mm'),
axis = TRUE,
axis_param = list(
gp = gpar(fontsize = 12),
side = 'left')),
annotation_width = unit(c(2.0), 'cm'),
which = 'col')
当然你也可以换成柱状图
barplotCol <- HeatmapAnnotation(
Average = anno_barplot(
apply(heat, 2, mean),
border = TRUE,
gp = gpar(fill = colorRampPalette(pick.col)(length(1:58))),
size = unit(2, 'mm'),
axis = TRUE,
axis_param = list(
gp = gpar(fontsize = 12),
side = 'left')),
annotation_width = unit(c(2.0), 'cm'),
which = 'col')
然后开始画图热图吧
说个题外话,如果你对魔改complexheatmap有兴趣,强烈建议看看这儿
ComplexHeatmap Complete Reference (jokergoo.github.io)
虽然看的人想喊师傅别念了别念了……
myCol <- colorRampPalette(c('dodgerblue', 'black', 'yellow'))(100) #三色
myBreaks <- seq(1, 8, length.out = 100) #设置色彩范围
pamClusters <- cluster::pam(heat, k = 5) #设置聚类数
Heatmap(
heat,
name = 'Wheat
(tonnes per hectare)',
col = colorRamp2(myBreaks, myCol), #这个颜色设定值得学习,做出来很高级
row_split = factor(pamClusters$clustering, levels = c(5,4,3,1,2)), #聚类排序,
cluster_row_slices = FALSE, #不解释,自己看manual
heatmap_legend_param = list( #设置标签
color_bar = 'continuous',
legend_direction = 'vertical',
legend_width = unit(8, 'cm'),
legend_height = unit(5.0, 'cm'),
title_position = 'topcenter',
title_gp = gpar(fontsize = 12, fontface = 'bold'),
labels_gp = gpar(fontsize = 12, fontface = 'bold')
),
# row parameters
cluster_rows = TRUE, #聚类
show_row_dend = TRUE, #显示聚类
row_title = "Cluster",
row_title_gp = gpar(fontsize = 10, fontface = 'bold'), #字号,粗体
row_title_rot = 90,
show_row_names = TRUE,
row_names_gp = gpar(fontsize = 10, fontface = 'bold'),
row_names_side = 'right',
row_dend_width = unit(25, 'mm'),
# column parameters
cluster_columns = FALSE,
column_title = 'Year',
column_title_side = 'bottom',
column_title_gp = gpar(fontsize = 10, fontface = 'bold'),
column_title_rot = 0,
show_column_names = TRUE,
column_names_gp = gpar(fontsize = 10, fontface = 'bold'),
column_names_max_height = unit(10, 'cm'),
column_dend_height = unit(25, 'mm'),
# cluster methods for rows and columns
clustering_distance_rows = function(m) dist(m),
row_dend_reorder = T,
clustering_method_rows = 'ward.D2',
top_annotation = heat_anno,
right_annotation = colAnn
) -> heat_map
heat_map
是不是很简单呢!做完我都不记得那些参数是干嘛的了……
那么有没有简单的办法呢!有个低配版,效果嘛……
yield_mean <- apply(heat, 2, mean) %>% data.frame(average = .) %>% mutate(year = rownames(.))
heat %>%
mutate(country = rownames(.)) %>%
left_join(., df, by = "country") %>%
gather(year, yield, -country, -Continent ) %>%
left_join(., yield_mean, by = "year") %>%
as.tibble() %>%
tidyHeatmap::heatmap(country, year, yield,
cluster_columns = FALSE,
.scale = "none",
transform = NULL) %>%
tidyHeatmap::add_tile(Continent) %>%
tidyHeatmap::add_bar(average)
怎么说呢,又不是不能用!