我正在尝试使用 Plotly via R 制作旭日图。我正在努力解决层次结构所需的数据模型,无论是概念化它的工作原理,还是看看是否有任何简单的方法来转换常规数据框,其中的列代表不同的层次级别,转换为所需的格式。
我查看了 R 中的绘图旭日图示例,例如,here,并查看了参考页,但没有完全获得数据格式化的模型。
# Create some fake data - say ownership and land use data with acreage
df <- data.frame(ownership=c(rep("private", 3), rep("public",3),rep("mixed", 3)),
landuse=c(rep(c("residential", "recreation", "commercial"),3)),
acres=c(108,143,102, 300,320,500, 37,58,90))
# Just try some quick pie charts of acreage by landuse and ownership
plot_ly(data=df, labels= ~landuse, values= ~acres, type='pie')
plot_ly(data=df, labels= ~ownership, values= ~acres, type='pie')
# This doesn't render anything... not that I'd expect it to given the data format doesn't seem to match what's needed,
# but this is what I'd intuitively expect to work
plot_ly(data=df, labels= ~landuse, parents = ~ownership, values= ~acres, type='sunburst')
鉴于上面的示例代码或类似代码,了解如何从数据 (
df
) 转换为绘图旭日图所需的格式会很有帮助。
你是绝对正确的,与plotly的R API的其余直观用法相比,为旭日(或树状图)图表准备数据是相当烦人的。
我遇到了同样的问题,并根据
library(data.table)
编写了一个函数来准备数据,接受两种不同的 data.frame
输入格式。
使用与您的结构类似的数据生成旭日图所需的格式可以在此处在具有重复标签的旭日图部分下看到。
对于您的示例,它应该如下所示:
labels values parents ids
1: total 1658 <NA> total
2: private 353 total total - private
3: public 1120 total total - public
4: mixed 185 total total - mixed
5: residential 108 total - private total - private - residential
6: recreation 143 total - private total - private - recreation
7: commercial 102 total - private total - private - commercial
8: residential 300 total - public total - public - residential
9: recreation 320 total - public total - public - recreation
10: commercial 500 total - public total - public - commercial
11: residential 37 total - mixed total - mixed - residential
12: recreation 58 total - mixed total - mixed - recreation
13: commercial 90 total - mixed total - mixed - commercial
这是到达那里的代码:
library(data.table)
library(plotly)
DF <- data.table(ownership=c(rep("private", 3), rep("public",3),rep("mixed", 3)),
landuse=c(rep(c("residential", "recreation", "commercial"),3)),
acres=c(108, 143, 102, 300, 320, 500, 37, 58, 90))
as.sunburstDF <- function(DF, value_column = NULL, add_root = FALSE){
require(data.table)
colNamesDF <- names(DF)
if(is.data.table(DF)){
DT <- copy(DF)
} else {
DT <- data.table(DF, stringsAsFactors = FALSE)
}
if(add_root){
DT[, root := "Total"]
}
colNamesDT <- names(DT)
hierarchy_columns <- setdiff(colNamesDT, value_column)
DT[, (hierarchy_columns) := lapply(.SD, as.factor), .SDcols = hierarchy_columns]
if(is.null(value_column) && add_root){
setcolorder(DT, c("root", colNamesDF))
} else if(!is.null(value_column) && !add_root) {
setnames(DT, value_column, "values", skip_absent=TRUE)
setcolorder(DT, c(setdiff(colNamesDF, value_column), "values"))
} else if(!is.null(value_column) && add_root) {
setnames(DT, value_column, "values", skip_absent=TRUE)
setcolorder(DT, c("root", setdiff(colNamesDF, value_column), "values"))
}
hierarchyList <- list()
for(i in seq_along(hierarchy_columns)){
current_columns <- colNamesDT[1:i]
if(is.null(value_column)){
currentDT <- unique(DT[, ..current_columns][, values := .N, by = current_columns], by = current_columns)
} else {
currentDT <- DT[, lapply(.SD, sum, na.rm = TRUE), by=current_columns, .SDcols = "values"]
}
setnames(currentDT, length(current_columns), "labels")
hierarchyList[[i]] <- currentDT
}
hierarchyDT <- rbindlist(hierarchyList, use.names = TRUE, fill = TRUE)
parent_columns <- setdiff(names(hierarchyDT), c("labels", "values", value_column))
hierarchyDT[, parents := apply(.SD, 1, function(x){fifelse(all(is.na(x)), yes = NA_character_, no = paste(x[!is.na(x)], sep = ":", collapse = " - "))}), .SDcols = parent_columns]
hierarchyDT[, ids := apply(.SD, 1, function(x){paste(x[!is.na(x)], collapse = " - ")}), .SDcols = c("parents", "labels")]
hierarchyDT[, c(parent_columns) := NULL]
return(hierarchyDT)
}
sunburstDF <- as.sunburstDF(DF, value_column = "acres", add_root = TRUE)
plot_ly(data = sunburstDF, ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')
这里是函数接受的第二种
data.frame
格式的示例(value_column = NULL
,因为它是根据数据计算出来的):
DF2 <- data.frame(sample(LETTERS[1:3], 100, replace = TRUE),
sample(LETTERS[4:6], 100, replace = TRUE),
sample(LETTERS[7:9], 100, replace = TRUE),
sample(LETTERS[10:12], 100, replace = TRUE),
sample(LETTERS[13:15], 100, replace = TRUE),
stringsAsFactors = FALSE)
plot_ly(data = as.sunburstDF(DF2, add_root = TRUE), ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')
另请参阅库(sunburstR)作为替代方案。
编辑: 添加了关于来自
count_to_sunburst()
的基于 dplyr 的 library(plotme)
函数的基准(见下文),在我的系统上,该基准比 data.table
版本慢大约 5 倍。
Unit: milliseconds
expr min lq mean median uq max neval
plotme 50.4618 53.09425 60.92404 55.37815 63.62315 122.3842 100
ismirsehregal 8.6553 10.28870 12.63881 11.53760 12.26620 108.2025 100
重现基准测试的代码:
# devtools::install_github("yogevherz/plotme")
library(microbenchmark)
library(plotme)
library(dplyr)
library(data.table)
library(plotly)
DF <- data.frame(ownership=c(rep("private", 3), rep("public",3),rep("mixed", 3)),
landuse=c(rep(c("residential", "recreation", "commercial"),3)),
acres=c(108, 143, 102, 300, 320, 500, 37, 58, 90))
as.sunburstDF <- function(DF, value_column = NULL, add_root = FALSE){
require(data.table)
colNamesDF <- names(DF)
if(is.data.table(DF)){
DT <- copy(DF)
} else {
DT <- data.table(DF, stringsAsFactors = FALSE)
}
if(add_root){
DT[, root := "Total"]
}
colNamesDT <- names(DT)
hierarchy_columns <- setdiff(colNamesDT, value_column)
DT[, (hierarchy_columns) := lapply(.SD, as.factor), .SDcols = hierarchy_columns]
if(is.null(value_column) && add_root){
setcolorder(DT, c("root", colNamesDF))
} else if(!is.null(value_column) && !add_root) {
setnames(DT, value_column, "values", skip_absent=TRUE)
setcolorder(DT, c(setdiff(colNamesDF, value_column), "values"))
} else if(!is.null(value_column) && add_root) {
setnames(DT, value_column, "values", skip_absent=TRUE)
setcolorder(DT, c("root", setdiff(colNamesDF, value_column), "values"))
}
hierarchyList <- list()
for(i in seq_along(hierarchy_columns)){
current_columns <- colNamesDT[1:i]
if(is.null(value_column)){
currentDT <- unique(DT[, ..current_columns][, values := .N, by = current_columns], by = current_columns)
} else {
currentDT <- DT[, lapply(.SD, sum, na.rm = TRUE), by=current_columns, .SDcols = "values"]
}
setnames(currentDT, length(current_columns), "labels")
hierarchyList[[i]] <- currentDT
}
hierarchyDT <- rbindlist(hierarchyList, use.names = TRUE, fill = TRUE)
parent_columns <- setdiff(names(hierarchyDT), c("labels", "values", value_column))
hierarchyDT[, parents := apply(.SD, 1, function(x){fifelse(all(is.na(x)), yes = NA_character_, no = paste(x[!is.na(x)], sep = ":", collapse = " - "))}), .SDcols = parent_columns]
hierarchyDT[, ids := apply(.SD, 1, function(x){paste(x[!is.na(x)], collapse = " - ")}), .SDcols = c("parents", "labels")]
hierarchyDT[, c(parent_columns) := NULL]
return(hierarchyDT)
}
microbenchmark(plotme = {
DF %>%
rename(n = acres) %>%
count_to_sunburst()
}, ismirsehregal = {
plot_ly(data = as.sunburstDF(DF, value_column = "acres", add_root = TRUE), ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')
})
有专门用于此任务的
plotme
包:
library(plotme)
library(dplyr)
df %>%
rename(n = acres) %>%
count_to_sunburst()
要安装软件包,请运行:
devtools::install_github("yogevherz/plotme")
有关包装的更多信息这里。
我已经尝试了 ismirsehregal 上面建议的方法,最终将数据帧转换为旭日图的适当格式真是太棒了。 然而,正如您在创建的图中所看到的,存在不应该存在的间隙。 这是输入数据:
labels values parents ids
1: Total 2347 <NA> Total
2: External User 1264 Total Total - External User
3: UHB 1083 Total Total - UHB
4: ARI 3 Total - External User Total - External User - ARI
5: BCH 223 Total - External User Total - External User - BCH
6: Castle Vale RU 2 Total - External User Total - External User - Castle Vale RU
7: City Hospital Birmingham 150 Total - External User Total - External User - City Hospital Birmingham
8: DRI 1 Total - External User Total - External User - DRI
9: GP 7 Total - External User Total - External User - GP
10: HCH WVT 13 Total - External User Total - External User - HCH WVT
11: Jersey Path lab 3 Total - External User Total - External User - Jersey Path lab
12: LCH 1 Total - External User Total - External User - LCH
13: PCH 1 Total - External User Total - External User - PCH
14: PEH 33 Total - External User Total - External User - PEH
15: PHB 25 Total - External User Total - External User - PHB
16: Priory Hospital 1 Total - External User Total - External User - Priory Hospital
17: RHH 10 Total - External User Total - External User - RHH
18: RSH 20 Total - External User Total - External User - RSH
19: RWHT 183 Total - External User Total - External User - RWHT
20: SFH 1 Total - External User Total - External User - SFH
21: SHB 2 Total - External User Total - External User - SHB
22: Sandwell GH 4 Total - External User Total - External User - Sandwell GH
23: Scunthorpe GH 45 Total - External User Total - External User - Scunthorpe GH
24: Spire Little Aston 6 Total - External User Total - External User - Spire Little Aston
25: UHCW 12 Total - External User Total - External User - UHCW
26: UHNM 297 Total - External User Total - External User - UHNM
27: WAH 216 Total - External User Total - External User - WAH
28: WMH 1 Total - External User Total - External User - WMH
29: WoSSVC 4 Total - External User Total - External User - WoSSVC
30: BHH 544 Total - UHB Total - UHB - BHH
31: GHH 279 Total - UHB Total - UHB - GHH
32: QE 87 Total - UHB Total - UHB - QE
33: SOL 90 Total - UHB Total - UHB - SOL
34: UHB (no location) 83 Total - UHB Total - UHB - UHB (no location)
labels values parents ids
有谁知道为什么我似乎无法覆盖整个圆圈并且只能覆盖 50%? 我仔细检查了我的数据,没有发现总计和小计之间有任何差异。