我意识到这个问题已经在这里以类似的方式被问过多次。我并不是要求包含数据密度热图的散点图,因为这将 both 变量的密度捕获为平滑函数。我正在寻找的是这样的东西,它获取结果变量分布的“切片”并将其覆盖在散点图上:
我能想出的最好的办法是这样的:
#### Load Library ####
library(tidyverse)
#### Get IQR ####
q <- quantile(iris$Sepal.Length,
probs = c(.25,.5,.75))
q
#### Label Quantile Regions ####
qiris <- iris %>%
mutate(qs = ifelse(Sepal.Length >= q[3],
"Q75",
ifelse(Sepal.Length >= q[2],
"Q50","Q25")))
#### Plot Density and Scatter ####
ggplot()+
geom_point(aes(x=Sepal.Width,
y=Sepal.Length),
data=iris)+
geom_density(aes(y=Sepal.Length,
fill=qs),
data=qiris)
但可以预见的是,这会失败,因为它没有将分布的“切片”与预测变量值相关联。
然后我想出了一个稍微好一点的解决方案,可以正确定位值的分布:
library(ggridges)
ggplot(qiris,
aes(x = Sepal.Length,
y = qs)) +
stat_density_ridges(quantiles = c(0.25,0.5,0.75),
geom="density_ridges_gradient",
jittered_points = TRUE,
position = "raincloud",
alpha = 0.6,
scale = 0.6)+
coord_flip()
这给了我这个:
但是这里仍然存在三个问题。首先,我无法通过它拟合回归线。其次,我希望数据点像普通散点图一样彼此相邻,而不是按分位数在空间上分开,这样它们就太远了。第三,这根本不包括其他变量,这一点很重要。
艾伦的答案乍一看不错,但我认为有些东西我在他的代码中没有看到。为了尝试解决这个问题,我尝试使用另一个数据集并将输入保存为 R 中的对象,以便更轻松地交换所有内容。当我这样做时,我会在绘图上得到平坦的线条。
#### Load Library ####
library(tidyverse)
#### Save Objects ####
dfy <- mtcars$mpg # y var
dfx <- mtcars$hp # x var
data <- mtcars # dataset
#### QDATA ####
qdata <- data %>%
mutate(cut_group = cut(dfy,
quantile(dfy, c(0.125, 0.375, 0.625, 0.875)),
labels = c('Q25', 'Q50', 'Q75')),
baseline = quantile(dfy,
c(0.25, 0.5, 0.75))[as.numeric(cut_group)]) %>%
filter(complete.cases(.)) %>%
group_by(cut_group) %>%
reframe(dfxx = density(dfx)$x,
dfy = first(baseline) - density(dfx, bw = 0.5)$y/3) %>%
rename(dfx = dfxx)
ggplot(data,
aes(dfy,
dfx)) +
geom_smooth(method = 'lm',
color = 'gray',
se = FALSE) +
geom_point(color = 'navy',
shape = 21,
fill = NA) +
geom_path(data = qdata,
aes(group = cut_group),
color = 'darkgreen',
linewidth = 1.5) +
theme_classic() +
theme(panel.border = element_rect(fill = NA,
linewidth = 1))
像这样:
我可能会通过预先计算分位数的密度并将它们绘制为
geom_path
:来做到这一点
quartiles <- quantile(iris$Sepal.Width)
midpoints <- quartiles[-5] + 0.5 * diff(quartiles)
qiris <- iris %>%
mutate(Q = cut(Sepal.Width, quartiles, labels = paste0('Q', 1:4)),
baseline = midpoints[as.numeric(Q)]) %>%
filter(complete.cases(.)) %>%
group_by(Q) %>%
reframe(SepalLength = density(Sepal.Length)$x,
Sepal.Width = first(baseline) - density(Sepal.Length, bw = 0.5)$y/3) %>%
rename(Sepal.Length = SepalLength)
ggplot(iris, aes(Sepal.Width, Sepal.Length)) +
annotate('rect', xmin = quartiles[-5], xmax = quartiles[-1], ymin = -Inf,
ymax = Inf, fill = c('gray', NA, 'gray', NA), alpha = 0.2) +
annotate('text', x = midpoints, y = 9, label = paste0('Q', 1:4)) +
geom_smooth(method = 'lm', color = 'gray', se = FALSE) +
geom_point(color = 'navy', shape = 21, fill = NA) +
geom_path(data = qiris, aes(group = Q), color = 'darkgreen',
linewidth = 1.5, alpha = 0.5) +
theme_classic() +
theme(panel.border = element_rect(fill = NA, linewidth = 1))