当我使用 geom_point() 和 geom_line() 绘制绘图时,中间的一些值没有出现,我不明白为什么。
我有一个包含 3 列的数据框(K_value、CV_error 和 Run),其中 K_value 是分类的。
dput() 输出:
structure(list(K_value = structure(c(10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), levels = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"), class = "factor"),
CV_error = c(0.3496, 0.48953, 0.22838, 0.3241, 0.48187, 0.81215,
0.64932, 0.48208, 0.34502, 0.29175, 0.38106, 0.34349, 0.29372,
0.31848, 0.28904, 0.36266, 0.35706, 0.40682, 0.22942, 0.81252,
0.66357, 0.48312, 0.34643, 0.29845, 0.33101, 0.44156, 0.32816,
0.26834, 0.38874, 0.32601, 0.33054, 0.5124, 0.4978, 0.81195,
0.62714, 0.49569, 0.34549, 0.29434, 0.21027, 0.35551, 0.23482,
0.36595, 0.33906, 0.30915, 0.38615, 0.42463, 0.38548, 0.81222,
0.64116, 0.48115, 0.34543, 0.31653, 0.39421, 0.23617, 0.26476,
0.30773, 0.29044, 0.23667, 0.40504, 0.24453, 0.38279, 0.81107,
0.62831, 0.48073, 0.34307, 0.25076, 0.18189, 0.24538, 0.30349,
0.31099, 0.26404, 0.26664, 0.37712, 0.38249, 0.27946, 0.81362,
0.66236, 0.48343, 0.34475, 0.29682, 0.20412, 0.20799, 0.25753,
0.28842, 0.25157, 0.41521, 0.34065, 0.24796, 0.30641, 0.81291,
0.65986, 0.4821, 0.34447, 0.24829, 0.20115, 0.22076, 0.31345,
0.39544, 0.40846, 0.26986, 0.27907, 0.33826, 0.37872, 0.81762,
0.65032, 0.48309, 0.34895, 0.31037, 0.39639, 0.222, 0.33737,
0.23645, 0.35719, 0.42435, 0.2783, 0.41588, 0.43157, 0.81294,
0.6575, 0.47089, 0.34488, 0.24524, 0.29636, 0.22649, 0.23698,
0.30698, 0.40407, 0.3819, 0.31701, 0.47138, 0.34162, 0.81551,
0.66211, 0.49685, 0.34662, 0.23958, 0.32928, 0.19703, 0.25929,
0.29533), Run = c("1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"5", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5",
"5", "5", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"6", "6", "6", "6", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "8", "8", "8", "8", "8", "8",
"8", "8", "8", "8", "8", "8", "8", "8", "9", "9", "9", "9",
"9", "9", "9", "9", "9", "9", "9", "9", "9", "9", "10", "10",
"10", "10", "10", "10", "10", "10", "10", "10", "10", "10",
"10", "10")), class = "data.frame", row.names = c(NA, -140L
))
我想绘制 K_value 的平均变化率,如下所示:
df %>%
mutate(rate = CV_error - lag(CV_error)) %>%
mutate(sd = sd(rate),
.by=K_value) %>%
mutate(meanrate = mean(rate),
.by=K_value) %>%
ggplot(aes(x=K_value, y=meanrate)) +
geom_point() +
geom_errorbar(aes(ymin=meanrate-sd, ymax=meanrate+sd), width=0.2) +
geom_line(aes(group=1))
这产生:
知道为什么 K=10 的数据没有出现吗? K=10 的数据存在于原始 df 中,当我绘制其他内容(例如 CV_error 的平均值)时,它显示得很好。我想知道这是否与
lag()
有关。难道是我使用方法不对?
运行您的
mutate
行并查看数据后,所有 K_value = 10
行都有 NA
值。您的数据显示 K_value
是一个因素,尽管其级别顺序正确,但您的数据按 K_value
的字母顺序排序,而不是按数字排序,这意味着 "10"
是每个 Run
组中的第一行,并且当您lag
第一个值时,您会得到一个缺失值....
df %>%
mutate(rate = CV_error - lag(CV_error)) %>%
mutate(sd = sd(rate),
.by=K_value) %>%
mutate(meanrate = mean(rate),
.by=K_value)
K_value CV_error Run rate sd meanrate
1 10 0.34960 1 NA NA NA
2 11 0.48953 1 0.13993 0.095314894 0.013977
3 12 0.22838 1 -0.26115 0.125867371 -0.018266
4 13 0.32410 1 0.09572 0.107924558 0.046913
5 14 0.48187 1 0.15777 0.110399314 -0.005331
6 1 0.81215 1 0.33028 0.085966983 0.441737
7 2 0.64932 1 -0.16283 0.013051558 -0.163086
8 3 0.48208 1 -0.16724 0.016702958 -0.166252
9 4 0.34502 1 -0.13706 0.007179984 -0.138402
10 5 0.29175 1 -0.05327 0.029021407 -0.066298
11 6 0.38106 1 0.08931 0.077438538 0.013361
12 7 0.34349 1 -0.03757 0.111384103 -0.022936
13 8 0.29372 1 -0.04977 0.082100970 0.013319
14 9 0.31848 1 0.02476 0.066958840 0.026454
15 10 0.28904 2 -0.02944 NA NA
16 11 0.36266 2 0.07362 0.095314894 0.013977
...
如果您首先对数据进行正确排序,
df |> arrange(Run, K_value) |> ...
,那么您将缺少 K_value = 1,而不是 K_value = 10(否则会出现正确的值,因为顺序是正确的)。
df %>%
arrange(Run, K_value) |>
mutate(rate = CV_error - lag(CV_error)) %>%
mutate(sd = sd(rate),
.by=K_value) %>%
mutate(meanrate = mean(rate),
.by=K_value) %>%
ggplot(aes(x=K_value, y=meanrate)) +
geom_point() +
geom_errorbar(aes(ymin=meanrate-sd, ymax=meanrate+sd), width=0.2) +
geom_line(aes(group=1))
如果您想要
K_value = 1
的值,则可以在 default
中设置 lag
值,以填充每组中第一行所需的任何内容。