我有一个变量
x
,我想将其分为具有相同观察值的三组。然而,由于平局,使用分位数并不会产生最平等的组,因为分位数截止点可能会导致平局被分配到多个组中。我正在寻找一种函数或算法,可以找到最佳分界点,同时确保关系不会分散在多个组中。
x = c(26, 34, 27, 26, 38, 40, 34, 28, 27, 36, 29, 30, 29, 44, 30,
34, 32, 30, 26, 29, 34, 32, 38, 27, 35, 29, 28, 34, 26, 27, 27,
30, 27, 28, 27, 28, 28, 27, 29, 29, 28, 29, 29, 28, 29, 29, 28,
27, 29, 27, 36, 34, 34, 39, 34, 31, 31, 33, 35, 31, 31, 32, 37,
38, 32, 31, 28, 33, 33, 28, 27, 27, 30, 31, 32, 28, 27, 31, 36,
27, 33, 31, 34, 31, 35, 38, 37, 36, 39, 33, 33, 28, 41, 34, 35,
37, 37, 41, 32, 37, 30, 34, 38, 30, 40, 35, 31, 30, 30, 29, 29,
30, 29, 35, 28, 27, 27, 27, 29, 27, 28, 27, 27, 27, 26, 28, 28,
27, 29, 29, 27, 27, 27, 27, 29, 27, 28, 27, 28, 34, 29, 28, 28,
28, 29, 38, 33, 39, 28, 27, 28, 27, 29, 34, 29, 32, 70, 26, 29,
43, 48, 30, 30, 27, 26, 29, 27, 27, 27, 27, 28, 28, 27, 28, 28,
27, 28, 28, 38, 52, 26, 31, 56, 29, 29, 36, 28, 35, 32, 34, 35,
28, 27, 37, 26, 26, 32, 26, 27, 30, 28, 28, 30, 29, 30, 29, 29,
28, 26, 33, 39, 26, 31, 27, 28, 30, 30, 28, 28, 29, 26, 27, 26,
29, 28, 28, 27, 27, 27, 28, 27, 28, 28, 28, 28, 28, 27, 27, 29,
27, 26, 28, 28, 27, 27, 28, 27, 28, 28, 30, 27, 30, 28, 32, 34,
28, 27, 28, 28, 27, 28, 27, 27, 27, 28, 27, 28, 27, 27, 28, 27,
27, 27, 27, 27, 28, 27, 27, 27, 26, 27, 27, 30, 28, 27, 30, 30,
42, 26, 27, 40, 33, 29, 29, 29, 52, 58, 44, 32, 43, 30, 27, 38,
30, 27, 30, 27, 31, 39, 35, 32, 32, 34, 45, 31, 44, 42, 29, 29,
30, 30, 50, 30, 33, 31, 35, 27, 28, 27, 28, 55, 28, 28, 28, 27,
27, 28, 29, 27, 28, 27, 28, 28, 28, 28, 27, 28, 29, 34, 45, 27,
29, 61, 38, 62, 29, 36, 36, 30, 31, 45, 27, 30, 28, 29, 44, 45,
42, 52, 50, 52, 42, 38, 42, 32, 27, 37, 40, 52, 27, 36, 38, 39,
34, 30, 29, 34, 29, 26, 35, 43, 33, 40, 35, 33, 41, 61, 45, 35,
52, 50, 38, 43, 29, 35, 38, 39, 31, 28, 28, 29, 34, 27, 30, 32,
28, 26, 28, 27, 26, 29, 27, 26, 29, 29, 27, 29, 27, 27, 29, 27,
30, 29, 25, 30, 27, 29, 29, 30, 30, 27, 30, 28, 28, 27, 29, 29,
30, 29, 27, 28, 28, 28, 29, 28, 28, 27, 28, 29, 28, 29, 27, 28,
28, 28, 30, 27, 27, 28, 26, 28, 27, 27, 28, 28, 28, 28, 27, 27,
28, 27, 28, 27, 35, 27, 27, 28, 29, 27, 27, 28, 26, 27, 28, 28,
28, 27, 27, 27, 28, 32, 27, 28, 28, 29, 28, 28, 27, 28, 28, 30,
29, 28, 25, 27, 28, 30, 28, 30, 30, 28, 30, 30, 28, 29, 30, 28,
28, 26, 27, 28, 45, 36, 40, 28, 50, 45, 30, 45, 40, 30, 45, 45,
29, 45, 35, 40, 40, 30, 30, 30, 45, 40, 40, 40, 40, 40, 40, 35,
34, 49, 40, 30, 61, 35, 40, 30, 36, 35, 29, 27, 48, 28, 27, 27,
26, 27, 29, 27, 26, 27, 31, 27, 27, 28, 29, 28, 27, 28, 29, 38,
30, 26, 36, 40, 58, 57, 30, 33, 56, 35, 39, 37, 38, 46, 37, 39,
39, 45, 35, 46, 58, 65, 60, 45, 32, 36, 43, 32, 68, 39, 28, 31,
27, 28, 27, 37, 38, 30, 30, 28, 36, 45, 28, 26, 28, 28, 28, 27,
26, 28, 27, 26, 26, 27, 28, 31, 32, 37, 35, 29, 33, 35, 29, 41,
32, 36, 29, 28, 28, 28, 37, 36, 37, 35, 31, 32, 30, 27, 31, 32,
31, 33, 28, 33, 29, 27, 28, 31, 28, 31, 28, 34, 27, 27, 28, 27,
27, 27, 27, 26, 26, 26, 27, 27, 28, 26, 31, 26, 29, 31, 29, 29,
30, 29, 30, 31, 32, 29, 30, 27, 32, 27, 26, 31, 31, 31, 27, 27,
33, 27, 28, 28, 28, 26, 27, 27, 28, 30, 27, 27, 30, 29, 26, 27,
28, 27, 26, 26, 28, 27, 26, 28, 28, 26, 28, 27, 29, 27, 28, 28,
26, 26, 29, 28, 27, 27, 27, 28, 26, 25, 27, 29, 30, 36, 40, 28,
38, 26, 27, 27, 50, 27, 45, 27, 28, 26, 25, 35, 35, 44, 30, 27,
31, 27, 28, 27, 27, 28, 28, 28, 35, 33, 30, 28, 28, 29, 29, 36,
32, 36, 34, 32, 28, 28, 29, 28, 28, 32, 30, 35, 33, 36, 32, 30,
32, 36, 34)
quantile(x, probs = c(0.333, 0.666))
#> 33.3% 66.6%
#> 28 31
l = cut(x, breaks = c(-Inf, 28, 31, Inf))
table(l)
#> l
#> (-Inf,28] (28,31] (31, Inf]
#> 387 185 246
#using different cut-off points yielded more equal groups
l = cut(x, breaks = c(-Inf, 28, 32, Inf))
table(l)
#> l
#> (-Inf,28] (28,32] (32, Inf]
#> 387 214 217
#again using different cut-off points which yielded more equal groups
l = cut(x, breaks = c(-Inf, 27, 32, Inf))
table(l)
#> l
#> (-Inf,27] (27,32] (32, Inf]
#> 222 379 217
创建于 2024-10-07,使用 reprex v2.1.1
编辑: 我认为“等于”这个词不清楚,所以我想我可以说,我寻求最合适的分配,使最高和最低组观察数之间的差异最小,仅对连续数字进行分组,且不存在多个关系组
这不是一个明智的方法,但希望可能会有所帮助
n <- 2
lst <- c()
repeat {
q <- quantile(x, probs = seq.int(n - 1) / n)
if (!anyDuplicated(q)) {
lst <- c(lst,list(table(cut(x, breaks = c(-Inf, q, Inf)))))
} else {
break
}
n <- n + 1
}
lst[[which.min(sapply(lst, var))]]
这给出了
(-Inf,27] (27,28] (28,30] (30,35] (35, Inf]
222 165 153 132 146