我有一个大data.table
。我要求:
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
> data
group value
1: A -0.3260365
2: A 0.5524619
3: A -0.6749438
4: A 0.2143595
5: A 0.3107692
6: A 1.1739663
7: A 0.6187899
8: A -0.1127343
9: A 0.9170283
10: A -0.2232594
11: B 0.5264481
12: B -0.7948444
13: B 1.4277555
14: B -1.4668197
15: B -0.2366834
16: B -0.1933380
17: B -0.8497547
要确定“值”列的滚动分位数,我正在使用软件包runquantile()
中的函数caTools
,这需要大约1分钟才能在我的数据集上执行。
对于相同的滚动窗口(此处为k=4
),当需要考虑计算时间时,如何获得高于移动分位数的滚动平均值?在示例中,结果应类似于“ mean_above_q”列。
library(caTools)
data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]
> data
group value rolling_q mean_above_q
1: A -0.3260365 NA NA
2: A 0.5524619 NA NA
3: A -0.6749438 NA NA
4: A 0.2143595 -0.21795730 -0.50049020
5: A 0.3107692 0.23364141 -0.23029220
6: A 1.1739663 0.23364141 -0.23029220
7: A 0.6187899 0.37237334 0.26256430
8: A -0.1127343 0.37237334 0.09901745
9: A 0.9170283 0.67843754 0.25302780
10: A -0.2232594 0.03357052 -0.16799680
11: B 0.5264481 NA NA
12: B -0.7948444 NA NA
13: B 1.4277555 NA NA
14: B -1.4668197 -0.53058593 -1.13083200
15: B -0.2366834 -0.68321222 -1.13083200
16: B -0.1933380 -0.22801430 -0.85175150
17: B -0.8497547 -0.72714047 -1.15828700
非常感谢。
我相信会有错别字,因为数据显示的是低于百分位数而不是高于百分位数的值的平均值。
这里有2个选项:
1]使用frollapply
(因为它返回长度为1的值),因此您需要另一个非等式联接来计算低于百分位数的值的均值。
data[, rolling_q := frollapply(value, sz, function(x) {
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
(1 - h) * qs[i] + h * x[hi[i]]
#.(q, sum(x[1L:lo]) / lo)
}), group]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
2)使用1个非等式联接来计算两个值
data[, c("rolling_q", "mean_below_q") :=
.SD[.SD, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, {
if (.N >= sz) {
x <- x.value
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
q <- (1 - h) * qs[i] + h * x[hi[i]]
.(q, sum(x[1L:lo]) / lo)
} else
.(NA_real_, NA_real_)
}][, (1L:3L) := NULL]
]
输出:
group value sr er rolling_q mean_below_q
1: A -0.3260365 -2 1 NA NA
2: A 0.5524619 -1 2 NA NA
3: A -0.6749438 0 3 NA NA
4: A 0.2143595 1 4 -0.21795730 -0.50049017
5: A 0.3107692 2 5 0.23364141 -0.23029219
6: A 1.1739663 3 6 0.23364141 -0.23029219
7: A 0.6187899 4 7 0.37237334 0.26256434
8: A -0.1127343 5 8 0.37237334 0.09901745
9: A 0.9170283 6 9 0.67843754 0.25302777
10: A -0.2232594 7 10 0.03357052 -0.16799684
11: B 0.5264481 8 11 NA NA
12: B -0.7948444 9 12 NA NA
13: B 1.4277555 10 13 NA NA
14: B -1.4668197 11 14 -0.53058593 -1.13083206
15: B -0.2366834 12 15 -0.68321222 -1.13083206
16: B -0.1933380 13 16 -0.22801430 -0.85175154
17: B -0.8497547 14 17 -0.72714047 -1.15828722
数据:
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
sz <- 4L
prob <- 0.4
#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]