id random count
a 0 -1
a 1 1
a 1 2
a 0 -1
a 0 -2
a 1 1
a 0 -1
a 1 1
a 0 -1
b 0 -1
b 0 -2
b 1 1
b 0 -1
b 1 1
b 0 -1
b 0 -2
b 0 -3
id
是一个玩家,random
是二进制0
或1
,我想创建一个计数列,按玩家计算1和0的序列,最好没有循环,因为数据库非常大。
这是一个dplyr
解决方案
dat %>%
transform(idx = c(0,cumsum(random[-1L] != random[-length(random)]))) %>%
group_by(id, idx) %>%
mutate(count = -1*cumsum(random == 0) + cumsum(random == 1)) %>%
ungroup() %>%
select(-idx)
Source: local data frame [17 x 3]
id random count
1 a 0 -1
2 a 1 1
3 a 1 2
4 a 0 -1
5 a 0 -2
6 a 1 1
7 a 0 -1
8 a 1 1
9 a 0 -1
10 b 0 -1
11 b 0 -2
12 b 1 1
13 b 0 -1
14 b 1 1
15 b 0 -1
16 b 0 -2
17 b 0 -3
我想这就是你要找的东西:
library(data.table)
setDT(DF)[, count := seq_len(.N), by=.(id,rleid(random))]
这使
id random count
1: a 0 1
2: a 1 1
3: a 1 2
4: a 0 1
5: a 0 2
6: a 1 1
7: a 0 1
8: a 1 1
9: a 0 1
10: b 0 2
11: b 0 3
12: b 1 1
13: b 0 1
14: b 1 1
15: b 0 1
16: b 0 2
17: b 0 3
(在data.table包的下一个版本1.9.8中,会有一个小的快捷方式setDT(DF)[, count := rowid(rleid(random)), by=id]
。我正在制作这个注释,所以我可以稍后更新答案。)
您可能还需要运行组的标识符:
DF[, rid := rleid(random), by=id]
这使
id random count rid
1: a 0 1 1
2: a 1 1 2
3: a 1 2 2
4: a 0 1 3
5: a 0 2 3
6: a 1 1 4
7: a 0 1 5
8: a 1 1 6
9: a 0 1 7
10: b 0 1 1
11: b 0 2 1
12: b 1 1 2
13: b 0 1 3
14: b 1 1 4
15: b 0 1 5
16: b 0 2 5
17: b 0 3 5
如果你读完introductory materials on the package,你会发现这些变量也可以在一个步骤中创建。
我认为最简单的方法是使用streak_run
包中的runner函数。 streak_run
也是最快的,如下节所示
解
library(runner)
df <- data.frame( id = 1:10, random = sample(c(0,1), 10, replace=T))
df$count <- streak_run(df$random)
df$count[df$random==0] <- -df$count[df$random==0]
df
# id random count
#1 1 0 -1
#2 2 0 -2
#3 3 1 1
#4 4 1 2
#5 5 1 3
#6 6 1 4
#7 7 0 -1
#8 8 0 -2
#9 9 0 -3
#10 10 0 -4
基准
runner_example <- function(df){
df$count <- streak_run(df$random)
df$count[df$random==0] <- -df$count[df$random==0]
return(df)}
dplyr_example <- function(df){
df %>%
transform(idx = c(0,cumsum(random[-1L] != random[-length(random)]))) %>%
group_by(id, idx) %>%
mutate(count = -1*cumsum(random == 0) + cumsum(random == 1)) %>%
ungroup() %>%
select(-idx)
return(df)}
dt_example <- function(df){
setDT(df)[, count := seq_len(.N), by=.(id,rleid(random))]
return(df)}
library(dplyr);library(data.table)
library(microbenchmark); library(magrittr)
df <- data.frame( id = 1:2000L, random = sample(letters[1:2], 2000L, replace=T))
microbenchmark(
dplyr = dplyr_example(df),
dt = dt_example(df),
runner = runner_example(df),
times=100
)
#Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 134388.839 164274.611 204478.048 188548.4975 222777.298 526019.563 100
# dt 1306.139 1710.665 2181.989 1941.3420 2380.953 5581.682 100
# runner 284.522 741.145 1022.456 853.5715 1004.553 7398.019 100