如果每个值都出现在不同的data.table列中,那么检查data.table R中满列的最快方法是什么?
示例问题:
创建示例大数据:
dt1 <- data.table(dt1row=c(1:1000000),code=sapply(c(1:1000000),FUN=function(x){paste(sample(letters,5), collapse="")}))
dt2 <- data.table(dt2row=c(1:500000),code=sapply(c(1:500000),FUN=function(x){paste(sample(letters,5), collapse="")}))
我想替换的慢功能(但有效):
#SLOW ON BIG DATA!
dt1$in_dt2 <- sapply(c(1:nrow(dt1)),FUN=function(x){dt1$code[x] %in% dt2$code})
@ thelatemail到目前为止具有更快的方法:
setkey(dt1 code)
setkey(dt2, code)
dt1[, in_dt2 := FALSE][dt2, on=.(code), in_dt2 := TRUE]
我认为您正在寻找连接操作,并且设置键应该可以加快速度:
setkey(dt1, code)
setkey(dt2, code)
existing <- dt2[dt1, on=.(code), nomatch=0L]
dt1[, in_dt2 := dt1row %in% existing$dt1row]
另一个更快的选择是使用base::match
m0 <- function() {
DT10$in_dt2 <- match(DT10$code, dt2$code, nomatch=0L) > 0L
DT10
}
m1 <- function() {
setkey(DT11, code)
existing <- dt2[DT11, on=.(code), nomatch=0L, mult="first"]
DT11[, in_dt2 := dt1row %in% existing$dt1row]
}
m2 <- function() {
DT12[, in_dt2 := match(code, dt2$code, nomatch=0L) > 0L]
}
m_thelatemail <- function() {
setkey(DT13, code)
DT13[, in_dt2 := FALSE][dt2, on=.(code), in_dt2 := TRUE]
}
bench::mark(m0(), m1(), m2(), m_thelatemail(), check=FALSE)
identical(DT11[order(dt1row), in_dt2], m0()$in_dt2)
identical(DT12[order(dt1row), in_dt2], m0()$in_dt2)
identical(DT13[order(dt1row), in_dt2], m0()$in_dt2)
#[1] TRUE
时间:
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 m0() 914ms 914ms 1.09 38.3MB 1.09 1 1 914ms <df[,3] [1,000,000 x 3]> <df[,3] [10 x 3]> <bch:tm> <tibble [1 x 3]>
2 m1() 252ms 273ms 3.66 36.8MB 1.83 2 1 547ms <df[,3] [1,000,000 x 3]> <df[,3] [33 x 3]> <bch:tm> <tibble [2 x 3]>
3 m2() 198ms 252ms 4.14 23.1MB 2.76 3 2 724ms <df[,3] [1,000,000 x 3]> <df[,3] [10 x 3]> <bch:tm> <tibble [3 x 3]>
4 m_thelatemail() 148ms 158ms 6.38 15.4MB 0 4 0 627ms <df[,3] [1,000,000 x 3]> <df[,3] [28 x 3]> <bch:tm> <tibble [4 x 3]>
m0()
的输出:
dt1row code in_dt2
1: 1 nydga FALSE
2: 2 bwknr FALSE
3: 3 sauxj FALSE
4: 4 vnjgi FALSE
5: 5 ouein FALSE
---
999996: 999996 wiucs FALSE
999997: 999997 yqjrp FALSE
999998: 999998 elort FALSE
999999: 999999 asjyh FALSE
1000000: 1000000 lmbjw FALSE
数据:
library(data.table)
set.seed(0L)
nr <- 1e6
dt1 <- data.table(dt1row=c(1:nr),code=sapply(c(1:nr),FUN=function(x) paste(sample(letters,5), collapse="")))
dt2 <- data.table(dt2row=c(1:(nr/2)),code=sapply(c(1:(nr/2)),FUN=function(x) paste(sample(letters,5), collapse="")))
DT10 <- copy(dt1)
DT11 <- copy(dt1)
DT12 <- copy(dt1)
DT13 <- copy(dt1)
建议使用set.seed
随机生成数据时的[C0