findFragment<-function(Dataset){
df1 <<- data.frame(Col9=character(),aid=character(),month=as.Date(character()),year=as.Date(character()),Outcome=character(),ser_no=character(),Similar=character(),stringsAsFactors=FALSE)
rowind<<-0
start.time <- Sys.time()
apply(Dataset,1,function(slic){
rowind<<-rowind+1
fragment<-subset(Dataset, subset = ser_no %in% slic[1] &
Outcome %in% slic[2] &
year %in% slic[3] &
month %in% slic[4] &
code %in% slic[5] &
name %in% slic[6] &
!(aid %in% slic[7]) &
((as.numeric(Percentage)<=(as.numeric(slic[8])+0.01) &
as.numeric(Percentage)>=as.numeric(slic[8])-0.01)
)
)
#Refiltering results
#If result includes more than 3 rows then refilter back on these rows and include only those rows that have percentage+-0.0001
if(nrow(fragment)>3){
fragment<<-subset(fragment, subset = ((as.numeric(Percentage)<=(as.numeric(slic[8])+0.0001) &
as.numeric(Percentage)>=as.numeric(slic[8])-0.0001)
))
}
#Writing data is extremely slow in below way(takes 30+ minutes).
#fragmentize$Similiar[rowind]<<-paste(as.character(unlist(fragment[7])),collapse=",")
#Writing data this way takes total execution time to 9 minutes
# df1<<-rbind(df1,data.frame(Col9=slic[9],
# aid=slic[7],
# ser_no=slic[1],
# Outcome=slic[2],
# month=slic[4],
# year=slic[3],
# Similar=paste(as.character(unlist(fragment[7])),collapse=",")),make.row.names = FALSE)
})
# df1<<-merge(x = Dataset, y = df1, by = c("Col9","aid","ser_no","Outcome","month","year"), all = TRUE)
cat("Completed in",Sys.time()-start.time)
}
fragmentize$Similiar<-0
findFragment(fragmentize)
fragmentize<-data.frame(ser_no=rep("A1",35243),Outcome=rep("A2",35243),year=rep("A3",35243),month=rep("A4",35243),code=rep("A5",35243),name=rep("A6",35243),aid=rep(letters[1:4],35243),Percentage=rep(1,35243),col9=rep("A9",35243),col10=rep("A10",35243),col11=rep("A11",35243),col12=rep("A12",35243),col13=rep("A13",35243),col4=rep("A14",35243),col15=rep("A15",35243),col16=rep("A16",35243),col7=rep("A17",35243),col8=rep("A18",35243),col19=rep("A19",35243),col20=rep("A20",35243),col21=rep("A21",35243),col22=rep("A22",35243),col23=rep("A23",35243),col24=rep("A24",35243),col25=rep("A25",35243),col26=rep("A26",35243),col27=rep("A27",35243),col28=rep("A28",35243),col29=rep("A29",35243))
fragmentize<-data.frame(col9=rep("A9",35243),col10=rep("A10",35243),col11=rep("A11",35243),col12=rep("A12",35243),col13=rep("A13",35243),col4=rep("A14",35243),col15=rep("A15",35243),col16=rep("A16",35243),col7=rep("A17",35243),col8=rep("A18",35243),col19=rep("A19",35243),col20=rep("A20",35243),col21=rep("A21",35243),col22=rep("A22",35243),col23=rep("A23",35243),col24=rep("A24",35243),col25=rep("A25",35243),col26=rep("A26",35243),col27=rep("A27",35243),col28=rep("A28",35243),col29=rep("A29",35243))
library(random)
ser_noVal<-rep(1:831)
OutcomeVal<-c("Aggressive","Balanced","Positive","Negative","Neutral","Conservative")
yearVal<-c(2013:2017)
monthVal<-c(1:12)
codeVal <- c("A", "B", "C")
nameVal<-randomStrings(n=33, len=2, digits=FALSE,loweralpha=TRUE, unique=TRUE, check=TRUE)
aidVal<-randomStrings(n=222, len=4, digits=TRUE,loweralpha=TRUE, unique=TRUE, check=TRUE)
percentVal<-c(1:1561)
fragmentize$ser_no[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(ser_noVal, c(6,70,4,83,1,92,1,1,6,16,8,3,376,63,735,23,28,3,24,1,84,13,119,7,5,4,1,29,1,27,7,3,9,7,4,11,7,14,2,1,1,16,5,150,31,10,1,1049,2,47,36,2,41,37,6,81,55,6,11,22,3,10,30,4,8,4,175,9,6,1,1,83,20,1,34,38,1,3,41,6,19,1,13,65,42,115,53,18,19,36,5,16,20,38,1,36,1,1,1,4,7,5,19,7,8,39,113,4,1,21,21,2,12,7,6,11,33,19,1,1,53,2,195,79,1,1,2,2,3,1,7,3,11,5,2,1,16,2,14,2,2,15,4,54,4,3,2,40,49,2,1,3,22,9,25,5,42,8,5,6,8,8,3,179,2,4,16,131,113,20,1,13,27,57,52,34,7,4,1,3,22,21,577,16,28,31,82,1,1,74,26,25,1,23,1,29,116,33,1,3,9,8,11,12,1,2,3,11,1,1,13,3,22,13,1,15,2,4,20,1,2,7,2,2,18,147,8,2,50,5,25,2,12,1,98,6,6,37,55,20,9,6,3,8,4,2,2,9,2,32,6,183,10,141,755,34,1,13,3,1,83,1,10,1,566,27,1,38,1,45,7,44,43,11,18,259,36,64,6,19,31,33,355,70,14,26,41,619,139,1,2,45,76,2,49,5,19,51,30,16,32,12,10,1,4,2,80,25,45,84,50,346,125,60,61,321,6,14,17,13,37,7,4,61,79,207,68,111,49,75,425,92,50,329,4,22,2,7,88,1265,3,22,41,10,29,1,37,3,1,13,20,35,10,33,26,5,1,1,1,1,1,2,3,6,14,2,4,2,20,921,132,9,8,114,438,57,37,10,1778,21,10,44,1,4,3,10,48,1,100,123,6,15,234,3,15,3,14,13,46,39,2,72,3,97,97,10,13,2,38,3,4,17,49,143,5,76,61,11,17,16,40,1,1,1,1,1,9,6,1,2,20,28,30,4,30,14,9,80,1,32,7,20,4,26,2,66,4,2,1,2,12,2,8,2,12,56,9,1023,33,19,1,3,46,1,6,88,40,84,85,35,28,314,3,7,61,79,34,55,2,23,1,10,1,2,77,6,70,40,1,4,93,1,48,3,5,17,2,8,1,2,1,7,27,13,23,4,4,4,7,1,2,1,1,2,18,13,44,32,1,2,2,8,103,1,6,366,4,4,5,2,6,15,6,30,10,1,3,1,2,4,20,8,1,86,3,3,3,2,4,76,3,436,4,1,10,28,17,39,1,1,896,21,12,24,1,177,29,8,3,36,14,2,6,9,1,17,5,2,113,48,2,8,15,155,34,465,23,1,222,1,22,14,23,4,11,3,18,12,17,2,5,3,7,4,2,1,1,1,2,2,9,185,22,11,1,1,14,3,3,2,11,2,4,2,1,4,17,4,213,7,62,1,210,126,38,1,391,2,6,67,44,21,19,16,98,14,4,1,1,2,197,8,31,1,48,1,10,9,36,24,54,65,1,5,5,12,224,13,41,28,7,339,50,5,9,2,3,3,1,1,1,2,7,1,35,11,25,1,2,12,23,4,14,6,2,3,20,36,7,2,6,10,22,1,2,6,2,18,14,15,10,24,11,3,78,2,1,10,236,293,25,43,5,14,4,32,29,4,1,6,6,9,1,202,173,1,12,1,18,1,55,56,3,9,4,3,12,4,2,32,3,22,7,45,15,4,5,4,3,2,1,7,7,12,4,1,2,8,166,1,10,9,15,1,1,11,8,26,67,1,288,39,3,31,4,25,6,7,4,22,5,3,1,71,19,3,5,19,4,27,21,4,22,5,1,52,1,7,70,27,277,1,4,1,80,1,141,10,4,6,3,11,5,6,15,1,1,1,6,1,2))
fragmentize$Outcome[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(OutcomeVal, c(21775,3034,126,10,10277,21))
fragmentize$year[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(yearVal, c(11,2709,8476,11308,12739))
fragmentize$month[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(monthVal, c(2536, 2535, 2780, 2616, 2902, 3190, 3274, 3553, 3623, 3515, 2339, 2380))
fragmentize$code[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(codeVal, c(7610,24718,2915))
fragmentize$name[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(nameVal, c(218, 917, 1736, 555, 42, 76, 79, 267, 1988, 116, 194, 161, 12, 353, 261, 285, 382, 6050, 2053, 45, 1, 276, 4598, 7543, 337, 14, 1, 591, 1020, 657, 139, 3995, 281))
fragmentize$aid[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(aidVal, c(310, 82, 26, 6, 493, 175, 31, 4, 19, 160, 263, 248, 68, 20, 666, 303, 6, 125, 190, 8, 108, 93, 206, 11, 278, 2, 273, 3, 3, 4, 285, 1, 555, 44, 93, 21, 94, 5309, 46, 25, 7, 249, 67, 20, 3, 15, 15, 16, 5, 12, 5, 17, 67, 44, 332, 57, 358, 25, 204, 8, 612, 108, 47, 273, 16, 20, 516, 16, 344, 33, 153, 4, 43, 73, 14, 37, 88, 7, 26, 23, 116, 33, 28, 66, 24, 21, 18, 32, 96, 6, 16, 3, 176, 121, 109, 177, 8, 30, 156, 117, 24, 90, 199, 236, 24, 25, 34, 20, 50, 14, 19, 30, 8, 20, 3, 10, 55, 24, 26, 17, 17, 29, 147, 148, 6, 2031, 65, 1135, 632, 91, 544, 1073, 11, 617, 15, 18, 2, 226, 182, 89, 513, 23, 149, 6, 398, 148, 13, 129, 323, 26, 4, 4, 155, 63, 32, 64, 23, 2, 120, 1, 2, 1, 10, 25, 120, 993, 5, 335, 40, 539, 413, 116, 78, 15, 38, 2, 15, 34, 271, 3, 604, 375, 52, 47, 459, 457, 177, 28, 293, 49, 266, 96, 1836, 18, 127, 18, 246, 5, 8, 4, 11, 102, 24, 21, 63, 57, 25, 22, 2, 1, 1, 51, 74, 56, 154, 97, 21, 31, 4, 3, 1, 11))
fragmentize$Percentage[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(percentVal, c(116,84,64,108,25,36,104,6,17,21,129,70,32,34,18,234,37,14,102,4,5,24,57,19,130,7,22,81,123,9,1,6,4,7,103,22,30,2,17,18,44,176,3,12,71,7,20,52,11,10,7,81,7,6,5,3,45,15,9,116,10,78,5,39,36,7,34,7,44,5,14,58,7,23,386,13,46,1,79,12,18,4,15,6,1009,6,47,55,36,18,15,2,1,2,297,39,6,18,50,33,18,37,632,5,26,28,31,187,15,26,9,1,38,27,9,25,2,4,486,49,11,104,130,6,3,9,6,3,16,5,9,392,96,9,4,7,1,39,35,8,3,12,14,94,309,59,3,15,1,18,85,277,13,6,3,4,68,204,2,7,59,5,19,189,1,440,2,44,109,151,2,45,6,3,131,18,23,17,334,1,103,27,18,2,27,2,75,98,7,19,2,72,1,10,82,17,256,20,17,1,92,2,1,13,71,3,21,13,86,1,16,1,83,103,226,1,26,20,1,63,1,7,9,10,51,2,155,70,11,4,10,2,49,152,9,2,42,9,21,53,33,11,1,101,8,49,1,3,1,2,4,141,9,17,163,44,7,36,121,17,32,6,4,2,26,9,4,72,1,22,70,3,1,4,1,74,24,41,39,30,8,1,27,4,30,1,73,4,21,10,9,8,117,9,65,3,6,24,14,2,4,89,6,2,20,49,40,266,3,4,1,23,1,28,14,17,22,28,20,1,6,58,25,10,4,6,37,168,11,8,3,58,4,99,62,20,22,15,2,20,1,32,3,3,9,4,19,1,7,33,1,18,4,1,13,13,11,38,27,1,20,176,18,10,1,1,15,20,3,21,13,4,49,6,10,22,2,1,12,10,78,7,5,4,13,7,22,5,8,10,72,2,17,1,9,1,13,14,129,21,2,12,1,13,51,12,138,3,3,9,9,6,17,11,13,4,1,6,15,11,1,102,15,2,1,4,5,48,7,12,4,2,2,6,20,9,2,28,25,1,1,12,16,30,12,10,4,3,2,88,13,1,216,13,9,1,3,11,12,9,7,1,1,48,6,2,16,2,1,4,2,12,11,16,11,1,7,67,3,2,1,6,323,23,1,25,5,2,5,57,10,50,5,97,4,4,19,5,2,1,5,5,4,7,4,2,6,4,1,1,2,1,5,2,13,13,1,7,1,6,3,43,3,1,47,8,5,1,179,97,5,10,40,1,5,4,3,11,1,4,2,8,1,1,3,7,5,1,54,1,7,2,3,43,1,1,3,3,1,191,27,1,3,1,19,51,3,3,3,33,4,41,2,15,2,2,6,114,1,1,1,2,2,13,1,1,3,1,1,3,3,1,1,107,2,7,2,10,2,1,1,30,1,42,1,1,67,1,1,11,3,48,32,1,4,2,58,1,1,149,2,17,1,40,97,1,2,6,1,20,1,1,28,127,30,1,1,14,13,5,84,5,2,4,1,86,4,13,15,18,18,11,8,3,1,12,49,92,5,3,2,100,12,81,1,6,64,7,15,6,20,13,82,46,19,26,7,67,2,6,9,1,29,3,1,10,2,64,5,18,107,203,9,2,2,101,52,2,6,1,70,7,10,86,1,1,1,7,1,15,1,1,51,4,44,5,15,2,50,79,27,12,64,1,17,32,54,44,114,1,34,4,12,96,56,1,31,5,1,3,37,4,130,5,4,3,1,26,2,20,41,9,1,37,4,18,1,2,157,30,5,5,27,6,30,1,20,42,1,51,34,7,12,16,1,57,1,1,40,29,1,37,36,32,2,14,43,3,4,10,2,2,17,36,27,10,1,53,101,111,1,10,56,2,1,43,152,8,103,2,29,2,44,2,18,44,87,49,5,43,16,13,1,53,26,30,6,17,7,2,24,36,4,41,2,17,1,24,1,7,5,8,14,1,38,45,14,38,22,10,2,11,8,13,3,28,1,19,1,18,14,15,2,26,2,15,27,1,3,22,28,49,10,2,1,20,22,77,1,2,4,122,1,1,44,1,14,15,1,70,4,4,25,54,10,34,13,17,2,2,23,30,13,1,2,10,15,1,14,30,23,1,1,21,15,12,1,13,2,1,6,26,3,12,1,62,10,15,3,21,34,14,3,10,2,35,18,6,1,90,44,1,1,172,3,7,79,13,37,2,13,23,8,2,10,59,1,12,1,107,6,11,9,25,4,1,2,26,14,18,8,322,1,19,1,6,162,1,9,19,2,9,5,3,12,50,4,16,9,5,34,14,10,2,1,46,40,15,3,13,55,20,93,8,1,2,25,8,7,58,14,17,3,1,7,2,5,3,4,1,131,3,1,2,7,18,45,6,3,12,8,11,18,5,13,7,35,7,1,8,4,5,15,49,6,1,1,80,11,2,5,1,5,19,20,179,22,1,10,1,9,48,111,3,21,1,3,17,20,2,2,2,7,3,6,1,6,8,12,5,5,1,129,1,1,2,10,6,8,16,8,2,2,9,20,1,74,5,42,5,1,1,1,2,14,4,12,9,47,12,38,1,1,3,8,34,1,5,1,4,4,21,2,1,1,14,95,23,14,2,1,90,7,7,32,8,6,1,2,19,12,1,2,7,30,4,1,10,1,2,7,1,7,3,16,1,9,4,3,5,1,76,3,17,8,1,6,70,3,1,11,3,7,27,1,2,40,2,3,7,6,8,3,1,49,14,56,1,17,2,5,5,70,5,13,3,10,2,3,10,1,2,4,5,94,1,3,2,1,5,2,6,4,4,5,6,12,1,16,68,1,4,11,4,4,1,9,1,6,3,9,5,4,50,3,1,12,4,1,5,2,24,35,2,3,2,60,1,3,2,6,3,2,2,9,7,1,11,12,5,4,3,56,7,1,5,1,1,4,1,18,5,1,1,9,159,1,11,2,8,2,3,1,1,9,3,7,2,68,2,5,43,2,4,38,1,5,2,26,1,4,2,1,5,10,1,4,2,1,8,2,6,86,2,2,1,10,3,1,4,10,3,35,17,3,4,14,1,1,17,4,6,39,3,13,50,6,3,3,38,4,1,3,2,26,1,5,28,2,5,1,1,21,1,13,6,2,4,6,13,3,5,9,3,2,1,32,1,8,20,2,2,8,2,2,30,1,9,2,4,4,4,1,13,1,45,2,5,3,1,1,23,12,1,2,1,1,1,26,1,14,1,1,6,1,10,1,10,7,2,2,1,1,1,4,11,4,2,2,1,3,2,19,8,5,4,3,1,1,52,4,1,1,2,3,4,3,1,23,23,2,2,2,1,1,9,6,2,26,1,1,2,2,1,1,1,1,10,4,7,27,4,2,1,1,24,3,3,2,1,3,5,2,4,14,1,1,4,3,2,1,18,1,1,2,4,2,1,5,2,1,5,1,4,1,1,5,1,5,1,1,3,2,1,5,1,3,1,1,1,3,3,2,1,5,1,4,5,4,3,2,1,1,1,4,6,2,1,1,1,9,1,2,1,3,1,1,1,5,5,8,1,1,1,2,6,2,2,4,1,3,2,2,1,9,1,2,4,1,3,25))
rm(ser_noVal,OutcomeVal,yearVal,monthVal,codeVal,nameVal,aidVal,percentVal)
据我所知,OP希望在他的生产数据集中找到类似的记录,这些记录在ser_no
,Outcome
,year
,month
,code
和name
中具有相同的值,并且在Percentage
中具有相等的值(在给定的容差范围内) 。 OP已请求附加除实际行的aid
值之外的任何匹配行的aid
值。
可能的方法是使用data.table
的非equi自联接:
library(data.table)
eps <- 0.01
system.time(
setDT(fragmentize, key = c("ser_no", "Outcome", "year", "month", "code", "name", "aid"))[
, Percentage := as.numeric(Percentage)][
, similar := fragmentize[
.(ser_no = ser_no, Outcome = Outcome, year = year, month = month,
code = code, name = name, aid = aid,
lb = Percentage * (1 - eps), ub = Percentage * (1 + eps)),
on = .(ser_no, Outcome, year, month, code, name,
Percentage >= lb, Percentage <= ub),
by = .EACHI, toString(setdiff(unique(x.aid), i.aid))][, V1]]
)
在我的系统上,OP的测试数据帧II采用了
User System Elapsed 0.61 0.00 0.64
这比OP报告的样本数据集的21分钟更快。
结果,fragmentize
获得了额外的列similar
:
str(fragmentize)
Classes ‘data.table’ and 'data.frame': 35243 obs. of 30 variables: $ col9 : Factor w/ 1 level "A9": 1 1 1 1 1 1 1 1 1 1 ... $ col10 : Factor w/ 1 level "A10": 1 1 1 1 1 1 1 1 1 1 ... $ col11 : Factor w/ 1 level "A11": 1 1 1 1 1 1 1 1 1 1 ... $ col12 : Factor w/ 1 level "A12": 1 1 1 1 1 1 1 1 1 1 ... $ col13 : Factor w/ 1 level "A13": 1 1 1 1 1 1 1 1 1 1 ... $ col4 : Factor w/ 1 level "A14": 1 1 1 1 1 1 1 1 1 1 ... $ col15 : Factor w/ 1 level "A15": 1 1 1 1 1 1 1 1 1 1 ... $ col16 : Factor w/ 1 level "A16": 1 1 1 1 1 1 1 1 1 1 ... $ col7 : Factor w/ 1 level "A17": 1 1 1 1 1 1 1 1 1 1 ... $ col8 : Factor w/ 1 level "A18": 1 1 1 1 1 1 1 1 1 1 ... $ col19 : Factor w/ 1 level "A19": 1 1 1 1 1 1 1 1 1 1 ... $ col20 : Factor w/ 1 level "A20": 1 1 1 1 1 1 1 1 1 1 ... $ col21 : Factor w/ 1 level "A21": 1 1 1 1 1 1 1 1 1 1 ... $ col22 : Factor w/ 1 level "A22": 1 1 1 1 1 1 1 1 1 1 ... $ col23 : Factor w/ 1 level "A23": 1 1 1 1 1 1 1 1 1 1 ... $ col24 : Factor w/ 1 level "A24": 1 1 1 1 1 1 1 1 1 1 ... $ col25 : Factor w/ 1 level "A25": 1 1 1 1 1 1 1 1 1 1 ... $ col26 : Factor w/ 1 level "A26": 1 1 1 1 1 1 1 1 1 1 ... $ col27 : Factor w/ 1 level "A27": 1 1 1 1 1 1 1 1 1 1 ... $ col28 : Factor w/ 1 level "A28": 1 1 1 1 1 1 1 1 1 1 ... $ col29 : Factor w/ 1 level "A29": 1 1 1 1 1 1 1 1 1 1 ... $ ser_no : int 1 1 1 1 1 1 2 2 2 2 ... $ Outcome : chr "Aggressive" "Aggressive" "Aggressive" "Aggressive" ... $ year : int 2015 2015 2016 2017 2015 2016 2014 2014 2015 2015 ... $ month : int 11 11 5 5 2 10 5 10 2 5 ... $ code : chr "A" "B" "B" "B" ... $ name : chr "wt" "Ds" "UF" "Of" ... $ aid : chr "UuaR" "uwIL" "9WAx" "h5eH" ... $ Percentage: num 255 1295 168 549 85 ... $ similar : chr "" "" "" "" ... - attr(*, ".internal.selfref")=<externalptr> - attr(*, "sorted")= chr "ser_no" "Outcome" "year" "month" ...
由于similar
对于绝大多数行都是空的,我们只显示非空行,也只显示相关列。设置键已经对fragmentize
进行了排序,这样可以更容易地验证结果:
fragmentize[similar != "", .(ser_no, Outcome, year, month, code, name, aid,
Percentage, similar)]
ser_no Outcome year month code name aid Percentage similar 1: 13 Aggressive 2016 3 B gZ 21So 525 59PL 2: 13 Aggressive 2016 3 B gZ 59PL 529 21So 3: 15 Aggressive 2017 1 B nt C2i4 1311 uwIL 4: 15 Aggressive 2017 1 B nt uwIL 1323 C2i4 5: 15 Aggressive 2017 6 B Wj hMo4 308 mrDx 6: 15 Aggressive 2017 6 B Wj mrDx 308 hMo4 7: 48 Aggressive 2016 11 B gZ 4LVK 1216 FtSG 8: 48 Aggressive 2016 11 B gZ FtSG 1205 4LVK 9: 48 Aggressive 2017 5 B nt 59PL 85 f1Fh 10: 48 Aggressive 2017 5 B nt f1Fh 85 59PL 11: 48 Aggressive 2017 7 B Wj lVpw 1021 mz3h 12: 48 Aggressive 2017 7 B Wj mz3h 1021 lVpw 13: 252 Aggressive 2016 6 B gZ bkk6 75 spPd 14: 252 Aggressive 2016 6 B gZ spPd 75 bkk6 15: 255 Aggressive 2015 9 B Wj 59PL 29 dceG 16: 255 Aggressive 2015 9 B Wj dceG 29 59PL 17: 265 Aggressive 2017 9 B FB FodL 756 twvT 18: 265 Aggressive 2017 9 B FB twvT 759 FodL 19: 276 Aggressive 2016 11 A gZ 59PL 949 M6sO 20: 276 Aggressive 2016 11 A gZ M6sO 944 59PL 21: 288 Aggressive 2017 6 B gZ 21So 878 Y9gk 22: 288 Aggressive 2017 6 B gZ Y9gk 882 21So 23: 340 Aggressive 2015 7 B nt FtSG 763 kBpV 24: 340 Aggressive 2015 7 B nt kBpV 767 FtSG 25: 340 Aggressive 2016 4 B Ds 21So 731 bkk6 26: 340 Aggressive 2016 4 B Ds bkk6 727 21So 27: 340 Aggressive 2017 10 B nt B4fM 673 M6sO 28: 340 Aggressive 2017 10 B nt M6sO 678 B4fM 29: 340 Neutral 2017 8 A Oa 59PL 872 Vyl1 30: 340 Neutral 2017 8 A Oa Vyl1 872 59PL 31: 340 Neutral 2017 9 B FB 59PL 723 75iU 32: 340 Neutral 2017 9 B FB 75iU 723 59PL 33: 370 Aggressive 2015 6 A gZ 3Xre 132 DWZh 34: 370 Aggressive 2015 6 A gZ DWZh 132 3Xre 35: 370 Aggressive 2016 5 B gZ 1reu 1162 jSL1 36: 370 Aggressive 2016 5 B gZ jSL1 1158 1reu 37: 370 Aggressive 2017 3 B Wj 21So 872 spPd 38: 370 Aggressive 2017 3 B Wj spPd 867 21So 39: 370 Aggressive 2017 4 B FB 0Xza 1547 NXGE 40: 370 Aggressive 2017 4 B FB NXGE 1535 0Xza 41: 379 Aggressive 2015 2 B FB mJAy 133 zQZw 42: 379 Aggressive 2015 2 B FB zQZw 133 mJAy 43: 379 Aggressive 2015 7 B gZ FtSG 201 spPd 44: 379 Aggressive 2015 7 B gZ spPd 201 FtSG 45: 379 Aggressive 2016 8 B Wj 75iU 95 HzTb 46: 379 Aggressive 2016 8 B Wj HzTb 95 75iU 47: 379 Aggressive 2016 9 B gZ F9c3 244 LpB1 48: 379 Aggressive 2016 9 B gZ LpB1 246 F9c3 49: 379 Aggressive 2016 12 B nt 4DGD 507 zYVN 50: 379 Aggressive 2016 12 B nt zYVN 504 4DGD 51: 379 Aggressive 2017 1 B Wj LpB1 85 gzvo 52: 379 Aggressive 2017 1 B Wj gzvo 85 LpB1 53: 379 Aggressive 2017 9 B FB Xo8U 60 hSJN 54: 379 Aggressive 2017 9 B FB hSJN 60 Xo8U 55: 379 Aggressive 2017 9 B Wj 75iU 12 Puss 56: 379 Aggressive 2017 9 B Wj Puss 12 75iU 57: 379 Aggressive 2017 11 B Wj 1reu 817 N7dg, SCPN 58: 379 Aggressive 2017 11 B Wj N7dg 809 SCPN, 1reu 59: 379 Aggressive 2017 11 B Wj SCPN 809 N7dg, 1reu 60: 379 Aggressive 2017 12 B gZ B4fM 17 hMo4 61: 379 Aggressive 2017 12 B gZ hMo4 17 B4fM 62: 379 Neutral 2016 9 B Wj L58K 103 hMo4 63: 379 Neutral 2016 9 B Wj hMo4 103 L58K 64: 379 Neutral 2017 6 B gZ 21So 1016 I46B 65: 379 Neutral 2017 6 B gZ I46B 1012 21So 66: 379 Neutral 2017 9 B Wj 21So 1244 LpB1 67: 379 Neutral 2017 9 B Wj LpB1 1240 21So 68: 379 Neutral 2017 11 B gZ 3Vpo 483 spPd 69: 379 Neutral 2017 11 B gZ spPd 483 3Vpo 70: 393 Aggressive 2015 2 B FB 8SzN 323 cKuN 71: 393 Aggressive 2015 2 B FB cKuN 322 8SzN 72: 458 Aggressive 2015 1 B FB 75iU 972 GWLn 73: 458 Aggressive 2015 1 B FB GWLn 977 75iU 74: 458 Neutral 2017 1 B Wj 21So 483 59PL 75: 458 Neutral 2017 1 B Wj 59PL 483 21So 76: 458 Neutral 2017 6 B iN hMo4 802 spPd 77: 458 Neutral 2017 6 B iN spPd 807 hMo4 78: 526 Aggressive 2017 3 B Wj 4DGD 992 59PL 79: 526 Aggressive 2017 3 B Wj 59PL 991 4DGD 80: 552 Aggressive 2015 7 B Wj 9oyt 95 OWxi 81: 552 Aggressive 2015 7 B Wj OWxi 95 9oyt 82: 552 Aggressive 2017 10 B Ds 59PL 890 9WAx 83: 552 Aggressive 2017 10 B Ds 9WAx 894 59PL 84: 561 Aggressive 2015 1 B gZ f1Fh 949 spPd 85: 561 Aggressive 2015 1 B gZ spPd 952 f1Fh 86: 561 Aggressive 2016 4 B Wj I46B 776 hpRD 87: 561 Aggressive 2016 4 B Wj hpRD 771 I46B 88: 561 Aggressive 2016 8 B gZ eKpA 809 rp75 89: 561 Aggressive 2016 8 B gZ rp75 807 eKpA 90: 561 Aggressive 2016 9 B Wj 4LVK 882 CF4V, M6sO 91: 561 Aggressive 2016 9 B Wj CF4V 878 4LVK, M6sO 92: 561 Aggressive 2016 9 B Wj M6sO 882 CF4V, 4LVK 93: 651 Aggressive 2017 2 B Ds 59PL 179 SCPN 94: 651 Aggressive 2017 2 B Ds SCPN 179 59PL 95: 735 Aggressive 2017 8 B iN M6sO 760 tNgx 96: 735 Aggressive 2017 8 B iN tNgx 758 M6sO 97: 817 Neutral 2016 6 B gZ I46B 197 SCPN 98: 817 Neutral 2016 6 B gZ SCPN 198 I46B ser_no Outcome year month code name aid Percentage similar
从第1行和第2行可以看出,检测到的相似性是对称的,即第1行指向59PL
,而第2行指向21So
。还有两种情况,其中已识别出3个相似的行。
setDT()
将fragmentize
强制转换为data.table
对象,从而在某些列上设置键。这不是连接所必需的,而是对fragmentize
进行排序,这有助于验证结果的正确性。此外,它可以加快加入。Percentage
被强制键入double
以防止在连接期间进行类型转换。在测试数据帧II中,OP创建了Percentage
作为integer
类型,而用于范围连接的下限和上限是double
类型。请注意,Percentage
通过引用或就地更新,即不复制整个数据对象以节省时间和内存。similar
。fragmentize
正好加入了自己选定的专栏。这些使用缩写list
指定为.()
。此外,使用lb
的相对容差,ub
和Percentage
被创建为与eps
近似匹配的下限和上限。on
子句指定了在连接中应该完全匹配的列以及非equi连接条件。 AFAIK,无法在单个列上指定反连接。因此,条件aid != aid
必须以另一种方式对待。by = .EACHI
参数请求为与连接条件匹配的每组行同时加入和聚合。这避免了创建一个包含所有多个匹配的潜在大型中间表。toString(setdiff(unique(x.aid), i.aid))
给出。在多个匹配的情况下,每个aid
值应该只出现一次。然后,setdiff()
从实现OP的要求aid
的结果中删除实际行的aid != aid
值。最后,结果折叠为单个字符串。[, V1]
表达式只提取具有聚合值的列,最终成为新列similar
。R优化的第一步是尽可能多地向量化操作。在这里,我们对应该相同的列的所有比较进行矢量化,并且仅对辅助和百分比执行行方式操作。后者可以通过自我连接和过滤而不是mapply
进行矢量化,但我们已经低于目标速度。
library(dplyr)
start.time <- Sys.time()
fragmentize <- fragmentize %>%
# group by all the columns that should match
group_by(ser_no, Outcome, year, month, code, name) %>%
#row-wise within-group filter for different aid and close percentage
mutate(similar = mapply(function (aid_i, Percentage_i) {
aid[aid != aid_i & abs(Percentage_i - Percentage) <= 1]
}, aid_i = aid, Percentage_i = Percentage, SIMPLIFY = FALSE)) %>%
ungroup %>%
mutate(similar = sapply(similar, paste, collapse = ", "))
cat("Completed in", Sys.time() - start.time)
> Completed in 1.856045
使用您问题中的35K行示例数据集不到2秒。这里的诀窍是,在分组数据帧上的mutate
调用内的任何位置的裸变量名称将评估为仅适用于该组的值的向量,因此mapply
调用执行逐行搜索以将每行的值进行比较其他用于匹配的,但仅在已经被识别为在所有分组变量上匹配的行的较小搜索空间内。
我建议省略最后的mutate
以保持similar
作为列表列而不是折叠字符串以使其更容易使用,但我已经包含了折叠步骤,就像在示例代码中所做的那样,以保持时间可比性。另请注意,您的代码在百分比上的过滤器在+/- 0.01之内,但示例数据在Percentage
中只有整数,所以我做了+/- 1。你想用<= 1
替换<= 0.01
。