假设我有以下两个数据框:
set.seed(123)
df_1 <- data.frame(
name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
lon = rnorm(15, mean = -74.0060, sd = 0.01),
lat = rnorm(15, mean = 40.7128, sd = 0.01)
)
df_2 <- data.frame(
name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
lon = rnorm(14, mean = -74.0060, sd = 0.01),
lat = rnorm(14, mean = 40.7128, sd = 0.01)
)
我的问题:对于
df_1
中的每个人,我试图找出df_2
中与此人最接近的5个人(半正矢距离),并记录各种距离统计数据(例如平均值,中位数,最大值,最小值)标准差)。
尝试
首先,我定义了距离函数:
library(geosphere)
haversine_distance <- function(lon1, lat1, lon2, lat2) {
distHaversine(c(lon1, lat1), c(lon2, lat2))
}
然后,我计算了
df_1
中每个人与df_2
中所有人之间的距离:
# Create a matrix to store results
distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
# calculate the distances
for (i in 1:nrow(df_1)) {
for (j in 1:nrow(df_2)) {
distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
}
}
# Create final
final <- data.frame(
name_1 = rep(df_1$name_1, each = nrow(df_2)),
lon_1 = rep(df_1$lon, each = nrow(df_2)),
lat_1 = rep(df_1$lat, each = nrow(df_2)),
name_2 = rep(df_2$name_2, nrow(df_1)),
lon_2 = rep(df_2$lon, nrow(df_1)),
lat_2 = rep(df_2$lat, nrow(df_1)),
distance = c(distances)
)
最后,对于df_1中的每个人,我保留了5个最小距离,并记录了距离统计:
# Keep only first 5 rows for each unique value of final$name_1
final <- final[order(final$name_1, final$distance), ]
final <- final[ave(final$distance, final$name_1, FUN = seq_along) <= 5, ]
# Calculate summary statistics for each unique person in final$name_1
final_summary <- aggregate(distance ~ name_1,
data = final,
FUN = function(x) c(min = min(x),
max = max(x),
mean = mean(x),
median = median(x),
sd = sd(x)))
final_summary <- do.call(data.frame, final_summary)
names(final_summary)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")
final_summary$closest_people <- tapply(final$name_2,
final$name_1,
FUN = function(x) paste(sort(x), collapse = ", "))
# break closest_people column into multiple columns
n <- 5
closest_people_split <- strsplit(final_summary$closest_people, ", ")
final_summary[paste0("closest_", seq_len(n))] <- do.call(rbind, closest_people_split)
最终结果如下:
name_1 min_distance max_distance mean_distance median_distance sd_distance closest_people closest_1 closest_2 closest_3 closest_4 closest_5
1 alex 342.8375 1158.1408 717.0810 650.9167 358.7439 boris, david, matthew, nigel, sarah boris david matthew nigel sarah
2 chris 195.4891 1504.8199 934.6618 895.8301 489.5175 boris, david, malcolm, nigel, steph boris david malcolm nigel steph
3 david 549.4500 830.2758 716.3839 807.6626 143.9571 matthew, sarah, steph, sylvia, tim matthew sarah steph sylvia tim
4 henry 423.1875 975.1733 639.5657 560.1101 223.2389 anna, boris, matthew, sebastian, tim anna boris matthew sebastian tim
5 john 415.8956 1174.1631 849.4313 965.2928 313.2616 boris, julie, matthew, theo, tyler boris julie matthew theo tyler
6 kelly 489.7949 828.5550 657.5908 658.7015 120.6485 david, julie, matthew, sebastian, steph david julie matthew sebastian steph
我的问题:虽然这段代码似乎运行没有错误,但我感觉当
df_1
和df_2
的大小开始增长时,这段代码将开始需要很长时间才能运行。因此,我正在寻找提高此代码效率的方法。有人可以建议大型数据框的例程吗?
解决此问题的 data.table 方法可能如下:
funcs <- function(d,n) {
c(setNames(lapply(c(min,max,mean,median,sd), \(f) f(d)), c("min", "max", "mean", "median", "sd")),
list("names" = paste0(n, collapse=", "))
)
}
library(data.table)
setDT(cross_join(df_1, df_2))[
,dist:=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y)), .(name_1, name_2)
][order(dist), .SD[1:5, funcs(dist, name_2)], name_1]
输出:
name_1 min max mean median sd names
1: taylor 170.5171 746.6206 470.0857 439.8022 227.39141 david, tim, nigel, sarah, sebastian
2: peter 195.4891 1455.0204 834.2543 830.2758 539.69009 steph, boris, matthew, anna, david
3: tom 243.6729 530.4778 426.2490 447.8639 110.26649 tim, sebastian, julie, nigel, david
4: ryan 342.8375 1243.7473 970.0721 1052.6759 367.08513 tyler, julie, sebastian, sylvia, nigel
5: henry 394.8684 894.5358 647.1996 670.9220 236.69562 anna, matthew, david, steph, boris
6: john 423.1875 1948.9521 1106.4374 1052.8789 674.69139 boris, steph, matthew, anna, david
7: kelly 491.6430 1130.9239 717.7716 658.7015 248.96974 sylvia, tyler, sarah, nigel, julie
8: trevor 520.1834 650.9167 609.4363 631.9494 52.96026 nigel, sarah, julie, tim, sebastian
9: troy 549.4500 1035.0599 782.8799 828.5550 220.72034 tyler, sylvia, sarah, nigel, theo
10: michael 581.9209 1504.5642 1057.1773 1012.5247 378.81712 theo, tyler, sylvia, sarah, nigel
11: david 602.9369 941.3102 752.1558 715.3872 159.37550 nigel, sarah, david, sylvia, anna
12: kevin 638.9259 834.5504 715.5252 644.2898 102.23793 matthew, anna, david, nigel, steph
13: xavier 972.9730 1767.1953 1369.5604 1396.8569 371.03190 julie, sebastian, tim, tyler, david
14: chris 1389.1659 2106.7084 1644.0448 1455.8430 316.31565 julie, tyler, sebastian, tim, theo
15: alex 1765.7750 2428.5429 2013.7843 1828.6055 294.37805 julie, tyler, sebastian, tim, theo
使用 dplyr 的另一种方法是使用
cross_join
和 rowwise()
来获取距离,然后使用 slice_head(n=5, by=name_1)
通过 name_1
来获取五个最小距离,然后重新构建或总结通常的方法:
cross_join(df_1, df_2) %>%
rowwise() %>%
mutate(dist=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y))) %>%
ungroup() %>%
arrange(dist) %>%
slice_head(n = 5, by=name_1) %>%
reframe(
min_distance = min(dist),
max_distance = max(dist),
mean_distance=mean(dist),
median_distance=median(dist),
sd_distance = sd(dist),
names = paste0(name_2, collapse=","),
.by=name_1
)
输出:
# A tibble: 15 × 7
name_1 min_distance max_distance mean_distance median_distance sd_distance names
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 taylor 171. 747. 470. 440. 227. david,tim,nigel,sarah,sebastian
2 peter 195. 1455. 834. 830. 540. steph,boris,matthew,anna,david
3 tom 244. 530. 426. 448. 110. tim,sebastian,julie,nigel,david
4 ryan 343. 1244. 970. 1053. 367. tyler,julie,sebastian,sylvia,nigel
5 henry 395. 895. 647. 671. 237. anna,matthew,david,steph,boris
6 john 423. 1949. 1106. 1053. 675. boris,steph,matthew,anna,david
7 kelly 492. 1131. 718. 659. 249. sylvia,tyler,sarah,nigel,julie
8 trevor 520. 651. 609. 632. 53.0 nigel,sarah,julie,tim,sebastian
9 troy 549. 1035. 783. 829. 221. tyler,sylvia,sarah,nigel,theo
10 michael 582. 1505. 1057. 1013. 379. theo,tyler,sylvia,sarah,nigel
11 david 603. 941. 752. 715. 159. nigel,sarah,david,sylvia,anna
12 kevin 639. 835. 716. 644. 102. matthew,anna,david,nigel,steph
13 xavier 973. 1767. 1370. 1397. 371. julie,sebastian,tim,tyler,david
14 chris 1389. 2107. 1644. 1456. 316. julie,tyler,sebastian,tim,theo
15 alex 1766. 2429. 2014. 1829. 294. julie,tyler,sebastian,tim,theo
工作进行中
此解决方案既不比用户@langtang给出的one更简洁也更快,但提请注意
geosphere:.distm()
以及{Rfast}
、{psych}
和{collpase}
。
(1) 计算距离矩阵 (
MD
)
MD = geosphere::distm(df_1[-1], df_2[-1], fun = geosphere::distHaversine)
(2) 对于每一行(来自
df_1
的点),找到前五个最近的点(存储在矩阵 X
中)
rowMins = \(D, k) matrix(D[order(row(D), D)], ncol = ncol(D), byrow = TRUE)[, k]
X = rowMins(MD, 1:5)
(3) 按行计算汇总特征 (
S
)
即 min、max、mean、median 和 sd。
虽然被认为相对较慢,但诀窍是首先
t
转置 X
,这样我们就可以使用完善的函数。不幸的是,summary(t(X))
缺少sd,而collapse::qsut(t(X))
缺少median。如果有选项请评论。
S = cbind(collapse::qsu(t(X))[, -1], Median = Rfast::rowMedians(X))
这已经产生了开销,因为
qsu()
和 rowMedians()
都运行在每一行上。另一种选择可能是
psych::describe(t(X), skew = FALSE)[3:7] |> # S2
`row.names<-`(df_1$name_1)
给予
mean sd median min max
john 1877.49 526.79 2086.03 965.66 2241.15
david 763.62 160.63 831.82 562.65 910.51
alex 1518.81 192.12 1561.73 1225.09 1721.88
kevin 892.37 290.81 922.51 582.00 1236.57
trevor 623.79 226.16 592.48 359.11 857.30
xavier 741.49 130.98 677.02 621.83 932.36
tom 530.70 189.60 597.44 205.96 663.98
michael 1109.90 146.69 1097.67 893.32 1295.68
troy 861.05 188.89 801.55 616.48 1059.15
kelly 802.43 291.93 800.38 432.64 1118.09
chris 1184.69 233.42 1233.05 840.16 1457.28
henry 963.14 257.45 994.43 649.05 1337.23
taylor 594.71 386.41 757.00 118.70 1003.30
ryan 720.59 217.00 772.56 407.34 957.74
peter 1333.79 552.69 1509.01 374.18 1718.11
可能有一个选项来指定应计算哪些汇总统计数据。我没有阅读完整的文档。
但是,所有这些并没有真正的帮助,因为您也希望名称与五个最近的点相关联。这里发生了很多开销。
X2 = t(apply(MD, 1, \(i) names(sort(i)[1:5])))
# collapse::dapply(MD, \(i) names(sort(i)[1:5])), 1) does not work
旁注。在我看来,没有比以下更好的基本 R 解决方案了:
f = \(X, k) t(apply(X, 1, \(i) names(sort(i)[1:k])))
终于给了
> cbind(data.frame(S), data.frame(X2))
Mean SD Min Max Median X1 X2 X3 X4 X5
john 1879.5928 527.3832 966.7430 2243.6565 2088.3664 steph tyler malcolm boris tim
david 764.4733 160.8068 563.2785 911.5332 832.7494 steph tyler sebastian tim malcolm
alex 1520.5110 192.3321 1226.4584 1723.8043 1563.4727 anna matthew theo sylvia david
kevin 893.3704 291.1345 582.6504 1237.9577 923.5420 tyler steph tim sebastian boris
trevor 624.4905 226.4107 359.5081 858.2593 593.1462 david sebastian tim tyler matthew
xavier 742.3244 131.1247 622.5262 933.4064 677.7796 anna matthew sylvia sarah julie
tom 531.2896 189.8111 206.1902 664.7259 598.1114 tim tyler julie david sebastian
michael 1111.1394 146.8538 894.3224 1297.1271 1098.9009 nigel theo malcolm sebastian david
troy 862.0110 189.0964 617.1660 1060.3376 802.4499 sebastian david malcolm nigel theo
kelly 803.3236 292.2534 433.1232 1119.3422 801.2745 sebastian david malcolm tim theo
chris 1186.0200 233.6779 841.1021 1458.9119 1234.4328 anna matthew theo david sarah
henry 964.2193 257.7372 649.7776 1338.7294 995.5426 tyler steph tim boris sebastian
taylor 595.3773 386.8399 118.8290 1004.4187 757.8502 tyler tim sebastian julie sarah
ryan 721.3972 217.2475 407.7949 958.8076 773.4229 david theo anna nigel matthew
peter 1335.2818 553.3045 374.5971 1720.0334 1510.6984 steph tyler malcolm sebastian tim
注意
可重复格式的数据。
df_1 = structure(list(
name_1 = c(
"john",
"david",
"alex",
"kevin",
"trevor",
"xavier",
"tom",
"michael",
"troy",
"kelly",
"chris",
"henry",
"taylor",
"ryan",
"peter"
),
lon = c(
-74.0116047564655,
-74.0083017748948,
-73.9904129168585,
-74.0052949160858,
-74.0047071226484,
-73.9888493501312,
-74.0013908379401,
-74.0186506123461,
-74.0128685285189,
-74.010456619701,
-73.9937591820256,
-74.0024018617294,
-74.001992285494,
-74.0048931728406,
-74.0115584113475
),
lat = c(
40.730669131368,
40.7177785047823,
40.6931338284337,
40.7198135590156,
40.7080720859227,
40.7021217629401,
40.7106202508534,
40.7025399555169,
40.7055110877071,
40.7065496073215,
40.6959330668926,
40.721177870445,
40.7143337311784,
40.7014186306299,
40.7253381492107
)
),
class = "data.frame",
row.names = c(NA, -15L))
df_2 = structure(list(
name_2 = c(
"matthew",
"tyler",
"sebastian",
"julie",
"anna",
"tim",
"david",
"nigel",
"sarah",
"steph",
"sylvia",
"boris",
"theo",
"malcolm"
),
lon = c(
-73.9950316098685,
-74.0016481850917,
-74.0092593158553,
-73.9945119238155,
-73.9960649614404,
-74.0005160304049,
-74.0036126826489,
-74.0122790607604,
-73.9923934755147,
-74.0120025958715,
-73.9841266700698,
-73.9906738937381,
-74.008357003591,
-74.0162642090031
),
lat = c(
40.705695934363,
40.7153688370916,
40.7103330812154,
40.709324574006,
40.7032838143273,
40.7123497227519,
40.7049509553054,
40.6961205806341,
40.7089977347971,
40.7219899660906,
40.7070465303739,
40.7188796432223,
40.6966211729171,
40.7122443803448
)
),
class = "data.frame",
row.names = c(NA, -14L))