我有两个需要匹配的单独的数据集df1
和df2
(每个具有不同的行数),然后在列中附加df2
中的特定值。我有一个较大的数据集,并且难以编写循环以将df2
的每一行与df1
匹配,然后附加所需的值...
df1
中的数据根据日期与df2
匹配。每次迭代都会根据df1
与df2
匹配的记录数返回不同的行数。然后,我需要从df2
中获取具有所需值的相关列,并将它们附加到日期匹配的输出中。这是一个手动示例(可以正常工作并提供我所需的东西):
# Step 1: Get the desired values from df2 that need to be appended to the matching output
df2_values = df2[, c("trip.start", "trip.end", "b","ln_b,"c","ln_c")]
# Step 2: Find the df1 records inclusive of the dates for the first record in df2
df1_rec = subset(df1, Date >= df2$trip.start[1] & Date <= df2$trip.end[1])
# Step 3: Append the date matching output and the df2 values
output1 = cbind(df1_rec, df2_values[1,])
# Doing this again for df2, second date
df1_rec = subset(df1, Date >= df2$trip.start[2] & Date <= df2$trip.end[2])
output2 = cbind(df1_rec, df2_values[2,])
我需要对40行df2重复此过程,然后将所有输出重新绑定在一起,然后再次对737个单独出现进行重复。到目前为止,这是我尝试过的方法,但都没有成功:
# Attempt #1
library(foreach)
output = foreach (j=1:nrow(df2), .combine=rbind) %do% {
cbind(subset(df1, Date >= df2$trip.start[j] & Date <= df2$trip.end[j]), df2_values[j,]) }
Error in { :
task 7 failed - "arguments imply differing number of rows: 0, 1"
# Attempt #2
for (j in 1:nrow(df2)) {
df1_rec = subset(df1, Date >= df2$trip.start[j] & Date <= df2$trip.end[j])
output = cbind(df1_rec, df2_values[j,]) }
Error in data.frame(..., check.names = FALSE) :
arguments imply differing number of rows: 0, 1
# Attempt #3
library(foreach)
output = foreach(i=1:nrow(df2), .combine=rbind) %do% {
for (j in 1:nrow(df2)) {
df1_rec = subset(df1, Date >= df2$trip.start[j] & Date <= df2$trip.end[j])
df2_rec = df2_values[j,]
cbind(df1_rec, df2_rec)}
}
Error in { :
task 1 failed - "arguments imply differing number of rows: 0, 1"
这是我的可用数据:
> dput(df1)
structure(list(Date = c("01-06-2008", "01-12-2008", "01-12-2008",
"03-29-2008", "03-29-2008", "03-30-2008", "03-30-2008", "03-30-2008",
"04-03-2008", "04-03-2008", "04-22-2008", "04-22-2008", "04-25-2008",
"04-25-2008", "04-26-2008", "04-26-2008", "06-01-2010", "06-02-2010",
"06-02-2010", "06-02-2010", "06-11-2010", "06-13-2010", "06-19-2010",
"06-25-2010", "06-26-2010", "09-06-2010", "09-10-2010", "09-11-2010",
"09-11-2010", "09-11-2010"), LONGITUDE = c(-83.93867, -84.11667,
-84.158, -83.89933, -83.90333, -84.03533, -84.038, -84.01267,
-84.14267, -84.18933, -84.01267, -84.02733, -83.49867, -83.98133,
-84.194, -84.148, -83.99933, -83.75733, -83.76333, -83.802, -83.79333,
-83.836, -83.78733, -83.92333, -84.23267, -83.79867, -84.15,
-84.07533, -84.082, -84.10333), LATITUDE = c(29.38467, 29.262,
29.19333, 28.89467, 28.866, 29.05933, 29.1, 29.074, 29.056, 29.01267,
29.52467, 29.488, 29.62267, 29.05867, 29.17067, 29.208, 29.29933,
29.08333, 29.05, 29.05267, 29.114, 29.09933, 29.18533, 29.04333,
28.83533, 29.16133, 29.256, 29.254, 29.30867, 29.32667)), .Names = c("Date",
"LONGITUDE", "LATITUDE"), row.names = c(1L, 20L, 21L, 211L, 212L,
213L, 214L, 215L, 249L, 250L, 339L, 340L, 367L, 368L, 369L, 370L,
72196L, 72197L, 72198L, 72199L, 72229L, 72237L, 72261L, 72302L,
72303L, 72456L, 72493L, 72494L, 72495L, 72496L), class = "data.frame")
> dput(df2)
structure(list(trip.start = c("01-06-2008", "01-12-2008", "03-29-2008",
"04-03-2008", "04-21-2008", "04-25-2008", "05-30-2008", "06-12-2008",
"06-27-2008", "12-19-2008", "12-20-2008", "12-28-2008", "01-02-2009",
"05-08-2009", "05-30-2009", "06-05-2009", "06-11-2009", "06-14-2009",
"06-19-2009", "07-11-2009"), trip.end = c("01-06-2008", "01-12-2008",
"03-30-2008", "04-04-2008", "04-22-2008", "04-26-2008", "05-31-2008",
"06-13-2008", "06-28-2008", "12-19-2008", "12-20-2008", "12-28-2008",
"01-03-2009", "05-09-2009", "05-30-2009", "06-06-2009", "06-12-2009",
"06-16-2009", "06-20-2009", "07-11-2009"), b = c(29, 80.2, 50.3589743589744,
34.6666666666667, 46.4117647058824, 44.0555555555556, 28.6511627906977,
34.7179487179487, 35.4871794871795, 37.3846153846154, 34.5555555555556,
103.764705882353, 25.6875, 25.9333333333333, 34, 40.3529411764706,
28.3448275862069, 25.1764705882353, 30.6896551724138, 47.3333333333333
), ln_b = c(3.36729582998647, 4.38452351487247, 3.91917684278476,
3.54577861047326, 3.83755297678966, 3.78545146373868, 3.3551940283999,
3.54725680734257, 3.56917149004797, 3.62125926643896, 3.54256833484301,
4.64212589251052, 3.24600449225645, 3.25552914251624, 3.52636052461616,
3.69766428366967, 3.34444456506971, 3.22590985152558, 3.42392563273971,
3.85721476893315), c = c(680.8023262, 2656.955794, 7593.859146,
4124.485734, 5422.506164, 5251.09887, 3739.569761, 3577.070057,
3572.918367, 1604.339982, 2090.724546, 5934.445367, 2302.743139,
2574.448603, 1095.623765, 3377.839478, 2109.856165, 3065.508244,
2118.826924, 1424.613609), ln_c = c(6.52327199411584, 7.88493630806025,
8.93509519255547, 8.32469662045131, 8.59831337944543, 8.56619264225512,
8.22672584662552, 8.18229932447399, 8.18113801057517, 7.38046772481922,
7.64526595760902, 8.68852885145536, 7.74185636038285, 7.85339065541043,
6.99907912846437, 8.12499157640442, 7.65437505590011, 8.02796865648309,
7.65861787678239, 7.26165590435557)), .Names = c("trip.start",
"trip.end", "b", "ln_b", "c", "ln_c"), row.names = c(16699L,
16724L, 17055L, 17081L, 17162L, 17187L, 17383L, 17461L, 17558L,
18176L, 18191L, 18223L, 18255L, 18732L, 18832L, 18888L, 18941L,
18979L, 19015L, 19124L), class = "data.frame")
您可以使用apply
:
apply
[使用# Go through each row of df2 and do the matching.
# Return the `cbind`-ed df if matches found.
# The '1' in `apply(df2, 1, ...) means 'do-by-row'.
df12 <- apply(df2, 1, function(df2Row){
matches <- df1$Date >= df2Row["trip.start"] & df1$Date <= df2Row["trip.end"];
if(sum(matches) > 0){
# Had to transpose df2Row before doing data.frame(..).
# or it would make a single column df with 6 rows instead!
# Very weird...
cbind(df1[matches, ], data.frame(t(df2Row)));
} else {
NULL;
}
})
# Now bind all the dfs in the list output above into
# a single data.frame
df12 <- do.call(rbind, df12);
的更好方法(值得学习!),但是如果您想坚持以dplyr
library为基数,那么以上就是您可以做到的一种方法。