我正在尝试对两个数据帧进行列绑定:第一个是具有大量列和约 10,000 行的时间序列。第二个是第一个的导数,具有其他相关向量之间的相关系数。
第二个 df 明显比第一个短得多。两个 dfs 都有
species
和 region
的分组列。
目标:我想将较短的第二个 df 的结果添加到第一个,通过
species
和 region
列进行匹配。
我尝试过
merge
、full_join
和其他 join
函数,但这些函数要么导致 NAs
,要么向第一个 df 添加行。我将如何在 tidy
宇宙中匹配和合并这些 df?
下面是 dfs 和所需输出的最小可重现示例:
set.seed(123)
ebird <- data.frame(
region = c("one", "one", "one", "one", "one",
"one", "one", "one", "one", "one",
"one", "one", "one", "one", "one",
"two", "two", "two", "two", "two",
"two", "two", "two", "two", "two",
"two", "two", "two", "two", "two",
"three", "three", "three", "three", "three",
"three", "three", "three", "three", "three",
"three", "three", "three", "three", "three"),
Species = c("A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C",
"A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C",
"A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C"),
value = sample(seq(from = 1, to = 45, by = 1), replace = TRUE), # abundance
date = seq(from = 1, to = 5, by = 1)
)
cor <- data.frame(
region = c("one", "one", "one",
"two", "two", "two",
"three", "three", "three"),
Species = c("A", "B", "C",
"A", "B", "C",
"A", "B", "C"),
rho = sample(seq(from = -1, to = 1, by = 0.1), size = 9, replace = TRUE),
date_max_value = sample(seq(from = 1, to = 5, by = 1), size = 9, replace = TRUE) # date of max abundance (value)
)
desired_output <- data.frame(
region = c("one", "one", "one", "one", "one",
"one", "one", "one", "one", "one",
"one", "one", "one", "one", "one",
"two", "two", "two", "two", "two",
"two", "two", "two", "two", "two",
"two", "two", "two", "two", "two",
"three", "three", "three", "three", "three",
"three", "three", "three", "three", "three",
"three", "three", "three", "three", "three"),
Species = c("A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C",
"A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C",
"A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C"),
value = sample(seq(from = 1, to = 45, by = 1), replace = TRUE), # abundance
date = seq(from = 1, to = 5, by = 1),
rho = c(rep(-0.6, 5), rep(-0.3, 5), rep(0.1, 5), rep(-0.2, 5), rep(0.7, 5),
rep(-1.0, 5), rep(-0.5, 5), rep(1.0, 5), rep(0.4, 5)),
date_max_value = c(rep(1, 5), rep(2, 5), rep(4, 5), rep(4, 5), rep(3, 5),
rep(1, 5), rep(2, 5), rep(1, 5), rep(2, 5))
)
编辑添加
dput(head(df1))
和dput(head(df2))
以便读者了解数据的结构。注意:我试图在 left_join
列上添加 Species
,而不是 species
列 - 我知道这可能会造成一些混乱,因为我的物种在我的玩具示例中没有大写:
dput(head(ebird))
structure(list(species = c("Mallard", "Mallard", "Mallard", "Mallard",
"Mallard", "Mallard"), week = structure(c(19255, 19262, 19276,
19283, 19290, 19304), class = "Date"), median = c(3.28408646583557,
2.67390370368958, 6.20513391494751, 16.557181596756, 26.7760531902313,
118.688171863556), region = c("Adams", "Adams", "Adams", "Adams",
"Adams", "Adams"), month = c(9, 9, 10, 10, 10, 11), lower = c(2.58288896083832,
2.33652949333191, 5.62750267982483, 15.5113918781281, 24.3405115604401,
102.083569049835), upper = c(3.94177603721619, 2.92097306251526,
7.13815760612488, 17.6045758724213, 29.9772963523865, 141.712877750397
), Date = structure(c(19255, 19262, 19276, 19283, 19290, 19304
), class = "Date"), week1 = c(38, 39, 41, 42, 43, 45), Species = c("MALL",
"MALL", "MALL", "MALL", "MALL", "MALL"), survey = c(0, 0, 0,
0, 0, 100), year = c(2022, 2022, 2022, 2022, 2022, 2022), day = c(20L,
27L, 11L, 18L, 25L, 8L), Date1 = structure(c(19255, 19262, 19276,
19283, 19290, 19304), class = "Date"), difference = c(3.28408646583557,
2.67390370368958, 6.20513391494751, 16.557181596756, 26.7760531902313,
18.6881718635559), max_survey = c(500, 500, 500, 500, 500, 500
), max_ebird = c(231.35857963562, 231.35857963562, 231.35857963562,
231.35857963562, 231.35857963562, 231.35857963562), max_ebird_upper = c(262.414058685303,
262.414058685303, 262.414058685303, 262.414058685303, 262.414058685303,
262.414058685303), max_ebird_lower = c(211.088241577148, 211.088241577148,
211.088241577148, 211.088241577148, 211.088241577148, 211.088241577148
), scaled_ebird = c(0.0141947900570961, 0.0115574002394934, 0.0268204184375627,
0.0715650209420927, 0.115733997124301, 0.513005275406189), scaled_ebird_upper = c(0.0150212075411071,
0.0111311607203873, 0.0272018871316847, 0.0670870149283174, 0.114236624754684,
0.540035387053498), scaled_ebird_lower = c(0.0122360627078999,
0.011068970378807, 0.0266594796459475, 0.0734829745239929, 0.115309651445195,
0.483606136879613), scaled_survey = c(0, 0, 0, 0, 0, 0.2), zscaled_ebird = c(-1.15368075571164,
-1.16038986114502, -1.12156314025646, -1.00773989965177, -0.895380961995726,
0.115214769380754), zscaled_ebird_upper = c(-1.14916660942026,
-1.15919230741443, -1.11777371864876, -1.01497900584015, -0.893461766987318,
0.203936283432415), zscaled_ebird_lower = c(-1.15213092209424,
-1.15506773303076, -1.11583657969332, -0.998012352409944, -0.892761860089624,
0.0340004185595489), zscaled_survey = c(-0.639688641399654, -0.639688641399654,
-0.639688641399654, -0.639688641399654, -0.639688641399654, -0.0156021619853574
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
> dput(head(cor))
structure(list(Date1 = structure(c(19255, 19241, 19234, 19234,
19234, 19234), class = "Date"), survey = c(0, 0, 0, 0, 0, 0),
Species = c("ABDU ", "ABDU ", "ABDU ", "ABDU ", "ABDU ",
"ABDU "), region = c(" Adams", " Appanoose/Lucas", " Bremer",
" Buena Vista", " Butler", " Calhoun"), Parameter1 = c("scaled_ebird",
"scaled_ebird", "scaled_ebird", "scaled_ebird", "scaled_ebird",
"scaled_ebird"), Parameter2 = c("scaled_survey", "scaled_survey",
"scaled_survey", "scaled_survey", "scaled_survey", "scaled_survey"
), rho = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), CI = c(0.95, 0.95, 0.95, 0.95, 0.95, 0.95), CI_low = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CI_high = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), S = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), p = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Method = c("NA correlation",
"NA correlation", "NA correlation", "NA correlation", "NA correlation",
"NA correlation"), n_Obs = c(0L, 0L, 0L, 0L, 0L, 0L), state = c("IA",
"IA", "IA", "IA", "IA", "IA")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
您提供的真实数据样本中没有匹配的记录,所以我将
cor$Species
的值更改为“MALL”,保留原始数据中的尾部空格。
我还注意到
cor$region
有一个前导空格。
试试这个:
ebird %>%
inner_join(
cor %>%
mutate(
region = str_trim(region),
Species = str_trim(Species)
),
by = c("region", "Species")
)
它会修剪
cor
中关键列的前导和尾随空格。
这给出了玩具数据的合理结果(尽管
value
列中的值在 ebird
和 desired_output
之间似乎不匹配)。
使用修改后的真实数据,它仅产生 6 行(与
ebird
中的匹配行)并从 cor
中挑选出相应的值。
如果这不起作用,也许您可以提供记录之间存在匹配的真实数据的子集?