用分组向量绑定 R tidyverse 中两个不同长度数据帧的列

Question

我正在尝试对两个数据帧进行列绑定：第一个是具有大量列和约 10,000 行的时间序列。第二个是第一个的导数，具有其他相关向量之间的相关系数。

第二个 df 明显比第一个短得多。两个 dfs 都有

species

和

region

的分组列。

目标：我想将较短的第二个 df 的结果添加到第一个，通过

species

和

region

列进行匹配。

我尝试过

merge

、

full_join

和其他

join

函数，但这些函数要么导致

NAs

，要么向第一个 df 添加行。我将如何在

tidy

宇宙中匹配和合并这些 df？

下面是 dfs 和所需输出的最小可重现示例：

set.seed(123)

ebird <- data.frame(
  
  region = c("one", "one", "one", "one", "one",
             "one", "one", "one", "one", "one",
             "one", "one", "one", "one", "one",
             "two", "two", "two", "two", "two",
             "two", "two", "two", "two", "two",
             "two", "two", "two", "two", "two",
             "three", "three", "three", "three", "three",
             "three", "three", "three", "three", "three",
             "three", "three", "three", "three", "three"),
  Species = c("A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C",
              "A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C",
              "A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C"),
  value = sample(seq(from = 1, to = 45, by = 1), replace = TRUE), # abundance
  date = seq(from = 1, to = 5, by = 1)
  
) 

cor <- data.frame(
  
  region = c("one", "one", "one",
             "two", "two", "two",
             "three", "three", "three"),
  Species = c("A", "B", "C",
              "A", "B", "C",
              "A", "B", "C"),
  rho = sample(seq(from = -1, to = 1, by = 0.1), size = 9, replace = TRUE),
  date_max_value = sample(seq(from = 1, to = 5, by = 1), size = 9, replace = TRUE) # date of max abundance (value)

)

desired_output <- data.frame(
  
  region = c("one", "one", "one", "one", "one",
             "one", "one", "one", "one", "one",
             "one", "one", "one", "one", "one",
             "two", "two", "two", "two", "two",
             "two", "two", "two", "two", "two",
             "two", "two", "two", "two", "two",
             "three", "three", "three", "three", "three",
             "three", "three", "three", "three", "three",
             "three", "three", "three", "three", "three"),
  Species = c("A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C",
              "A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C",
              "A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C"),
  value = sample(seq(from = 1, to = 45, by = 1), replace = TRUE), # abundance
  date = seq(from = 1, to = 5, by = 1),
  rho = c(rep(-0.6, 5), rep(-0.3, 5), rep(0.1, 5), rep(-0.2, 5), rep(0.7, 5),
          rep(-1.0, 5), rep(-0.5, 5), rep(1.0, 5), rep(0.4, 5)),
  date_max_value = c(rep(1, 5), rep(2, 5), rep(4, 5), rep(4, 5), rep(3, 5),
                     rep(1, 5), rep(2, 5), rep(1, 5), rep(2, 5))
  
)

编辑添加

dput(head(df1))

和

dput(head(df2))

以便读者了解数据的结构。注意：我试图在

left_join

列上添加

Species

，而不是

species

列 - 我知道这可能会造成一些混乱，因为我的物种在我的玩具示例中没有大写：

dput(head(ebird))
structure(list(species = c("Mallard", "Mallard", "Mallard", "Mallard", 
"Mallard", "Mallard"), week = structure(c(19255, 19262, 19276, 
19283, 19290, 19304), class = "Date"), median = c(3.28408646583557, 
2.67390370368958, 6.20513391494751, 16.557181596756, 26.7760531902313, 
118.688171863556), region = c("Adams", "Adams", "Adams", "Adams", 
"Adams", "Adams"), month = c(9, 9, 10, 10, 10, 11), lower = c(2.58288896083832, 
2.33652949333191, 5.62750267982483, 15.5113918781281, 24.3405115604401, 
102.083569049835), upper = c(3.94177603721619, 2.92097306251526, 
7.13815760612488, 17.6045758724213, 29.9772963523865, 141.712877750397
), Date = structure(c(19255, 19262, 19276, 19283, 19290, 19304
), class = "Date"), week1 = c(38, 39, 41, 42, 43, 45), Species = c("MALL", 
"MALL", "MALL", "MALL", "MALL", "MALL"), survey = c(0, 0, 0, 
0, 0, 100), year = c(2022, 2022, 2022, 2022, 2022, 2022), day = c(20L, 
27L, 11L, 18L, 25L, 8L), Date1 = structure(c(19255, 19262, 19276, 
19283, 19290, 19304), class = "Date"), difference = c(3.28408646583557, 
2.67390370368958, 6.20513391494751, 16.557181596756, 26.7760531902313, 
18.6881718635559), max_survey = c(500, 500, 500, 500, 500, 500
), max_ebird = c(231.35857963562, 231.35857963562, 231.35857963562, 
231.35857963562, 231.35857963562, 231.35857963562), max_ebird_upper = c(262.414058685303, 
262.414058685303, 262.414058685303, 262.414058685303, 262.414058685303, 
262.414058685303), max_ebird_lower = c(211.088241577148, 211.088241577148, 
211.088241577148, 211.088241577148, 211.088241577148, 211.088241577148
), scaled_ebird = c(0.0141947900570961, 0.0115574002394934, 0.0268204184375627, 
0.0715650209420927, 0.115733997124301, 0.513005275406189), scaled_ebird_upper = c(0.0150212075411071, 
0.0111311607203873, 0.0272018871316847, 0.0670870149283174, 0.114236624754684, 
0.540035387053498), scaled_ebird_lower = c(0.0122360627078999, 
0.011068970378807, 0.0266594796459475, 0.0734829745239929, 0.115309651445195, 
0.483606136879613), scaled_survey = c(0, 0, 0, 0, 0, 0.2), zscaled_ebird = c(-1.15368075571164, 
-1.16038986114502, -1.12156314025646, -1.00773989965177, -0.895380961995726, 
0.115214769380754), zscaled_ebird_upper = c(-1.14916660942026, 
-1.15919230741443, -1.11777371864876, -1.01497900584015, -0.893461766987318, 
0.203936283432415), zscaled_ebird_lower = c(-1.15213092209424, 
-1.15506773303076, -1.11583657969332, -0.998012352409944, -0.892761860089624, 
0.0340004185595489), zscaled_survey = c(-0.639688641399654, -0.639688641399654, 
-0.639688641399654, -0.639688641399654, -0.639688641399654, -0.0156021619853574
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
> dput(head(cor))
structure(list(Date1 = structure(c(19255, 19241, 19234, 19234, 
19234, 19234), class = "Date"), survey = c(0, 0, 0, 0, 0, 0), 
    Species = c("ABDU ", "ABDU ", "ABDU ", "ABDU ", "ABDU ", 
    "ABDU "), region = c(" Adams", " Appanoose/Lucas", " Bremer", 
    " Buena Vista", " Butler", " Calhoun"), Parameter1 = c("scaled_ebird", 
    "scaled_ebird", "scaled_ebird", "scaled_ebird", "scaled_ebird", 
    "scaled_ebird"), Parameter2 = c("scaled_survey", "scaled_survey", 
    "scaled_survey", "scaled_survey", "scaled_survey", "scaled_survey"
    ), rho = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), CI = c(0.95, 0.95, 0.95, 0.95, 0.95, 0.95), CI_low = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CI_high = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), S = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), p = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Method = c("NA correlation", 
    "NA correlation", "NA correlation", "NA correlation", "NA correlation", 
    "NA correlation"), n_Obs = c(0L, 0L, 0L, 0L, 0L, 0L), state = c("IA", 
    "IA", "IA", "IA", "IA", "IA")), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"))

Answer 1

您提供的真实数据样本中没有匹配的记录，所以我将

cor$Species

的值更改为“MALL”，保留原始数据中的尾部空格。

我还注意到

cor$region

有一个前导空格。

试试这个：

ebird %>%
  inner_join(
    cor %>%
      mutate(
        region = str_trim(region),
        Species = str_trim(Species)
      ),
    by = c("region", "Species")
  )

它会修剪

cor

中关键列的前导和尾随空格。

这给出了玩具数据的合理结果（尽管

value

列中的值在

ebird

和

desired_output

之间似乎不匹配）。

使用修改后的真实数据，它仅产生 6 行（与

ebird

中的匹配行）并从

cor

中挑选出相应的值。

如果这不起作用，也许您可以提供记录之间存在匹配的真实数据的子集？

用分组向量绑定 R tidyverse 中两个不同长度数据帧的列

问题描述投票：0回答：1

1个回答

最新问题

用分组向量绑定 R tidyverse 中两个不同长度数据帧的列

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1