我想计算该数据集中 chla 变量的连续两个月中位数。我创建了一个 ID 键(ID = 当前月份,ID2 = 连续月份)来帮助计算。我尝试使用
lead()
和 median()
,但是我使用中值函数的方式有些不正确。如果可能的话,我更喜欢 dplyr 方法,但对任何方法都持开放态度。
注意:5月之前或10月之后没有样品。
数据集:
df <- structure(list(Date = structure(c(1495166400, 1498536000, 1499659200,
1503288000, 1504843200, 1507003200, 1526270400, 1528862400, 1531800000,
1533528000, 1536552000, 1540180800, 1558324800, 1560139200, 1562040000,
1565668800, 1568174400, 1570680000, 1588564800, 1592193600, NA,
1596427200, 1599537600, 1602043200, 1621224000, 1624334400, 1626753600,
1629086400, 1631592000, 1634702400, 1652673600, 1656302400, 1658721600,
1661745600, 1663560000, 1666843200), tzone = "", class = c("POSIXct",
"POSIXt")), Month = c(5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5,
6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7,
8, 9, 10), Year = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 2019L,
2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L,
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2022L, 2022L, 2022L,
2022L, 2022L, 2022L), ID = c("5_2017", "6_2017", "7_2017", "8_2017",
"9_2017", "10_2017", "5_2018", "6_2018", "7_2018", "8_2018",
"9_2018", "10_2018", "5_2019", "6_2019", "7_2019", "8_2019",
"9_2019", "10_2019", "5_2020", "6_2020", "7_2020", "8_2020",
"9_2020", "10_2020", "5_2021", "6_2021", "7_2021", "8_2021",
"9_2021", "10_2021", "5_2022", "6_2022", "7_2022", "8_2022",
"9_2022", "10_2022"), ID2 = c("6_2017", "7_2017", "8_2017", "9_2017",
"10_2017", "11_2017", "6_2018", "7_2018", "8_2018", "9_2018",
"10_2018", "11_2018", "6_2019", "7_2019", "8_2019", "9_2019",
"10_2019", "11_2019", "6_2020", "7_2020", "8_2020", "9_2020",
"10_2020", "11_2020", "6_2021", "7_2021", "8_2021", "9_2021",
"10_2021", "11_2021", "6_2022", "7_2022", "8_2022", "9_2022",
"10_2022", "11_2022"), chla = c(34, 34, 34, 34, 92.5, 156, 34,
34, 20, 34, 34, 34, 34, 34, 34, 34, 34, 176, 34, 34, NA, 34,
34, 34, 34, 34, 34, 34, 34, 30, 34, 34, 34, 34, 34, 151)), out.attrs = list(
dim = c(6L, 6L), dimnames = list(Var1 = c("Var1= 5", "Var1= 6",
"Var1= 7", "Var1= 8", "Var1= 9", "Var1=10"), Var2 = c("Var2=2017",
"Var2=2018", "Var2=2019", "Var2=2020", "Var2=2021", "Var2=2022"
))), class = "data.frame", row.names = c(NA, -36L))
我的做法:
Trial <- df %>%
mutate(Consec_2month_median =
case_when(ID2 == lead(ID) ~ median(chla, lead(chla), na.rm = TRUE)))
所需输出:
Desired <- structure(list(Date = structure(c(1495166400, 1498536000, 1499659200,
1503288000, 1504843200, 1507003200, 1526270400, 1528862400, 1531800000,
1533528000, 1536552000, 1540180800, 1558324800, 1560139200, 1562040000,
1565668800, 1568174400, 1570680000, 1588564800, 1592193600, NA,
1596427200, 1599537600, 1602043200, 1621224000, 1624334400, 1626753600,
1629086400, 1631592000, 1634702400, 1652673600, 1656302400, 1658721600,
1661745600, 1663560000, 1666843200), class = c("POSIXct", "POSIXt"
), tzone = ""), Month = c(5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10,
5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6,
7, 8, 9, 10), Year = c(2017, 2017, 2017, 2017, 2017, 2017, 2018,
2018, 2018, 2018, 2018, 2018, 2019, 2019, 2019, 2019, 2019, 2019,
2020, 2020, 2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021, 2021,
2021, 2022, 2022, 2022, 2022, 2022, 2022), ID = c("5_2017", "6_2017",
"7_2017", "8_2017", "9_2017", "10_2017", "5_2018", "6_2018",
"7_2018", "8_2018", "9_2018", "10_2018", "5_2019", "6_2019",
"7_2019", "8_2019", "9_2019", "10_2019", "5_2020", "6_2020",
"7_2020", "8_2020", "9_2020", "10_2020", "5_2021", "6_2021",
"7_2021", "8_2021", "9_2021", "10_2021", "5_2022", "6_2022",
"7_2022", "8_2022", "9_2022", "10_2022"), ID2 = c("6_2017", "7_2017",
"8_2017", "9_2017", "10_2017", "11_2017", "6_2018", "7_2018",
"8_2018", "9_2018", "10_2018", "11_2018", "6_2019", "7_2019",
"8_2019", "9_2019", "10_2019", "11_2019", "6_2020", "7_2020",
"8_2020", "9_2020", "10_2020", "11_2020", "6_2021", "7_2021",
"8_2021", "9_2021", "10_2021", "11_2021", "6_2022", "7_2022",
"8_2022", "9_2022", "10_2022", "11_2022"), chla = c(34, 34, 34,
34, 92.5, 156, 34, 34, 20, 34, 34, 34, 34, 34, 34, 34, 34, 176,
34, 34, NA, 34, 34, 34, 34, 34, 34, 34, 34, 30, 34, 34, 34, 34,
34, 151), Consec_2month_median = c(34, 34, 34, 63.25, 124.25,
NA, 34, 27, 27, 34, 34, NA, 34, 34, 34, 34, 105, NA, 34, NA,
NA, 34, 34, NA, 34, 34, 34, 34, 32, NA, 34, 34, 34, 34, 92.5,
NA)), row.names = c(NA, -36L), class = "data.frame")
在缺少数据或连续月份超出时间范围的情况下,我可以接受将单个值报告为中位数(例如,由于缺少第 11 个月,因此使用第 10 个月的 chla 值而不是 NA)数据)
library(dplyr)
left_join(df, df[,c("ID", "chla")],
by = c("ID2" = "ID"), suffix = c("", "2")) %>%
rowwise() %>%
mutate(Consec_2month_median = median(c(chla, chla2))) %>%
select(-chla2)
#> # A tibble: 36 × 7
#> # Rowwise:
#> Date Month Year ID ID2 chla Consec_2month_median
#> <dttm> <dbl> <int> <chr> <chr> <dbl> <dbl>
#> 1 2017-05-19 00:00:00 5 2017 5_2017 6_2017 34 34
#> 2 2017-06-27 00:00:00 6 2017 6_2017 7_2017 34 34
#> 3 2017-07-10 00:00:00 7 2017 7_2017 8_2017 34 34
#> 4 2017-08-21 00:00:00 8 2017 8_2017 9_2017 34 63.2
#> 5 2017-09-08 00:00:00 9 2017 9_2017 10_2017 92.5 124.
#> 6 2017-10-03 00:00:00 10 2017 10_2017 11_2017 156 NA
#> 7 2018-05-14 00:00:00 5 2018 5_2018 6_2018 34 34
#> 8 2018-06-13 00:00:00 6 2018 6_2018 7_2018 34 27
#> 9 2018-07-17 00:00:00 7 2018 7_2018 8_2018 20 27
#> 10 2018-08-06 00:00:00 8 2018 8_2018 9_2018 34 34
#> # ℹ 26 more rows
创建于 2024 年 10 月 16 日,使用 reprex v2.0.2