如何使用 R 中的滞后/超前计算中位数

问题描述 投票:0回答:1

我想计算该数据集中 chla 变量的连续两个月中位数。我创建了一个 ID 键(ID = 当前月份,ID2 = 连续月份)来帮助计算。我尝试使用

lead()
median()
,但是我使用中值函数的方式有些不正确。如果可能的话,我更喜欢 dplyr 方法,但对任何方法都持开放态度。

注意:5月之前或10月之后没有样品。

数据集:

df <- structure(list(Date = structure(c(1495166400, 1498536000, 1499659200, 
1503288000, 1504843200, 1507003200, 1526270400, 1528862400, 1531800000, 
1533528000, 1536552000, 1540180800, 1558324800, 1560139200, 1562040000, 
1565668800, 1568174400, 1570680000, 1588564800, 1592193600, NA, 
1596427200, 1599537600, 1602043200, 1621224000, 1624334400, 1626753600, 
1629086400, 1631592000, 1634702400, 1652673600, 1656302400, 1658721600, 
1661745600, 1663560000, 1666843200), tzone = "", class = c("POSIXct", 
"POSIXt")), Month = c(5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 
6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 
8, 9, 10), Year = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 2019L, 
2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2022L, 2022L, 2022L, 
2022L, 2022L, 2022L), ID = c("5_2017", "6_2017", "7_2017", "8_2017", 
"9_2017", "10_2017", "5_2018", "6_2018", "7_2018", "8_2018", 
"9_2018", "10_2018", "5_2019", "6_2019", "7_2019", "8_2019", 
"9_2019", "10_2019", "5_2020", "6_2020", "7_2020", "8_2020", 
"9_2020", "10_2020", "5_2021", "6_2021", "7_2021", "8_2021", 
"9_2021", "10_2021", "5_2022", "6_2022", "7_2022", "8_2022", 
"9_2022", "10_2022"), ID2 = c("6_2017", "7_2017", "8_2017", "9_2017", 
"10_2017", "11_2017", "6_2018", "7_2018", "8_2018", "9_2018", 
"10_2018", "11_2018", "6_2019", "7_2019", "8_2019", "9_2019", 
"10_2019", "11_2019", "6_2020", "7_2020", "8_2020", "9_2020", 
"10_2020", "11_2020", "6_2021", "7_2021", "8_2021", "9_2021", 
"10_2021", "11_2021", "6_2022", "7_2022", "8_2022", "9_2022", 
"10_2022", "11_2022"), chla = c(34, 34, 34, 34, 92.5, 156, 34, 
34, 20, 34, 34, 34, 34, 34, 34, 34, 34, 176, 34, 34, NA, 34, 
34, 34, 34, 34, 34, 34, 34, 30, 34, 34, 34, 34, 34, 151)), out.attrs = list(
    dim = c(6L, 6L), dimnames = list(Var1 = c("Var1= 5", "Var1= 6", 
    "Var1= 7", "Var1= 8", "Var1= 9", "Var1=10"), Var2 = c("Var2=2017", 
    "Var2=2018", "Var2=2019", "Var2=2020", "Var2=2021", "Var2=2022"
    ))), class = "data.frame", row.names = c(NA, -36L))

我的做法:

Trial <- df %>%
  mutate(Consec_2month_median = 
  case_when(ID2 == lead(ID) ~ median(chla, lead(chla), na.rm = TRUE)))

所需输出:

Desired <- structure(list(Date = structure(c(1495166400, 1498536000, 1499659200, 
1503288000, 1504843200, 1507003200, 1526270400, 1528862400, 1531800000, 
1533528000, 1536552000, 1540180800, 1558324800, 1560139200, 1562040000, 
1565668800, 1568174400, 1570680000, 1588564800, 1592193600, NA, 
1596427200, 1599537600, 1602043200, 1621224000, 1624334400, 1626753600, 
1629086400, 1631592000, 1634702400, 1652673600, 1656302400, 1658721600, 
1661745600, 1663560000, 1666843200), class = c("POSIXct", "POSIXt"
), tzone = ""), Month = c(5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 
5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 
7, 8, 9, 10), Year = c(2017, 2017, 2017, 2017, 2017, 2017, 2018, 
2018, 2018, 2018, 2018, 2018, 2019, 2019, 2019, 2019, 2019, 2019, 
2020, 2020, 2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021, 2021, 
2021, 2022, 2022, 2022, 2022, 2022, 2022), ID = c("5_2017", "6_2017", 
"7_2017", "8_2017", "9_2017", "10_2017", "5_2018", "6_2018", 
"7_2018", "8_2018", "9_2018", "10_2018", "5_2019", "6_2019", 
"7_2019", "8_2019", "9_2019", "10_2019", "5_2020", "6_2020", 
"7_2020", "8_2020", "9_2020", "10_2020", "5_2021", "6_2021", 
"7_2021", "8_2021", "9_2021", "10_2021", "5_2022", "6_2022", 
"7_2022", "8_2022", "9_2022", "10_2022"), ID2 = c("6_2017", "7_2017", 
"8_2017", "9_2017", "10_2017", "11_2017", "6_2018", "7_2018", 
"8_2018", "9_2018", "10_2018", "11_2018", "6_2019", "7_2019", 
"8_2019", "9_2019", "10_2019", "11_2019", "6_2020", "7_2020", 
"8_2020", "9_2020", "10_2020", "11_2020", "6_2021", "7_2021", 
"8_2021", "9_2021", "10_2021", "11_2021", "6_2022", "7_2022", 
"8_2022", "9_2022", "10_2022", "11_2022"), chla = c(34, 34, 34, 
34, 92.5, 156, 34, 34, 20, 34, 34, 34, 34, 34, 34, 34, 34, 176, 
34, 34, NA, 34, 34, 34, 34, 34, 34, 34, 34, 30, 34, 34, 34, 34, 
34, 151), Consec_2month_median = c(34, 34, 34, 63.25, 124.25, 
NA, 34, 27, 27, 34, 34, NA, 34, 34, 34, 34, 105, NA, 34, NA, 
NA, 34, 34, NA, 34, 34, 34, 34, 32, NA, 34, 34, 34, 34, 92.5, 
NA)), row.names = c(NA, -36L), class = "data.frame")

在缺少数据或连续月份超出时间范围的情况下,我可以接受将单个值报告为中位数(例如,由于缺少第 11 个月,因此使用第 10 个月的 chla 值而不是 NA)数据)

r dplyr median
1个回答
0
投票
library(dplyr)

left_join(df, df[,c("ID", "chla")], 
            by = c("ID2" = "ID"), suffix = c("", "2")) %>% 
  rowwise() %>% 
  mutate(Consec_2month_median = median(c(chla, chla2))) %>% 
  select(-chla2) 

#> # A tibble: 36 × 7
#> # Rowwise: 
#>    Date                Month  Year ID      ID2      chla Consec_2month_median
#>    <dttm>              <dbl> <int> <chr>   <chr>   <dbl>                <dbl>
#>  1 2017-05-19 00:00:00     5  2017 5_2017  6_2017   34                   34  
#>  2 2017-06-27 00:00:00     6  2017 6_2017  7_2017   34                   34  
#>  3 2017-07-10 00:00:00     7  2017 7_2017  8_2017   34                   34  
#>  4 2017-08-21 00:00:00     8  2017 8_2017  9_2017   34                   63.2
#>  5 2017-09-08 00:00:00     9  2017 9_2017  10_2017  92.5                124. 
#>  6 2017-10-03 00:00:00    10  2017 10_2017 11_2017 156                   NA  
#>  7 2018-05-14 00:00:00     5  2018 5_2018  6_2018   34                   34  
#>  8 2018-06-13 00:00:00     6  2018 6_2018  7_2018   34                   27  
#>  9 2018-07-17 00:00:00     7  2018 7_2018  8_2018   20                   27  
#> 10 2018-08-06 00:00:00     8  2018 8_2018  9_2018   34                   34  
#> # ℹ 26 more rows

创建于 2024 年 10 月 16 日,使用 reprex v2.0.2

© www.soinside.com 2019 - 2024. All rights reserved.