我们的气象站每15分钟记录一次温度数据。我想计算每小时
min
imum,每日max
imum和每日mean
温度。我怎样才能用 R 编程语言做到这一点,最好是用 dplyr
语法?
我的数据是这样的
date temperature
1 2014-05-26 11:45:00 25.0
2 2014-05-26 12:00:00 25.2
3 2014-05-26 12:15:00 25.3
4 2014-05-26 12:30:00 25.1
5 2014-05-26 12:45:00 25.4
[...]
96 2014-05-27 11:30:00 26.3
97 2014-05-27 11:45:00 25.7
98 2014-05-27 12:00:00 24.9
99 2014-05-27 12:15:00 24.9
100 2014-05-27 12:30:00 26.0
这是可重现的例子。
dput
函数更改日期格式,mdy_hm 将无法正确解析它。我认为这就是某些答案对我不起作用的原因。
df <- structure(list(date = structure(c(1401104700, 1401105600, 1401106500,
1401107400, 1401108300, 1401109200, 1401110100, 1401111000, 1401111900,
1401112800, 1401113700, 1401114600, 1401115500, 1401116400, 1401117300,
1401118200, 1401119100, 1401120000, 1401120900, 1401121800, 1401122700,
1401123600, 1401124500, 1401125400, 1401126300, 1401127200, 1401128100,
1401129000, 1401129900, 1401130800, 1401131700, 1401132600, 1401133500,
1401134400, 1401135300, 1401136200, 1401137100, 1401138000, 1401138900,
1401139800, 1401140700, 1401141600, 1401142500, 1401143400, 1401144300,
1401145200, 1401146100, 1401147000, 1401147900, 1401148800, 1401149700,
1401150600, 1401151500, 1401152400, 1401153300, 1401154200, 1401155100,
1401156000, 1401156900, 1401157800, 1401158700, 1401159600, 1401160500,
1401161400, 1401162300, 1401163200, 1401164100, 1401165000, 1401165900,
1401166800, 1401167700, 1401168600, 1401169500, 1401170400, 1401171300,
1401172200, 1401173100, 1401174000, 1401174900, 1401175800, 1401176700,
1401177600, 1401178500, 1401179400, 1401180300, 1401181200, 1401182100,
1401183000, 1401183900, 1401184800, 1401185700, 1401186600, 1401187500,
1401188400, 1401189300, 1401190200, 1401191100, 1401192000, 1401192900,
1401193800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
temperature = c(25, 25.2, 25.3, 25.1, 25.4, 26, 25.9, 25.6,
26.8, 27.8, 26.8, 26, 26, 26.3, 27, 27, 26.2, 25.8, 24.9,
25.1, 26.3, 25.6, 25.3, 25.2, 25.1, 24.8, 24.7, 24, 23, 22.7,
22.5, 22.5, 22.2, 21.9, 21.5, 21.1, 20.8, 20.5, 20.3, 20.3,
20.2, 20, 19.8, 19.6, 19.2, 19.1, 19.1, 18.9, 18.8, 18.6,
18.3, 18.2, 18.2, 18.2, 18.1, 17.9, 17.8, 17.7, 17.8, 18,
18.1, 18, 18.1, 18.6, 18.7, 18.5, 18.3, 18.1, 18.1, 18.6,
18.8, 18.6, 18.6, 18.3, 18.2, 18, 17.8, 18, 18.2, 18.9, 19.8,
19.6, 19.5, 19.7, 20.2, 21.5, 22.4, 23, 24, 23.3, 23.2, 23.7,
24.5, 24.8, 24.9, 26.3, 25.7, 24.9, 24.9, 26)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
df$date = as.POSIXct(df$date)
我用下面的代码将15分钟记录的数据转换为daily,但我想改为hourly.
library(dplyr)
library(lubridate)
df %>%
group_by(date = as.Date(date, "%m/%d/%Y")) %>% # how to group by hour?
summarise(min_temp = min(temperature, na.rm = TRUE),
max_temp = max(temperature, na.rm = TRUE),
mean_temp = mean(temperature, na.rm = TRUE)) %>%
ungroup()
使用
lubridate::hour
添加额外的分组变量,例如
> '4/20/2017 7:30 PM' |>
+ strptime(format = '%m/%d/%Y %I:%M %p', tz = 'UTC') |>
+ lubridate::hour()
[1] 19
(确保时区正确,以防万一)
这里有一个解决方案:
lubridate
.library(dplyr)
library(lubridate)
library(stringr)
df <- tibble(
date = c('4/20/2017 1:40 PM', '4/20/2017 1:45 PM',
'4/20/2017 6:45 PM', '4/20/2017 7:45 PM'),
temperature = c(14.5, 14.8, 19, 21.0)
)
# create date time, hour
df <- df %>%
mutate(
datetime = parse_date_time(df$date, '%m/%d/%Y %H:%M %p'),
hour = lubridate::hour(datetime))
# group by hour
df %>%
group_by(hour) %>%
summarise(min_temp = min(temperature, na.rm = TRUE),
max_temp = max(temperature, na.rm = TRUE),
mean_temp = mean(temperature, na.rm = TRUE))
退货:
# A tibble: 3 × 4
hour min_temp max_temp mean_temp
<int> <dbl> <dbl> <dbl>
1 13 14.5 14.8 14.6
2 18 19 19 19
3 19 21 21 21
我们这里先用
mdy_hm()
函数转成datetime格式。然后
2. 在计算下一行的差异后,我们应用cumsum()
3.我们用ceiling(diff/60)
创建小时然后我们使用现有代码:
library(lubridate)
library(dplyr)
df %>%
mutate(date = mdy_hm(date),
diff = cumsum(as.numeric(difftime(date,lag(date, default = date[1]), units = "mins"))),
hour = ceiling(diff / 60)) %>%
group_by(hour) %>%
summarise(min_temp = min(temperature, na.rm = TRUE),
max_temp = max(temperature, na.rm = TRUE),
mean_temp = mean(temperature, na.rm = TRUE)) %>%
ungroup()
hour min_temp max_temp mean_temp
<dbl> <dbl> <dbl> <dbl>
1 0 14.5 14.5 14.5
2 1 14.8 19.1 16.8
3 2 20.2 21.8 21.3
4 3 22.4 25.6 24.1
5 4 24.7 26.8 25.8
6 5 24.1 25.3 24.8
7 6 23 26.3 24.5
8 7 24.6 26.1 25.4
9 8 24.7 27.7 26.2
数据
df <- structure(list(date = c("4/20/2017 1:30 PM", "4/20/2017 1:45 PM",
"4/20/2017 2:00 PM", "4/20/2017 2:15 PM", "4/20/2017 2:30 PM",
"4/20/2017 2:45 PM", "4/20/2017 3:00 PM", "4/20/2017 3:15 PM",
"4/20/2017 3:30 PM", "4/20/2017 3:45 PM", "4/20/2017 4:00 PM",
"4/20/2017 4:15 PM", "4/20/2017 4:30 PM", "4/20/2017 4:45 PM",
"4/20/2017 5:00 PM", "4/20/2017 5:15 PM", "4/20/2017 5:30 PM",
"4/20/2017 5:45 PM", "4/20/2017 6:00 PM", "4/20/2017 6:15 PM",
"4/20/2017 6:30 PM", "4/20/2017 6:45 PM", "4/20/2017 7:00 PM",
"4/20/2017 7:15 PM", "4/20/2017 7:30 PM", "4/20/2017 7:45 PM",
"4/20/2017 8:00 PM", "4/20/2017 8:15 PM", "4/20/2017 8:30 PM",
"4/20/2017 8:45 PM", "4/20/2017 9:00 PM", "4/20/2017 9:15 PM",
"4/20/2017 9:30 PM"), temperature = c(14.5, 14.8, 15.8, 17.3,
19.1, 20.2, 21.4, 21.8, 21.7, 22.4, 23.2, 25.3, 25.6, 26.5, 26.8,
24.7, 25.2, 25, 25.3, 24.7, 24.1, 23, 23.1, 25.6, 26.3, 26.1,
25.8, 25.2, 24.6, 24.7, 25.6, 26.8, 27.7)), class = "data.frame", row.names = c(NA,
-33L))
使用
round_date
进行每小时分组的方法。使用玩具数据。
(...以确保您的date是一流的
date
使用:df$date <- as.POSIXct(df$date, format="%m/%d/%Y %I:%M %p")
)
library(dplyr)
library(lubridate)
df %>%
group_by(grp = cumsum(format(round_date(date, "hour"), "%H:%M") ==
format(date, "%H:%M"))) %>%
summarize(date = unique(format(date, "%m/%d/%Y %I:00 %p")),
max_temperature = max(temperature),
min_temerature = min(temperature),
mean_temperature = mean(temperature)) %>%
select(-grp)
# A tibble: 3 × 4
date max_temperature min_temerature mean_temperature
<chr> <int> <int> <dbl>
1 04/20/2017 01:00 PM 26 14 20
2 04/20/2017 02:00 PM 27 10 17.2
3 04/20/2017 03:00 PM 35 16 26.5
set.seed(42)
df <- data.frame(date = seq(as.POSIXct("4/20/2017 01:30 PM",
format="%m/%d/%Y %I:%M %p"), as.POSIXct("4/20/2017 03:45 PM",
format="%m/%d/%Y %I:%M %p"), 900), temperature = sample(10:35, 10))
使用非 tidyvert 方法,我们可以使用
by
并将 "POSIXct"
转换为 "Date"
as INDICES=
将数据按日期拆分为类似列表的对象。然后我们可以在 aggregate
d strftime
ours %H
获得每小时 min
imums。最后只是data.frame
min
的h_mins
以及日常max
和min
,以及rbind
的东西。
by(df, as.Date(df$date), \(x) {
h_mins <- aggregate(temperature ~ strftime(x$date, '%H'), x, min)[, 'temperature']
data.frame(min_temp=min(h_mins), max_temp=max(x$temperature), mean_temp=mean(x$temperature))
}) |> do.call(what=rbind)
# min_temp max_temp mean_temp
# 2017-04-20 12.09854 19.94601 17.01504
# 2017-04-21 12.09854 19.94601 17.01504
# 2017-04-22 12.09854 19.94601 17.01504
注意: 我使用下面的数据来获得更多可用天数。这也适用于您的
dput
,但您需要 df$date <- as.POSIXct(df$date)
因为您有 "POSIXct"
格式。
资料:
df <- data.frame(
date=seq.POSIXt(as.POSIXct('2017-04-20', tz='GMT'), as.POSIXct('2017-04-22 23:45', tz='GMT'), by='15 mins'),
temperature=dnorm(seq.int(-1, 1, length.out=96))*.5e2
)