如何将每隔15分钟记录的天气数据转换为每小时数据?

问题描述 投票:0回答:5

我们的气象站每15分钟记录一次温度数据。我想计算每小时

min
imum,每日
max
imum和每日
mean
温度。我怎样才能用 R 编程语言做到这一点,最好是用
dplyr
语法?

我的数据是这样的

                   date temperature
1   2014-05-26 11:45:00        25.0
2   2014-05-26 12:00:00        25.2
3   2014-05-26 12:15:00        25.3
4   2014-05-26 12:30:00        25.1
5   2014-05-26 12:45:00        25.4
[...]
96  2014-05-27 11:30:00        26.3
97  2014-05-27 11:45:00        25.7
98  2014-05-27 12:00:00        24.9
99  2014-05-27 12:15:00        24.9
100 2014-05-27 12:30:00        26.0

这是可重现的例子。

dput
函数更改日期格式,mdy_hm 将无法正确解析它。我认为这就是某些答案对我不起作用的原因。

df <- structure(list(date = structure(c(1401104700, 1401105600, 1401106500, 
1401107400, 1401108300, 1401109200, 1401110100, 1401111000, 1401111900, 
1401112800, 1401113700, 1401114600, 1401115500, 1401116400, 1401117300, 
1401118200, 1401119100, 1401120000, 1401120900, 1401121800, 1401122700, 
1401123600, 1401124500, 1401125400, 1401126300, 1401127200, 1401128100, 
1401129000, 1401129900, 1401130800, 1401131700, 1401132600, 1401133500, 
1401134400, 1401135300, 1401136200, 1401137100, 1401138000, 1401138900, 
1401139800, 1401140700, 1401141600, 1401142500, 1401143400, 1401144300, 
1401145200, 1401146100, 1401147000, 1401147900, 1401148800, 1401149700, 
1401150600, 1401151500, 1401152400, 1401153300, 1401154200, 1401155100, 
1401156000, 1401156900, 1401157800, 1401158700, 1401159600, 1401160500, 
1401161400, 1401162300, 1401163200, 1401164100, 1401165000, 1401165900, 
1401166800, 1401167700, 1401168600, 1401169500, 1401170400, 1401171300, 
1401172200, 1401173100, 1401174000, 1401174900, 1401175800, 1401176700, 
1401177600, 1401178500, 1401179400, 1401180300, 1401181200, 1401182100, 
1401183000, 1401183900, 1401184800, 1401185700, 1401186600, 1401187500, 
1401188400, 1401189300, 1401190200, 1401191100, 1401192000, 1401192900, 
1401193800), tzone = "UTC", class = c("POSIXct", "POSIXt")), 
    temperature = c(25, 25.2, 25.3, 25.1, 25.4, 26, 25.9, 25.6, 
    26.8, 27.8, 26.8, 26, 26, 26.3, 27, 27, 26.2, 25.8, 24.9, 
    25.1, 26.3, 25.6, 25.3, 25.2, 25.1, 24.8, 24.7, 24, 23, 22.7, 
    22.5, 22.5, 22.2, 21.9, 21.5, 21.1, 20.8, 20.5, 20.3, 20.3, 
    20.2, 20, 19.8, 19.6, 19.2, 19.1, 19.1, 18.9, 18.8, 18.6, 
    18.3, 18.2, 18.2, 18.2, 18.1, 17.9, 17.8, 17.7, 17.8, 18, 
    18.1, 18, 18.1, 18.6, 18.7, 18.5, 18.3, 18.1, 18.1, 18.6, 
    18.8, 18.6, 18.6, 18.3, 18.2, 18, 17.8, 18, 18.2, 18.9, 19.8, 
    19.6, 19.5, 19.7, 20.2, 21.5, 22.4, 23, 24, 23.3, 23.2, 23.7, 
    24.5, 24.8, 24.9, 26.3, 25.7, 24.9, 24.9, 26)), row.names = c(NA, 
-100L), class = c("tbl_df", "tbl", "data.frame"))

df$date = as.POSIXct(df$date)

我用下面的代码将15分钟记录的数据转换为daily,但我想改为hourly.

library(dplyr)
library(lubridate)
df %>%
   group_by(date = as.Date(date, "%m/%d/%Y")) %>%       # how to group by hour?
    summarise(min_temp = min(temperature, na.rm = TRUE),
              max_temp = max(temperature, na.rm = TRUE),
              mean_temp = mean(temperature, na.rm = TRUE)) %>%
              ungroup()
r dataframe dplyr aggregate posixct
5个回答
0
投票

使用

lubridate::hour
添加额外的分组变量,例如

> '4/20/2017 7:30 PM' |> 
+     strptime(format = '%m/%d/%Y %I:%M %p', tz = 'UTC') |> 
+     lubridate::hour()
[1] 19

(确保时区正确,以防万一)


0
投票

这里有一个解决方案:

  1. 使用
    lubridate
    .
  2. 将日期转换为日期时间对象
  3. 从日期时间对象中提取小时。
  4. 按小时分组并总结。
library(dplyr)
library(lubridate)
library(stringr)

df <- tibble(
  date = c('4/20/2017 1:40 PM', '4/20/2017 1:45 PM',
           '4/20/2017 6:45 PM', '4/20/2017 7:45 PM'),
  temperature = c(14.5, 14.8, 19, 21.0)
)

# create date time, hour
df <- df %>%
  mutate(
    datetime = parse_date_time(df$date, '%m/%d/%Y %H:%M %p'),
    hour = lubridate::hour(datetime))

# group by hour
df %>%
  group_by(hour) %>%
  summarise(min_temp = min(temperature, na.rm = TRUE),
            max_temp = max(temperature, na.rm = TRUE),
            mean_temp = mean(temperature, na.rm = TRUE))

退货:

# A tibble: 3 × 4
   hour min_temp max_temp mean_temp
  <int>    <dbl>    <dbl>     <dbl>
1    13     14.5     14.8      14.6
2    18     19       19        19  
3    19     21       21        21  

0
投票

我们这里先用

mdy_hm()
函数转成datetime格式。然后 2. 在计算下一行的差异后,我们应用
cumsum()
3.我们用
ceiling(diff/60)
创建小时然后我们使用现有代码:

library(lubridate)
library(dplyr)
df %>% 
  mutate(date = mdy_hm(date),
         diff = cumsum(as.numeric(difftime(date,lag(date, default = date[1]), units = "mins"))),
         hour = ceiling(diff / 60)) %>%
  group_by(hour) %>%
  summarise(min_temp = min(temperature, na.rm = TRUE),
            max_temp = max(temperature, na.rm = TRUE),
            mean_temp = mean(temperature, na.rm = TRUE)) %>%
  ungroup()


  hour min_temp max_temp mean_temp
  <dbl>    <dbl>    <dbl>     <dbl>
1     0     14.5     14.5      14.5
2     1     14.8     19.1      16.8
3     2     20.2     21.8      21.3
4     3     22.4     25.6      24.1
5     4     24.7     26.8      25.8
6     5     24.1     25.3      24.8
7     6     23       26.3      24.5
8     7     24.6     26.1      25.4
9     8     24.7     27.7      26.2

数据

df <- structure(list(date = c("4/20/2017 1:30 PM", "4/20/2017 1:45 PM", 
"4/20/2017 2:00 PM", "4/20/2017 2:15 PM", "4/20/2017 2:30 PM", 
"4/20/2017 2:45 PM", "4/20/2017 3:00 PM", "4/20/2017 3:15 PM", 
"4/20/2017 3:30 PM", "4/20/2017 3:45 PM", "4/20/2017 4:00 PM", 
"4/20/2017 4:15 PM", "4/20/2017 4:30 PM", "4/20/2017 4:45 PM", 
"4/20/2017 5:00 PM", "4/20/2017 5:15 PM", "4/20/2017 5:30 PM", 
"4/20/2017 5:45 PM", "4/20/2017 6:00 PM", "4/20/2017 6:15 PM", 
"4/20/2017 6:30 PM", "4/20/2017 6:45 PM", "4/20/2017 7:00 PM", 
"4/20/2017 7:15 PM", "4/20/2017 7:30 PM", "4/20/2017 7:45 PM", 
"4/20/2017 8:00 PM", "4/20/2017 8:15 PM", "4/20/2017 8:30 PM", 
"4/20/2017 8:45 PM", "4/20/2017 9:00 PM", "4/20/2017 9:15 PM", 
"4/20/2017 9:30 PM"), temperature = c(14.5, 14.8, 15.8, 17.3, 
19.1, 20.2, 21.4, 21.8, 21.7, 22.4, 23.2, 25.3, 25.6, 26.5, 26.8, 
24.7, 25.2, 25, 25.3, 24.7, 24.1, 23, 23.1, 25.6, 26.3, 26.1, 
25.8, 25.2, 24.6, 24.7, 25.6, 26.8, 27.7)), class = "data.frame", row.names = c(NA, 
-33L))

0
投票

使用

round_date
进行每小时分组的方法。使用玩具数据。

(...以确保您的date是一流的

date
使用:
df$date <- as.POSIXct(df$date, format="%m/%d/%Y %I:%M %p")

library(dplyr)
library(lubridate)

df %>% 
  group_by(grp = cumsum(format(round_date(date, "hour"), "%H:%M") == 
                                 format(date, "%H:%M"))) %>% 
  summarize(date = unique(format(date, "%m/%d/%Y %I:00 %p")), 
            max_temperature = max(temperature), 
            min_temerature = min(temperature), 
            mean_temperature = mean(temperature)) %>% 
  select(-grp)
# A tibble: 3 × 4
  date                max_temperature min_temerature mean_temperature
  <chr>                         <int>          <int>            <dbl>
1 04/20/2017 01:00 PM              26             14             20  
2 04/20/2017 02:00 PM              27             10             17.2
3 04/20/2017 03:00 PM              35             16             26.5

数据

set.seed(42)
df <- data.frame(date = seq(as.POSIXct("4/20/2017 01:30 PM", 
format="%m/%d/%Y %I:%M %p"), as.POSIXct("4/20/2017 03:45 PM", 
format="%m/%d/%Y %I:%M %p"), 900), temperature = sample(10:35, 10))

0
投票

使用非 tidyvert 方法,我们可以使用

by
并将
"POSIXct"
转换为
"Date"
as
INDICES=
将数据按日期拆分为类似列表的对象。然后我们可以在
aggregate
d
strftime
ours
%H
获得每小时
min
imums。最后只是
data.frame
min
h_mins
以及日常
max
min
,以及
rbind
的东西。

by(df, as.Date(df$date), \(x) {
  h_mins <- aggregate(temperature ~ strftime(x$date, '%H'), x, min)[, 'temperature']
  data.frame(min_temp=min(h_mins), max_temp=max(x$temperature), mean_temp=mean(x$temperature))
}) |> do.call(what=rbind)
#            min_temp max_temp mean_temp
# 2017-04-20 12.09854 19.94601  17.01504
# 2017-04-21 12.09854 19.94601  17.01504
# 2017-04-22 12.09854 19.94601  17.01504

注意: 我使用下面的数据来获得更多可用天数。这也适用于您的

dput
,但您需要
df$date <- as.POSIXct(df$date)
因为您有
"POSIXct"
格式。


资料:

df <- data.frame(
  date=seq.POSIXt(as.POSIXct('2017-04-20', tz='GMT'), as.POSIXct('2017-04-22 23:45', tz='GMT'), by='15 mins'),
  temperature=dnorm(seq.int(-1, 1, length.out=96))*.5e2
)
© www.soinside.com 2019 - 2024. All rights reserved.