我正在尝试计算开始日期和结束日期之间的持续时间,这看起来很简单。问题是格式是可变的。我创建了一些函数来清理字符串并处理不同的格式,但对于大约 8% 左右的日期(据我所知),转换不准确。我真的希望有人能帮忙解决这个问题。
data <- read.csv('justDates.csv',
na.strings=c("","NA"))
## FUNCTIONS -----------------------------------------
# clean text
# don't judge me, I hate regex and this is hella ugly but gets the job done. Maybe.
clean_dates <- function(datetime_string) {
# Normalize timezones by removing all mentions of timezone abbreviations except UTC
datetime_string2 <- gsub("\\s*\\([^\\)]+\\)", "", datetime_string) # Remove extra timezone notes in parentheses
datetime_string3 <- gsub("(?<=\\d)(?=UTC|\\sUTC).*", "", datetime_string2, perl = TRUE) # Strip trailing text after UTC
datetime_string4 <- gsub("[\n\t]", " ", datetime_string3, perl = TRUE) # Remove newlines and tabs
datetime_string5 <- gsub("UTC\\s+", "UTC", datetime_string4, perl = TRUE) # Clean trailing whitespace after UTC
datetime_string6 <- gsub(" ?(UTC|GMT|UT)", "", datetime_string5, perl = TRUE)
datetime_string7 <- gsub("[\\h\\v]+$", "",datetime_string6, perl=TRUE)
return(datetime_string7)
}
#convert various date and time formats into one standard format
parse_date <- function(date_string){
parse_attempts <- list(
function(x) mdy_hms(x),
function(x) dmy_hms(x),
function(x) mdy_hm(x),
function(x) dmy_hm(x),
function(x) ymd_hms(x),
function(x) ymd_hm(x)
)
# # Attempt each format, catching errors and returning the first successful conversion
for (parse_fn in parse_attempts) {
date <- tryCatch(parse_fn(date_string), error = function(e) NA)
if (!is.na(date)) return(as.numeric(as.POSIXct(date))) # Convert to consistent POSIXct format if successful
}
return(NA) # Return NA if no formats matched
# Attempt to parse the date with each format
}
# Clean the Dates
df_dates <- data |>
mutate(
start_date_clean=sapply(start_date,clean_dates),
end_date_clean=sapply(end_date,clean_dates)
)
# Convert to POSIXct
df_dates_posix <- df_dates |>
mutate(
start_datePOSIX = sapply(start_date_clean, parse_date),
end_datePOSIX= sapply(end_date_clean, parse_date)
)
# Check dates are correct
df_dates_comp <- df_dates_posix |>
mutate(
neg=end_datePOSIX<=start_datePOSIX,
start_date_rev=as_datetime(start_datePOSIX),
end_date_rev=as_datetime(end_datePOSIX)
)
test3 |>
filter(neg==TRUE) |>
nrow()
有 119 个日期的 end_date 小于 start_date。通过将 *_date_rev 列与 *_date_clean 列进行比较,您可以看到有时转换不太有效。然而,我现在不知道为什么。我很感激你们能提供的任何帮助或见解。我做错了什么?
谢谢!
问题中的外部托管数据集太大,无法包含。我已在本答案末尾添加了前 30 行作为示例。
您可以向
lubridate::parse_date_time()
提供格式向量来尝试和解析。将下面的集合应用于完整数据集会遗漏 2 个对我来说不明确的日期条目。
library(tidyverse)
# Specify the datetime formats in the dataset
orders <- c(
'd B Y, H:M:S',
'B d, Y, H:M:S',
'B d, Y, H:M',
'd B Y H:M',
'd b Y H:M',
'd B Y, H:M',
'B d, Y, H:M:S'
)
df |>
# Drop text after UTC
mutate(across(everything(), ~str_replace(., '(UTC).*', '\\1'))) |>
# Parse datetimes
mutate(across(everything(), ~parse_date_time(., orders))) |>
filter(start_date > end_date)
#> # A tibble: 0 × 2
#> # ℹ 2 variables: start_date <dttm>, end_date <dttm>
虽然在这个小样本中,没有任何
start_dates
发生在 end_dates
之后,但此过滤器返回完整数据集中的 5 行。查看原始数据,您会发现诸如以下的行:
25 May 2023 15:15:00 UTC 5 May 2023 16:37:00 UTC
df <- structure(list(start_date = c("12 September 2017,\n21:17:02 UTC",
"15 May 2012, 03:01:23 UTC", "March 15, 2009, 23:43 UTC", "July 29, 1985, 21:00:00 UTC",
"August 8, 1989, 12:37:00 UTC", "August 2, 1991, 15:02:00 UTC",
"2 December 1990, 08:13:32 UTC", "January 8, 1994, 10:05:34 UTC",
"February 20, 1999, 04:18:01 UTC", "October 21, 2001, 08:59:35 UTC",
"2 September 2015\n04:37:42 UTC", "October 6, 1990, 11:47:15 UTC",
"May 7, 1992, 23:40:00 UTC", "December 2, 1993, 09:27:00 UTC",
"16 September 1996, 08:54:49 UTC", "2 December 1990, 08:13:32 UTC",
"15 September 1976,\n09:48:30 UTC", "June 5, 1980, 14:19:30 UTC",
"25 September 2019\n13:57:42 UTC", "June 17, 1985, 11:33:00 UTC",
"July 16, 1969, 13:32:00 UTC", "November 11, 1966, 20:46:33 UTC",
"27 June 1983, 09:12:00 UTC", "22 July 1987, 01:59:17 UTC",
"7 June 1988, 14:03:13 UTC", "July 31, 1992, 13:56:48 UTC",
"4 March 1994, 13:53:01 UTC", "22 February 1996, 20:18:00 UTC",
"November 11, 1982, 11:19:00 UTC", "November 8, 1984, 12:15:00 UTC"
), end_date = c("28 February 2018 02:31 UTC", "17 September 2012, 02:53 UTC",
"March 28, 2009, 19:13 UTC", "August 6, 1985, 19:45:26 UTC",
"August 13, 1989, 13:37:08 UTC", "August 11, 1991, 12:23:25 UTC",
"26 May 1991, 10:04:13 UTC", "July 9, 1994, 10:32:35 UTC", "August 28, 1999, 00:34:20 UTC",
"October 31, 2001, 05:00:00 UTC", "12 September 2015\n00:51 UTC",
"October 10, 1990, 13:57:19 UTC", "May 16, 1992, 20:57:39 UTC",
"December 13, 1993, 05:25:33 UTC", "26 September 1996, 12:13:13 UTC",
"10 December 1990, 06:08:12 UTC", "23 September 1976,\n07:40:47 UTC",
"June 9, 1980, 12:39:00 UTC", "3 October 2019, 10:59 UTC", "June 24, 1985, 13:11:52 UTC",
"July 24, 1969, 16:50:35 UTC", "November 15, 1966, 19:21:04 UTC",
"23 November 1983, 19:58:00 UTC", "29 December 1987, 09:16:15 UTC",
"17 June 1988, 10:12:32 UTC", "August 8, 1992, 13:11:50 UTC",
"18 March 1994, 13:10:42 UTC", "9 March 1996, 13:58:22 UTC",
"November 16, 1982, 14:33:26 UTC", "November 16, 1984, 11:59:56 UTC"
)), row.names = c(NA, -30L), spec = structure(list(cols = list(
start_date = structure(list(), class = c("collector_character",
"collector")), end_date = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))