我有一个非常大的数据集。以下是 .csv 中的代表性示例。
date, x, y
"03082018", 304, 1
"071999", 305, 1
"04032018", 309, 2
"041997", 400, 0
"05081999", 306, 1
"22052000", 201, 1
"16092002", 201, 1
"121997", 101, 1
"26051999", 203, 1
"16012003", 202, 1
"101998", 300, 2
"02012010", 201, 2
"02112018", 310, 2
"111997", 101, 1
"20032004", 101, 1
date
值被保留为数据框中的字符类对象。
较大样本(20,000 行)的
dput()
输出可在以下位置获取:https://pastebin.com/bAVpJMb5.
我想将
date
列转换为 Date
类对象。 为此,我必须添加缺失的日期或日期和月份。原始数据集中的所有值在 date
列中都有 8、6 或 4 位数字的日期。月份的数字在 01-12
范围内,年份的数字在 1996-2022
范围内。
我希望从完整值中对它们进行采样,而不是任意分配缺失值。例如,在
date
为 2001
的行中,将每个观察结果替换为来自所有完整日期的样本,格式为 ddmm2001
。其中 date
是 061997
,从 dd061997
形式的值中采样。
一开始,我只是随意将缺少的天数设置为
15
,将缺少的月份设置为06
。
我使用了以下代码:
df |>
mutate(
date_new = case_when(
str_length(date) == 6 ~ str_c("15", date),
str_length(date) == 4 ~ str_c("1506", date),
.default = date
)
) |>
mutate(date_unalt = date_new == date) |> # New column that signals whether the date is unaltered.
relocate(date_new, date, date_unalt, x, y)
但是,我不希望将缺失值不成比例地转换为任何日期。
我知道我可以删除所有不完整的值。然而,它们确实携带了“一些”信息。因此,我宁愿在数据中引入一些“知情”噪音,同时保留任何信息。 未解决我的具体问题的相关主题:
library(dplyr)
library(stringr)
library(purrr)
set.seed(1)
df <- mutate(df, rn = row_number())
df |>
filter(str_length(date) != 8) |>
mutate(new_date = map_chr(date, \(x) sample(df$date[endsWith(df$date, x) & str_length(df$date) == 8], 1))) |>
right_join(df) |>
mutate(new_date = coalesce(new_date, date)) |>
select(-rn)
fix_date <- function(d) {
len <- nchar(d)
Date <- as.Date(d, format="%d%m%Y")
yrs <- format(Date, "%Y")
mths <- format(Date, "%m")
if(any(len<8)) { # missing days
yrs[is.na(yrs)] =substr(d[is.na(yrs)], len[is.na(yrs)] - 3, len[is.na(yrs)])
mths[len==6] =substr(d[len==6], 1, 2)
days=format(Date, "%d")
days[len<8] <- sample(days[len==8], size=sum(len<8), replace=TRUE)
Date[len<8] <- as.Date(paste(yrs, mths, days, sep="-"))[len<8]
if(any(len<6)) { # missing months
mths[len<6] <- sample(mths[len>=6], size=sum(len<6), replace=TRUE)
Date[len<6] <- as.Date(paste(yrs, mths, days, sep="-"))[len<6]
}
# There may be invalid dates created due to sampling, ex. 2000-02-31 may occur
# Loop until there are no non-valid dates
while(any(is.na(Date))) {
days[is.na(Date)] <- sample(days[len==8], size=sum(is.na(Date)), replace=TRUE)
mths[is.na(Date)] <- sample(mths[len>=6], size=sum(is.na(Date)), replace=TRUE)
Date[is.na(Date)] <- as.Date(paste(yrs, mths, days, sep="-"))[is.na(Date)]
}
}
Date
}
在玩具数据上进行测试(注意:我将第14行更改为1997):
mutate(df, Date=fix_date(date))
# A tibble: 15 × 4
date x y Date
<chr> <dbl> <dbl> <date>
1 03082018 304 1 2018-08-03
2 071999 305 1 1999-07-16
3 04032018 309 2 2018-03-04
4 041997 400 0 1997-04-02
5 05081999 306 1 1999-08-05
6 22052000 201 1 2000-05-22
7 16092002 201 1 2002-09-16
8 121997 101 1 1997-12-20
9 26051999 203 1 1999-05-26
10 16012003 202 1 2003-01-16
11 101998 300 2 1998-10-16
12 02012010 201 2 2010-01-02
13 02112018 310 2 2018-11-02
14 1997 101 1 1997-09-16
15 20032004 101 1 2004-03-20
df <- read.csv("bAVpJMb5.txt")
df2 <- mutate(df, Date=fix_date(date))
summary(df2$Date)
Min. 1st Qu. Median Mean 3rd Qu. Max.
"1996-01-01" "2000-04-21" "2006-02-11" "2007-04-18" "2013-09-28" "2022-12-31"
hist(df2$Date, breaks=as.numeric(diff(range(df2$Date))))
summary(df2$Date) # note the function was modified
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
"1996-01-01" "2000-04-21" "2006-02-12" "2007-04-18" "2013-09-28" "2022-12-31" "4"
split
是日期,首先按年份,应该关心闰年,其次按是否需要关心。第三,在
lapply
循环中,我们再次按 nchar
进行分割,因为我们可能会混合缺少日期以及缺少日期和月份。 if
这个nchar
至少包含一个完整的日期,我们从那里采样,else
如果不是的话,我们可能想从全年中采样一个随机日期,我们使用seq.Date
安全地得出。最后我们unsplit
几次才取回原来的订单。在这之间,我们删除年份不合适的失败,这是因为我们有一些缺少年份的日期和月份。> cpl_date <- \(x, start=1300) {
+ # x <- replace(x, nchar(x) < 6L, NA) ## maybe delete ambiguous strings?
+ nx <- nchar(x)
+ nxs <- substr(x, nx - 3L, nx)
+ Z <- split(x, nxs) ## split by year
+ fails <- names(Z) < start ## store fails with unappropriate year
+ sa <- sapply(lapply(lapply(Z, nchar), `<`, 8), any)
+ sZ <- split(Z, sa) ## split by if need care or not
+ sZ$`TRUE` <- lapply(sZ$`TRUE`, \(z) {
+ nz <- nchar(z)
+ sz <- split(z, nz) ## split by nchar
+ szn <- lapply(sz, nchar)
+ sz8 <- sapply(lapply(lapply(sz, nchar), `==`, 8), any)
+ if (any(sz8)) { ## if any complete dates (nz == 8) sample from available
+ sz[!sz8] <- lapply(sz[!sz8], \(zz) {
+ sprintf('%s%s', sample(substr(sz$`8`, 1L, 8L - nchar(zz)), length(zz)),
+ zz)
+ })
+ } else { ## sample from year dates
+ sz[!sz8] <- lapply(sz[!sz8], \(zz) {
+ drg <- seq.Date(
+ as.Date(sprintf('%s-01-01', substr(el(z), nz[1] - 3L, nz[1]))),
+ length.out=2,
+ by='1 year'
+ )
+ dsq <- do.call('seq.Date', c(as.list(drg), by='day')) |>
+ strftime('%d%m')
+ sprintf('%s%s', sample(substr(dsq, 1L, 8L - nchar(zz)), length(zz)), zz)
+ })
+ }
+ unsplit(sz, nz)
+ })
+ Z <- unsplit(sZ, sa)
+ Z[fails] <- lapply(Z[fails], \(x) ## delete fails
+ replicate(length(x), NA_character_))
+ Z |> unsplit(nxs) |> as.Date('%d%m%Y')
+ }
使用方法
> set.seed(42)
> transform(d, dte_new=cpl_date(date))
date x y dte_new
1 03082018 304 1 2018-08-03
2 071999 305 1 1999-07-26
3 04032018 309 2 2018-03-04
4 041997 400 0 1997-04-16
5 05081999 306 1 1999-08-05
6 22052000 201 1 2000-05-22
7 16092002 201 1 2002-09-16
8 121997 101 1 1997-12-26
9 26051999 203 1 1999-05-26
10 16012003 202 1 2003-01-16
11 101998 300 2 1998-10-18
12 02012010 201 2 2010-01-02
13 02112018 310 2 2018-11-02
14 111997 101 1 1997-11-02
15 20032004 101 1 2004-03-20
16 032024 101 1 2024-03-24
17 082024 101 1 2024-08-22
18 2024 101 1 2024-10-29
19 1912 101 1 1912-03-14
20 311220 101 1 <NA>
21 010119 101 1 <NA>
22 020119 101 1 <NA>
测试
> set.seed(42)
> R <- 5
> d <- d |> sort_by(~substr(date, nchar(d$date) - 3L, nchar(d$date)))
> data.frame(original=d$date, replicate(R, cpl_date(d$date), simplify=FALSE) |>
+ setNames(paste('sample', seq_len(R), sep='_')))
original sample_1 sample_2 sample_3 sample_4 sample_5
1 010119 <NA> <NA> <NA> <NA> <NA>
2 020119 <NA> <NA> <NA> <NA> <NA>
3 311220 <NA> <NA> <NA> <NA> <NA>
4 1912 1912-03-14 1912-04-19 1912-11-09 1912-12-13 1912-09-14
5 041997 1997-04-16 1997-04-20 1997-04-25 1997-04-16 1997-04-24
6 121997 1997-12-26 1997-12-24 1997-12-24 1997-12-04 1997-12-05
7 111997 1997-11-02 1997-11-30 1997-11-07 1997-11-14 1997-11-18
8 101998 1998-10-18 1998-10-10 1998-10-26 1998-10-21 1998-10-09
9 071999 1999-07-26 1999-07-26 1999-07-05 1999-07-26 1999-07-05
10 05081999 1999-08-05 1999-08-05 1999-08-05 1999-08-05 1999-08-05
11 26051999 1999-05-26 1999-05-26 1999-05-26 1999-05-26 1999-05-26
12 22052000 2000-05-22 2000-05-22 2000-05-22 2000-05-22 2000-05-22
13 16092002 2002-09-16 2002-09-16 2002-09-16 2002-09-16 2002-09-16
14 16012003 2003-01-16 2003-01-16 2003-01-16 2003-01-16 2003-01-16
15 20032004 2004-03-20 2004-03-20 2004-03-20 2004-03-20 2004-03-20
16 02012010 2010-01-02 2010-01-02 2010-01-02 2010-01-02 2010-01-02
17 03082018 2018-08-03 2018-08-03 2018-08-03 2018-08-03 2018-08-03
18 04032018 2018-03-04 2018-03-04 2018-03-04 2018-03-04 2018-03-04
19 02112018 2018-11-02 2018-11-02 2018-11-02 2018-11-02 2018-11-02
20 032024 2024-03-24 2024-03-05 2024-03-15 2024-03-01 2024-03-12
21 082024 2024-08-22 2024-08-30 2024-08-18 2024-08-23 2024-08-15
22 2024 2024-10-29 2024-04-18 2024-11-09 2024-08-02 2024-02-02
date, x, y
"03082018", 304, 1
"071999", 305, 1
"04032018", 309, 2
"041997", 400, 0
"05081999", 306, 1
"22052000", 201, 1
"16092002", 201, 1
"121997", 101, 1
"26051999", 203, 1
"16012003", 202, 1
"101998", 300, 2
"02012010", 201, 2
"02112018", 310, 2
"111997", 101, 1
"20032004", 101, 1
"032024", 101, 1
"082024", 101, 1
"2024", 101, 1
"1912", 101, 1
"311220", 101, 1
"010119", 101, 1
"020119", 101, 1