由于数据是一个时间序列,其中包含一年以上的半小时值,我计划每天找到每天的平均值,并在相应一天的平均值中填充NAS(我意识到这不是统计上最合理的方法,但我只需要目前运行的代码)。
数据集很大(〜200个变量,约17,000个观测值)
TIMESTAMP <- c("2019-04-27 17:30:00", "2019-04-27 18:00:00", "2019-04-27 18:30:00", "2019-04-27 19:00:00", "2019-04-27 19:30:00", "2019-04-28 10:00:00", "2019-04-28 10:30:00", "2019-04-28 11:00:00", "2019-04-28 11:30:00", "2019-04-28 12:00:00")
dates<-c("2019-04-27", "2019-04-27", "2019-04-27", "2019-04-27", "2019-04-27", "2019-04-28", "2019-04-28", "2019-04-28", "2019-04-28", "2019-04-28")
ch4_flux <- c(NA, 66.39, 65.39, 64.41, 63.52, 62.76, 62.16,NA, 61.54,61.53)
distance <- c(1000,1000,NA,125.35,1000,NA,1000,5.50,NA,1000)
Tau <-c(0.0322000, 0.0495000, 0.1737616, 0.1772567, NA, 0.1246816, 0.1435230, 0.1098670, NA, NA)
filename<- c("2019-04-27T173000_AIU-2079.ghg","2019-04-27T180000_AIU-2079.ghg", "2019-04-27T183000_AIU-2079.ghg","2019-04-27T190000_AIU-2079.ghg", "2019-04-27T193000_AIU-2079.ghg",NA, "2019-04-28T100000_AIU-2079.ghg","2019-04-28T103000_AIU-2079.ghg", "2019-04-28T110000_AIU-2079.ghg",NA)
dd<- data.frame(TIMESTAMP, dates, ch4_flux, distance,Tau, filename)
ave()
:
i = is.na(dd$ch4_flux)
dd$ch4_flux[i]= with(dd, ave(ch4_flux, dates, FUN = \(i) mean(i, na.rm=TRUE)))[i]
> dd
TIMESTAMP dates ch4_flux
1 2019-04-27 17:30:00 2019-04-27 64.9275
2 2019-04-27 18:00:00 2019-04-27 66.3900
3 2019-04-27 18:30:00 2019-04-27 65.3900
4 2019-04-27 19:00:00 2019-04-27 64.4100
5 2019-04-27 19:30:00 2019-04-27 63.5200
6 2019-04-28 10:00:00 2019-04-28 62.7600
7 2019-04-28 10:30:00 2019-04-28 62.1600
8 2019-04-28 11:00:00 2019-04-28 61.9975
9 2019-04-28 11:30:00 2019-04-28 61.5400
10 2019-04-28 12:00:00 2019-04-28 61.5300
用
tranform
replace
:
# dd =
dd |>
transform(ch4_flux =
replace(ch4_flux, is.na(ch4_flux),
ave(ch4_flux, dates, FUN = \(i) mean(i, na.rm=TRUE))))