我有大约3000个时间序列的医疗数据(就诊产生的诊断),想用
tso
包的tsoutliers
函数做干预分析,然后用STL+arima(stlf
),arima进行预测(auto.arima
) 和 nnetar
模型(在 forecastHybrid
的帮助下,它是著名的 forecast
包的包装器,允许一次运行各种模型并对预测进行平均)......但这一切进程运行非常低
我会用一个时间序列展示我在做什么,因为我当然不能在这里重现整个 3000 个时间序列......让我们创建一个名为
tsdatadx
的时间序列
tsdatadx <- ts(c(1541,972,1069,826,820,745,649,567,747,1086,962,988,1064,1039,924,766,990,
1047,766,576,692,865,852,1046,919,803,1117,1140,1024,1081,1172,984,1351,
1157,1284,1289,1146,923,1180,982,1161,1088,1059,751,994,1144,1066,1241,
1085,899,976,1135,1159,1233,874,682,1013,1009,1070,1202,993,1097,1097,
1066,1290,1416,978,842,1062,1211,1181,1153,1274,1129,1245,1054,1187,
1097,889,654,948,1123,1091,1213,1810,1199,1282,1127,1339,1304,1012,835,
1201,1440,1438,1462,1661,1333,1308,1487,1454,1472,1051,906,1385,1151,
1623,2269,1912,1684,1835,347,229,467,547,378,367,501,610,550,692,478,
596,692,428,619,1022,1044,1065,1270,1703,1642,1355,542,1743,2063,2643,
2467,2028,1743,2055,2330,3539,3336,2541,2073,2594),
frequency = 12, start = c(2011,1))
使用
tso
函数,有时会出错,所以我创建了一个自定义函数,它将尝试一系列函数......首先是所有5种类型的异常值(AO,TC,LS,IO,SLS),如果只有错误尝试 4,如果错误只尝试 3 等等......然后返回预测模型的回归量
library(tidyverse)
library(tsibble)
library(lubridate)
library(fable)
library(data.table)
library(feasts)
library(hts)
library(forecast)
library(forecastHybrid)
library(dplyr)
library(tsoutliers)
library(parallel)
library(future.apply)
library(zoo)
library(purrr)
library(maditr)
lista_func_outl <- list(function(x) { outl <- tso(x, types = c("AO", "LS", "TC", "IO", "SLS"));return(outl)},
function(x) { outl <- tso(x, types = c("AO", "LS", "TC", "IO"));return(outl)},
function(x) { outl <- tso(x, types = c("AO", "LS", "TC"));return(outl)},
function(x) { outl <- tso(x, types = c("AO", "LS"));return(outl)})
#### funcion que recibe una serie, obtiene outliers y devuelve los xreg para arima, nnetar y stlm
xreg_serie <- function(ts, lista_func, n_pred) {
for(i in seq_along(lista_func)) {
try({ outl <- lista_func[[i]](ts); break;}, silent = TRUE)
}
if(nrow(outl$outliers) > 0) {
xreg_return <- outliers.effects(outliers(type = outl$outliers$type,
ind = outl$outliers$ind, weight = outl$outliers$coefhat),
n = length(ts)+n_pred, pars = coefs2poly(outl$fit), weights = TRUE)
} else {xreg_return <- 'no_xreg'}
return(xreg_return)
}
如果我想在我的数据中获得离群回归变量并将它们推断为 9 个月进行预测..我可以在下面的代码行中做到这一点
xreg_input <- xreg_serie(ts = tsdatadx, lista_func = lista_func_outl, n_pred = 9)
另外,我创建了一个自定义函数,使用这些异常值返回预测:
prueba_prono_con_outliers <- function (ts_temp, n_pred, xreg_input, xreg_prono) {
errorvar <- FALSE
if(xreg_input == "no_xreg") {
hmodel <- tryCatch(
hybridModel(ts_temp, models = "ans",
a.args = list(stepwise = FALSE, approximation = FALSE,
method = "ML", lambda = "auto"),
s.args = list(robust = TRUE, s.window = "periodic", lambda = 0,
method = "arima"),
n.args = list(repeats = 50, lambda = "auto"), verbose = FALSE ),
error = function(e) {errorvar = TRUE})
if (errorvar == FALSE) {
hforecast <- forecast(hmodel, h = n_pred)
nombres <- colnames(hforecast$pointForecasts)
forecast_temp <- cbind(hforecast$pointForecasts, hforecast$mean)
colnames(forecast_temp) <- c(nombres, "promedio")
forecast_temp <- as.data.frame(forecast_temp)
forecast_temp$tiempo <- as.Date(time(hforecast$mean))
} else {
columns = c("auto.arima","nnetar","stlm")
hforecast <- data.frame(matrix(nrow = 0, ncol = length(columns)))
colnames(hforecast) = columns
}
} else {
hmodel <- tryCatch(
hybridModel(ts_temp, models = "ans",
a.args = list(stepwise = FALSE, approximation = FALSE,
method = "ML", lambda = "auto", xreg = xreg_input),
s.args = list(robust = TRUE, s.window = "periodic", lambda = 0,
method = "arima", xreg = xreg_input),
n.args = list(repeats = 50, lambda = "auto", xreg = xreg_input), verbose = FALSE),
error = function(e) {errorvar <- TRUE})
if (errorvar == FALSE) {
hforecast <- forecast(hmodel, xreg = xreg_prono)
nombres <- colnames(hforecast$pointForecasts)
forecast_temp <- cbind(hforecast$pointForecasts, hforecast$mean)
colnames(forecast_temp) <- c(nombres, "promedio")
forecast_temp <- as.data.frame(forecast_temp)
forecast_temp$tiempo <- as.Date(time(hforecast$mean))
} else {
columns = c("auto.arima","nnetar","stlm","promedio","tiempo")
forecast_temp <- data.frame(matrix(nrow = 0, ncol = length(columns)))
colnames(forecast_temp) = columns
}
}
return(forecast_temp)
}
我得到了这条线的预测
n_pred <- 9
prono <- prueba_prono_con_outliers(ts_temp = tsdatadx, n_pred = n_pred,
xreg_input = head(xreg_input, length(tsdatadx)),
xreg_prono = tail(xreg_input, n_pred))
1 个时间序列的整个过程在我的笔记本电脑上大约需要 6 分钟......但我有将近 3000 个时间序列
假设我将这 3000 个时间序列存储在一个名为 ts.data.full 的列表中……然后运行以下行
prono_full <- future_lapply(ts.data.full,
function(x) prueba_prono_con_outliers(x, n_pred = n_pred,
xreg_input = head(xreg_serie(x, lista_func = lista_func_outl, n_pred), length(x)),
xreg_prono = tail(xreg_serie(x, lista_func = lista_func_outl, n_pred), n_pred)))
那条线需要很长时间才能完成……超过 1 天而且还在运行……
我知道 forecast 包中的 tsoutliers 函数非常快,但只能检测附加异常值...
我也知道包
fable
存在并且可以非常快速地预测我的3000个时间序列,但不支持干预分析......我认为这个数据真的需要一个很好的异常值处理......你可以看到非常由于 covid,2020 年初诊断为这种诊断的医疗预约大幅下降,并且自 2022 年 2 月以来快速增长
抱歉解释太多...最后我的问题是:
有没有办法加快这个过程?也许是另一种包或另一种方法来一次性检测异常值和预测……或者我的编程效率很低?
包
fable
将来会支持干预分析吗?