我正在尝试将我的模型和特征矩阵传递给 SHAPforxgboost,但由于我使用的是可调配方和模型,所以遇到了问题。
大多数示例(包括 这个)显示 tidymodels 如何与 SHAPforxgboost 交互,都有一个需要 prep() 和 Baker() 的步骤,但这对于我正在使用的可调配方来说是不可能的。
任何可以分享的指导将不胜感激。
# LOAD PACKAGES
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidymodels,
tidyverse,
doParallel,
janitor,
AmesHousing,
vip,
randomForest,
finetune,
lightgbm,
pdp
)
# load the housing data and clean names
ames_data <- make_ames() %>%
janitor::clean_names()
# split into training and testing datasets. Stratify by Sale price
ames_split <- rsample::initial_split(
ames_no_outliers,
prop = 0.8,
strata = sale_price
)
# CREATE TRAINING AND TESTING OBJECTS FROM THE SPLIT OBJECT
ames_train <- training(ames_split)
ames_test <- testing(ames_split)
# CREATE RESAMPLES TO CHOOSE AND COMPARE MODELS
set.seed(234)
ames_folds <- vfold_cv(ames_train, strata = sale_price, v = 5)
# BASE RECIPE
base_rec <-
recipe(sale_price ~ ., data = ames_train) %>%
step_log(sale_price, base = 10) %>%
step_YeoJohnson(lot_area, gr_liv_area) %>%
step_other(neighborhood, threshold = .1) %>%
step_dummy(all_nominal()) %>%
step_zv(all_predictors()) %>%
step_ns(latitude, longitude, deg_free = tune())
# RECIPES WITH LOG TRANSFORMATIONS
bt_rec <-
recipe(sale_price ~ overall_qual + gr_liv_area + bsmt_qual + garage_cars + garage_area + year_built + total_bsmt_sf + exter_qual + first_flr_sf + kitchen_qual, data = ames_train) %>%
step_log(sale_price, base = 10) %>%
step_log(gr_liv_area, base = 10)
step_dummy(all_nominal_predictors())
# DEFINE A BAGGED RANDOM FOREST MODEL
bagged_spec <- bag_tree(
tree_depth = tune(),
min_n = tune(),
cost_complexity = tune()
) %>%
set_mode("regression") %>%
set_engine("rpart", times = 25L)
# DEFINE A RANGER RANDOM FOREST MODEL
rf_spec <-
rand_forest(
mtry = tune(),
min_n = tune(),
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
# DEFINE AN XGBOOST MODEL
xgb_spec <- boost_tree(
trees = 500,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost", importance = TRUE) %>%
set_mode("regression")
# DEFINE A BOOSTED TREE ENSEMBLE MODEL
bt_spec <-
boost_tree(
learn_rate = tune(),
stop_iter = tune(),
trees = 500
) %>%
set_engine("lightgbm", num_leaves = tune()) %>%
set_mode("regression")
wflw_set <-
workflow_set(
preproc = list(base = base_rec, bt = bt_rec),
models = list(xgb = xgb_spec, bagged = bagged_spec, rf = rf_spec, bt = bt_spec)
)
# UPDATE MTRY PARAMETER FOR THE BASE XGBOOST
base_xgb_param <- wflw_set %>%
extract_workflow(
id = "base_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 277)))
# UPDATE MTRY PARAMETER FOR THE BASE RF MODEL
base_rf_param <- wflw_set %>%
extract_workflow(
id = "base_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 277)))
# UPDATE MTRY PARAMETER FOR THE BASE BAGGED MODEL
bt_xgb_param <- wflw_set %>%
extract_workflow(
id = "bt_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 21)))
# UPDATE MTRY PARAMETER FOR THE BASE BAGGED MODEL
bt_rf_param <- wflw_set %>%
extract_workflow(
id = "bt_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 21)))
# UPDATE THE WORKFLOW SET WITH THE NEW PARAMETERS
wf_set_tune_list_finalize <- wflw_set %>%
option_add(param_info = base_xgb_param, id = "base_xgb") %>%
option_add(param_info = base_rf_param, id = "base_rf") %>%
option_add(param_info = bt_xgb_param, id = "bt_xgb") %>%
option_add(param_info = bt_rf_param, id = "bt_rf")
# SPECIFY THE TUNE GRID
race_ctrl <-
control_race(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
# DETECT THE NUMBER OF CORES
cores <- parallel::detectCores(logical = FALSE)
# CREATE A SET OF COPIES OF R RUNNING IN PARALLEL AND COMMUNICATING VIA SOCKETS
cl <- makePSOCKcluster(cores)
# REGISTER THE PARALLEL BACKEND
doParallel::registerDoParallel(cores = cl)
# APPLY RACE ANOVA TUNING TO EACH WORKFLOW IN THE WORKFLOW SET
tictoc::tic()
race_results <- wf_set_tune_list_finalize %>%
workflow_map(
"tune_race_anova",
seed = 123,
resamples = ames_folds,
grid = 25,
control = race_ctrl,
verbose = TRUE
)
tictoc::toc()
# STOP THE PARALLEL BACKEND
doParallel::stopCluster(cl)
# FINALIZE THE BEST WORKFLOW
best_results <-
race_results %>%
extract_workflow_set_result("base_xgb") %>%
select_best(metric = "rmse")
# EXTRACT THE XGB MODEL FROM THE WORKFLOWSET, UPDATE THE PARAMETERS,
# AND UPDATE THE PARAMETERS WITH THE NUMERICALLY BEST SETTINGS AND FIT THE DATA
# TO THE TRAINING SET
test_results <-
race_results %>%
extract_workflow("base_xgb") %>%
finalize_workflow(best_results) %>%
last_fit(split = ames_split, metric = "rmse")
但这对于我正在使用的可调配方来说是不可能的。
这就是问题所在。
您应该最终确定调整参数(通过选择最佳值)。一旦你有了完整的配方,你就可以获得这些功能。您可以使用
fit_best()
来做到这一点
library(tidymodels)
tidymodels_prefer()
data(Chicago)
Chicago <-
Chicago %>%
select(ridership, date, all_of(stations))
split <- initial_time_split(Chicago, prop = .995)
chi_train <- training(split)
chi_test <- testing(split)
time_val_split <-
sliding_period(
chi_train,
date,
"month",
lookback = 181,
assess_stop = 1
)
date_and_holidays_and_pca <-
recipe(ridership ~ ., data = chi_train) %>%
step_rm(date) %>%
step_pca(!!stations, num_comp = tune())
lm_spec <- linear_reg() %>% set_engine("lm")
knn_spec <- nearest_neighbor(neighbors = tune()) %>% set_mode("regression")
# use this for fit_best()
ctrl_g <- control_grid(save_workflow = TRUE)
chi_set <-
workflow_set(
preproc = list(pca = date_and_holidays_and_pca),
models = list(lm = lm_spec, knn = knn_spec),
cross = TRUE
) %>%
option_add(control = ctrl_g) %>%
workflow_map(resamples = time_val_split, seed = 1)
# Finalized fit for each model/preproc combo
best_workflow_fits <- map(chi_set$result, fit_best)
# Get the predictor training data for the finalized model
feature_sets <-
map(best_workflow_fits,
~ extract_recipe(.x) %>% bake(new_data = chi_train))
创建于 2023-11-26,使用 reprex v2.0.2