我正在尝试创建一个强大的函数,该函数将使用多个 xgboost 基础学习器和一个随机森林元学习器来训练堆叠集成模型。这是代码:
train_ensemble <- function(data, cpus = 32, n_models = 10) {
# Split
data_train = data[[1]]
data_test = data[[2]]
# Rename the target column
colnames(data_train)[1] = 'target'
colnames(data_test)[1] = 'target'
#data_train$target <- factor(data_train$target, levels = c("0", "1"))
traintask <- makeClassifTask(data = data_train, target = "target")
learners <- list()
for (i in 1:n_models) {
lrn <- makeLearner("classif.xgboost", predict.type = "prob")
lrn$par.vals <- list(objective="binary:logistic", eval_metric="auc",
nrounds=100L, eta=0.1, scale_pos_weight=1)
params <- makeParamSet(
makeDiscreteParam("booster", values = c("gbtree", "gblinear")),
makeNumericParam("eta", lower = 0.01, upper = 0.08),
makeIntegerParam("max_depth", lower = 2L, upper = 11L),
makeNumericParam("alpha", lower = 0L, upper = 8),
makeNumericParam("lambda", lower = 0L, upper = 8),
makeNumericParam("gamma", lower = 0L, upper = 8),
makeNumericParam("min_child_weight", lower = 2L, upper = 8L),
makeNumericParam("subsample", lower = 0.5, upper = 1),
makeNumericParam("colsample_bytree", lower = 0.5, upper = 1),
makeNumericParam("scale_pos_weight", lower = 1, upper = 10)
)
rdesc <- makeResampleDesc("CV", stratify = TRUE, iters = 10L)
ctrl <- makeTuneControlRandom(maxit = 50L)
measures <- mlr3measures::auc()
lrn_tune <- tuneParams(learner = lrn, task = traintask, resampling = rdesc,
measures = list(measures),
par.set = params, control = ctrl,
show.info = TRUE)
learners[[i]] <- setHyperPars(lrn, par.vals = lrn_tune$x)
}
# Create a stacked learner to ensemble the base learners
ensemble_lrn <- makeStackedLearner(base.learners = learners,
predict.type = "prob",
method = "classif.randomForest",
resampling = makeResampleDesc("CV", iters = 10L, stratify = TRUE))
# Train the ensemble on the training set with 10-fold cross-validation
ensemble_model <- train(learner = ensemble_lrn, task = traintask,
train.control = list(method = "repeatedcv",
repeats = 10,
stratify = TRUE,
search = "random",
maximize = TRUE,
cores = cpus))
testtask <- makeClassifTask(data = data_test, target = "target")
ensemble_pred <- predict(ensemble_model, testtask)
return(ensemble_model)
}
但是,这是行不通的。我不太确定问题出在哪里,这是我不断收到的错误:
Error in assert_factor(truth, min.len = 1L, n.levels = 2L, any.missing = FALSE) :
argument "truth" is missing, with no default
我做了一些广泛的超参数调整。我使用交叉验证和参数搜索空间调整每个 xgboost 学习器的超参数。然后,我使用
makeStackedLearner()
创建了一个堆叠学习器,其中 xgboost 学习器作为基础学习器,随机森林作为元学习器。然后,我使用 10 折交叉验证在训练集上训练集成,并为测试数据创建一个 TaskClassif
对象,使用集成模型预测测试数据的目标变量,然后返回经过训练的集成模型。这就是整个故事。
但是代码有什么问题呢?为什么它不起作用?
这里是我训练集的一小部分:
structure(list(target = structure(c(1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), levels = c("0",
"1"), class = "factor"), Bone = c(0.68, 0.82, 0.6, 0.63,
0.5, 0.72, 0.66, 0.58, 0.6, 0.6, 0.56, 0.7, 0.59, 0.55, 0.47,
0.6, 0.64, 0.67, 0.62, 0.59), Blood = c(0.54, 0.65, 0.48,
0.54, 0.22, 0.58, 0.51, 0.38, 0.46, 0.42, 0.3, 0.62, 0.4, 0.29,
0.2, 0.48, 0.54, 0.55, 0.37, 0.46), Skin = c(0.77, 0.83,
0.74, 0.75, 0.75, 0.78, 0.75, 0.76, 0.76, 0.76, 0.75, 0.76, 0.76,
0.75, 0.74, 0.75, 0.76, 0.75, 0.75, 0.74), Brain = c(0.51,
0.59, 0.46, 0.5, 0.33, 0.56, 0.49, 0.39, 0.45, 0.46, 0.37, 0.57,
0.43, 0.37, 0.32, 0.47, 0.48, 0.55, 0.43, 0.47), Liver = c(0.58,
0.62, 0.55, 0.58, 0.48, 0.58, 0.62, 0.5, 0.51, 0.55, 0.53, 0.65,
0.54, 0.48, 0.46, 0.57, 0.57, 0.65, 0.59, 0.59), Stomach = c(0.58,
0.65, 0.54, 0.59, 0.5, 0.61, 0.6, 0.51, 0.51, 0.55, 0.56, 0.61,
0.55, 0.51, 0.47, 0.59, 0.55, 0.63, 0.57, 0.59), Heart = c(0.62,
0.66, 0.55, 0.61, 0.44, 0.63, 0.58, 0.51, 0.55, 0.55, 0.46, 0.66,
0.53, 0.47, 0.41, 0.56, 0.57, 0.64, 0.51, 0.56), T.cells = c(0.53,
0.75, 0.5, 0.51, 0.47, 0.66, 0.52, 0.52, 0.53, 0.54, 0.47, 0.56,
0.49, 0.48, 0.45, 0.51, 0.52, 0.55, 0.52, 0.53), B.cells = c(0.52,
0.81, 0.46, 0.47, 0.39, 0.7, 0.49, 0.47, 0.5, 0.54, 0.41, 0.54,
0.44, 0.43, 0.37, 0.48, 0.49, 0.53, 0.48, 0.52)), row.names = c("Pt1",
"Pta101", "Pta106", "Ptc11", "Ptc17", "Ptc18", "Ptb2", "Ptm26", "Ptm28",
"Pta29", "Pta3", "Pta34", "Ptb37", "Ptb38", "Ptb39", "Ptb4", "Ptc44",
"Ptc46", "Ptb47", "Pta48"), class = "data.frame")