基于XGboost模型制作集成学习函数

问题描述 投票:0回答:0

我正在尝试创建一个强大的函数,该函数将使用多个 xgboost 基础学习器和一个随机森林元学习器来训练堆叠集成模型。这是代码:

train_ensemble <- function(data, cpus = 32, n_models = 10) {
  
  # Split 
  data_train = data[[1]]
  data_test = data[[2]]
  
  # Rename the target column 
  colnames(data_train)[1] = 'target'
  colnames(data_test)[1] = 'target'
  
  #data_train$target <- factor(data_train$target, levels = c("0", "1"))
  
  traintask <- makeClassifTask(data = data_train, target = "target")
  
  learners <- list()
  for (i in 1:n_models) {
    lrn <- makeLearner("classif.xgboost", predict.type = "prob")
    lrn$par.vals <- list(objective="binary:logistic", eval_metric="auc", 
                         nrounds=100L, eta=0.1, scale_pos_weight=1)
    
    params <- makeParamSet(
      makeDiscreteParam("booster", values = c("gbtree", "gblinear")), 
      makeNumericParam("eta", lower = 0.01, upper = 0.08), 
      makeIntegerParam("max_depth", lower = 2L, upper = 11L), 
      makeNumericParam("alpha", lower = 0L, upper = 8),
      makeNumericParam("lambda", lower = 0L, upper = 8),
      makeNumericParam("gamma", lower = 0L, upper = 8),
      makeNumericParam("min_child_weight", lower = 2L, upper = 8L), 
      makeNumericParam("subsample", lower = 0.5, upper = 1), 
      makeNumericParam("colsample_bytree", lower = 0.5, upper = 1),
      makeNumericParam("scale_pos_weight", lower = 1, upper = 10)
    )
    
    rdesc <- makeResampleDesc("CV", stratify = TRUE, iters = 10L)
    ctrl <- makeTuneControlRandom(maxit = 50L)
    measures <- mlr3measures::auc()
    lrn_tune <- tuneParams(learner = lrn, task = traintask, resampling = rdesc,
                           measures = list(measures),
                           par.set = params, control = ctrl, 
                           show.info = TRUE)
    learners[[i]] <- setHyperPars(lrn, par.vals = lrn_tune$x)
  }
  
  # Create a stacked learner to ensemble the base learners
  ensemble_lrn <- makeStackedLearner(base.learners = learners,
                                     predict.type = "prob",
                                     method = "classif.randomForest",
                                     resampling = makeResampleDesc("CV", iters = 10L, stratify = TRUE))
  
  # Train the ensemble on the training set with 10-fold cross-validation
  ensemble_model <- train(learner = ensemble_lrn, task = traintask,
                          train.control = list(method = "repeatedcv",
                                               repeats = 10,
                                               stratify = TRUE,
                                               search = "random",
                                               maximize = TRUE,
                                               cores = cpus))
  
  testtask <- makeClassifTask(data = data_test, target = "target")
  ensemble_pred <- predict(ensemble_model, testtask)
  
  return(ensemble_model)
}
     

但是,这是行不通的。我不太确定问题出在哪里,这是我不断收到的错误:

Error in assert_factor(truth, min.len = 1L, n.levels = 2L, any.missing = FALSE) :
argument "truth" is missing, with no default

我做了一些广泛的超参数调整。我使用交叉验证和参数搜索空间调整每个 xgboost 学习器的超参数。然后,我使用

makeStackedLearner()
创建了一个堆叠学习器,其中 xgboost 学习器作为基础学习器,随机森林作为元学习器。然后,我使用 10 折交叉验证在训练集上训练集成,并为测试数据创建一个
TaskClassif
对象,使用集成模型预测测试数据的目标变量,然后返回经过训练的集成模型。这就是整个故事。

但是代码有什么问题呢?为什么它不起作用?

这里是我训练集的一小部分:

structure(list(target = structure(c(1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), levels = c("0", 
"1"), class = "factor"), Bone = c(0.68, 0.82, 0.6, 0.63, 
0.5, 0.72, 0.66, 0.58, 0.6, 0.6, 0.56, 0.7, 0.59, 0.55, 0.47, 
0.6, 0.64, 0.67, 0.62, 0.59), Blood = c(0.54, 0.65, 0.48, 
0.54, 0.22, 0.58, 0.51, 0.38, 0.46, 0.42, 0.3, 0.62, 0.4, 0.29, 
0.2, 0.48, 0.54, 0.55, 0.37, 0.46), Skin = c(0.77, 0.83, 
0.74, 0.75, 0.75, 0.78, 0.75, 0.76, 0.76, 0.76, 0.75, 0.76, 0.76, 
0.75, 0.74, 0.75, 0.76, 0.75, 0.75, 0.74), Brain = c(0.51, 
0.59, 0.46, 0.5, 0.33, 0.56, 0.49, 0.39, 0.45, 0.46, 0.37, 0.57, 
0.43, 0.37, 0.32, 0.47, 0.48, 0.55, 0.43, 0.47), Liver = c(0.58, 
0.62, 0.55, 0.58, 0.48, 0.58, 0.62, 0.5, 0.51, 0.55, 0.53, 0.65, 
0.54, 0.48, 0.46, 0.57, 0.57, 0.65, 0.59, 0.59), Stomach = c(0.58, 
0.65, 0.54, 0.59, 0.5, 0.61, 0.6, 0.51, 0.51, 0.55, 0.56, 0.61, 
0.55, 0.51, 0.47, 0.59, 0.55, 0.63, 0.57, 0.59), Heart = c(0.62, 
0.66, 0.55, 0.61, 0.44, 0.63, 0.58, 0.51, 0.55, 0.55, 0.46, 0.66, 
0.53, 0.47, 0.41, 0.56, 0.57, 0.64, 0.51, 0.56), T.cells = c(0.53, 
0.75, 0.5, 0.51, 0.47, 0.66, 0.52, 0.52, 0.53, 0.54, 0.47, 0.56, 
0.49, 0.48, 0.45, 0.51, 0.52, 0.55, 0.52, 0.53), B.cells = c(0.52, 
0.81, 0.46, 0.47, 0.39, 0.7, 0.49, 0.47, 0.5, 0.54, 0.41, 0.54, 
0.44, 0.43, 0.37, 0.48, 0.49, 0.53, 0.48, 0.52)), row.names = c("Pt1", 
"Pta101", "Pta106", "Ptc11", "Ptc17", "Ptc18", "Ptb2", "Ptm26", "Ptm28", 
"Pta29", "Pta3", "Pta34", "Ptb37", "Ptb38", "Ptb39", "Ptb4", "Ptc44", 
"Ptc46", "Ptb47", "Pta48"), class = "data.frame")
r function machine-learning xgboost ensemble-learning
© www.soinside.com 2019 - 2024. All rights reserved.