我按照docs中的说明进行操作,但也尝试了许多其他变体。一切都结束在同一个地方,如下:
LightGBM] [Fatal] Check failed: (split_indices_block_size_data_partition) > (0) at /home/azureuser/localfiles/LightGBM/lightgbm-python/src/treelearner/cuda/cuda_data_partition.cpp, line 280 .
我无法找到正在发生的事情或如何解决此问题的答案。我已经在 conda envs 中尝试过 Python 3.10 和 11 并使用最新的 lightgbm 和 cuda。我怀疑 cuda 实现比其他 GPU 实现更快,但不确定。完整的回溯是:
"name": "LightGBMError",
"message": "Check failed: (split_indices_block_size_data_partition) > (0) at /home/azureuser/localfiles/LightGBM/lightgbm-python/src/treelearner/cuda/cuda_data_partition.cpp, line 280 .
",
"stack": "---------------------------------------------------------------------------
LightGBMError Traceback (most recent call last)
Cell In[2], line 19
12 # Create and train the LightGBM classifier with GPU support
13 clf = lgb.LGBMClassifier(
14 objective='binary',
15 device='cuda',
16 verbose=1,
17 )
---> 19 clf.fit(X_train, y_train)
21 # Predict and evaluate
22 y_pred = clf.predict(X_test)
File /anaconda/envs/py311/lib/python3.11/site-packages/lightgbm/sklearn.py:1421, in LGBMClassifier.fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, feature_name, categorical_feature, callbacks, init_model)
1418 else:
1419 valid_sets.append((valid_x, self._le.transform(valid_y)))
-> 1421 super().fit(
1422 X,
1423 _y,
1424 sample_weight=sample_weight,
1425 init_score=init_score,
1426 eval_set=valid_sets,
1427 eval_names=eval_names,
1428 eval_sample_weight=eval_sample_weight,
1429 eval_class_weight=eval_class_weight,
1430 eval_init_score=eval_init_score,
1431 eval_metric=eval_metric,
1432 feature_name=feature_name,
1433 categorical_feature=categorical_feature,
1434 callbacks=callbacks,
1435 init_model=init_model,
1436 )
1437 return self
File /anaconda/envs/py311/lib/python3.11/site-packages/lightgbm/sklearn.py:1015, in LGBMModel.fit(self, X, y, sample_weight, init_score, group, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_group, eval_metric, feature_name, categorical_feature, callbacks, init_model)
1012 evals_result: _EvalResultDict = {}
1013 callbacks.append(record_evaluation(evals_result))
-> 1015 self._Booster = train(
1016 params=params,
1017 train_set=train_set,
1018 num_boost_round=self.n_estimators,
1019 valid_sets=valid_sets,
1020 valid_names=eval_names,
1021 feval=eval_metrics_callable, # type: ignore[arg-type]
1022 init_model=init_model,
1023 callbacks=callbacks,
1024 )
1026 # This populates the property self.n_features_, the number of features in the fitted model,
1027 # and so should only be set after fitting.
1028 #
1029 # The related property self._n_features_in, which populates self.n_features_in_,
1030 # is set BEFORE fitting.
1031 self._n_features = self._Booster.num_feature()
File /anaconda/envs/py311/lib/python3.11/site-packages/lightgbm/engine.py:361, in train(params, train_set, num_boost_round, valid_sets, valid_names, feval, init_model, feature_name, categorical_feature, keep_training_booster, callbacks)
349 for cb in callbacks_before_iter:
350 cb(
351 callback.CallbackEnv(
352 model=booster,
(...)
358 )
359 )
--> 361 booster.update(fobj=fobj)
363 evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = []
364 # check evaluation result.
File /anaconda/envs/py311/lib/python3.11/site-packages/lightgbm/basic.py:4143, in Booster.update(self, train_set, fobj)
4141 if self.__set_objective_to_none:
4142 raise LightGBMError(\"Cannot update due to null objective function.\")
-> 4143 _safe_call(
4144 _LIB.LGBM_BoosterUpdateOneIter(
4145 self._handle,
4146 ctypes.byref(is_finished),
4147 )
4148 )
4149 self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
4150 return is_finished.value == 1
File /anaconda/envs/py311/lib/python3.11/site-packages/lightgbm/basic.py:295, in _safe_call(ret)
287 \"\"\"Check the return value from C API call.
288
289 Parameters
(...)
292 The return value from C API calls.
293 \"\"\"
294 if ret != 0:
--> 295 raise LightGBMError(_LIB.LGBM_GetLastError().decode(\"utf-8\"))
LightGBMError: Check failed: (split_indices_block_size_data_partition) > (0) at /home/azureuser/localfiles/LightGBM/lightgbm-python/src/treelearner/cuda/cuda_data_partition.cpp, line 280 .
"
}
事实证明,截至 2024 年末,LightGBM for CUDA 的计算能力最低级别为 6.0。因此我使用的 NVIDIA M60 Tesla GPU 不兼容。请参阅 github 问题此处。