我正在运行具有一些分类功能的
HistGradientBoostingClassifier
,并且我得到了ValueError: could not convert string to float
。我通过 categorical_features
参数定义了分类特征。
HistGradientBoostingClassifier
上的文档显示它接受分类变量作为特征。为什么我会收到此错误?
我尝试了不同的 categorical_features 输入(布尔值、indeces),认为我做错了什么,但是我仍然无法使其工作。
下面是我的代码示例:
X_train = pd.DataFrame({'person_age': {24716: 33.0,
37121: 28.0,
34325: 24.0,
7068: 24.0,
11680: 23.0,
17900: 34.0,
16108: 22.0,
26879: 27.0,
37408: 23.0,
10782: 26.0,
40871: 26.0,
16929: 23.0,
21868: 28.0,
34622: 31.0,
14948: 24.0,
22929: 33.0,
15295: 26.0,
16620: 23.0,
42191: 24.0,
13442: 26.0},
'person_gender': {24716: 'female',
37121: 'male',
34325: 'male',
7068: 'female',
11680: 'male',
17900: 'female',
16108: 'male',
26879: 'female',
37408: 'male',
10782: 'male',
40871: 'male',
16929: 'male',
21868: 'male',
34622: 'male',
14948: 'male',
22929: 'female',
15295: 'female',
16620: 'female',
42191: 'male',
13442: 'female'}})
y_train = np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0])
X_train['person_gender']=X_train['person_gender'].astype('category')
clf_hgb = HistGradientBoostingClassifier(categorical_features=['person_gender'])
clf_hgb.fit(X_train, y_train)
我提供了下面的回溯:
ValueError Traceback (most recent call last)
/tmp/ipykernel_168/499751463.py in ?()
1 clf_hgb = HistGradientBoostingClassifier(categorical_features=['person_gender'])
----> 2 clf_hgb.fit(X_train, y_train)
/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
1147 skip_parameter_validation=(
1148 prefer_skip_nested_validation or global_skip_validation
1149 )
1150 ):
-> 1151 return fit_method(estimator, *args, **kwargs)
/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py in ?(self, X, y, sample_weight)
367 acc_apply_split_time = 0.0 # time spent splitting nodes
368 acc_compute_hist_time = 0.0 # time spent computing histograms
369 # time spent predicting X for gradient and hessians update
370 acc_prediction_time = 0.0
--> 371 X, y = self._validate_data(X, y, dtype=[X_DTYPE], force_all_finite=False)
372 y = self._encode_y(y)
373 check_consistent_length(X, y)
374 # Do not create unit sample weights by default to later skip some
/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
617 if "estimator" not in check_y_params:
618 check_y_params = {**default_check_params, **check_y_params}
619 y = check_array(y, input_name="y", **check_y_params)
620 else:
--> 621 X, y = check_X_y(X, y, **check_params)
622 out = X, y
623
624 if not no_val_X and check_params.get("ensure_2d", True):
/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1143 raise ValueError(
1144 f"{estimator_name} requires y to be passed, but the target y is None"
1145 )
1146
-> 1147 X = check_array(
1148 X,
1149 accept_sparse=accept_sparse,
1150 accept_large_sparse=accept_large_sparse,
/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
914 )
915 array = xp.astype(array, dtype, copy=False)
916 else:
917 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
--> 918 except ComplexWarning as complex_warning:
919 raise ValueError(
920 "Complex data not supported\n{}\n".format(array)
921 ) from complex_warning
/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp)
376 # Use NumPy API to support order
377 if copy is True:
378 array = numpy.array(array, order=order, dtype=dtype)
379 else:
--> 380 array = numpy.asarray(array, order=order, dtype=dtype)
381
382 # At this point array is a NumPy ndarray. We convert it to an array
383 # container that is consistent with the input's namespace.
/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/pandas/core/generic.py in ?(self, dtype)
2082 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
2083 values = self._values
-> 2084 arr = np.asarray(values, dtype=dtype)
2085 if (
2086 astype_is_view(values.dtype, arr.dtype)
2087 and using_copy_on_write()
ValueError: could not convert string to float: 'female'
在 sklearn 版本中 <1.4, categorical features in the histogram GBMs needed to be pre-encoded as nonnegative integers.
摘自v1.3.2用户指南:
每个分类特征的基数必须小于
参数,并且每个分类特征预计以max_bins
编码。为此,使用 OrdinalEncoder 预处理数据可能会很有用,如梯度提升中的分类特征支持中所做的那样。[0, max_bins - 1]
来自 v1.4 变更日志:
分类特征不再需要用数字编码。