我有一个 OneVsOne 模型,在文本特征和目标字段上运行良好。要发展到多类模型(即具有多个文本特征字段),我相信带有 Logistic 回归的 OneVsRest 是合适的。
但是,当我使用以下管道时:
model = Pipeline([
('vect', CountVectorizer()),
('tfidfT', TfidfTransformer()),
('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
])
尝试使用 Logistic 回归运行 OneVsRest 分类器时出现以下错误:
ValueError: Found input variables with inconsistent numbers of samples: [3, 224]
特征字段位于 224 行的 pandas 数据帧中,目标字段是长度为 224 的 pandas 系列。数据中没有空值。
这是完整的回溯:
ValueError Traceback (most recent call last)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\defect_autocategorisation_main9.py:127
119 model = Pipeline([
120 ('vect', CountVectorizer()),
121 ('tfidfT', TfidfTransformer()),
122 ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
123 ])
124 #model = OneVsRestClassifier(LogisticRegression())
125
126 # Initialize the classifier
--> 127 model.fit(X,y)
128 predicted = model.predict(X_test)
129 #predicted = model.predict(X_test, Y_test)
130
131 # creating a confusion matrix
(...)
140
141 # Generate classification report
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\pipeline.py:473, in Pipeline.fit(self, X, y, **params)
471 if self._final_estimator != "passthrough":
472 last_step_params = routed_params[self.steps[-1][0]]
--> 473 self._final_estimator.fit(Xt, y, **last_step_params["fit"])
475 return self
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\multiclass.py:370, in OneVsRestClassifier.fit(self, X, y, **fit_params)
366 columns = (col.toarray().ravel() for col in Y.T)
367 # In cases where individual estimators are very fast to train setting
368 # n_jobs > 1 in can results in slower performance due to the overhead
369 # of spawning threads. See joblib issue #112.
--> 370 self.estimators_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
371 delayed(_fit_binary)(
372 self.estimator,
373 X,
374 column,
375 fit_params=routed_params.estimator.fit,
376 classes=[
377 "not %s" % self.label_binarizer_.classes_[i],
378 self.label_binarizer_.classes_[i],
379 ],
380 )
381 for i, column in enumerate(columns)
382 )
384 if hasattr(self.estimators_[0], "n_features_in_"):
385 self.n_features_in_ = self.estimators_[0].n_features_in_
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\parallel.py:74, in Parallel.__call__(self, iterable)
69 config = get_config()
70 iterable_with_config = (
71 (_with_config(delayed_func, config), args, kwargs)
72 for delayed_func, args, kwargs in iterable
73 )
---> 74 return super().__call__(iterable_with_config)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\joblib\parallel.py:1918, in Parallel.__call__(self, iterable)
1916 output = self._get_sequential_output(iterable)
1917 next(output)
-> 1918 return output if self.return_generator else list(output)
1920 # Let's create an ID that uniquely identifies the current call. If the
1921 # call is interrupted early and that the same instance is immediately
1922 # re-used, this id will be used to prevent workers that were
1923 # concurrently finalizing a task from the previous call to run the
1924 # callback.
1925 with self._lock:
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\joblib\parallel.py:1847, in Parallel._get_sequential_output(self, iterable)
1845 self.n_dispatched_batches += 1
1846 self.n_dispatched_tasks += 1
-> 1847 res = func(*args, **kwargs)
1848 self.n_completed_tasks += 1
1849 self.print_progress()
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\parallel.py:136, in _FuncWrapper.__call__(self, *args, **kwargs)
134 config = {}
135 with config_context(**config):
--> 136 return self.function(*args, **kwargs)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\multiclass.py:93, in _fit_binary(estimator, X, y, fit_params, classes)
91 else:
92 estimator = clone(estimator)
---> 93 estimator.fit(X, y, **fit_params)
94 return estimator
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\linear_model\_logistic.py:1223, in LogisticRegression.fit(self, X, y, sample_weight)
1220 else:
1221 _dtype = [np.float64, np.float32]
-> 1223 X, y = self._validate_data(
1224 X,
1225 y,
1226 accept_sparse="csr",
1227 dtype=_dtype,
1228 order="C",
1229 accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
1230 )
1231 check_classification_targets(y)
1232 self.classes_ = np.unique(y)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:650, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
648 y = check_array(y, input_name="y", **check_y_params)
649 else:
--> 650 X, y = check_X_y(X, y, **check_params)
651 out = X, y
653 if not no_val_X and check_params.get("ensure_2d", True):
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\validation.py:1320, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1301 X = check_array(
1302 X,
1303 accept_sparse=accept_sparse,
(...)
1315 input_name="X",
1316 )
1318 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
-> 1320 check_consistent_length(X, y)
1322 return X, y
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\validation.py:457, in check_consistent_length(*arrays)
455 uniques = np.unique(lengths)
456 if len(uniques) > 1:
--> 457 raise ValueError(
458 "Found input variables with inconsistent numbers of samples: %r"
459 % [int(l) for l in lengths]
460 )
ValueError: Found input variables with inconsistent numbers of samples: [3, 224]
有一个类似的 Stack Overflow 问题: ValueError:模型的特征数量必须与输入匹配 但这个建议和其他几个类似问题中的建议都不适合我。
虽然我的数据是文本的,但作为信息,上述管道在使用 Iris 数据集时会导致相同的错误,但在仅运行分类器时成功完成(即省略矢量化器和转换器)。但是,仅在文本数据上运行分类器是行不通的,会出现以下预期错误:
ValueError: could not convert string to float: 'Jacket'
我知道 OneHot 编码,但这个“样本数量不一致”问题似乎与任何编码问题无关,我想在解决任何其他问题之前解决这个问题。
编辑 22/10/24: 这是一个适合使用 iris 数据集的最小可重复示例:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
# Read in the dataset to train the model
training_data = pd.read_csv('iris_dataset.csv')
print(training_data)
# Load feature data
X = training_data[['sepal.length', 'sepal.width','petal.length','petal.width']]
# Load target data
y = training_data['variety']
# Split training data into training and test portions
X_train, X_test, y_train, y_test \
= train_test_split(X, y, test_size=0.5, random_state=42)
# Create the pipeline composed of vectoriser, transformer and classifier
model = Pipeline([
('vect', CountVectorizer()),
('tfidfT', TfidfTransformer()),
('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
])
# Initialize the classifier
model.fit(X,y)
predicted = model.predict(X_test)
编辑 23/10/24:这又是 MRE,独立的文本数据输入:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
# Read in the dataset to train the model
training_data = pd.read_csv('iris_dataset.csv')
print(training_data)
training_data = pd.DataFrame({
'Location': ['Structure', 'Stucture', 'Structure', 'Access systems'],\
'Component': ['Mid bay brace12', 'Mid bay brace10', 'Mid bay brace07', 'First stage ladder'],\
'Defect Description': ['Surface corrosion', 'Coating delamination with minor surface corrosion', 'Corrosion', 'Entangled rope'],\
'Failure Mode': ['Corrosion', 'Corrosion','Corrosion', 'Debris']
})
# Load feature data
X = training_data[['Location', 'Component','Defect Description']]
# Load target data
y = training_data['Failure Mode']
# Split training data into training and test portions
X_train, X_test, y_train, y_test \
= train_test_split(X, y, test_size=0.5, random_state=42)
# Create the pipeline composed of vectoriser, transformer and classifier
model = Pipeline([
('vect', CountVectorizer()),
('tfidfT', TfidfTransformer()),
('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
])
# Initialize the classifier
model.fit(X,y)
predicted = model.predict(X_test)
我稍后会对此进行扩展,但主要解决方案是使用
ColumnTransformer
来每列运行单独的文本转换器(我还整合了 CountVectorizer+TfidfTransformer=TfidfVectorizer
):
preproc = ColumnTransformer([
(col+"_tfidf", TfidfVectorizer(), col)
for col in X.columns
])
model = Pipeline([
("preproc", preproc),
('clf', LogisticRegression()),
])
model.fit(X,y)