我正在使用自定义变压器构建模型(KeyError:“ [Index([('A','B','C')] , dtype='object')] 都不在 [列] 中) 。 当我运行以下代码时,由于 .fit:
出现错误---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-165-289e1d466eb9> in <module>
10
11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
13
14 # metrics
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
978 sum of n_components (output dimension) over transformers.
979 """
--> 980 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
981 if not results:
982 # All transformers are None
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
1000 transformers = list(self._iter())
1001
-> 1002 return Parallel(n_jobs=self.n_jobs)(delayed(func)(
1003 transformer, X, y, weight,
1004 message_clsname='FeatureUnion',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1042 self._iterating = self._original_iterator is not None
1043
-> 1044 while self.dispatch_one_batch(iterator):
1045 pass
1046
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
代码是
# MODEL
from sklearn import tree
# Decision Tree
decision_tree = tree.DecisionTreeClassifier()
# define full pipeline --> preprocessing + model
full_pipeline = Pipeline(steps=[
('preprocess_pipeline', preprocess_pipeline),
('model', decision_tree)])
# fit on the complete pipeline
training = full_pipeline.fit(X, y) # <- this step returns the error
我也尝试过 .fit_transform 但我得到了同样的错误。 我读到了这个:AttributeError:'numpy.ndarray'对象没有属性'lower'拟合逻辑模型数据但似乎我没有像该示例中那样在决策树中传递X或y,但也许我错了。 添加
# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
('text_transformer', TextTransformer()),
('cv', CountVectorizer(analyzer='word', ngram_range=(2, 2), lowercase=False))])
我收到这个新错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-159-289e1d466eb9> in <module>
10
11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
13
14 # metrics
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
978 sum of n_components (output dimension) over transformers.
979 """
--> 980 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
981 if not results:
982 # All transformers are None
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
1000 transformers = list(self._iter())
1001
-> 1002 return Parallel(n_jobs=self.n_jobs)(delayed(func)(
1003 transformer, X, y, weight,
1004 message_clsname='FeatureUnion',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1042 self._iterating = self._original_iterator is not None
1043
-> 1044 while self.dispatch_one_batch(iterator):
1045 pass
1046
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
104 doc = preprocessor(doc)
105 if tokenizer is not None:
--> 106 doc = tokenizer(doc)
107 if ngrams is not None:
108 if stop_words is not None:
TypeError: cannot use a string pattern on a bytes-like object
如果我删除text_pipeline,则不会发生错误,因此似乎由于使用countVectorizer的方式而出现了问题。 文本示例是
an example
example number 1
this is another small example
我还有其他数字和分类列。
您遇到过类似的问题吗?如果是的话,你是怎么处理的?
sklearn
文本转换器中的一个常见错误涉及数据的形状:与大多数其他
sklearn
预处理器不同,文本转换器通常期望一维输入,而 python 的鸭子类型会导致数组和字符串出现奇怪的错误可迭代。你的
TextTransformer.transform
返回
X[['Tweet']]
,它是二维的,会导致后续的
CountVectorizer
出现问题。 (使用
.values
转换为 numpy 数组不会改变维数问题,但也没有令人信服的理由进行该转换。)返回
X['Tweet']
应该可以解决该问题。