使用 CustomTransformers 的 AttributeError 和 TypeError

问题描述 投票:0回答:1

我正在使用自定义变压器构建模型(KeyError:“ [Index([('A','B','C')] , dtype='object')] 都不在 [列] 中) 。 当我运行以下代码时,由于 .fit:

出现错误
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-165-289e1d466eb9> in <module>
     10 
     11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
     13 
     14 # metrics

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    339         """
    340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
    342         with _print_elapsed_time('Pipeline',
    343                                  self._log_message(len(self.steps) - 1)):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    301                 cloned_transformer = clone(transformer)
    302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
    304                 cloned_transformer, X, y, None,
    305                 message_clsname='Pipeline',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    978             sum of n_components (output dimension) over transformers.
    979         """
--> 980         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    981         if not results:
    982             # All transformers are None

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
   1000         transformers = list(self._iter())
   1001 
-> 1002         return Parallel(n_jobs=self.n_jobs)(delayed(func)(
   1003             transformer, X, y, weight,
   1004             message_clsname='FeatureUnion',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
   1042                 self._iterating = self._original_iterator is not None
   1043 
-> 1044             while self.dispatch_one_batch(iterator):
   1045                 pass
   1046 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1200         max_features = self.max_features
   1201 
-> 1202         vocabulary, X = self._count_vocab(raw_documents,
   1203                                           self.fixed_vocabulary_)
   1204 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1112         for doc in raw_documents:
   1113             feature_counter = {}
-> 1114             for feature in analyze(doc):
   1115                 try:
   1116                     feature_idx = vocabulary[feature]

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    102     else:
    103         if preprocessor is not None:
--> 104             doc = preprocessor(doc)
    105         if tokenizer is not None:
    106             doc = tokenizer(doc)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _preprocess(doc, accent_function, lower)
     67     """
     68     if lower:
---> 69         doc = doc.lower()
     70     if accent_function is not None:
     71         doc = accent_function(doc)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

代码是

# MODEL
from sklearn import tree

# Decision Tree
decision_tree = tree.DecisionTreeClassifier()
# define full pipeline --> preprocessing + model
full_pipeline = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline),
    ('model', decision_tree)])

# fit on the complete pipeline
training = full_pipeline.fit(X, y) # <- this step returns the error

我也尝试过 .fit_transform 但我得到了同样的错误。 我读到了这个:AttributeError:'numpy.ndarray'对象没有属性'lower'拟合逻辑模型数据但似乎我没有像该示例中那样在决策树中传递X或y,但也许我错了。 添加

# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
    ('text_transformer', TextTransformer()),
    ('cv', CountVectorizer(analyzer='word', ngram_range=(2, 2), lowercase=False))])

我收到这个新错误:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-159-289e1d466eb9> in <module>
     10 
     11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
     13 
     14 # metrics

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    339         """
    340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
    342         with _print_elapsed_time('Pipeline',
    343                                  self._log_message(len(self.steps) - 1)):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    301                 cloned_transformer = clone(transformer)
    302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
    304                 cloned_transformer, X, y, None,
    305                 message_clsname='Pipeline',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    978             sum of n_components (output dimension) over transformers.
    979         """
--> 980         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    981         if not results:
    982             # All transformers are None

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
   1000         transformers = list(self._iter())
   1001 
-> 1002         return Parallel(n_jobs=self.n_jobs)(delayed(func)(
   1003             transformer, X, y, weight,
   1004             message_clsname='FeatureUnion',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
   1042                 self._iterating = self._original_iterator is not None
   1043 
-> 1044             while self.dispatch_one_batch(iterator):
   1045                 pass
   1046 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1200         max_features = self.max_features
   1201 
-> 1202         vocabulary, X = self._count_vocab(raw_documents,
   1203                                           self.fixed_vocabulary_)
   1204 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1112         for doc in raw_documents:
   1113             feature_counter = {}
-> 1114             for feature in analyze(doc):
   1115                 try:
   1116                     feature_idx = vocabulary[feature]

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    104             doc = preprocessor(doc)
    105         if tokenizer is not None:
--> 106             doc = tokenizer(doc)
    107         if ngrams is not None:
    108             if stop_words is not None:

TypeError: cannot use a string pattern on a bytes-like object

如果我删除text_pipeline,则不会发生错误,因此似乎由于使用countVectorizer的方式而出现了问题。 文本示例是

an example
example number 1
this is another small example

我还有其他数字和分类列。

您遇到过类似的问题吗?如果是的话,你是怎么处理的?

python scikit-learn scikit-learn-pipeline
1个回答
1
投票

sklearn

 文本转换器中的一个常见错误涉及数据的形状:与大多数其他 
sklearn
 预处理器不同,文本转换器通常期望一维输入,而 python 的鸭子类型会导致数组和字符串出现奇怪的错误可迭代。

你的

TextTransformer.transform

返回
X[['Tweet']]
,它是二维的,会导致后续的
CountVectorizer
出现问题。  (使用 
.values
 转换为 numpy 数组不会改变维数问题,但也没有令人信服的理由进行该转换。)返回 
X['Tweet']
 应该可以解决该问题。

© www.soinside.com 2019 - 2024. All rights reserved.