我修改了Python
的一些github
代码,以使用以下代码对一部分消费者投诉数据进行逻辑回归,文本向量化和分类部分可以正常工作。但是我想知道是否还可以将非文本的二进制数字指示符(例如timely_response
和consumer_disputed.
)作为特征(以及文本向量)包含在内?但是,当我这样做时,Python
返回一个错误,说我有input variables with inconsistent numbers of samples
。
%% load packages and data
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell
df = pd.read_csv('https://www.dropbox.com/s/obbs000w7knjmys/example_complaints.csv?dl=1')
df = df[pd.notnull(df['consumer_complaint_narrative'])]
df['product'].value_counts()
%% cleaning text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(clean_text)
%% include only text as features
X = df['consumer_complaint_narrative']
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
%% fit and test with logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5)),
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
%% including binary numerical indicators as additional features
new_X = df[['consumer_complaint_narrative', 'timely_response', 'consumer_disputed.']]
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state = 42)
%% fit and test again
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5)),
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
返回以下错误消息
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-28-455c8fd83ba4> in <module>
8 ('clf', LogisticRegression(n_jobs=1, C=1e5)),
9 ])
---> 10 logreg.fit(X_train, y_train)
11
12 y_pred = logreg.predict(X_test)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
--> 267 self._final_estimator.fit(Xt, y, **fit_params)
268 return self
269
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1286
1287 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
-> 1288 accept_large_sparse=solver != 'liblinear')
1289 check_classification_targets(y)
1290 self.classes_ = np.unique(y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
764 y = y.astype(np.float64)
765
--> 766 check_consistent_length(X, y)
767
768 return X, y
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
233 if len(uniques) > 1:
234 raise ValueError("Found input variables with inconsistent numbers of"
--> 235 " samples: %r" % [int(l) for l in lengths])
236
237
ValueError: Found input variables with inconsistent numbers of samples: [3, 529]
如果有人可以对此有所解释,将非常感谢。
这与使管道具有多个步骤有关。
CountVectorizer()和TfidfTransformer()用于文本数据,但是您要添加的其他字段不是文本数据。 (这是我在下面谈论的部分。)
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5))
还有其他一些线程可以将其他熊猫列添加到文本数据中,但是对我来说似乎有些麻烦。 (参考Adding pandas columns to a sparse matrix)。
集成附加数据的另一种选择只是使用文本数据对逻辑回归建模,并在具有附加功能的另一个模型中使用该回归的输出。
这是您可以做到的方式。
X = df['consumer_complaint_narrative']
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
%% fit and test with logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5)),
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
## get the output of your model for the new model
add_features = logreg.predict_proba(X)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
%% including binary numerical indicators as additional features
new_X = pd.concat([add_features,df[['timely_response', 'consumer_disputed.']]],axis=1)
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state = 42)
%% fit and test again
## Do not need pipeline can fit a single logistic regression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
我正在尝试考虑这里是否可能存在某种过拟合的问题,但我不这么认为。由于您使用的是相同的random_state,因此您的拆分应该与以前完全相同,因此我们应该很好。