我正在尝试从 CSV 文本数据文件中提取特征,我有两列“Label”和“text_stemmed”。前几天该项目运行良好并且显示输出。但现在出现了一个错误,我试图找到解决方案,但无法做到这一点。我是 python 初学者,请帮忙。
我的代码:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import random
df= pd.read_csv('updated1.csv', encoding='UTF-8')
df.head()
df.loc[df["Label"]=='Acquittal',"Label",]=0
df.loc[df["Label"]=='Convictal',"Label",]=1
df_x=df["text_stemmed"]
df_y=df["Label"]
cv = TfidfVectorizer(min_df=1,stop_words='english')
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)
cv = TfidfVectorizer(min_df=1,stop_words='english')
x_traincv = cv.fit_transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])
cv1 = TfidfVectorizer(min_df=1, ngram_range=(1,1), stop_words='english')
x_traincv=cv1.fit_transform(x_train)
cv1 = TfidfVectorizer(min_df=1,stop_words='english', ngram_range = ('1,1'))
a=x_traincv.toarray()
a
cv1.inverse_transform(a)
错误:
NotFittedError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_5228/2571295384.py in <module>
----> 1 cv1.inverse_transform(a)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in inverse_transform(self, X)
1270 List of arrays of terms.
1271 """
-> 1272 self._check_vocabulary()
1273 # We need CSR format for fast row manipulations.
1274 X = check_array(X, accept_sparse='csr')
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _check_vocabulary(self)
470 self._validate_vocabulary()
471 if not self.fixed_vocabulary_:
--> 472 raise NotFittedError("Vocabulary not fitted or provided")
473
474 if len(self.vocabulary_) == 0:
NotFittedError: Vocabulary not fitted or provided
您正在此处重新创建 TfidfVectorizer:
cv1 = TfidfVectorizer(min_df=1, ngram_range=(1,1), stop_words='english')
x_traincv=cv1.fit_transform(x_train)
cv1 = TfidfVectorizer(min_df=1,stop_words='english', ngram_range = ('1,1'))
它的第二个版本是写给
cv1
的,而那个版本从未安装过,所以它没有词汇。
您想要训练模型的数据是什么?
`I have saved my random classifier in pickle and the count vectorizer in vectorizer.py as shown below but I am getting Vocabulary not fitted or provided. Kindly help. Please let me know if any errors as well.
from vectorizer import vect
clf = pickle.load(open(os.path.join('pkl_objects', 'r_classifier.pkl'), 'rb'))
example = ["HELLO is racist"]
X = vect.transform(example)
prediction = clf.predict(X)
probability = clf.predict_proba(X)
print('Prediction: %s\nProbability: %.2f%%' % (label[prediction[0]], np.max(probability) * 100))
Vecotrizer.py
%%writefile HateSpeechDetection/vectorizer.py
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
import re
import os
import pickle
cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(
os.path.join(cur_dir,
'pkl_objects',
'stopwords.pkl'), 'rb'))
lemmatizer = WordNetLemmatizer()
def preprocessor(tweet):
# Removal of user handles
tweet = re.sub('@[\w\-]+','', tweet)
# Coverting the string into lower case
tweet = str(tweet).lower()
tweet = re.sub('\[.*?\]','',tweet)
# Removal of HTML linkups
tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|''[!*\(\),]|(?:%[0-9a-fA-F][0- 9a-fA-F]))+','',tweet)
tweet = re.sub('<.*?>+', '', tweet)
# Removal of punctuations
tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet)
tweet = re.sub('\n','',tweet)
tweet = re.sub('\w*\d\w*', '', tweet)
# Removal of stopwords
tweet = [word for word in tweet.split(' ') if word not in stopwords]
#removal of greek characters
tweet = [' '.join([unidecode.unidecode(word) for word in str(t).split()]) if t is not None else t for t in tweet]
#lemmetizing of tweets
tweet = [" ".join(lemmatizer.lemmatize(word) for word in t.split()) for t in tweet]
tweet = " ".join(tweet)
return tweet
vect = CountVectorizer()
def process_tweet(tweet):
# Process the tweet
processed_tweet = preprocessor(tweet)
vect.transform([processed_tweet]) # Pass a list of processed_tweet
return processed_tweet
I want to import the code to flask to create a web embedding but stuck near this one poin
`