最近我开始了Python情感分析培训课程,其中我的第一个任务是通过两种不同的模型处理IMDB评论数据集:使用情感管道(来自变压器)和朴素贝叶斯分类器进行词法分析。我的任务是创建一个由三列组成的表:评论、第一个模型的情绪、第二个模型的情绪。如果涉及第一个模型,一切都很清楚,因为管道返回一个列表,可以轻松转换为 csv,而 NBC 确实让我陷入困境。我已经完成了清除数据集、训练模型并执行它的所有步骤,但我完全不知道如何将其工作结果显示为表格,不知道在最后一行之后下一步要做什么下面的代码。我已阅读文档,但没有找到任何与之相关的内容。在我给出的示例中(here),只有对其输出的估计,而我需要输出本身。对不起,如果这个问题太简单了,不好问,但我真的需要帮助,因为课程支持不太好。
代码如下。
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')
def cleaning(text):
# converting to lowercase, removing URL links, special characters, punctuations...
text = text.lower() # converting to lowercase
text = re.sub('https?://\S+|www\.\S+', '', text) # removing URL links
text = re.sub(r"\b\d+\b", "", text) # removing number
text = re.sub('<.*?>+', '', text) # removing special characters,
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # punctuations
text = re.sub('\n', '', text)
text = re.sub('[’“”…]', '', text)
# removing emoji:
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)
# removing short form:
text = re.sub("isn't", 'is not', text)
text = re.sub("he's", 'he is', text)
text = re.sub("wasn't", 'was not', text)
text = re.sub("there's", 'there is', text)
text = re.sub("couldn't", 'could not', text)
text = re.sub("won't", 'will not', text)
text = re.sub("they're", 'they are', text)
text = re.sub("she's", 'she is', text)
text = re.sub("There's", 'there is', text)
text = re.sub("wouldn't", 'would not', text)
text = re.sub("haven't", 'have not', text)
text = re.sub("That's", 'That is', text)
text = re.sub("you've", 'you have', text)
text = re.sub("He's", 'He is', text)
text = re.sub("what's", 'what is', text)
text = re.sub("weren't", 'were not', text)
text = re.sub("we're", 'we are', text)
text = re.sub("hasn't", 'has not', text)
text = re.sub("you'd", 'you would', text)
text = re.sub("shouldn't", 'should not', text)
text = re.sub("let's", 'let us', text)
text = re.sub("they've", 'they have', text)
text = re.sub("You'll", 'You will', text)
text = re.sub("i'm", 'i am', text)
text = re.sub("we've", 'we have', text)
text = re.sub("it's", 'it is', text)
text = re.sub("don't", 'do not', text)
text = re.sub("that´s", 'that is', text)
text = re.sub("I´m", 'I am', text)
text = re.sub("it’s", 'it is', text)
text = re.sub("she´s", 'she is', text)
text = re.sub("he’s'", 'he is', text)
text = re.sub('I’m', 'I am', text)
text = re.sub('I’d', 'I did', text)
text = re.sub("he’s'", 'he is', text)
text = re.sub('there’s', 'there is', text)
return text
df = pd.read_csv('IMDB Dataset.csv', delimiter=',', nrows=5)
df.rename(columns={'review':'text'}, inplace = True)
dt = df['text'].apply(cleaning)
# data = []
# for i in dt:
# data.append(i)
#
# from transformers import pipeline
# sentiment_pipeline = pipeline("sentiment-analysis")
# sent = sentiment_pipeline(data)
# sent = pd.DataFrame(sent)
# sent.drop('score', axis=1, inplace=True)
# sent.rename(columns={'label':'sentiment'}, inplace = True)
# tab = df
# tab['sentiment'] = sent['sentiment']
dt = pd.DataFrame(dt)
dt['sentiment']=df['sentiment']
dt['no_sw'] = dt['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
from collections import Counter
cnt = Counter()
for text in dt["no_sw"].values:
for word in text.split():
cnt[word] += 1
cnt.most_common(10)
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
"""custom function to remove the frequent words"""
return " ".join([word for word in str(text).split() if word not in FREQWORDS])
dt["wo_stopfreq"] = dt["no_sw"].apply(lambda text: remove_freqwords(text))
wordnet_lem = WordNetLemmatizer()
dt['wo_stopfreq_lem'] = dt['wo_stopfreq'].apply(wordnet_lem.lemmatize)
nb=dt.drop(columns=['text','no_sw', 'wo_stopfreq'])
nb.columns=['sentiment','review']
nb.sentiment = [0 if each == "negative" else 1 for each in nb.sentiment]
tokenized_review=nb['review'].apply(lambda x: x.split())
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(nb['review'])
from sklearn.model_selection import train_test_split
X=text_counts
y=nb['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=30)
from sklearn.naive_bayes import ComplementNB
CNB = ComplementNB()
CNB.fit(X_train, y_train)```
在此阶段,基于 NaiveBayes 算法的机器学习模型已准备好对该模型的输入执行情感分析。
from sklearn import metrics
predicted = CNB.predict(X_test)
// predicted will have the prediction output based on sentimental analysis
accuracy_score = metrics.accuracy_score(predicted, y_test)