这个问题在这里已有答案:
我正在使用Python进行情感分析项目(使用自然语言处理)。我已经从twitter收集了数据并将其保存为CSV文件。该文件包含推文,主要是关于加密货币。我使用分类算法清理数据并应用情绪分析。
由于数据很干净,我想找到最常用的单词。这是我用来导入库和csv文件的代码:
# importing Libraries
from pandas import DataFrame, read_csv
import chardet
import matplotlib.pyplot as plt; plt.rcdefaults()
from matplotlib import rc
%matplotlib inline
import pandas as pd
plt.style.use('ggplot')
import numpy as np
import re
import warnings
#Visualisation
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from IPython.display import display
from mpl_toolkits.basemap import Basemap
from wordcloud import WordCloud, STOPWORDS
#nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
matplotlib.style.use('ggplot')
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")
## Reading CSV File and naming the object called crime
ltweet=pd.read_csv("C:\\Users\\name\\Documents\\python assignment\\bitcoin1.csv",index_col = None, skipinitialspace = True)
print(btweet)
我没有必要发布其他代码,因为它们很长。对于数据清理,我摆脱了超链接,RT(转推),URL,标点符号,以小写形式放置文本等。
例如,这是正面推文列表的输出
In [35]: btweet[btweet.sentiment_type == 'POSITIVE'].Tweets.reset_index(drop = True)[0:5]
Out[35]:
0 anizameddine more than just bitcoin blockchain...
1 bitcoinmagazine icymi wyoming house unanimousl...
2 bitracetoken bitrace published the smart contr...
3 unusual and quite promising ico banca banca_of...
4 airdrop coinstocks link it is a exchange so ge...
Name: Tweets, dtype: object
有没有办法找到数据中最常用的单词?任何人都可以帮我写代码吗?
演示:
from nltk import sent_tokenize, word_tokenize, regexp_tokenize, FreqDist
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from wordcloud import WordCloud, STOPWORDS
def tokenize(text, pat='(?u)\\b\\w\\w+\\b', stop_words='english', min_len=2):
if stop_words:
stop = set(stopwords.words(stop_words))
return [w
for w in regexp_tokenize(text.casefold(), pat)
if w not in stop and len(w) >= min_len]
def get_data():
categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
twenty_train = \
fetch_20newsgroups(subset='train',
categories=categories, shuffle=True)
twenty_test = \
fetch_20newsgroups(subset='test',
categories=categories, shuffle=True)
X_train = pd.DataFrame(twenty_train.data, columns=['text'])
X_test = pd.DataFrame(twenty_test.data, columns=['text'])
return X_train, X_test, twenty_train.target, twenty_test.target
X_train, X_test, y_train, y_test = get_data()
words = tokenize(X_train.text.str.cat(sep=' '), min_len=4)
fdist = FreqDist(words)
wc = WordCloud(width=800, height=400, max_words=100).generate_from_frequencies(fdist)
plt.figure(figsize=(12,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.savefig('d:/temp/result.png')
结果:
a = "Hello world and say hello again"
sp = a.split()
Counter = Counter(sp)
most_occur = Counter.most_common(4)
print(most_occur)