语料库 = []
pstem = PorterStemmer()
for i in range(data_train['text'].shape[0]):
#Remove unwanted words
tweet = re.sub("[^a-zA-Z]", ' ', data_train['text'][i])
#Transform words to lowercase
tweet = tweet.lower()
tweet = tweet.split()
#Remove stopwords then Stemming it
tweet = [pstem.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
tweet = ' '.join(tweet)
#Append cleaned tweet to corpus
corpus.append(tweet)
print("Corpus created successfully")
print(pd.DataFrame(corpus)[0].head(10))
rawTexData = data_train["text"].head(10)
cleanTexData = pd.DataFrame(corpus[0]).head(10)
frames = [rawTexData, cleanTexData]
result = pd.concat(frames, axis=1, sort=False)
result
#Create our dictionary
uniqueWordFrequents = {}
for tweet in corpus:
for word in tweet.split():
if(word in uniqueWordFrequents.keys()):
uniqueWordFrequents[word] += 1
else:
uniqueWordFrequents[word] = 1
你好,请问我在尝试创建语料库词典时遇到了困难, 错误是: AttributeError:“系列”对象没有属性“拆分”
然后我试着在这里写 str.split:: 对于 tweet.str.split() 中的单词: 我也有错误: 该列表是可清除的
我呆了这么多小时 我是机器学习初学者,需要帮助,谢谢