Python 文本数据预处理中的停用词问题

问题描述 投票:0回答:1


import pandas as pd
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import requests'punkt')'stopwords')
nlp = spacy.load('en_core_web_sm')

# GitHub stopwords
url = ""
github_stopwords = set(requests.get(url).text.splitlines())

# my stopwords 
with open('/content/s.txt') as f:
    my_stopwords = set(line.strip() for line in f)

# NLTK stopwords 
nltk_stopwords = set(stopwords.words('english'))

# All stopwords 
all_stopwords = nltk_stopwords.union(github_stopwords, my_stopwords)

# clean function
def preprocess_text(text):
    # 1. Lower
    text = text.lower()
    # 2. Tokenization 
    words = word_tokenize(text)
    # 3. keep alphanumeric words
    words = [word for word in words if word.isalpha() and word not in all_stopwords]
    print(f"remaining words:{words}")
     # 4. Stopwords filter
    filtered_words = [word for word in words if word not in all_stopwords]

    # 5. Lemmatization (spaCy used)
    doc = nlp(" ".join( filtered_words))
    lemmatized_words = [token.lemma_ for token in doc]

    print(f"Lemmatized text: {' '.join(lemmatized_words)}\n")
    # Return sanitized text
    return " ".join(lemmatized_words)


 For example, a few words in this topic (challenge, theoretical)

 Topic 25: challenge, framework, theoretical, provide, offer 
python topic-modeling stop-words

建议 您可以检查非索引字表以确保它仅包含小写单词。该问题也可能源于您传递到主题建模模型中的参数。


import pandas as pd
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import requests'punkt')'stopwords')
nlp = spacy.load('en_core_web_sm')

# GitHub stopwords
url = ""
github_stopwords = set(requests.get(url).text.splitlines())

my_stopwords={ "challenge", "theoretical"}
# NLTK stopwords 
nltk_stopwords = set(stopwords.words('english'))

# All stopwords 
all_stopwords = nltk_stopwords.union(github_stopwords, my_stopwords)

# clean function
def preprocess_text(text):
    # 1. Lower
    text = text.lower()
    # 2. Tokenization 
    words = word_tokenize(text)
    # 3. keep alphanumeric words
    words = [word for word in words if word.isalpha() and word not in all_stopwords]
    print(f"remaining words:{words}")
     # 4. Stopwords filter
    filtered_words = [word for word in words if word not in all_stopwords]

    # 5. Lemmatization (spaCy used)
    doc = nlp(" ".join( filtered_words))
    lemmatized_words = [token.lemma_ for token in doc]

    print(f"Lemmatized text: {' '.join(lemmatized_words)}\n")
    # Return sanitized text
    return " ".join(lemmatized_words)

test_sentence = "This is a theoretical challenge to offer some insights and provide a useful framework."

preprocessed_test = preprocess_text(test_sentence)


offer insight provide framework
© 2019 - 2024. All rights reserved.