我试图在 UCI 垃圾邮件消息数据集上运行一些 nltk 函数,但遇到了 word_tokenize 即使在下载依赖项后也不起作用的问题。
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
以下是错误:
{
"name": "LookupError",
"message": "
**********************************************************************
Resource punkt_tab not found.
Please use the NLTK Downloader to obtain the resource:
>>> import nltk
>>> nltk.download('punkt_tab')
For more information see: https://www.nltk.org/data.html
Attempted to load tokenizers/punkt_tab/english/
Searched in:
- 'C:\\\\Users\\\\user/nltk_data'
- 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\
ltk_data'
- 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\\share\\\
ltk_data'
- 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\\lib\\\
ltk_data'
- 'C:\\\\Users\\\\user\\\AppData\\\\Roaming\\\
ltk_data'
- 'C:\\\
ltk_data'
- 'D:\\\
ltk_data'
- 'E:\\\
ltk_data'
**********************************************************************
",
"stack": "---------------------------------------------------------------------------
LookupError Traceback (most recent call last)
Cell In[1024], line 3
1 #finding no. of words
----> 3 df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\series.py:4915, in Series.apply(self, func, convert_dtype, args, by_row, **kwargs)
4780 def apply(
4781 self,
4782 func: AggFuncType,
(...)
4787 **kwargs,
4788 ) -> DataFrame | Series:
4789 \"\"\"
4790 Invoke function on values of Series.
4791
(...)
4906 dtype: float64
4907 \"\"\"
4908 return SeriesApply(
4909 self,
4910 func,
4911 convert_dtype=convert_dtype,
4912 by_row=by_row,
4913 args=args,
4914 kwargs=kwargs,
-> 4915 ).apply()
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\apply.py:1427, in SeriesApply.apply(self)
1424 return self.apply_compat()
1426 # self.func is Callable
-> 1427 return self.apply_standard()
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\apply.py:1507, in SeriesApply.apply_standard(self)
1501 # row-wise access
1502 # apply doesn't have a `na_action` keyword and for backward compat reasons
1503 # we need to give `na_action=\"ignore\"` for categorical data.
1504 # TODO: remove the `na_action=\"ignore\"` when that default has been changed in
1505 # Categorical (GH51645).
1506 action = \"ignore\" if isinstance(obj.dtype, CategoricalDtype) else None
-> 1507 mapped = obj._map_values(
1508 mapper=curried, na_action=action, convert=self.convert_dtype
1509 )
1511 if len(mapped) and isinstance(mapped[0], ABCSeries):
1512 # GH#43986 Need to do list(mapped) in order to get treated as nested
1513 # See also GH#25959 regarding EA support
1514 return obj._constructor_expanddim(list(mapped), index=obj.index)
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\base.py:921, in IndexOpsMixin._map_values(self, mapper, na_action, convert)
918 if isinstance(arr, ExtensionArray):
919 return arr.map(mapper, na_action=na_action)
--> 921 return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\algorithms.py:1743, in map_array(arr, mapper, na_action, convert)
1741 values = arr.astype(object, copy=False)
1742 if na_action is None:
-> 1743 return lib.map_infer(values, mapper, convert=convert)
1744 else:
1745 return lib.map_infer_mask(
1746 values, mapper, mask=isna(values).view(np.uint8), convert=convert
1747 )
File lib.pyx:2972, in pandas._libs.lib.map_infer()
Cell In[1024], line 3, in <lambda>(x)
1 #finding no. of words
----> 3 df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\tokenize\\__init__.py:129, in word_tokenize(text, language, preserve_line)
114 def word_tokenize(text, language=\"english\", preserve_line=False):
115 \"\"\"
116 Return a tokenized copy of *text*,
117 using NLTK's recommended word tokenizer
(...)
127 :type preserve_line: bool
128 \"\"\"
--> 129 sentences = [text] if preserve_line else sent_tokenize(text, language)
130 return [
131 token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
132 ]
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\tokenize\\__init__.py:106, in sent_tokenize(text, language)
96 def sent_tokenize(text, language=\"english\"):
97 \"\"\"
98 Return a sentence-tokenized copy of *text*,
99 using NLTK's recommended sentence tokenizer
(...)
104 :param language: the model name in the Punkt corpus
105 \"\"\"
--> 106 tokenizer = PunktTokenizer(language)
107 return tokenizer.tokenize(text)
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\tokenize\\punkt.py:1744, in PunktTokenizer.__init__(self, lang)
1742 def __init__(self, lang=\"english\"):
1743 PunktSentenceTokenizer.__init__(self)
-> 1744 self.load_lang(lang)
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\tokenize\\punkt.py:1749, in PunktTokenizer.load_lang(self, lang)
1746 def load_lang(self, lang=\"english\"):
1747 from nltk.data import find
-> 1749 lang_dir = find(f\"tokenizers/punkt_tab/{lang}/\")
1750 self._params = load_punkt_params(lang_dir)
1751 self._lang = lang
File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\data.py:582, in find(resource_name, paths)
580 sep = \"*\" * 70
581 resource_not_found = f\"\
{sep}\
{msg}\
{sep}\
\"
--> 582 raise LookupError(resource_not_found)
LookupError:
**********************************************************************
Resource punkt_tab not found.
Please use the NLTK Downloader to obtain the resource:
>>> import nltk
>>> nltk.download('punkt_tab')
For more information see: https://www.nltk.org/data.html
Attempted to load tokenizers/punkt_tab/english/
Searched in:
- 'C:\\\\Users\\\\user/nltk_data'
- 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\
ltk_data'
- 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\\share\\\
ltk_data'
- 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\\lib\\\
ltk_data'
- 'C:\\\\Users\\\\user\\\\AppData\\\\Roaming\\\
ltk_data'
- 'C:\\\
ltk_data'
- 'D:\\\
ltk_data'
- 'E:\\\
ltk_data'
**********************************************************************
"
}
我尝试重新安装 nltk 并尝试下载一些其他依赖文件,但没有任何效果。我做错了什么?
您使用的是哪个版本?显然他们目前在 3.8.1 以上的版本中遇到问题。看看这个线程,可能是相关的。 https://github.com/Unstructed-IO/unstructed/issues/3511