无法使用nltk功能

问题描述 投票:0回答:1

我试图在 UCI 垃圾邮件消息数据集上运行一些 nltk 函数,但遇到了 word_tokenize 即使在下载依赖项后也不起作用的问题。

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

df['text'].apply(lambda x: len(nltk.word_tokenize(x)))

以下是错误:

{
    "name": "LookupError",
    "message": "
**********************************************************************
  Resource punkt_tab not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('punkt_tab')
  
  For more information see: https://www.nltk.org/data.html

  Attempted to load tokenizers/punkt_tab/english/

  Searched in:
    - 'C:\\\\Users\\\\user/nltk_data'
    - 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\
ltk_data'
    - 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\\share\\\
ltk_data'
    - 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\\lib\\\
ltk_data'
    - 'C:\\\\Users\\\\user\\\AppData\\\\Roaming\\\
ltk_data'
    - 'C:\\\
ltk_data'
    - 'D:\\\
ltk_data'
    - 'E:\\\
ltk_data'
**********************************************************************
",
    "stack": "---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
Cell In[1024], line 3
      1 #finding no. of words
----> 3 df['text'].apply(lambda x: len(nltk.word_tokenize(x)))

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\series.py:4915, in Series.apply(self, func, convert_dtype, args, by_row, **kwargs)
   4780 def apply(
   4781     self,
   4782     func: AggFuncType,
   (...)
   4787     **kwargs,
   4788 ) -> DataFrame | Series:
   4789     \"\"\"
   4790     Invoke function on values of Series.
   4791 
   (...)
   4906     dtype: float64
   4907     \"\"\"
   4908     return SeriesApply(
   4909         self,
   4910         func,
   4911         convert_dtype=convert_dtype,
   4912         by_row=by_row,
   4913         args=args,
   4914         kwargs=kwargs,
-> 4915     ).apply()

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\apply.py:1427, in SeriesApply.apply(self)
   1424     return self.apply_compat()
   1426 # self.func is Callable
-> 1427 return self.apply_standard()

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\apply.py:1507, in SeriesApply.apply_standard(self)
   1501 # row-wise access
   1502 # apply doesn't have a `na_action` keyword and for backward compat reasons
   1503 # we need to give `na_action=\"ignore\"` for categorical data.
   1504 # TODO: remove the `na_action=\"ignore\"` when that default has been changed in
   1505 #  Categorical (GH51645).
   1506 action = \"ignore\" if isinstance(obj.dtype, CategoricalDtype) else None
-> 1507 mapped = obj._map_values(
   1508     mapper=curried, na_action=action, convert=self.convert_dtype
   1509 )
   1511 if len(mapped) and isinstance(mapped[0], ABCSeries):
   1512     # GH#43986 Need to do list(mapped) in order to get treated as nested
   1513     #  See also GH#25959 regarding EA support
   1514     return obj._constructor_expanddim(list(mapped), index=obj.index)

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\base.py:921, in IndexOpsMixin._map_values(self, mapper, na_action, convert)
    918 if isinstance(arr, ExtensionArray):
    919     return arr.map(mapper, na_action=na_action)
--> 921 return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\pandas\\core\\algorithms.py:1743, in map_array(arr, mapper, na_action, convert)
   1741 values = arr.astype(object, copy=False)
   1742 if na_action is None:
-> 1743     return lib.map_infer(values, mapper, convert=convert)
   1744 else:
   1745     return lib.map_infer_mask(
   1746         values, mapper, mask=isna(values).view(np.uint8), convert=convert
   1747     )

File lib.pyx:2972, in pandas._libs.lib.map_infer()

Cell In[1024], line 3, in <lambda>(x)
      1 #finding no. of words
----> 3 df['text'].apply(lambda x: len(nltk.word_tokenize(x)))

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\tokenize\\__init__.py:129, in word_tokenize(text, language, preserve_line)
    114 def word_tokenize(text, language=\"english\", preserve_line=False):
    115     \"\"\"
    116     Return a tokenized copy of *text*,
    117     using NLTK's recommended word tokenizer
   (...)
    127     :type preserve_line: bool
    128     \"\"\"
--> 129     sentences = [text] if preserve_line else sent_tokenize(text, language)
    130     return [
    131         token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
    132     ]

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\tokenize\\__init__.py:106, in sent_tokenize(text, language)
     96 def sent_tokenize(text, language=\"english\"):
     97     \"\"\"
     98     Return a sentence-tokenized copy of *text*,
     99     using NLTK's recommended sentence tokenizer
   (...)
    104     :param language: the model name in the Punkt corpus
    105     \"\"\"
--> 106     tokenizer = PunktTokenizer(language)
    107     return tokenizer.tokenize(text)

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\tokenize\\punkt.py:1744, in PunktTokenizer.__init__(self, lang)
   1742 def __init__(self, lang=\"english\"):
   1743     PunktSentenceTokenizer.__init__(self)
-> 1744     self.load_lang(lang)

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\tokenize\\punkt.py:1749, in PunktTokenizer.load_lang(self, lang)
   1746 def load_lang(self, lang=\"english\"):
   1747     from nltk.data import find
-> 1749     lang_dir = find(f\"tokenizers/punkt_tab/{lang}/\")
   1750     self._params = load_punkt_params(lang_dir)
   1751     self._lang = lang

File ~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\
ltk\\data.py:582, in find(resource_name, paths)
    580 sep = \"*\" * 70
    581 resource_not_found = f\"\
{sep}\
{msg}\
{sep}\
\"
--> 582 raise LookupError(resource_not_found)

LookupError: 
**********************************************************************
  Resource punkt_tab not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('punkt_tab')
  
  For more information see: https://www.nltk.org/data.html

  Attempted to load tokenizers/punkt_tab/english/

  Searched in:
    - 'C:\\\\Users\\\\user/nltk_data'
    - 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\
ltk_data'
    - 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\\share\\\
ltk_data'
    - 'C:\\\\Program Files\\\\WindowsApps\\\\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\\\\lib\\\
ltk_data'
    - 'C:\\\\Users\\\\user\\\\AppData\\\\Roaming\\\
ltk_data'
    - 'C:\\\
ltk_data'
    - 'D:\\\
ltk_data'
    - 'E:\\\
ltk_data'
**********************************************************************
"
}

我尝试重新安装 nltk 并尝试下载一些其他依赖文件,但没有任何效果。我做错了什么?

python machine-learning nlp nltk
1个回答
0
投票

您使用的是哪个版本?显然他们目前在 3.8.1 以上的版本中遇到问题。看看这个线程,可能是相关的。 https://github.com/Unstructed-IO/unstructed/issues/3511

© www.soinside.com 2019 - 2024. All rights reserved.