如果我有一个文本,例如包含加泰罗尼亚语报纸的文章,我如何从该文本中找到所有城市?
我一直在查看Python的nltk包,并且我已经下载了加泰罗尼亚语语言的语料库(nltk.corpus.cess_cat)。
我现在拥有的: 我已经从 nltk.download() 安装了所有必需的内容。我现在拥有的一个例子:
te = nltk.word_tokenize('Tots els gats son de Sant Cugat del Valles.')
nltk.pos_tag(te)
这座城市是“Sant Cugat del Valles”。我从输出中得到的是:
[('Tots', 'NNS'),
('els', 'NNS'),
('gats', 'NNS'),
('son', 'VBP'),
('de', 'IN'),
('Sant', 'NNP'),
('Cugat', 'NNP'),
('del', 'NN'),
('Valles', 'NNP')]
NNP 似乎表示第一个字母大写的名词。有没有办法获取地点或城市而不是所有名称? 谢谢你
您可以使用 geotext python 库来实现同样的目的。
pip install geotext
这就是安装这个库所需的一切。使用方法很简单:
from geotext import GeoText
places = GeoText("London is a great city")
places.cities
给出结果“伦敦”
该库涵盖的城市列表并不广泛,但它有一个很好的列表。
您必须训练命名实体识别器(NER),或者您可以制作自己的地名词典。
我为像您这样的任务制作并使用的一个简单的地名词典是这样的:
# -*- coding: utf-8 -*-
import codecs
from lxml.html.builder import DT
import os
import re
from nltk.chunk.util import conlltags2tree
from nltk.chunk import ChunkParserI
from nltk.tag import pos_tag
from nltk.tokenize import wordpunct_tokenize
def sub_leaves(tree, node):
return [t.leaves() for t in tree.subtrees(lambda s: s.node == node)]
class Gazetteer(ChunkParserI):
"""
Find and annotate a list of words that matches patterns.
Patterns may be regular expressions in the form list of tuples.
Every tuple has the regular expression and the iob tag for this one.
Before applying gazetteer words a part of speech tagging should
be performed. So, you have to pass your tagger as a parameter.
Example:
>>> patterns = [(u"Αθήνα[ς]?", "LOC"), (u"Νομική[ς]? [Σσ]χολή[ς]?", "ORG")]
>>> gazetteer = Gazetteer(patterns, nltk.pos_tag, nltk.wordpunct_tokenize)
>>> text = u"Η Νομική σχολή της Αθήνας"
>>> t = gazetteer.parse(text)
>>> print(unicode(t))
... (S Η/DT (ORG Νομική/NN σχολή/NN) της/DT (LOC Αθήνας/NN))
"""
def __init__(self, patterns, pos_tagger, tokenizer):
"""
Initialize the class.
:param patterns:
The patterns to search in text is a list of tuples with regular
expression and the tag to apply
:param pos_tagger:
The tagger to use for applying part of speech to the text
:param tokenizer:
The tokenizer to use for tokenizing the text
"""
self.patterns = patterns
self.pos_tag = pos_tagger
self.tokenize = tokenizer
self.lookahead = 0 # how many words it is possible to be a gazetteer word
self.words = [] # Keep the words found by applying the regular expressions
self.iobtags = [] # For each set of words keep the coresponding tag
def iob_tags(self, tagged_sent):
"""
Search the tagged sentences for gazetteer words and apply their iob tags.
:param tagged_sent:
A tokenized text with part of speech tags
:type tagged_sent: list
:return:
yields the IOB tag of the word with it's character, eg. B-LOCATION
:rtype:
"""
i = 0
l = len(tagged_sent)
inside = False # marks the I- tag
iobs = []
while i < l:
word, pos_tag = tagged_sent[i]
j = i + 1 # the next word
k = j + self.lookahead # how many words in a row we may search
nextwords, nexttags = [], [] # for now, just the ith word
add_tag = False # no tag, this is O
while j <= k:
words = ' '.join([word] + nextwords) # expand our word list
if words in self.words: # search for words
index = self.words.index(words) # keep index to use for iob tags
if inside:
iobs.append((word, pos_tag, 'I-' + self.iobtags[index])) # use the index tag
else:
iobs.append((word, pos_tag, 'B-' + self.iobtags[index]))
for nword, ntag in zip(nextwords, nexttags): # there was more than one word
iobs.append((nword, ntag, 'I-' + self.iobtags[index])) # apply I- tag to all of them
add_tag, inside = True, True
i = j # skip tagged words
break
if j < l: # we haven't reach the length of tagged sentences
nextword, nexttag = tagged_sent[j] # get next word and it's tag
nextwords.append(nextword)
nexttags.append(nexttag)
j += 1
else:
break
if not add_tag: # unkown words
inside = False
i += 1
iobs.append((word, pos_tag, 'O')) # it's an Outsider
return iobs
def parse(self, text, conlltags=True):
"""
Given a text, applies tokenization, part of speech tagging and the
gazetteer words with their tags. Returns an conll tree.
:param text: The text to parse
:type text: str
:param conlltags:
:type conlltags:
:return: An conll tree
:rtype:
"""
# apply the regular expressions and find all the
# gazetteer words in text
for pattern, tag in self.patterns:
words_found = set(re.findall(pattern, text)) # keep the unique words
if len(words_found) > 0:
for word in words_found: # words_found may be more than one
self.words.append(word) # keep the words
self.iobtags.append(tag) # and their tag
# find the pattern with the maximum words.
# this will be the look ahead variable
for word in self.words: # don't care about tags now
nwords = word.count(' ')
if nwords > self.lookahead:
self.lookahead = nwords
# tokenize and apply part of speech tagging
tagged_sent = self.pos_tag(self.tokenize(text))
# find the iob tags
iobs = self.iob_tags(tagged_sent)
if conlltags:
return conlltags2tree(iobs)
else:
return iobs
if __name__ == "__main__":
patterns = [(u"Αθήνα[ς]?", "LOC"), (u"Νομική[ς]? [Σσ]χολή[ς]?", "ORG")]
g = Gazetteer(patterns, pos_tag, wordpunct_tokenize)
text = u"Η Νομική σχολή της Αθήνας"
t = g.parse(text)
print(unicode(t))
dir_with_lists = "Lists"
patterns = []
tags = []
for root, dirs, files in os.walk(dir_with_lists):
for f in files:
lines = codecs.open(os.path.join(root, f), 'r', 'utf-8').readlines()
tag = os.path.splitext(f)[0]
for l in lines[1:]:
patterns.append((l.rstrip(), tag))
tags.append(tag)
text = codecs.open("sample.txt", 'r', "utf-8").read()
#g = Gazetteer(patterns)
t = g.parse(text.lower())
print unicode(t)
for tag in set(tags):
for gaz_word in sub_leaves(t, tag):
print gaz_word[0][0], tag
在
if __name__ == "__main__":
中,您可以看到我在代码中制作模式的示例 patterns = [(u"Αθήνα[ς]?", "LOC"), (u"Νομική[ς]? [Σσ]χολή[ς]?", "ORG")]
。
稍后在代码中,我从名为
Lists
的目录中读取文件(将其放在包含上述代码的文件夹中)。每个文件的名称都会成为地名词典的标签。因此,制作类似 LOC.txt
的文件,其中包含位置模式(LOC
标签),PERSON.txt
代表人物等。
您不需要为此使用 NLTK。相反,请执行以下操作:
以列表形式迭代文本中的元素。
3.1。如果元素第一个元素与文本中的元素对应,则迭代城市,然后检查下一个元素。
这是一个可运行的代码示例:
text = 'Tots els gats son de Sant Cugat del Valles.'
#Prepare your text. Remove "." (and other unnecessary marks).
#Then split it into a list of words.
text = text.replace('.','').split(' ')
#Insert the cities you want to search for.
cities = {"Sant Cugat del Valles":["Sant","Cugat","del","Valles"]}
found_match = False
for word in text:
if found_match:
cityTest = cityTest
else:
cityTest = ''
found_match = False
for city in cities.keys():
if word in cities[city]:
cityTest += word + ' '
found_match = True
if cityTest.split(' ')[0:-1] == city.split(' '):
print city #Print if it found a city.
有一个标准的 Linux 程序“fgrep”可以执行此操作。给它一个包含城市列表的文件,每行一个,以及第二个要搜索的文件(或 stein),它会打印第二个文件中包含任何城市的每一行。有一些开关可以仅打印匹配的文本(仅城市),或者进行与大小写无关的匹配等。
您可以直接从Python调用fgrep。