我将在没有外部库的情况下解析Html代码效率。
我已经准备好尝试 for 并检查了它是哪个符号。
是这样的:
list = []
html = """<html><p>Hello</p></html>"""
m = 0
for a in html:
if a == "<":
m = 1
list.append([])
elif a == ">":
m = 0
list.append([])
else:
list[-1] = a
print(list)
代码在 50KB 文件上运行速度非常慢。
我可以建议从一个简单的 HTML 解析器开始,如下所示?它使用Python自带的标准库,没有外部依赖。您可能需要根据需要更改和扩展它,但它为您提供了一个基本的 DOM API,这应该是一个很好的工作起点。该代码适用于它要解决的简单情况;但根据您的需求,您可能需要添加更多功能来实现您的最终目标。
#! /usr/bin/env python3
import html.parser
import pprint
import xml.dom.minidom
def main():
# noinspection PyPep8
document = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
parser = DocumentParser()
parser.feed(document)
parser.close()
model = parser.document.documentElement
model.normalize()
print(model.toprettyxml())
first_title = model.getElementsByTagName('title')[0]
print(first_title.toxml())
print(first_title.tagName)
print(first_title.firstChild.data)
print(first_title.parentNode.tagName)
first_p = model.getElementsByTagName('p')[0]
print(first_p.toxml())
print(first_p.getAttribute('class'))
all_a = model.getElementsByTagName('a')
print(all_a[0].toxml())
pprint.pprint([element.toxml() for element in all_a])
pprint.pprint([element.toxml() for element in find(model, id='link3')])
for element in all_a:
print(element.getAttribute('href'))
print(*get_text(model), sep='\n')
class DocumentParser(html.parser.HTMLParser):
# noinspection SpellCheckingInspection
def __init__(self, *, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.document = self.focus = xml.dom.minidom.DOMImplementation() \
.createDocument(None, None, None)
@property
def document_has_focus(self):
return self.document is self.focus
def handle_starttag(self, tag, attrs):
element = self.document.createElement(tag)
for name, value in attrs:
element.setAttribute(name, value)
self.focus.appendChild(element)
self.focus = element
def handle_endtag(self, tag):
while self.focus.tagName != tag:
self.focus = self.focus.parentNode
self.focus = self.focus.parentNode
def handle_data(self, data):
if not self.document_has_focus and not data.isspace():
self.focus.appendChild(self.document.createTextNode(data.strip()))
def error(self, message):
raise RuntimeError(message)
def close(self):
super().close()
while not self.document_has_focus:
self.focus = self.focus.parentNode
def find(element, **kwargs):
get_attribute = getattr(element, 'getAttribute', None)
if get_attribute and \
all(get_attribute(key) == value for key, value in kwargs.items()):
yield element
for child in element.childNodes:
yield from find(child, **kwargs)
def get_nodes_by_type(node, node_type):
if node.nodeType == node_type:
yield node
for child in node.childNodes:
yield from get_nodes_by_type(child, node_type)
def get_text(node):
return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))
if __name__ == '__main__':
main()