如何在Python中高效解析Html?

问题描述 投票:0回答:1

我将在没有外部库的情况下解析Html代码效率。

我已经准备好尝试 for 并检查了它是哪个符号。

是这样的:

list = []
html = """<html><p>Hello</p></html>"""
m = 0
for a in html:
    if a == "<":
        m = 1
        list.append([])
    elif a == ">":
        m = 0
        list.append([])
    else:
        list[-1] = a
print(list)

代码在 50KB 文件上运行速度非常慢。

python-3.x parsing
1个回答
1
投票

我可以建议从一个简单的 HTML 解析器开始,如下所示?它使用Python自带的标准库,没有外部依赖。您可能需要根据需要更改和扩展它,但它为您提供了一个基本的 DOM API,这应该是一个很好的工作起点。该代码适用于它要解决的简单情况;但根据您的需求,您可能需要添加更多功能来实现您的最终目标。

#! /usr/bin/env python3
import html.parser
import pprint
import xml.dom.minidom


def main():
    # noinspection PyPep8
    document = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
'''
    parser = DocumentParser()
    parser.feed(document)
    parser.close()
    model = parser.document.documentElement
    model.normalize()
    print(model.toprettyxml())
    first_title = model.getElementsByTagName('title')[0]
    print(first_title.toxml())
    print(first_title.tagName)
    print(first_title.firstChild.data)
    print(first_title.parentNode.tagName)
    first_p = model.getElementsByTagName('p')[0]
    print(first_p.toxml())
    print(first_p.getAttribute('class'))
    all_a = model.getElementsByTagName('a')
    print(all_a[0].toxml())
    pprint.pprint([element.toxml() for element in all_a])
    pprint.pprint([element.toxml() for element in find(model, id='link3')])
    for element in all_a:
        print(element.getAttribute('href'))
    print(*get_text(model), sep='\n')


class DocumentParser(html.parser.HTMLParser):
    # noinspection SpellCheckingInspection
    def __init__(self, *, convert_charrefs=True):
        super().__init__(convert_charrefs=convert_charrefs)
        self.document = self.focus = xml.dom.minidom.DOMImplementation() \
            .createDocument(None, None, None)

    @property
    def document_has_focus(self):
        return self.document is self.focus

    def handle_starttag(self, tag, attrs):
        element = self.document.createElement(tag)
        for name, value in attrs:
            element.setAttribute(name, value)
        self.focus.appendChild(element)
        self.focus = element

    def handle_endtag(self, tag):
        while self.focus.tagName != tag:
            self.focus = self.focus.parentNode
        self.focus = self.focus.parentNode

    def handle_data(self, data):
        if not self.document_has_focus and not data.isspace():
            self.focus.appendChild(self.document.createTextNode(data.strip()))

    def error(self, message):
        raise RuntimeError(message)

    def close(self):
        super().close()
        while not self.document_has_focus:
            self.focus = self.focus.parentNode


def find(element, **kwargs):
    get_attribute = getattr(element, 'getAttribute', None)
    if get_attribute and \
            all(get_attribute(key) == value for key, value in kwargs.items()):
        yield element
    for child in element.childNodes:
        yield from find(child, **kwargs)


def get_nodes_by_type(node, node_type):
    if node.nodeType == node_type:
        yield node
    for child in node.childNodes:
        yield from get_nodes_by_type(child, node_type)


def get_text(node):
    return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))


if __name__ == '__main__':
    main()
© www.soinside.com 2019 - 2024. All rights reserved.