Python 代码修剪 HTML 内容而不丢失 html 结构

问题描述 投票:0回答:1

对 HTML 内容进行后期修剪,因为剩余标签被修剪/未正确关闭,HTML 结构中断,并且代码呈现为可见标签:

< body> < able> [如果是 mso]> < d> < r> < able> < r> < body> < able> Viac na Právne noviny < d> < r> < body> < able> < d> < r> < body> < able> ...[已修剪]

def trim_html(html: str) -> str: 最大长度 = 70000

if len(html) < max_length:
    return html
html = html.replace('/n', '').replace('/t', ' ').replace('/r', '')

notice = "...[trimmed]"
current_length = 0
soup = BeautifulSoup(html, 'html.parser')
output_soup = BeautifulSoup('', 'html.parser')

for element in soup.descendants:
    if element.name is None:
        available_length = max_length - current_length - len(notice)
        if available_length <= 0:
            break
        text_to_add = element[:available_length]
        current_length += len(text_to_add)
        output_soup.append(text_to_add)
        if len(text_to_add) < len(element):
            break
    else:
        new_tag = output_soup.new_tag(element.name, attrs=element.attrs)
        output_soup.append(new_tag)
output_soup.append(notice)
return str(output_soup)
html python-3.x
1个回答
0
投票
from html.parser import HTMLParser

"""
Purpose: 
    The TrimHtml to trim HTML content while retaining the original HTML structure.
Limitations:
    This works only with HTML content.
Usage: 
    TrimHtml(max_length)
"""


class TrimHtml(HTMLParser):
    limit_reached: bool = False
    data_trimmed = '<span style="color:red;font-weight:bold">[data trimmed ...]</span>'
    warning_msg = '<h3 style="color: red;"> &#9888; Warning! Incomplete data, see source.</h3>\n'
    level = prev_level = 0
    msg_len = 0
    tags = []
    tags_stack = []

    def __init__(self, max_len=1000):
        HTMLParser.__init__(self)
        self.margin = min(max(max_len * 5 / 100, 100), max_len * 50 / 100)
        self.max_len = max_len - self.margin

    def push_tag(self, tag):
        self.tags_stack.append(tag)
        self.tags.append('\n')

    def pop_tag(self):
        if not self.isempty():
            return self.tags_stack.pop()

    def isempty(self):
        return len(self.tags_stack) == 0

    def peek_tag(self):
        if self.tags_stack:
            return self.tags_stack[-1]
        else:
            return None

    def html_text(self):
        return ''.join(self.tags).strip() + self.closing_html().strip()

    def handle_starttag(self, tag, attrs):
        if not self.limit_reached:
            self.push_tag(tag)
            if not self.limit_reached and self.data_trimmed:
                self.tags.append(f'<{tag}>')

    def handle_endtag(self, tag):
        if not self.limit_reached:
            if tag == self.peek_tag():
                self.pop_tag()
                self.tags.append(f'</{tag}>')

    def handle_data(self, data):
        if not self.limit_reached:
            data = data.strip()
            if data:
                dlen = len(data)
                if self.msg_len + dlen < self.max_len:
                    self.tags.append(data)
                    self.msg_len += dlen
                else:
                    self.limit_reached = True
                    self.tags.append(self.data_trimmed)
                    self.data_trimmed = ""

    def closing_html(self):
        closing_tags = []
        while not self.isempty():
            if self.peek_tag() == 'body':
                closing_tags.append(self.warning_msg)
            closing_tags.append(f'</{self.pop_tag()}>\n')

        return ''.join(closing_tags)


if __name__ == "__main__":
    html_text = """<html>
    <body>
    <h2>An Unordered HTML List</h2>
    <ul>
      <li>Coffee</li>
      <li>Tea</li>
      <li>Milk</li>
    </ul>  
    <h2>An Ordered HTML List</h2>
    <ol>
      <li>Coffee</li>
      <li>Tea</li>
      <li>Milk</li>
    </ol> 
    </body>
    </html>"""

if __name__ == "__main__":
    parser = TrimHtml(105)
    parser.feed(html_text)
    out_html = parser.html_text()
    print(out_html)
© www.soinside.com 2019 - 2024. All rights reserved.