对 HTML 内容进行后期修剪,因为剩余标签被修剪/未正确关闭,HTML 结构中断,并且代码呈现为可见标签:
< body> < able> [如果是 mso]> < d> < r> < able> < r> < body> < able> Viac na Právne noviny < d> < r> < body> < able> < d> < r> < body> < able> ...[已修剪]
def trim_html(html: str) -> str: 最大长度 = 70000
if len(html) < max_length:
return html
html = html.replace('/n', '').replace('/t', ' ').replace('/r', '')
notice = "...[trimmed]"
current_length = 0
soup = BeautifulSoup(html, 'html.parser')
output_soup = BeautifulSoup('', 'html.parser')
for element in soup.descendants:
if element.name is None:
available_length = max_length - current_length - len(notice)
if available_length <= 0:
break
text_to_add = element[:available_length]
current_length += len(text_to_add)
output_soup.append(text_to_add)
if len(text_to_add) < len(element):
break
else:
new_tag = output_soup.new_tag(element.name, attrs=element.attrs)
output_soup.append(new_tag)
output_soup.append(notice)
return str(output_soup)
from html.parser import HTMLParser
"""
Purpose:
The TrimHtml to trim HTML content while retaining the original HTML structure.
Limitations:
This works only with HTML content.
Usage:
TrimHtml(max_length)
"""
class TrimHtml(HTMLParser):
limit_reached: bool = False
data_trimmed = '<span style="color:red;font-weight:bold">[data trimmed ...]</span>'
warning_msg = '<h3 style="color: red;"> ⚠ Warning! Incomplete data, see source.</h3>\n'
level = prev_level = 0
msg_len = 0
tags = []
tags_stack = []
def __init__(self, max_len=1000):
HTMLParser.__init__(self)
self.margin = min(max(max_len * 5 / 100, 100), max_len * 50 / 100)
self.max_len = max_len - self.margin
def push_tag(self, tag):
self.tags_stack.append(tag)
self.tags.append('\n')
def pop_tag(self):
if not self.isempty():
return self.tags_stack.pop()
def isempty(self):
return len(self.tags_stack) == 0
def peek_tag(self):
if self.tags_stack:
return self.tags_stack[-1]
else:
return None
def html_text(self):
return ''.join(self.tags).strip() + self.closing_html().strip()
def handle_starttag(self, tag, attrs):
if not self.limit_reached:
self.push_tag(tag)
if not self.limit_reached and self.data_trimmed:
self.tags.append(f'<{tag}>')
def handle_endtag(self, tag):
if not self.limit_reached:
if tag == self.peek_tag():
self.pop_tag()
self.tags.append(f'</{tag}>')
def handle_data(self, data):
if not self.limit_reached:
data = data.strip()
if data:
dlen = len(data)
if self.msg_len + dlen < self.max_len:
self.tags.append(data)
self.msg_len += dlen
else:
self.limit_reached = True
self.tags.append(self.data_trimmed)
self.data_trimmed = ""
def closing_html(self):
closing_tags = []
while not self.isempty():
if self.peek_tag() == 'body':
closing_tags.append(self.warning_msg)
closing_tags.append(f'</{self.pop_tag()}>\n')
return ''.join(closing_tags)
if __name__ == "__main__":
html_text = """<html>
<body>
<h2>An Unordered HTML List</h2>
<ul>
<li>Coffee</li>
<li>Tea</li>
<li>Milk</li>
</ul>
<h2>An Ordered HTML List</h2>
<ol>
<li>Coffee</li>
<li>Tea</li>
<li>Milk</li>
</ol>
</body>
</html>"""
if __name__ == "__main__":
parser = TrimHtml(105)
parser.feed(html_text)
out_html = parser.html_text()
print(out_html)