我有一个代码片段,可以通过Python中的Playwright从网页的DOM树中提取可输入和可点击的节点元素(即交互式元素)。
这段代码几乎可以正常工作,但在某些情况下会错过一些元素,例如谷歌的按钮!事实上,这个按钮被这段代码标记为不可点击。有人可以找出这段代码的问题吗?
这是代码:
from playwright.sync_api import sync_playwright
VOID_ELEMENTS = {
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr",
}
READABLE_ATTRIBUTES = {
"title",
"alt",
"href",
"placeholder",
"label",
"value",
"caption",
"summary",
"aria-label",
"aria-describedby",
"datetime",
"download",
"selected",
"checked",
"type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}
class DOMNode:
def __init__(self, i, nodes, strings):
self._on_screen = None
self.parent = None
self.children = []
self.llm_id = None
### Only some nodes have these, default None to differentiate between None and False
self.bounds = None
self.center = None
self.inputValue = None
self.inputChecked = None
self.isClickable = None
self.optionSelected = None
self.parentId = (
nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
)
self.nodeType = strings[nodes["nodeType"][i]]
self.nodeName = strings[nodes["nodeName"][i]].lower()
self.nodeValue = (
strings[nodes["nodeValue"][i]].strip()
if nodes["nodeValue"][i] >= 0
else None
)
self.backendNodeId = nodes["backendNodeId"][i]
self.attributes = {}
attrs = nodes["attributes"][i]
for att1, att2 in zip(attrs[::2], attrs[1::2]):
self.attributes[strings[att1]] = strings[att2][:100] # cut off long URLs
self.readable_attributes = {
k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
}
def __repr__(self, indent=0) -> str:
if self.nodeName == "#text":
return " " * indent + (self.nodeValue or "")
attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
attr_str = " " + attr_str if attr_str else ""
open_tag = f"<{self.nodeName}{attr_str}>"
close_tag = f"</{self.nodeName}>"
if len(self.children) == 0:
return (" " * indent + open_tag) + (
close_tag if self.nodeName not in VOID_ELEMENTS else ""
)
# special case for elements with only one text child -> one-line element
if len(self.children) == 1 and self.children[0].nodeName == "#text":
return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag
children_repr = "\n".join(
[child.__repr__(indent + 2) for child in self.children]
)
return (
(" " * indent + open_tag)
+ "\n"
+ children_repr
+ "\n"
+ (" " * indent + close_tag)
)
def on_screen(self, screen_bounds):
if len(self.children) > 0:
return any([child.on_screen(screen_bounds) for child in self.children])
if (
self.bounds is None
or len(self.bounds) != 4
or self.bounds[2] * self.bounds[3] == 0
):
return False
x, y, w, h = self.bounds
win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
win_right_bound = win_left_bound + win_width
win_lower_bound = win_upper_bound + win_height
return (
x < win_right_bound
and x + w > win_left_bound
and y < win_lower_bound
and y + h > win_upper_bound
)
class Globot:
def __init__(self, headless=False):
playwright = sync_playwright().start()
self.browser = playwright.chromium.launch(headless=headless)
self.context = self.browser.new_context()
self.page = self.context.new_page()
def go_to_page(self, url):
self.page.goto(url=url if "://" in url else "https://" + url)
self.client = self.page.context.new_cdp_session(self.page)
self.page.wait_for_load_state("domcontentloaded")
def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
dom = self.client.send(
"DOMSnapshot.captureSnapshot",
{"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
)
dom_strings = dom["strings"]
document = dom["documents"][0]
dom_layout = document["layout"]
dom_nodes = document["nodes"]
screen_bounds = dom_layout["bounds"][0]
# For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
"window.screen.width"
)
nodes = []
root = None
# Takes much longer naively
nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
inputValue_flipped = {
v: k for k, v in enumerate(dom_nodes["inputValue"]["index"])
}
for i in range(len(dom_nodes["parentIndex"])):
node = DOMNode(i, dom_nodes, dom_strings)
if i == 0:
root = node
if i in nodeIndex_flipped:
bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
bounds = [int(b / device_pixel_ratio) for b in bounds]
node.bounds = bounds
node.center = (
int(bounds[0] + bounds[2] / 2),
int(bounds[1] + bounds[3] / 2),
)
if i in dom_nodes["isClickable"]["index"]:
node.isClickable = True
if i in inputValue_flipped:
v = dom_nodes["inputValue"]["value"][inputValue_flipped[i]]
node.inputValue = dom_strings[v] if v >= 0 else ""
# node.string_attributes['value'] = node.inputValue
if i in dom_nodes["inputChecked"]["index"]:
node.inputChecked = True
if i in dom_nodes["optionSelected"]["index"]:
node.optionSelected = True
nodes.append(node)
# Switch node ids to node pointers
for node in nodes:
if node.parentId is not None:
node.parent = nodes[node.parentId]
node.parent.children.append(node)
count = 0
input_elements = {}
clickable_elements = {}
def find_interactive_elements(node):
nonlocal count
clickable = (
node.nodeName in CLICKABLE_ELEMENTS
and node.isClickable
and node.center is not None
)
inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None
# Special case for select and option elements
select_or_option = node.nodeName == "select" or node.nodeName == "option"
visible = node.on_screen(
root.bounds
) and "visibility: hidden" not in node.attributes.get("style", "")
if node.nodeName == "button":
print(f"Node: {node.nodeName}")
print(f" Attributes: {node.attributes}")
print(f" Bounds: {node.bounds}")
print(f" Clickable: {clickable}")
print(f" Inputable: {inputable}")
print(f" Visible: {visible}")
print(f" Center: {node.center}")
if visible and (clickable or inputable) or select_or_option:
if clickable:
clickable_elements[count] = node
if inputable or select_or_option:
input_elements[count] = node
node.llm_id = count
count += 1
for child in node.children:
find_interactive_elements(child)
find_interactive_elements(root)
return input_elements, clickable_elements
用于重现问题的代码片段(此处的 Next 按钮不可点击):
from pprint import pprint
bot = Globot()
bot.go_to_page(
"https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()
s = ""
for i in inputs.keys() | clickables.keys():
inputable = False
clickable = False
if i in inputs:
node = inputs[i]
inputable = True
if i in clickables:
node = clickables[i]
clickable = True
s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
s += node.__repr__(indent=2)
s += "\n</node>\n"
html_description = s
pprint(html_description)
这是有关 Next 元素的日志部分 - 正如您所看到的,
Clickable
设置为 None
:
Node: button
Attributes: {'class': 'VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKG', 'jscontroller': 'soHxf', 'jsaction': 'click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2', 'data-idom-class': 'nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b', 'jsname': 'LgbsSe', 'type': 'button'}
Bounds: [965, 453, 78, 40]
Clickable: None
Inputable: False
Visible: True
Center: (1004, 473)
这是 Next 按钮的 RAW HTML:
<button class="VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jscontroller="soHxf" jsaction="click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2H; touchmove:FwuNnf; touchend:yfqBxc; touchcancel:JMtRjd; focus:AHmuwe; blur:O22p3e; contextmenu:mg9Pef;mlnRJb:fLiPzd;" data-idom-class="nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jsname="LgbsSe" type="button"><div class="VfPpkd-Jh9lGc"></div><div class="VfPpkd-J1Ukfc-LhBDec"></div><div class="VfPpkd-RLmnJb"></div><span jsname="V67aGc" class="VfPpkd-vQzf8d">Next</span></button>
这是相应页面的屏幕截图:
如果代码太长,我深表歉意,并感谢您提前提供帮助。
我最终得到了以下代码,包括使用
sets
进行快速查找的优化,并添加一个新条件来确定节点是否可以通过 onclick
属性的存在进行点击,或者该节点是否是 button
。
这是精炼版本:
from playwright.sync_api import sync_playwright
from pprint import pprint
VOID_ELEMENTS = {
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr",
}
READABLE_ATTRIBUTES = {
"title",
"alt",
"href",
"placeholder",
"label",
"value",
"caption",
"summary",
"aria-label",
"aria-describedby",
"datetime",
"download",
"selected",
"checked",
"type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary", "ul", "li"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}
class DOMNode:
def __init__(self, i, nodes, strings):
self._on_screen = None
self.parent = None
self.children = []
self.llm_id = None
### Only some nodes have these, default None to differentiate between None and False
self.bounds = None
self.center = None
self.inputValue = None
self.inputChecked = None
self.isClickable = None
self.optionSelected = None
self.parentId = (
nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
)
self.nodeType = strings[nodes["nodeType"][i]]
self.nodeName = strings[nodes["nodeName"][i]].lower()
self.nodeValue = (
strings[nodes["nodeValue"][i]].strip()
if nodes["nodeValue"][i] >= 0
else None
)
self.backendNodeId = nodes["backendNodeId"][i]
self.attributes = {}
attrs = nodes["attributes"][i]
for att1, att2 in zip(attrs[::2], attrs[1::2]):
self.attributes[strings[att1]] = strings[att2][:100] # cut off long URLs
self.readable_attributes = {
k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
}
def __repr__(self, indent=0) -> str:
if self.nodeName == "#text":
return " " * indent + (self.nodeValue or "")
attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
attr_str = " " + attr_str if attr_str else ""
open_tag = f"<{self.nodeName}{attr_str}>"
close_tag = f"</{self.nodeName}>"
if len(self.children) == 0:
return (" " * indent + open_tag) + (
close_tag if self.nodeName not in VOID_ELEMENTS else ""
)
# special case for elements with only one text child -> one-line element
if len(self.children) == 1 and self.children[0].nodeName == "#text":
return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag
children_repr = "\n".join(
[child.__repr__(indent + 2) for child in self.children]
)
return (
(" " * indent + open_tag)
+ "\n"
+ children_repr
+ "\n"
+ (" " * indent + close_tag)
)
def on_screen(self, screen_bounds):
if len(self.children) > 0:
return any([child.on_screen(screen_bounds) for child in self.children])
if (
self.bounds is None
or len(self.bounds) != 4
or self.bounds[2] * self.bounds[3] == 0
):
return False
x, y, w, h = self.bounds
win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
win_right_bound = win_left_bound + win_width
win_lower_bound = win_upper_bound + win_height
return (
x < win_right_bound
and x + w > win_left_bound
and y < win_lower_bound
and y + h > win_upper_bound
)
class Globot:
def __init__(self, headless=False):
playwright = sync_playwright().start()
self.browser = playwright.chromium.launch(headless=headless)
self.context = self.browser.new_context()
self.page = self.context.new_page()
def go_to_page(self, url):
self.page.goto(url=url if "://" in url else "https://" + url)
self.client = self.page.context.new_cdp_session(self.page)
self.page.wait_for_load_state("domcontentloaded")
def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
dom = self.client.send(
"DOMSnapshot.captureSnapshot",
{"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
)
dom_strings = dom["strings"]
document = dom["documents"][0]
dom_layout = document["layout"]
dom_nodes = document["nodes"]
screen_bounds = dom_layout["bounds"][0]
# For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
"window.screen.width"
)
nodes = []
root = None
isClickable_set = set(dom_nodes.get("isClickable", {}).get("index", []))
inputChecked_set = set(dom_nodes.get("inputChecked", {}).get("index", []))
optionSelected_set = set(dom_nodes.get("optionSelected", {}).get("index", []))
inputValue_map = dict(
zip(
dom_nodes.get("inputValue", {}).get("index", []),
dom_nodes.get("inputValue", {}).get("value", []),
)
)
nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
for i in range(len(dom_nodes["parentIndex"])):
node = DOMNode(i, dom_nodes, dom_strings)
if i == 0:
root = node
if i in nodeIndex_flipped:
bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
bounds = [int(b / device_pixel_ratio) for b in bounds]
node.bounds = bounds
node.center = (
int(bounds[0] + bounds[2] / 2),
int(bounds[1] + bounds[3] / 2),
)
node.isClickable = i in isClickable_set
node.inputChecked = i in inputChecked_set
node.optionSelected = i in optionSelected_set
if i in inputValue_map:
v = inputValue_map[i]
node.inputValue = dom_strings[v] if v >= 0 else ""
nodes.append(node)
# Switch node ids to node pointers
for node in nodes:
if node.parentId is not None:
node.parent = nodes[node.parentId]
node.parent.children.append(node)
count = 0
input_elements = {}
clickable_elements = {}
def find_interactive_elements(node):
nonlocal count
clickable = (
node.nodeName in CLICKABLE_ELEMENTS
and node.center is not None
and (
node.isClickable
or node.nodeName == "button"
or "onclick" in node.attributes
)
)
inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None
# Special case for select and option elements
select_or_option = node.nodeName in {"select", "option"}
visible = node.on_screen(
root.bounds
) and "visibility: hidden" not in node.attributes.get("style", "")
if visible and (clickable or inputable) or select_or_option:
if clickable:
clickable_elements[count] = node
if inputable or select_or_option:
input_elements[count] = node
node.llm_id = count
count += 1
for child in node.children:
find_interactive_elements(child)
find_interactive_elements(root)
return input_elements, clickable_elements
bot = Globot()
bot.go_to_page(
"https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()
s = ""
for i in inputs.keys() | clickables.keys():
inputable = False
clickable = False
if i in inputs:
node = inputs[i]
inputable = True
if i in clickables:
node = clickables[i]
clickable = True
s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
s += node.__repr__(indent=2)
s += "\n</node>\n"
html_description = s
pprint(html_description)
输出:
('<node id=0 clickable=False inputable=True>\n'
' <input type="email" aria-label="Email or phone" '
'value="2D7AB92D588040EBA91955F62E1BEE47">\n'
'</node>\n'
'<node id=1 clickable=True inputable=False>\n'
' <button type="button">\n'
' <::before></::before>\n'
' Forgot email?\n'
' </button>\n'
'</node>\n'
'<node id=2 clickable=True inputable=False>\n'
' <a href="https://support.google.com/chrome/answer/6130773?hl=en">\n'
' <::before></::before>\n'
' Learn more about using Guest mode\n'
' </a>\n'
'</node>\n'
'<node id=3 clickable=True inputable=False>\n'
' <button type="button">\n'
' <div>\n'
' <::before></::before>\n'
' <::after></::after>\n'
' </div>\n'
' <div></div>\n'
' <div></div>\n'
' <span>Next</span>\n'
' </button>\n'
'</node>\n'
'<node id=4 clickable=True inputable=False>\n'
' <button type="button">\n'
' <div>\n'
' <::before></::before>\n'
' <::after></::after>\n'
' </div>\n'
' <div></div>\n'
' <div></div>\n'
' <span>Create account</span>\n'
' </button>\n'
'</node>\n'
'<node id=5 clickable=True inputable=False>\n'
' <a href="https://support.google.com/accounts?hl=en&p=account_iph">\n'
' <::before></::before>\n'
' Help\n'
' </a>\n'
'</node>\n'
'<node id=6 clickable=True inputable=False>\n'
' <a href="https://accounts.google.com/TOS?loc=GB&hl=en&privacy=true">\n'
' <::before></::before>\n'
' Privacy\n'
' </a>\n'
'</node>\n'
'<node id=7 clickable=True inputable=False>\n'
' <a href="https://accounts.google.com/TOS?loc=GB&hl=en">\n'
' <::before></::before>\n'
' Terms\n'
' </a>\n'
'</node>\n')
查看提取的可点击内容,例如“下一步”按钮以及上面正确检测到的其他按钮。