如何通过playwright提取google的按钮元素?

问题描述 投票:0回答:1

我有一个代码片段,可以通过Python中的Playwright从网页的DOM树中提取可输入和可点击的节点元素(即交互式元素)。

这段代码几乎可以正常工作,但在某些情况下会错过一些元素,例如谷歌的按钮!事实上,这个按钮被这段代码标记为不可点击。有人可以找出这段代码的问题吗?

这是代码:

from playwright.sync_api import sync_playwright

VOID_ELEMENTS = {
    "area",
    "base",
    "br",
    "col",
    "embed",
    "hr",
    "img",
    "input",
    "link",
    "meta",
    "param",
    "source",
    "track",
    "wbr",
}
READABLE_ATTRIBUTES = {
    "title",
    "alt",
    "href",
    "placeholder",
    "label",
    "value",
    "caption",
    "summary",
    "aria-label",
    "aria-describedby",
    "datetime",
    "download",
    "selected",
    "checked",
    "type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}


class DOMNode:
    def __init__(self, i, nodes, strings):
        self._on_screen = None
        self.parent = None
        self.children = []
        self.llm_id = None
        ### Only some nodes have these, default None to differentiate between None and False
        self.bounds = None
        self.center = None
        self.inputValue = None
        self.inputChecked = None
        self.isClickable = None
        self.optionSelected = None
        self.parentId = (
            nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
        )
        self.nodeType = strings[nodes["nodeType"][i]]
        self.nodeName = strings[nodes["nodeName"][i]].lower()
        self.nodeValue = (
            strings[nodes["nodeValue"][i]].strip()
            if nodes["nodeValue"][i] >= 0
            else None
        )
        self.backendNodeId = nodes["backendNodeId"][i]

        self.attributes = {}
        attrs = nodes["attributes"][i]
        for att1, att2 in zip(attrs[::2], attrs[1::2]):
            self.attributes[strings[att1]] = strings[att2][:100]  # cut off long URLs

        self.readable_attributes = {
            k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
        }

    def __repr__(self, indent=0) -> str:
        if self.nodeName == "#text":
            return " " * indent + (self.nodeValue or "")

        attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
        attr_str = " " + attr_str if attr_str else ""
        open_tag = f"<{self.nodeName}{attr_str}>"
        close_tag = f"</{self.nodeName}>"

        if len(self.children) == 0:
            return (" " * indent + open_tag) + (
                close_tag if self.nodeName not in VOID_ELEMENTS else ""
            )

        # special case for elements with only one text child -> one-line element
        if len(self.children) == 1 and self.children[0].nodeName == "#text":
            return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag

        children_repr = "\n".join(
            [child.__repr__(indent + 2) for child in self.children]
        )
        return (
            (" " * indent + open_tag)
            + "\n"
            + children_repr
            + "\n"
            + (" " * indent + close_tag)
        )

    def on_screen(self, screen_bounds):
        if len(self.children) > 0:
            return any([child.on_screen(screen_bounds) for child in self.children])

        if (
            self.bounds is None
            or len(self.bounds) != 4
            or self.bounds[2] * self.bounds[3] == 0
        ):
            return False

        x, y, w, h = self.bounds
        win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
        win_right_bound = win_left_bound + win_width
        win_lower_bound = win_upper_bound + win_height
        return (
            x < win_right_bound
            and x + w > win_left_bound
            and y < win_lower_bound
            and y + h > win_upper_bound
        )


class Globot:
    def __init__(self, headless=False):
        playwright = sync_playwright().start()
        self.browser = playwright.chromium.launch(headless=headless)
        self.context = self.browser.new_context()
        self.page = self.context.new_page()

    def go_to_page(self, url):
        self.page.goto(url=url if "://" in url else "https://" + url)
        self.client = self.page.context.new_cdp_session(self.page)
        self.page.wait_for_load_state("domcontentloaded")

    def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
        dom = self.client.send(
            "DOMSnapshot.captureSnapshot",
            {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
        )

        dom_strings = dom["strings"]
        document = dom["documents"][0]
        dom_layout = document["layout"]
        dom_nodes = document["nodes"]

        screen_bounds = dom_layout["bounds"][0]
        # For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
        device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
            "window.screen.width"
        )

        nodes = []
        root = None

        # Takes much longer naively
        nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
        inputValue_flipped = {
            v: k for k, v in enumerate(dom_nodes["inputValue"]["index"])
        }
        for i in range(len(dom_nodes["parentIndex"])):
            node = DOMNode(i, dom_nodes, dom_strings)
            if i == 0:
                root = node

            if i in nodeIndex_flipped:
                bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
                bounds = [int(b / device_pixel_ratio) for b in bounds]
                node.bounds = bounds
                node.center = (
                    int(bounds[0] + bounds[2] / 2),
                    int(bounds[1] + bounds[3] / 2),
                )

            if i in dom_nodes["isClickable"]["index"]:
                node.isClickable = True

            if i in inputValue_flipped:
                v = dom_nodes["inputValue"]["value"][inputValue_flipped[i]]
                node.inputValue = dom_strings[v] if v >= 0 else ""
                # node.string_attributes['value'] = node.inputValue

            if i in dom_nodes["inputChecked"]["index"]:
                node.inputChecked = True

            if i in dom_nodes["optionSelected"]["index"]:
                node.optionSelected = True

            nodes.append(node)

        # Switch node ids to node pointers
        for node in nodes:
            if node.parentId is not None:
                node.parent = nodes[node.parentId]
                node.parent.children.append(node)

        count = 0
        input_elements = {}
        clickable_elements = {}

        def find_interactive_elements(node):
            nonlocal count
            clickable = (
                node.nodeName in CLICKABLE_ELEMENTS
                and node.isClickable
                and node.center is not None
            )
            inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None

            # Special case for select and option elements
            select_or_option = node.nodeName == "select" or node.nodeName == "option"
            visible = node.on_screen(
                root.bounds
            ) and "visibility: hidden" not in node.attributes.get("style", "")

            if node.nodeName == "button":
                print(f"Node: {node.nodeName}")
                print(f"  Attributes: {node.attributes}")
                print(f"  Bounds: {node.bounds}")
                print(f"  Clickable: {clickable}")
                print(f"  Inputable: {inputable}")
                print(f"  Visible: {visible}")
                print(f"  Center: {node.center}")

            if visible and (clickable or inputable) or select_or_option:
                if clickable:
                    clickable_elements[count] = node
                if inputable or select_or_option:
                    input_elements[count] = node
                node.llm_id = count
                count += 1

            for child in node.children:
                find_interactive_elements(child)

        find_interactive_elements(root)

        return input_elements, clickable_elements

用于重现问题的代码片段(此处的 Next 按钮不可点击):

from pprint import pprint

bot = Globot()
bot.go_to_page(
    "https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()

s = ""
for i in inputs.keys() | clickables.keys():
    inputable = False
    clickable = False
    if i in inputs:
        node = inputs[i]
        inputable = True
    if i in clickables:
        node = clickables[i]
        clickable = True

    s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
    s += node.__repr__(indent=2)
    s += "\n</node>\n"
html_description = s
pprint(html_description)

这是有关 Next 元素的日志部分 - 正如您所看到的,

Clickable
设置为
None

Node: button
  Attributes: {'class': 'VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKG', 'jscontroller': 'soHxf', 'jsaction': 'click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2', 'data-idom-class': 'nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b', 'jsname': 'LgbsSe', 'type': 'button'}
  Bounds: [965, 453, 78, 40]
  Clickable: None
  Inputable: False
  Visible: True
  Center: (1004, 473)

这是 Next 按钮的 RAW HTML:

<button class="VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jscontroller="soHxf" jsaction="click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2H; touchmove:FwuNnf; touchend:yfqBxc; touchcancel:JMtRjd; focus:AHmuwe; blur:O22p3e; contextmenu:mg9Pef;mlnRJb:fLiPzd;" data-idom-class="nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jsname="LgbsSe" type="button"><div class="VfPpkd-Jh9lGc"></div><div class="VfPpkd-J1Ukfc-LhBDec"></div><div class="VfPpkd-RLmnJb"></div><span jsname="V67aGc" class="VfPpkd-vQzf8d">Next</span></button>

这是相应页面的屏幕截图:

enter image description here

如果代码太长,我深表歉意,并感谢您提前提供帮助。

python web-scraping web-crawler playwright playwright-python
1个回答
0
投票

我最终得到了以下代码,包括使用

sets
进行快速查找的优化,并添加一个新条件来确定节点是否可以通过
onclick
属性的存在进行点击,或者该节点是否是
button

这是精炼版本:

from playwright.sync_api import sync_playwright
from pprint import pprint

VOID_ELEMENTS = {
    "area",
    "base",
    "br",
    "col",
    "embed",
    "hr",
    "img",
    "input",
    "link",
    "meta",
    "param",
    "source",
    "track",
    "wbr",
}
READABLE_ATTRIBUTES = {
    "title",
    "alt",
    "href",
    "placeholder",
    "label",
    "value",
    "caption",
    "summary",
    "aria-label",
    "aria-describedby",
    "datetime",
    "download",
    "selected",
    "checked",
    "type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary", "ul", "li"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}


class DOMNode:
    def __init__(self, i, nodes, strings):
        self._on_screen = None
        self.parent = None
        self.children = []
        self.llm_id = None
        ### Only some nodes have these, default None to differentiate between None and False
        self.bounds = None
        self.center = None
        self.inputValue = None
        self.inputChecked = None
        self.isClickable = None
        self.optionSelected = None
        self.parentId = (
            nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
        )
        self.nodeType = strings[nodes["nodeType"][i]]
        self.nodeName = strings[nodes["nodeName"][i]].lower()
        self.nodeValue = (
            strings[nodes["nodeValue"][i]].strip()
            if nodes["nodeValue"][i] >= 0
            else None
        )
        self.backendNodeId = nodes["backendNodeId"][i]

        self.attributes = {}
        attrs = nodes["attributes"][i]
        for att1, att2 in zip(attrs[::2], attrs[1::2]):
            self.attributes[strings[att1]] = strings[att2][:100]  # cut off long URLs

        self.readable_attributes = {
            k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
        }

    def __repr__(self, indent=0) -> str:
        if self.nodeName == "#text":
            return " " * indent + (self.nodeValue or "")

        attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
        attr_str = " " + attr_str if attr_str else ""
        open_tag = f"<{self.nodeName}{attr_str}>"
        close_tag = f"</{self.nodeName}>"

        if len(self.children) == 0:
            return (" " * indent + open_tag) + (
                close_tag if self.nodeName not in VOID_ELEMENTS else ""
            )

        # special case for elements with only one text child -> one-line element
        if len(self.children) == 1 and self.children[0].nodeName == "#text":
            return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag

        children_repr = "\n".join(
            [child.__repr__(indent + 2) for child in self.children]
        )
        return (
            (" " * indent + open_tag)
            + "\n"
            + children_repr
            + "\n"
            + (" " * indent + close_tag)
        )

    def on_screen(self, screen_bounds):
        if len(self.children) > 0:
            return any([child.on_screen(screen_bounds) for child in self.children])

        if (
            self.bounds is None
            or len(self.bounds) != 4
            or self.bounds[2] * self.bounds[3] == 0
        ):
            return False

        x, y, w, h = self.bounds
        win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
        win_right_bound = win_left_bound + win_width
        win_lower_bound = win_upper_bound + win_height
        return (
            x < win_right_bound
            and x + w > win_left_bound
            and y < win_lower_bound
            and y + h > win_upper_bound
        )


class Globot:
    def __init__(self, headless=False):
        playwright = sync_playwright().start()
        self.browser = playwright.chromium.launch(headless=headless)
        self.context = self.browser.new_context()
        self.page = self.context.new_page()

    def go_to_page(self, url):
        self.page.goto(url=url if "://" in url else "https://" + url)
        self.client = self.page.context.new_cdp_session(self.page)
        self.page.wait_for_load_state("domcontentloaded")

    def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
        dom = self.client.send(
            "DOMSnapshot.captureSnapshot",
            {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
        )

        dom_strings = dom["strings"]
        document = dom["documents"][0]
        dom_layout = document["layout"]
        dom_nodes = document["nodes"]

        screen_bounds = dom_layout["bounds"][0]
        # For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
        device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
            "window.screen.width"
        )

        nodes = []
        root = None

        isClickable_set = set(dom_nodes.get("isClickable", {}).get("index", []))
        inputChecked_set = set(dom_nodes.get("inputChecked", {}).get("index", []))
        optionSelected_set = set(dom_nodes.get("optionSelected", {}).get("index", []))

        inputValue_map = dict(
            zip(
                dom_nodes.get("inputValue", {}).get("index", []),
                dom_nodes.get("inputValue", {}).get("value", []),
            )
        )

        nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
        for i in range(len(dom_nodes["parentIndex"])):
            node = DOMNode(i, dom_nodes, dom_strings)
            if i == 0:
                root = node

            if i in nodeIndex_flipped:
                bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
                bounds = [int(b / device_pixel_ratio) for b in bounds]
                node.bounds = bounds
                node.center = (
                    int(bounds[0] + bounds[2] / 2),
                    int(bounds[1] + bounds[3] / 2),
                )

            node.isClickable = i in isClickable_set
            node.inputChecked = i in inputChecked_set
            node.optionSelected = i in optionSelected_set

            if i in inputValue_map:
                v = inputValue_map[i]
                node.inputValue = dom_strings[v] if v >= 0 else ""

            nodes.append(node)

        # Switch node ids to node pointers
        for node in nodes:
            if node.parentId is not None:
                node.parent = nodes[node.parentId]
                node.parent.children.append(node)

        count = 0
        input_elements = {}
        clickable_elements = {}

        def find_interactive_elements(node):
            nonlocal count
            clickable = (
                node.nodeName in CLICKABLE_ELEMENTS
                and node.center is not None
                and (
                    node.isClickable
                    or node.nodeName == "button"
                    or "onclick" in node.attributes
                )
            )
            inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None

            # Special case for select and option elements
            select_or_option = node.nodeName in {"select", "option"}
            visible = node.on_screen(
                root.bounds
            ) and "visibility: hidden" not in node.attributes.get("style", "")

            if visible and (clickable or inputable) or select_or_option:
                if clickable:
                    clickable_elements[count] = node
                if inputable or select_or_option:
                    input_elements[count] = node
                node.llm_id = count
                count += 1

            for child in node.children:
                find_interactive_elements(child)

        find_interactive_elements(root)

        return input_elements, clickable_elements


bot = Globot()
bot.go_to_page(
    "https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()

s = ""
for i in inputs.keys() | clickables.keys():
    inputable = False
    clickable = False
    if i in inputs:
        node = inputs[i]
        inputable = True
    if i in clickables:
        node = clickables[i]
        clickable = True

    s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
    s += node.__repr__(indent=2)
    s += "\n</node>\n"
html_description = s
pprint(html_description)

输出:

('<node id=0 clickable=False inputable=True>\n'
 '  <input type="email" aria-label="Email or phone" '
 'value="2D7AB92D588040EBA91955F62E1BEE47">\n'
 '</node>\n'
 '<node id=1 clickable=True inputable=False>\n'
 '  <button type="button">\n'
 '    <::before></::before>\n'
 '    Forgot email?\n'
 '  </button>\n'
 '</node>\n'
 '<node id=2 clickable=True inputable=False>\n'
 '  <a href="https://support.google.com/chrome/answer/6130773?hl=en">\n'
 '    <::before></::before>\n'
 '    Learn more about using Guest mode\n'
 '  </a>\n'
 '</node>\n'
 '<node id=3 clickable=True inputable=False>\n'
 '  <button type="button">\n'
 '    <div>\n'
 '      <::before></::before>\n'
 '      <::after></::after>\n'
 '    </div>\n'
 '    <div></div>\n'
 '    <div></div>\n'
 '    <span>Next</span>\n'
 '  </button>\n'
 '</node>\n'
 '<node id=4 clickable=True inputable=False>\n'
 '  <button type="button">\n'
 '    <div>\n'
 '      <::before></::before>\n'
 '      <::after></::after>\n'
 '    </div>\n'
 '    <div></div>\n'
 '    <div></div>\n'
 '    <span>Create account</span>\n'
 '  </button>\n'
 '</node>\n'
 '<node id=5 clickable=True inputable=False>\n'
 '  <a href="https://support.google.com/accounts?hl=en&p=account_iph">\n'
 '    <::before></::before>\n'
 '    Help\n'
 '  </a>\n'
 '</node>\n'
 '<node id=6 clickable=True inputable=False>\n'
 '  <a href="https://accounts.google.com/TOS?loc=GB&hl=en&privacy=true">\n'
 '    <::before></::before>\n'
 '    Privacy\n'
 '  </a>\n'
 '</node>\n'
 '<node id=7 clickable=True inputable=False>\n'
 '  <a href="https://accounts.google.com/TOS?loc=GB&hl=en">\n'
 '    <::before></::before>\n'
 '    Terms\n'
 '  </a>\n'
 '</node>\n')

查看提取的可点击内容,例如“下一步”按钮以及上面正确检测到的其他按钮。

© www.soinside.com 2019 - 2024. All rights reserved.