from playwright.sync_api import Playwright, sync_playwright
with sync_playwright() as playwright:
chromium = playwright.chromium
browser = chromium.launch()
context = browser.new_context()
page = context.new_page()
page.goto("https://www.ebay.com/deals/tech/ipads-tablets-ereaders")
button = page.locator("button.load-more-btn.btn.btn--secondary")
try:
while button:
button.scroll_into_view_if_needed()
button.click()
except:
pass
items = page.locator("div.dne-itemtile.dne-itemtile-large").all()
for item in items:
print(item.locator("img").get_attribute("src"))
print(item.locator("span.first").text_content())
print(item.locator("span.ebayui-ellipsis-2").text_content())
print()
print(len(items), "items")
我正在努力争取 eBay 交易。
在我的 try 块中,使用
headless = False
,我会看到浏览器单击按钮向我显示,直到不再有按钮,但代码不会抓取所有项目,但可能最多抓取前 4 页。
eBay 上的交易可能有超过 800 件商品,但我可以刮到前 96 件
简而言之,当您点击(或向下滚动)时,服务器会发送请求(您可以在开发者模式下查看)来检索交易。您可以仅使用请求来获得交易,而无需担心 Selenium。
示例:
import time
import json
import requests
from bs4 import BeautifulSoup
LISTINGS_URL = "https://www.ebay.com/deals/spoke/ajax/listings"
TIMEZONE_OFFSET = 63072000
def get_dp1():
current_time = hex(int(time.time()) + TIMEZONE_OFFSET)[2:]
return f"bbl/DE{current_time}^"
def parse_deals(content):
soup = BeautifulSoup(content, "lxml")
items = []
for el in soup.select("div[data-listing-id]"):
image = el.select_one("img").get("src")
price = el.select_one("span.first").text
title = el.select_one("span.ebayui-ellipsis-2").text
items.append({"title": title, "price": price, "image": image})
return items
items = []
with requests.Session() as session:
session.cookies.set("dp1", get_dp1())
params = {"_ofs": 0, "category_path_seo": "tech,ipads-tablets-ereaders"}
while True:
print(f"Total: {len(items):<5} | Offset: {params['_ofs']}")
response = session.get(LISTINGS_URL, params=params)
data = response.json().get("fulfillmentValue", {})
params = data.get("pagination", {}).get("params")
if not params:
break
ditems = parse_deals(data["listingsHtml"])
items.extend(ditems)
with open("data.json", "w") as f:
json.dump(items, f, ensure_ascii=False, indent=2)
输出:
[
{
"title": "Samsung Galaxy Tab A9+ 11.0\" 64GB Gray Wi-Fi Tablet Bundle SM-X210NZAYXAR 2023",
"price": "$139.99",
"image": "https://i.ebayimg.com/images/g/qbUAAOSw1o1l1Rtt/s-l300.jpg"
},
...
]
为了获取交易,如前所述,服务器发送带有强制 cookie
dp1
的 GET 请求,该 cookie 代表当前 Unix 时间(例如,bbl/DE6a9839a1^
)。这里,bb/DE
和^
是常量值(据我理解),它们之间是十六进制格式的当前Unix时间。
您可能需要调整 Unix 时间偏移,因为当您访问该站点时,它会发送相对于其自己时区的 cookie 值
。dp1
之后,服务器会返回一个 JSON 对象,其中包含抓取所需的所有信息。