我只想使用
xhr
从 scrapy_playwright
返回 playwright_page_event_handlers
。检查 jsonlines 文件后,我发现它没有成功限制为仅 xhrs。
我知道我可以在写入文件之前进行过滤,但是我想节省获取这些资源所需的时间,而不是之后过滤所有内容。
如何将资源类型限制为仅
xhr
?
这是我尝试过的:
from playwright.async_api import Response as PlaywrightResponse, BrowserContext
from scrapy_playwright.page import PageCoroutine
from scrapy import Spider, Request
import jsonlines
class EventSpider(Spider):
name = "event"
def start_requests(self):
yield Request(
url="http://quotes.toscrape.com/scroll",
cookies={"foo": "bar", "asdf": "qwerty"},
meta=dict(
playwright=True,
playwright_page_coroutines = [
PageCoroutine("wait_for_selector", "div.quote"),
PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
],
playwright_page_event_handlers={
"response": "handle_response",
"context": self.configure_context
},
),
)
async def configure_context(name: str, context: BrowserContext) -> None:
def handle_route(route):
if ("quotes" in route.request.post_data):
route.fulfill()
else:
route.continue_()
await context.route("/api/**", handle_route)
async def handle_response(self, response: PlaywrightResponse) -> None:
jl_file = "test.jl"
data = {response.request.resource_type:[response.request.url]}
with jsonlines.open(jl_file, mode='a') as writer:
writer.write(data)
def parse(self, response):
return {"url": response.url}
产生以下输出:
{"document": ["http://quotes.toscrape.com/scroll"]}
{"stylesheet": ["http://quotes.toscrape.com/static/bootstrap.min.css"]}
{"stylesheet": ["http://quotes.toscrape.com/static/main.css"]}
{"script": ["http://quotes.toscrape.com/static/jquery.js"]}
{"stylesheet": ["https://fonts.googleapis.com/css?family=Raleway:400,700"]}
{"font": ["https://fonts.gstatic.com/s/raleway/v26/1Ptug8zYS_SKggPNyC0IT4ttDfA.woff2"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=1"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=2"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=3"]}
预期输出:
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=1"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=2"]}
{"xhr": ["http://quotes.toscrape.com/api/quotes?page=3"]}
就你而言
def should_abort_request(request):
return (
request.resource_type != "xhr" in request.url
)
并在设置中应用该功能
PLAYWRIGHT_ABORT_REQUEST = should_abort_request