我正在尝试使用 scrapy 和 scrapy-playwright 访问网页,但是,我不断收到“请启用 JS 并禁用任何广告拦截器”消息以及超时错误。我尝试了各种解决方案,但似乎没有一个有效......
import scrapy
from scrapy.selector import Selector
from scrapy_playwright.page import PageMethod
class WsjNewsJSSpider(scrapy.Spider):
name = 'wsj_newsJS_BACKUP'
start_urls = ['https://www.wsj.com']
custom_settings = {
"DOWNLOAD_HANDLERS": {
'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
},
"TWISTED_REACTOR": 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
"PLAYWRIGHT_BROWSER_TYPE": "chromium", # Optional: specify the browser type (chromium, firefox, webkit)
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": False}, # Optional: configure Playwright options
}
def start_requests(self):
# Enable Playwright for these requests
for url in self.start_urls:
yield scrapy.Request(
url,
meta={
'playwright': True,
"playwright_page_methods": [
PageMethod("wait_for_timeout", 5000), # Wait for 5 seconds
],
},
callback=self.parse
)
def parse(self, response):
# Parse the response with rendered JS content
html_content = response.text
sel = Selector(text=html_content)
print('Its working')
任何帮助将不胜感激。
通常,这意味着目标站点使用反机器人系统。 您可以搜索playwright-stealth和高级剪贴技术。