我正在尝试使用 Playwright Python 库编写一个简单的剪贴器
这是我如何使用它的基本示例:
from contextlib import contextmanager
from playwright.sync_api import sync_playwright
import asyncio
def is_async():
return asyncio.get_event_loop().is_running()
class BaseScrapper(object):
@property
@contextmanager
def playwright(self):
print(f'BaseScrapper.playwright - is_async={is_async()}')
with sync_playwright() as p:
print(f'BaseScrapper.playwright with - is_async={is_async()}')
yield p
@property
@contextmanager
def browser(self):
print(f'BaseScrapper.browser - is_async={is_async()}')
with self.playwright as p:
print(f'BaseScrapper.browser with - is_async={is_async()}')
yield p.chromium.launch(headless=True)
@contextmanager
def open_page(self, url):
print(f'BaseScrapper.open_page - is_async={is_async()}')
with self.browser as browser:
print(f'BaseScrapper.open_page.with - is_async={is_async()}')
new_page = browser.new_page()
# attach response listener
new_page.on("response", self.intercept_response)
new_page.goto(url, wait_until="domcontentloaded")
yield new_page
def intercept_response(self, response):
pass
class ScrapeTest(BaseScrapper):
@contextmanager
def run(self):
print(f'ScrapeTest.run - is_async={is_async()}')
with self.open_page(url='www.google.com') as page:
print(f'ScrapeTest.run - is_async={is_async()}')
yield page
def run():
print(f'running ... is_async={is_async()}')
s = ScrapeTest()
with s.run() as p:
print(f'run.with is_async={is_async()}')
我希望它在同步上下文中运行,但它尝试切换到异步。运行 run 函数会产生以下输出:
>>> run()
running ... is_async=False
ScrapeTest.run - is_async=False
BaseScrapper.open_page - is_async=False
BaseScrapper.browser - is_async=False
BaseScrapper.playwright - is_async=False
BaseScrapper.playwright with - is_async=True
BaseScrapper.browser with - is_async=True
BaseScrapper.open_page.with - is_async=True
为什么“playwright”函数会切换到上下文管理器块内的异步上下文?我正在尝试在其中运行一些 django ORM 调用,但失败了
django.core.exceptions.SynchronousOnlyOperation: You cannot call this from an async context - use a thread or sync_to_async.
但我正在尝试在同步上下文中运行它......
我错过了什么?
非常感谢!
from playwright.sync_api import sync_playwright
import asyncio
class BaseScrapper(object):
@property
def playwright(self):
print('basescrapper.playwright')
return sync_playwright().start()
@property
def browser(self):
print('basescrapper.browser')
return self.playwright.chromium.launch(headless=True)
def open_page(self, url):
print('basescrapper.open page')
page = self.browser.new_page()
page.goto(url)
return page
class SampleScrapper(BaseScrapper):
def run(self):
print(f'before - {"async" if asyncio.get_event_loop().is_running() else "sync"}')
page = self.open_page(url='https://www.google.com')
print(f'after - {"async" if asyncio.get_event_loop().is_running() else "sync"}')
def run():
print(f'run 1 - {"async" if asyncio.get_event_loop().is_running() else "sync"}')
s = SampleScrapper()
print(f'run 2 - {"async" if asyncio.get_event_loop().is_running() else "sync"}')
s.run()
print(f'run 3 - {"async" if asyncio.get_event_loop().is_running() else "sync"}')
结果:
>>> run()
run 1 - sync
run 2 - sync
before - sync
basescrapper.open page
basescrapper.browser
basescrapper.playwright
after - async
run 3 - async
Playwright for Python 使用 asyncio 来执行其任务,但它有一个包装器来使 API 同步。
注意:我不精通Python。如果我错了请纠正我。