我需要在循环中运行我的抓取工具,但如果蜘蛛中发生某些错误,我希望能够引发
CloseSpider
,并为此过滤到循环函数并停止循环。
这是我到目前为止的代码,它可以与功能齐全的蜘蛛一起正常工作,但我创建了一些 MRE 来测试
CloseSpider
用例。
from __future__ import print_function
import multiprocessing as mp
import traceback
from time import sleep
from typing import Type
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import CloseSpider
class MyTestSpider(Spider):
name = "my_test_spider"
def __init__(self) -> None:
raise CloseSpider
class Process(mp.Process):
def __init__(self, target: callable, *args, **kwargs):
mp.Process.__init__(self, target=target, *args, **kwargs)
self._pconn, self._cconn = mp.Pipe()
self._exception = None
def run(self):
try:
mp.Process.run(self)
self._cconn.send(None)
except Exception as e:
tb = traceback.format_exc()
self._cconn.send(tb)
@property
def exception(self):
if self._pconn.poll():
self._exception = self._pconn.recv()
return self._exception
def run_crawler_loop(
spider: Type[Spider],
loop_wait_secs: int,
**kwargs,
) -> None:
while True:
run_crawler_reactor_safe(spider=spider, **kwargs)
sleep(loop_wait_secs)
def run_crawler_reactor_safe(spider: Type[Spider], **kwargs) -> None:
process = Process(target=run_crawler, kwargs={"spider": spider} | kwargs)
process.start()
process.join()
if process.exception:
error, traceback = process.exception
# send an email here
raise error # close the loop
def run_crawler(spider: Type[Spider], **kwargs) -> None:
process = CrawlerProcess()
crawler = process.create_crawler(spider)
process.crawl(crawler_or_spidercls=crawler, **kwargs)
process.start()
# how would I tell here if the spider was closed due to raising a CloseSpider exception?
# I'd like to raise that exception here so I can stop the loop by raising an error in run_crawler_reactor_safe
if __name__ == "__main__":
run_crawler_loop(spider=MyTestSpider, loop_wait_secs=0)
运行此命令会在
twisted
: 中生成两个日志条目
Unhandled error in Deferred:
2024-10-09 09:39:01 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 265, in crawl
return self._crawl(crawler, *args, **kwargs)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 269, in _crawl
d = crawler.crawl(*args, **kwargs)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/twisted/internet/defer.py", line 2260, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/twisted/internet/defer.py", line 2172, in _cancellableInlineCallbacks
_inlineCallbacks(None, gen, status, _copy_context())
--- <exception caught here> ---
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/twisted/internet/defer.py", line 2003, in _inlineCallbacks
result = context.run(gen.send, result)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 155, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 169, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/spiders/__init__.py", line 62, in from_crawler
spider = cls(*args, **kwargs)
File "/Users/myusername/GitHub/polgara_v2/so__capture_SpiderClose.py", line 17, in __init__
raise CloseSpider
scrapy.exceptions.CloseSpider:
2024-10-09 09:39:01 [twisted] CRITICAL:
Traceback (most recent call last):
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/twisted/internet/defer.py", line 2003, in _inlineCallbacks
result = context.run(gen.send, result)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 155, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 169, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/spiders/__init__.py", line 62, in from_crawler
spider = cls(*args, **kwargs)
File "/Users/myusername/GitHub/polgara_v2/so__capture_SpiderClose.py", line 17, in __init__
raise CloseSpider
scrapy.exceptions.CloseSpider
但是,这些错误似乎已在此处处理,并且实际上并未以任何可以进一步捕获的方式再次引发。当
crawler
在 process
中完成时,我查看了 process.start()
和 run_crawler()
实例,但我找不到任何蜘蛛的迹象,更不用说错误消息了。
我也尝试过查看
twisted
包和 SO 上的帖子(阻止 Twisted 吞噬异常),但很快就迷失了......
关于如何完成我想要做的事情有什么想法吗?
好的,我已经找到了解决方案。它确实需要将
CloseSpider
放入回调中,但这对于 IRL 来说更现实。我已经注释了我对原始脚本所做的所有更改:
from __future__ import print_function
import multiprocessing as mp
import traceback
from time import sleep
from typing import Type
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import CloseSpider
from scrapy.settings import Settings
class MyTestSpider(Spider):
name = "my_test_spider"
start_urls = ["https://www.flashscore.co.uk/tennis/"]
# added in a setting to enable me to record if CloseSpider was triggered
@classmethod
def update_settings(cls, settings: Settings):
settings.set(
name="CLOSESPIDER_TRIGGERED",
value=False,
priority="spider",
)
super().update_settings(settings)
# CloseSpider records a reason of "cancelled" when triggered
def closed(self, reason: str) -> None:
if reason == "cancelled":
# need to unfreeze the settings as the object is usually immutable
self.crawler.settings.frozen = False
self.crawler.settings.set(
name="CLOSESPIDER_TRIGGERED",
value=True,
priority="spider",
)
self.crawler.settings.frozen = True
return
def parse(self, response):
raise CloseSpider
class Process(mp.Process):
def __init__(self, target: callable, *args, **kwargs):
mp.Process.__init__(self, target=target, *args, **kwargs)
self._pconn, self._cconn = mp.Pipe()
self._exception = None
def run(self):
try:
mp.Process.run(self)
self._cconn.send(None)
except Exception as e:
tb = traceback.format_exc()
# amended this to send both error and traceback
self._cconn.send((e, tb))
@property
def exception(self):
if self._pconn.poll():
self._exception = self._pconn.recv()
return self._exception
def run_crawler_loop(
spider: Type[Spider],
loop_wait_secs: int,
**kwargs,
) -> None:
while True:
run_crawler_reactor_safe(spider=spider, **kwargs)
sleep(loop_wait_secs)
def run_crawler_reactor_safe(spider: Type[Spider], **kwargs) -> None:
process = Process(target=run_crawler, kwargs={"spider": spider} | kwargs)
process.start()
process.join()
if process.exception:
error, traceback = process.exception
# send an email here
raise error # close the loop by re-raising the exception
def run_crawler(spider: Type[Spider], **kwargs) -> None:
process = CrawlerProcess()
crawler = process.create_crawler(spider)
process.crawl(crawler_or_spidercls=crawler, **kwargs)
process.start()
# check if CloseSpider was triggered - if it was then raise an exception
if crawler.settings.getbool("CLOSESPIDER_TRIGGERED"):
raise CloseSpider
if __name__ == "__main__":
run_crawler_loop(spider=MyTestSpider, loop_wait_secs=0)