如何判断CloseSpider是否在CrawlerProcess级别引发

问题描述 投票:0回答:1

我需要在循环中运行我的抓取工具,但如果蜘蛛中发生某些错误,我希望能够引发

CloseSpider
,并为此过滤到循环函数并停止循环。

这是我到目前为止的代码,它可以与功能齐全的蜘蛛一起正常工作,但我创建了一些 MRE 来测试

CloseSpider
用例。

from __future__ import print_function

import multiprocessing as mp
import traceback
from time import sleep
from typing import Type

from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import CloseSpider


class MyTestSpider(Spider):
    name = "my_test_spider"

    def __init__(self) -> None:
        raise CloseSpider


class Process(mp.Process):
    def __init__(self, target: callable, *args, **kwargs):
        mp.Process.__init__(self, target=target, *args, **kwargs)

        self._pconn, self._cconn = mp.Pipe()
        self._exception = None

    def run(self):
        try:
            mp.Process.run(self)

            self._cconn.send(None)

        except Exception as e:
            tb = traceback.format_exc()

            self._cconn.send(tb)

    @property
    def exception(self):
        if self._pconn.poll():
            self._exception = self._pconn.recv()

        return self._exception


def run_crawler_loop(
    spider: Type[Spider],
    loop_wait_secs: int,
    **kwargs,
) -> None:
    while True:
        run_crawler_reactor_safe(spider=spider, **kwargs)

        sleep(loop_wait_secs)


def run_crawler_reactor_safe(spider: Type[Spider], **kwargs) -> None:
    process = Process(target=run_crawler, kwargs={"spider": spider} | kwargs)

    process.start()
    process.join()

    if process.exception:
        error, traceback = process.exception

        # send an email here

        raise error # close the loop


def run_crawler(spider: Type[Spider], **kwargs) -> None:
    process = CrawlerProcess()

    crawler = process.create_crawler(spider)

    process.crawl(crawler_or_spidercls=crawler, **kwargs)

    process.start()

    # how would I tell here if the spider was closed due to raising a CloseSpider exception?
    # I'd like to raise that exception here so I can stop the loop by raising an error in run_crawler_reactor_safe


if __name__ == "__main__":
    run_crawler_loop(spider=MyTestSpider, loop_wait_secs=0)

运行此命令会在

twisted
:

中生成两个日志条目
Unhandled error in Deferred:
2024-10-09 09:39:01 [twisted] CRITICAL: Unhandled error in Deferred:

Traceback (most recent call last):
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 265, in crawl
    return self._crawl(crawler, *args, **kwargs)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 269, in _crawl
    d = crawler.crawl(*args, **kwargs)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/twisted/internet/defer.py", line 2260, in unwindGenerator
    return _cancellableInlineCallbacks(gen)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/twisted/internet/defer.py", line 2172, in _cancellableInlineCallbacks
    _inlineCallbacks(None, gen, status, _copy_context())
--- <exception caught here> ---
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/twisted/internet/defer.py", line 2003, in _inlineCallbacks
    result = context.run(gen.send, result)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 155, in crawl
    self.spider = self._create_spider(*args, **kwargs)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 169, in _create_spider
    return self.spidercls.from_crawler(self, *args, **kwargs)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/spiders/__init__.py", line 62, in from_crawler
    spider = cls(*args, **kwargs)
  File "/Users/myusername/GitHub/polgara_v2/so__capture_SpiderClose.py", line 17, in __init__
    raise CloseSpider
scrapy.exceptions.CloseSpider: 

2024-10-09 09:39:01 [twisted] CRITICAL: 
Traceback (most recent call last):
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/twisted/internet/defer.py", line 2003, in _inlineCallbacks
    result = context.run(gen.send, result)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 155, in crawl
    self.spider = self._create_spider(*args, **kwargs)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/crawler.py", line 169, in _create_spider
    return self.spidercls.from_crawler(self, *args, **kwargs)
  File "/Users/myusername/opt/miniconda3/envs/myenv/lib/python3.11/site-packages/scrapy/spiders/__init__.py", line 62, in from_crawler
    spider = cls(*args, **kwargs)
  File "/Users/myusername/GitHub/polgara_v2/so__capture_SpiderClose.py", line 17, in __init__
    raise CloseSpider
scrapy.exceptions.CloseSpider

但是,这些错误似乎已在此处处理,并且实际上并未以任何可以进一步捕获的方式再次引发。当

crawler
process
中完成时,我查看了
process.start()
run_crawler()
实例,但我找不到任何蜘蛛的迹象,更不用说错误消息了。

我也尝试过查看

twisted
包和 SO 上的帖子(阻止 Twisted 吞噬异常),但很快就迷失了......

关于如何完成我想要做的事情有什么想法吗?

python scrapy python-multiprocessing
1个回答
0
投票

好的,我已经找到了解决方案。它确实需要将

CloseSpider
放入回调中,但这对于 IRL 来说更现实。我已经注释了我对原始脚本所做的所有更改:

from __future__ import print_function

import multiprocessing as mp
import traceback
from time import sleep
from typing import Type

from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import CloseSpider
from scrapy.settings import Settings


class MyTestSpider(Spider):
    name = "my_test_spider"

    start_urls = ["https://www.flashscore.co.uk/tennis/"]

    # added in a setting to enable me to record if CloseSpider was triggered
    @classmethod
    def update_settings(cls, settings: Settings):
        settings.set(
            name="CLOSESPIDER_TRIGGERED",
            value=False,
            priority="spider",
        )

        super().update_settings(settings)

    # CloseSpider records a reason of "cancelled" when triggered
    def closed(self, reason: str) -> None:
        if reason == "cancelled":
            # need to unfreeze the settings as the object is usually immutable
            self.crawler.settings.frozen = False
            self.crawler.settings.set(
                name="CLOSESPIDER_TRIGGERED",
                value=True,
                priority="spider",
            )
            self.crawler.settings.frozen = True
        
        return
    
    def parse(self, response):
        raise CloseSpider


class Process(mp.Process):
    def __init__(self, target: callable, *args, **kwargs):
        mp.Process.__init__(self, target=target, *args, **kwargs)

        self._pconn, self._cconn = mp.Pipe()
        self._exception = None

    def run(self):
        try:
            mp.Process.run(self)

            self._cconn.send(None)

        except Exception as e:
            tb = traceback.format_exc()

            # amended this to send both error and traceback
            self._cconn.send((e, tb))

    @property
    def exception(self):
        if self._pconn.poll():
            self._exception = self._pconn.recv()

        return self._exception


def run_crawler_loop(
    spider: Type[Spider],
    loop_wait_secs: int,
    **kwargs,
) -> None:
    while True:
        run_crawler_reactor_safe(spider=spider, **kwargs)

        sleep(loop_wait_secs)


def run_crawler_reactor_safe(spider: Type[Spider], **kwargs) -> None:
    process = Process(target=run_crawler, kwargs={"spider": spider} | kwargs)

    process.start()
    process.join()

    if process.exception:
        error, traceback = process.exception

        # send an email here

        raise error # close the loop by re-raising the exception


def run_crawler(spider: Type[Spider], **kwargs) -> None:
    process = CrawlerProcess()

    crawler = process.create_crawler(spider)

    process.crawl(crawler_or_spidercls=crawler, **kwargs)
    process.start()

    # check if CloseSpider was triggered - if it was then raise an exception
    if crawler.settings.getbool("CLOSESPIDER_TRIGGERED"):
        raise CloseSpider


if __name__ == "__main__":
    run_crawler_loop(spider=MyTestSpider, loop_wait_secs=0)
© www.soinside.com 2019 - 2024. All rights reserved.