settings.py 中为所有蜘蛛启用了某些中间件
对于一个特定的蜘蛛,如何将另一个中间件与settings.py中的所有中间件一起附加?
假设settings.py
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.CustomDownloaderMiddleware1': 543,
'myproject.middlewares.CustomDownloaderMiddleware2': 544,
}
如果我在蜘蛛中通过
custom_settings
设置该中间件,则在 settings.py 中设置的所有其他中间件都将被忽略。
我试过了
class MySpider(Spider):
name = 'my_spider'
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
settings = get_project_settings()
# Get existing middlewares
middlewares = settings.get('DOWNLOADER_MIDDLEWARES', {})
# Append or update your additional middleware
middlewares['myproject.middlewares.MyAdditionalMiddleware'] = 550
# Apply it to the spider's settings
self.custom_settings = {
'DOWNLOADER_MIDDLEWARES': middlewares
}
def start_requests(self):
# Spider logic here
pass
和
from scrapy.spiders import Spider
from scrapy.utils.project import get_project_settings
class MySpider(Spider):
name = 'my_spider'
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
# Get the global settings
settings = crawler.settings
# Get the existing middlewares
middlewares = settings.get('DOWNLOADER_MIDDLEWARES', {}).copy()
# Append or update your additional middleware
middlewares['myproject.middlewares.MyAdditionalMiddleware'] = 550
# Update the spider's settings for this instance
spider.custom_settings = {
'DOWNLOADER_MIDDLEWARES': middlewares
}
return spider
def start_requests(self):
# Spider logic here
pass
但是我的蜘蛛中没有激活 MyAdditionalMiddleware
以上代码由ChatGpt生成https://chatgpt.com/share/6710b8b3-eda8-800a-a63d-5a244501475b
我也有类似的问题。我有数百个蜘蛛,它们的大多数设置都是相同的,有几个是定制的。这是我的解决方案。
首先,制作一个
spider_settings.py
文件:
project_name = 'xxx'
downloader_map = { # you can list all your downloadmiddlewares here with a alias
'json_check': (201, 'xy_spider.middlewares.MustJsonDecodeMiddleware'),
'abuyun': (202, 'xy_spider.middlewares.AbuyunProxyMiddleware'),
'charset_change': (203, 'xy_spider.middlewares.CharsetSwitchDownloaderMiddleware'),
}
pipeline_map = { # 0-1000
'print': (301, 'xy_spider.pipelines.printPipeline'),
'merge': (1, 'xy_spider.pipelines.MergePagesPipelineForNews')
}
class CommonSettings:
def __init__(self, **kwargs):
kwargs = kwargs or {}
_downloader_mids = {'xy_spider.middlewares.XySpiderDownloaderMiddleware': 200}
for k, v in downloader_map.items():
if k in kwargs and kwargs[k]:
_downloader_mids[v[1]] = v[0]
_pipeline = {
f'projects.{project_name}.pipelines.MySQLPipeline': 300,
f'projects.{project_name}.pipelines.MysqlQueuePipeline': 901,
}
for k, v in pipeline_map.items():
if k in kwargs and kwargs[k]:
_pipeline[v[1]] = v[0]
timeout = kwargs.get('timeout', kwargs.get('DOWNLOAD_TIMEOUT'))
self._settings = {
'LOG_LEVEL': 'INFO',
'SCHEDULER': 'xy_spider.redis.expire_scheduler.ExpireScheduler',
'DUPEFILTER_CLASS': 'xy_spider.redis.expire_dupefilter.ExpireDupeFilter',
'DUPEFILTER_EXPIRE_DAYS': int(kwargs.get('expire', kwargs.get('DUPEFILTER_EXPIRE_DAYS', 7))),
'SCHEDULER_PERSIST': True,
'SCHEDULER_FLUSH_ON_START': bool(kwargs.get('get_all', True)),
'DOWNLOAD_DELAY': int(kwargs.get('delay', kwargs.get('DOWNLOAD_DELAY', 1))),
'CONCURRENT_REQUESTS': int(kwargs.get('thread', kwargs.get('CONCURRENT_REQUESTS', 10))),
"SPIDER_MIDDLEWARES": {
'xy_spider.middlewares.RandomParamFilterSpiderMiddleware': 200,
},
"DOWNLOADER_MIDDLEWARES": _downloader_mids,
'ITEM_PIPELINES': _pipeline,
}
if timeout is not None:
self._settings['DOWNLOAD_TIMEOUT'] = int(timeout) or 20
def __call__(self):
return self._settings
其次,您现在可以通过在蜘蛛类中创建
CommonSettings()
的实例来轻松自定义设置,例如:
class TestSpider(scrapy.Spider):
name = "test"
is_test = not on_server
_settings = {
'delay': 1,
'json_check': 1,
'timeout': 10,
'thread': 1,
}
if is_test:
_settings['print'] = 1
else:
_settings['get_all'] = 0
custom_settings = CommonSettings(**_settings)()