错误:信号处理程序捕获错误:scrapy.downloadermiddlewares.offsite.OffsiteMiddleware 对象的绑定方法 OffsiteMiddleware.request_scheduled 位于 0x000002C3EBB5DB50 当我正在开发使用 scrapy 和 selenium 进行爬行的脚本时,我遇到了上述错误。 我该如何修复它?
# -*- coding: utf-8 -*-
import scrapy
from scrapy import FormRequest
from scrapy.http import HtmlResponse
from datetime import datetime, timedelta
from bloomberg.items import BloombergItem
import json
from scrapy.shell import inspect_response
import re
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class HistoricalDataSpider(scrapy.Spider):
name = 'historical_data'
# allowed_domains = ['econcal.forexprostools.com']
start_urls = ['http://econcal.forexprostools.com/']
output = []
not_parsed_pages = 0
def start_requests(self):
chrome_options = Options()
# chrome_options.add_argument('--headless')
# Setting up the Chrome WebDriver with options
driver = webdriver.Chrome(options=chrome_options)
# Replace '' with the URL you want to scrape
driver.get('http://econcal.forexprostools.com/')
page_source = driver.page_source
for n in range(0, (self.end_date-self.start_date).days+1, 30):
start_date = self.start_date + timedelta(n)
end_date = self.start_date + timedelta(n+30)
if end_date > self.end_date: end_date = self.end_date
skip = False
for n, date in enumerate(self.scraped_dates):
if start_date <= date <= end_date and (self.end_date - date).days > 90:
skip = True
self.scraped_dates = self.scraped_dates[n:]
break
if skip:
continue
start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d')
html_response = HtmlResponse(url=driver.current_url, body=page_source, encoding='utf-8')
rows = html_response.css('button')
# output = []
for row in rows:
# print(row, 'init###')
if 'USD' in row.css('div::text').extract_first():
event_datetime = row.css('button::attr(event_timestamp)').extract_first()
event_datetime = datetime.strptime(event_datetime, '%Y-%m-%d %H:%M:%S')
date = event_datetime.strftime('%m/%d/%Y')
time = event_datetime.strftime('%H:%M')
event_name = row.css('.left.event::text').extract_first().strip()
actual = row.css('.act span::text').extract()
if actual:
actual = actual[1].strip()
if actual:
actual = re.sub('\,', '', actual)
actual = re.search('[-0-9.]+', actual).group()
else: actual = None
forecast = row.css('.fore span::text').extract()
if forecast:
forecast = forecast[1].strip()
if forecast:
forecast = re.sub('\,', '', forecast)
forecast = re.search('[-0-9.]+', forecast).group()
else: forecast = None
prev = row.css('.prev span::text').extract()
if prev:
prev = prev[1].strip()
if prev:
prev = re.sub('\,', '', prev)
prev = re.search('[-0-9.]+', prev).group()
else: prev = None
new_row = [date,time, event_name, actual, forecast, prev]
if new_row not in self.output:
self.output.append([date,time, event_name, actual, forecast, prev])
# self.not_parsed_pages -= 1
if self.not_parsed_pages == 0:
item = BloombergItem()
item['data'] = self.output
yield item
driver.quit()
setting.py
# -*- coding: utf-8 -*-
# Scrapy settings for bloomberg project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'bloomberg'
SPIDER_MODULES = ['bloomberg.spiders']
NEWSPIDER_MODULE = 'bloomberg.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
SPIDER_MIDDLEWARES = {
'bloomberg.middlewares.BloombergSpiderMiddleware': 543,\
}
ITEM_PIPELINES = {
'bloomberg.pipelines.BloombergPipeline': 300,
}
# Filename with .csv
HISTORY_OUTPUT_FILENAME = 'bloomberg_history.csv'
LOG_LEVEL="DEBUG"
# Quandl api key
QUANDL_API_KEY = "X4hf1sbHT6D3xgN6kz7N"
# VX MASTER file path with filename (example/example.csv) vix futures curve
VX_MASTER_FILE_PATH = r"C:/Users/Prisma/Desktop/AdditionalData/VXF.csv"
#Treasury master file path with filename(example/example.csv)
TRES_MASTER_FILE_PATH = r"C:/Users/Prisma/Desktop/AdditionalData/TSY.csv"
EVENTS_DIRECTORY = r"C:/Users/Prisma/Desktop/AdditionalData/"
我要结合selenium和scrapy。 但我不太了解如何在蜘蛛中调用process_item。
不要在start_request中启动selenium,url应该生成中间件,而不是可以在蜘蛛中解析。
只需添加一个自定义下载中间我们的硒,对于初学者,可以参考这个repo。
最后,不要忘记在设置中添加您的自定义中间件。