我正在用硒刮擦,scrapy。这里,主要问题是链接 init 并解析。现在因为解析不接受响应,在解析中错过了 driver.get(url) 的调用
import scrapy
from scrapy import FormRequest
from scrapy.http import HtmlResponse
from datetime import datetime, timedelta
from bloomberg.items import BloombergItem
import json
from scrapy.shell import inspect_response
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
class HistoricalDataSpider(scrapy.Spider):
name = 'historical_data'
allowed_domains = ['econcal.forexprostools.com']
start_urls = ['http://econcal.forexprostools.com/']
output = []
not_parsed_pages = 0
def __init__(self):
# chrome_options.add_argument('--headless')
# Setting up the Chrome WebDriver with options
self.driver = webdriver.Chrome()
def parse(self, response):
print('###')
self.driver.get(response.url)
page_source = self.driver.page_source
for n in range(0, (self.end_date-self.start_date).days+1, 30):
start_date = self.start_date + timedelta(n)
end_date = self.start_date + timedelta(n+30)
if end_date > self.end_date: end_date = self.end_date
skip = False
for n, date in enumerate(self.scraped_dates):
if start_date <= date <= end_date and (self.end_date - date).days > 90:
skip = True
self.scraped_dates = self.scraped_dates[n:]
break
if skip:
continue
start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d')
html_response = HtmlResponse(url=self.driver.current_url, body=page_source, encoding='utf-8')
rows = html_response.css('button')
# output = []
for row in rows:
# print(row, 'init###')
if 'USD' in row.css('div::text').extract_first():
event_datetime = row.css('button::attr(event_timestamp)').extract_first()
event_datetime = datetime.strptime(event_datetime, '%Y-%m-%d %H:%M:%S')
date = event_datetime.strftime('%m/%d/%Y')
time = event_datetime.strftime('%H:%M')
event_name = row.css('.left.event::text').extract_first().strip()
actual = row.css('.act span::text').extract()
if actual:
actual = actual[1].strip()
if actual:
actual = re.sub('\,', '', actual)
actual = re.search('[-0-9.]+', actual).group()
else: actual = None
forecast = row.css('.fore span::text').extract()
if forecast:
forecast = forecast[1].strip()
if forecast:
forecast = re.sub('\,', '', forecast)
forecast = re.search('[-0-9.]+', forecast).group()
else: forecast = None
prev = row.css('.prev span::text').extract()
if prev:
prev = prev[1].strip()
if prev:
prev = re.sub('\,', '', prev)
prev = re.search('[-0-9.]+', prev).group()
else: prev = None
new_row = [date,time, event_name, actual, forecast, prev]
if new_row not in self.output:
self.output.append([date,time, event_name, actual, forecast, prev])
# self.not_parsed_pages -= 1
if self.not_parsed_pages == 0:
item = BloombergItem()
item['data'] = self.output
yield item
self.driver.quit()
self.driver.close()
由此我得到了下面的错误。
2024-06-09 04:55:54 [scrapy.middleware] INFO: Enabled item pipelines:
['bloomberg.pipelines.BloombergPipeline']
2024-06-09 04:55:54 [scrapy.core.engine] INFO: Spider opened
2024-06-09 04:55:54 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2024-06-09 04:55:54 [historical_data] INFO: Spider opened: historical_data
2024-06-09 04:55:54 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2024-06-09 04:55:55 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://econcal.forexprostools.com/> from <GET http://econcal.forexprostools.com/>
2024-06-09 04:55:55 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://econcal.forexprostools.com/> (referer: None)
2024-06-09 04:55:55 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://econcal.forexprostools.com/>: HTTP status code is not handled or not allowed
2024-06-09 04:55:55 [scrapy.core.engine] INFO: Closing spider (finished)
这里,chromedriver 正在运行,没有 url 并关闭。 在我看来,爬虫检测到了 init 和解析之间的错误。 更准确地说,有时我得到了准确的结果,但大多数情况下,我错过了网址。 我会等待你的帮助。
如果您已经在使用 Selenium,也许您应该尝试与抓取浏览器集成来解决此问题:https://medium.com/python-in-plain-english/getting-started-with-bright-datas-web-scraping -浏览器-18892624bbc9