我正在构建爬虫。现在,我希望它浏览该站点上所有可用的页面,并[i]填写每种产品的许多数据字段,并且[ii]对于每种产品,钻入相应的产品网址,并填充其他数据字段。我希望每种产品的所有数据都在同一{}中。但是,取而代之的是,搜寻器正在执行的是[i],然后是[ii],因此,部分[ii]填充在单独的{}中。
我想以某种方式将数据[i]添加到[ii]。 request.meta['item'] = item
看起来可以工作,但我尚未成功使其工作。
我有以下代码:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlerItem
class Crawler1Spider(CrawlSpider):
name = "crawler1"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
#visit each page
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="listnavpagenum"]')), callback='parse_item', follow=True),
#click on each product link
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="exhib_status exhib_status_interiors"]')), callback='parse_detail', follow=True),
]
def parse_item(self, response):
sel = Selector(response)
elements = sel.xpath('//div[@class="ez_listitem_wrapper"]')
items = []
results = []
n = 0
for element in elements:
item = CrawlerItem()
n = n + 1
#work out how to put images into image folder
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[@class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[@class="ez_merge4"]/a/@href').extract_first()
item['count'] = n
yield item
#items.append(item)
#return items
def parse_detail(self, response):
item = CrawlerItem()
item['telephone'] = response.xpath('//div[@id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[@id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
关于如何将每种产品的所有数据归为一个{}的建议,将不胜感激。
更新:15/11/20
我将代码修改如下:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlItem
class Crawler1Spider(CrawlSpider):
name = "test"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="listnavpagenum"]')), callback='parse_item', follow=True),
]
def parse_item(self, response):
item = CrawlItem()
sel = Selector(response)
elements = sel.xpath('//div[@class="ez_listitem_wrapper"]')
items = []
n = 0
for element in elements:
n = n + 1
#work out how to put images into image folder
#item['image_urls'] = selector.xpath('//a[@class="exhib_status exhib_status_interiors"]/img/@src').extract()
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[@class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[@class="ez_merge4"]/a/@href').extract_first()
item['count'] = n
item_detail_url = item['item_url'] = element.xpath('//div[@class="ez_merge4"]/a/@href').extract_first()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail,meta=dict(item=item))
def parse_detail(self, response):
#get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
item['telephone'] = response.xpath('//div[@id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[@id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
我正在使用相同的{}获取数据,但是,机器人仅从每页的最后一项提取数据。还有其他建议吗?
rules
,因为每个请求在到达您要爬网的站点时都是独立的。您需要根据start_requests
定义自己的行为:
def start_requests(self):
yield Request(url=myinitialurl, callback=self.parse)
def parse(self, response):
# crawl the initial page and then do something with that info
yield Request(url=producturl, callback=self.parse_item)
def parse_item(self, response):
item = CrawlerItem()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail, meta=dict(item=item))
def parse_detail(self, response):
# get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
yield item
item = CrawlItem()
中的for循环中实例化parse_item
。