在此page上,我需要从所有选项卡(配置文件,评论,电话号码和方向)获取信息。
wellness.py
def profile(self, response):
services = response.xpath('.//span[contains(text(),"Services")]')
education = response.xpath('.//span[contains(text(),"Education")]')
training = response.xpath('.//span[contains(text(),"Training")]')
yield {
'First and Last name': response.css('h1::text').get(),
'About': response.css('.listing-about::text').get(),
'Services': services.xpath('following-sibling::span[1]/text()').extract(),
'Primary Specialty': response.css('.normal::text').get(),
'Address': ' '.join([i.strip() for i in response.css('.office-address span::text').getall()]),
'Practice': response.css('.years-in-service::text').get(),
'Education': education.xpath('following-sibling::span[1]/text()').extract(),
'Training': training.xpath('following-sibling::span[1]/text()').extract(),
'Consumer Feedback': response.css('.item-rating-container a::text').get()
}
每个选项卡正在加载单独的页面,因此您将必须从第一页收集想要的数据,请求第二页获取数据,然后请求第三页。这就是我要做的。请注意,链接代码正确无误,您必须在每个页面上为数据点选择器。
def profile(self, response):
item["data1"] = response.xpath('//xpath').get()
# Get first link for reviews
review_link = response.css('#reviews_tab a::attr(href)').get()
yield scrapy.Request(response.urljoin(review_link), callback=self.parse_reviews, meta={'item': item})
def parse_reviews(self, response):
item = response.meta['item']
item["review_data"] = response.xpath
directions_link = response.css('#directions_tab a:attr(href)').get()
yield scrapy.Request(response.urljoin(directions_link), callback=self.parse_directions, meta={'item': item})
def parse_directions(self, response):
item = response.meta['item']
item['directions'] = response.xpath
yield item