Scrapy多个回调问题

Question

你好，我写了这个蜘蛛，以获得starturl上的新产品。但是，我在编写另一个回调来解析新产品时遇到问题，然后回到正常的解析回调，其中再次继续抓取新产品。这就是我现在拥有的

    def parse(self, response):
    products = Selector(response).xpath(
        '//div[@class="browsing-product-list"]//figure[contains(@class,"browsing-product-item")]')

    for product in products:
        item = StartItem()
        item['name'] = product.xpath('.//a/figcaption/p[2]/text()').extract()[0]
        item['link'] = product.xpath('.//meta[3]/@content').extract()[0]
        #New callback method to parse new url found not sure how to implement
        yield Request(StartURL, callback=self.parse, dont_filter=True, priority=70)
        ru = scrapy.Request(url=response.urljoin(item['link']), callback=self.parseProduct)
        ru.meta['item'] = item
        yield ru

def parseProduct(self, response):
    item = response.meta['item']
    imageUrls = response.xpath('id("img")/option/text()').extract()
    item['image_urls'] = imageUrls
    yield item

所以，请在下面提供任何帮助

    Superurl = "https://www.ssense.com/en-us/men/sneakers"
    class SuperSpider(Spider):
        name = "SuperSpider"
        allowded_domains = ["randomtester.com"]
        start_urls = [SuperURL]

    def __init__(self):
        logging.critical("starting superspider.")

    def parse(self, response):
        products = Selector(response).xpath('//div[@class="browsing-product-list"]//figure[contains(@class,"browsing-product-item")]')

        for product in products:
            item = SuperItem()
            item['name'] =product.xpath('.//a/figcaption/p[2]/text()').extract()[0]
            item['link'] = product.xpath('.//meta[3]/@content').extract()[0]
            # Not sure how to implement this to request the new url to parse  
            ru = scrapy.Request(url=response.urljoin(item['link']), callback=self.parseProduct)
            ru.meta['item'] = item
            yield ru
        yield Request(SuperURL, callback=self.parse, dont_filter=True, priority=70)


    def parseProduct(self, response):
        item = response.meta['item']
        imageUrls = response.xpath('id("size")/option/text()').extract()
        item['image_urls'] = imageUrls
        yield item

Answer 1

我得到你的代码并创建可以在没有项目的情况下运行的独立脚本它没有问题 - 所以我不知道你有什么问题。

目前它甚至下载图像，因为您的版本获取错误数据。

from scrapy import Spider, Request
from scrapy.selector import Selector
import logging
import json

SuperURL = "https://www.ssense.com/en-us/men/sneakers"

class SuperSpider(Spider):

    name = "SuperSpider"

    start_urls = [SuperURL]

    def __init__(self):
        logging.critical("starting superspider.")

    def parse(self, response):

        products = Selector(response).xpath('//div[@class="browsing-product-list"]//figure[contains(@class,"browsing-product-item")]')
        #products = response.xpath('//figure[@class="browsing-product-item"]')

        for product in products:
            #item = SuperItem()
            item = {}
            item['name'] = product.xpath('.//a/figcaption/p[2]/text()').extract()[0]
            item['link'] = product.xpath('.//meta[3]/@content').extract()[0]
            # Not sure how to implement this to request the new url to parse  
            ru = Request(url=response.urljoin(item['link']), callback=self.parseProduct)
            ru.meta['item'] = item
            yield ru
            #yield Request(url=response.urljoin(item['link']), callback=self.parseProduct, meta={'item': item})

        yield Request(SuperURL, callback=self.parse, dont_filter=True, priority=70)

    def parseProduct(self, response):

        item = response.meta['item']

        all_scripts = response.xpath('//script/text()').extract()

        for script in all_scripts:
            if 'window.INITIAL_STATE=' in script:
                images = json.loads(script[21:])["products"]["current"]["images"]
                item['image_urls'] = [x.replace('__IMAGE_PARAMS__', 'b_white,c_lpad,g_center,h_960,w_960/c_scale,h_680/f_auto,dpr_1.0') for x in images]

        yield item

# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',

    # save in file as CSV, JSON or XML
    'FEED_FORMAT': 'csv',     # csv, json, xml
    'FEED_URI': 'output.csv', # 

    # download images and convert to JPG
    # it needs `yield {'image_urls': [url]}` in `parse()`
    'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1},
    'IMAGES_STORE': '.',
})
c.crawl(SuperSpider)
c.start()

我在Python 3.6.2上使用Scrapy 1.4.0和Linux Mint 18.2

import sys
print('Python:',  sys.version)

import scrapy
print('Scrapy:', scrapy.__version__)

Scrapy多个回调问题

问题描述投票：0回答：1

1个回答

最新问题

Scrapy多个回调问题

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1