嗨我想在我的脚本中刮掉2个不同的域我已经尝试了我的if语句,但我似乎它不起作用,请问有什么想法吗?
这是我的代码
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com']
start_urls = [
'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999',
'https://www.forever21.com/us/shop/catalog/category/f21/sale',
]
def parse_start_url(response):
if (response.url == 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'):
parse_1(response)
if (response.url == 'https://www.forever21.com/us/shop/catalog/category/f21/sale'):
parse_2(response)
def parse_1(self, response):
for product_item in response.css('li.product-item'):
item = {
'title': product_item.css('h3.item-heading a.link::text').extract_first(),
'regular-price': product_item.css('strong.item-price span.price.regular::text').extract_first(),
'sale-price': product_item.css('strong.item-price span.price.sale::text').extract_first(),
'photo-url': product_item.css('.image-container img::attr(data-src)').extract_first(),
'description-url': "https://www2.hm.com/" + product_item.css('h3.item-heading a::attr(href)').extract_first(),
}
yield item
def parse_2(self, response):
#Some code getting item on domain 2
请帮忙谢谢
检查你的allowed_domains
变量。您应该添加新域名,例如['www2.hm.com', 'forever21.com']
或删除它。你也没有parse
功能。
我可以假设用start_urls
删除你的if
并使用start_requests
代替。您的代码将更具可读性。
import scrapy
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com', 'forever21.com']
def start_requests(self):
urls = (
(self.parse_1, 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'),
(self.parse_2, 'https://www.forever21.com/us/shop/catalog/category/f21/sale'),
)
for cb, url in urls:
yield scrapy.Request(url, callback=cb)
def parse_1(self, response):
print 111111111
def parse_2(self, response):
print 2222222222