从网页上刮取特定元素

问题描述 投票:0回答:1

大家好以下链接qazxsw poi

我能够除了以下所有东西:

`Mi Item#:,制造商说明:,规格

使用scrapy代码,我能够刮掉大部分区域但是当我刮掉上面提到的元素时会出现麻烦。我没有为这些领域得到任何东西

以下是我正在使用的蜘蛛:

https://www.motionindustries.com/productDetail.jsp?sku=00097433
python-3.x web-scraping scrapy
1个回答
0
投票

你可以申请class ProjectnameSpiderSpider(scrapy.Spider): name = 'motion' allowed_domains = ['www.motionindustries.com'] def start_requests(self): start_urls = [l.strip() for l in open('C:/Users/Admin/motion/motion/sachin.txt').readlines()] for i in start_urls: yield scrapy.Request(i) def parse(self, response): item = MotionItem() item['url'] = response.url try: dd = response.xpath('//*[@id="product-catalog-search-form"]/div[2]/div[1]/nav/p') ft = [] for i in range(1,10): q=str(i) trows = "a["+q+"]" xpathgiven = trows + "/text()" for bullets in dd: jj = bullets.xpath(xpathgiven).extract() ft.append(jj) ft.append(">") liststring = str(ft) liststring1 = liststring.replace("[['","") liststring2 = liststring1.replace("'], '>', ['",">") liststring3 = liststring2.replace(' \\n ',"") liststring4 = liststring3.replace("\\n ","") liststring5 =liststring4.replace(",","") liststring6 = liststring5.replace("'>' []","") liststring7 = liststring6.replace("']","") liststring8 = liststring7.replace(" '>","") liststring9 = liststring8.replace(" '>","") dd2 = response.xpath('//*[@id="product-catalog-search-form"]/div[2]/div[1]/nav/p') ft2 = [] for i2 in range(1, 10): q2 = str(i2) trows2 = "span[" + q2 + "]" xpathgiven2 = trows2 + "/b/text()" for bullets2 in dd2: jj2 = bullets2.xpath(xpathgiven2).extract() ft2.append(jj2) liststring2 = str(ft2) liststring12 = liststring2.replace("[['", "") liststring22 = liststring12.replace("'], '>', ['", ">") liststring32= liststring22.replace( ' \\n ', "") liststring42 = liststring32.replace( " ", "") liststring52 = liststring42.replace(",", "") liststring62 = liststring52.replace("'", "") liststring72 = liststring62.replace("]", "") liststring82 = liststring72.replace("[", "") liststring92 = liststring82.replace("[]", "") item['category'] = liststring9 + ">" +liststring92 except IndexError: item['category'] = "No Category" try: item['Maufacturer'] = response.xpath('//*[@id="product-catalog-search-form"]/div[2]/div[1]/div[1]/div[2]/div[1]/img/@alt').extract()[0].strip() except IndexError: item['Maufacturer'] = ' No Manufacturer name' try: item['ItemTitle'] = response.xpath('//*[@id="product-catalog-search-form"]/div[2]/div[1]/div[1]/div[2]/h1/text()').extract()[0].strip() except IndexError: item['ItemTitle'] = ' No ItemTitle name' try: item['shortdesc'] = response.xpath('//*[@id="product-catalog-search-form"]/div[2]/div[1]/div[1]/div[2]/h2/text()').extract()[0].strip() except IndexError: item['shortdesc'] = "No Short desc" try: item['MIitem'] = response.xpath('//*[@id="productReviewsToast"]/div[2]/div[1]/div[1]/div[2]/table/tbody/tr/td[2]/text()').extract() except IndexError: item['MIitem'] = "No MI Item" try: item['price'] = response.xpath('//*[@id="product-catalog-search-form"]/div[2]/div[1]/div[1]/div[3]/div/div/div[1]/div/text()').extract()[0].strip() except IndexError: item['price'] = "No Price" try: item['Availability'] = response.xpath('//*[@id="product-catalog-search-form"]/div[2]/div[1]/div[1]/div[3]/div/div/div[2]/text()').extract()[0].strip() except IndexError: item['Availability'] = "No Availability" try: qq = response.xpath("//*[@class='mi-product-image']/@src").extract()[0].strip() item['img'] = qq except IndexError: item['img'] = "No Img" yield item

css selector

for Mi Item#:,Mfr描述:然后在第一和第二位置提取值

EG

.item-property-value

对于您可以使用的规格

.css('.item-property-value').getall()[0]

然后迭代列表。

样品:

EG

#specifications tr

.css('#specifications tr').getall() 总是返回单个结果,.get()总是返回所有提取结果的列表。

© www.soinside.com 2019 - 2024. All rights reserved.