我正在抓取一个网站,该网站根据我抓取的项目类型具有不同的行。我有一个工作刮刀,看起来像下面的第一个块代码,但是,我希望能够从数据库中获取类型并从 start_requests(self) 发送到解析函数。我有 11 种不同的类型,页面某些部分的一个表的行数都不同,而页面上其他表中的其余行是相同的。我尝试在第二个块代码中显示代码。
如何完成从 start_requests 中的数据库获取类型并将其发送到解析?
第一个区块代码
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID FROM dbo.infostage")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
yield self.make_requests_from_url(url+row[0])
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = ''.join(response.url)
id = InfoID[29:len(InfoID)-1]
for info in infodata:
item = infoItem()
# Details
item['id'] = id #response.url
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
itemPool.append(item)
yield item
pass
第二个区块代码
这不起作用,但我不确定如何让它工作。我要创建一个全局列表、一个新函数吗?
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, type FROM dbo.infostage")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
type = row[1] # how do I send this value to the parse function?
yield self.make_requests_from_url(url+row[0])
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input base path
itemPool = []
InfoID = ''.join(response.url)
id = InfoID[29:len(InfoID)-1]
for info in infodata:
item = infoItem()
# Details
item['id'] = id #response.url
# Here I need to implement a condition that comes from def start_requests(self).
# If condition meet then scrape the following fields else the next
if type = 'type1':
# This is where I would like to use it.
# I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same.
# Type 1
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
else:
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
itemPool.append(item)
yield item
pass
感谢大家的帮助和见解!
request.meta
def make_requests_from_url(self, url, type, callback):
request = scrapy.Request(url, callback)
request.meta['type'] = type
return request
在
parse
中,您可以使用 type
访问
response.meta['type']