我正在尝试从此页面中抓取数据,特别是有关产品的所有信息。
使用浏览器的 Inspect 工具,我发现所有产品的数据都来自 JSON 文件;它是对发送到此 URL 的 API 的 GET 请求的响应。查看请求标头,我找到了订阅密钥(即
Ocp-Apim-Subscription-Key
)及其值(即 5e790236c84e46338f4290aa1050cdd4
)。
我尝试使用 python 请求模块自己发送 GET 请求来获取此 JSON 文件,但它响应了一个包含错误消息的 JSON 文件 -
"appMsg": "Search encountered a problem. Please try again OSSR0033-R"
。
看来我能够连接到 API,但另一端的程序无法找到产品数据 JSON 文件。我假设失败是由于我的 GET 请求中的错误造成的。 如果这个假设成立,我该如何正确复制请求以便收到预期的输出?
import requests
import json
# query url
def request_from_api(url, url_params, req_headers):
response = requests.get(url, params=url_params, headers=req_headers)
return response
def format_cookies(cookie_pairs):
'''
Takes a "list" of name-value pairs e.g. "cook1=value1; cook2=val2"
'''
pairs = [pair.split('=') for pair in cookie_pairs.split('; ')]
formatted_pairs = {cookie_val[0]:cookie_val[1] for cookie_val in pairs}
return formatted_pairs
if __name__ == '__main__':
# url that API is located at
api_url = "https://www.jewelosco.com/abs/pub/xapi/pgmsearch/v1/search/products?"
# url parameters for api_url
url_params = {
"request-id": "1771677643767994529",
"url": "https://www.jewelosco.com",
"pageurl": "https://www.jewelosco.com",
"pagename": "search",
"rows": "30",
"start": "0",
"search-type": "keyword",
"storeid": "1118",
"featured": "true",
"search-uid": "",
"q": "rice",
"sort": "",
"featuredsessionid": "",
"screenwidth": "1533",
"dvid": "web-4.1search",
"channel": "instore",
"banner": "jewelosco"
}
# API sub key
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"DNT": "1",
"Host": "www.jewelosco.com",
"Ocp-Apim-Subscription-Key": "5e790236c84e46338f4290aa1050cdd4",
"Referer": "https://www.jewelosco.com/shop/search-results.html?q=rice",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"TE": "trailers",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"
}
# List of cookie-value pairs copied from my browser's Inspect tool
raw_form_cookies = "visid_incap_1990338=s+w9h0GrTSqb/iWdgj5yGT7p/2MAAAAAQUIPAAAAAAD+pkwygiCfx/ikABjRUg/L; nlbi_1990338=mHC1ApVnlTLFJURPzoaznQAAAACG3swCSzQedoLPtuqqPhlT; incap_ses_8080_1990338=FeLoM/tDE2aUu2sos+0hcD7p/2MAAAAAyClJy0AvAh6rRWqmCVVCcw==; ECommBanner=jewelosco; abs_gsession=%7B%22info%22%3A%7B%22COMMON%22%3A%7B%22Selection%22%3A%22user%22%2C%22preference%22%3A%22J4U%22%2C%22userType%22%3A%22G%22%2C%22zipcode%22%3A%2252732%22%2C%22banner%22%3A%22jewelosco%22%7D%2C%22J4U%22%3A%7B%22zipcode%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%7D%2C%22SHOP%22%3A%7B%22zipcode%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%7D%7D%7D; SWY_SHARED_SESSION_INFO=%7B%22info%22%3A%7B%22COMMON%22%3A%7B%22userType%22%3A%22G%22%2C%22zipcode%22%3A%2252732%22%2C%22banner%22%3A%22jewelosco%22%2C%22preference%22%3A%22J4U%22%2C%22Selection%22%3A%22user%22%2C%22userData%22%3A%7B%7D%7D%2C%22J4U%22%3A%7B%22storeId%22%3A%221118%22%2C%22zipcode%22%3A%2252732%22%2C%22userData%22%3A%7B%7D%7D%2C%22SHOP%22%3A%7B%22storeId%22%3A%221118%22%2C%22zipcode%22%3A%2252732%22%2C%22userData%22%3A%7B%7D%7D%7D%7D; abs_previouslogin=%7B%22info%22%3A%7B%22COMMON%22%3A%7B%22Selection%22%3A%22user%22%2C%22preference%22%3A%22J4U%22%2C%22userType%22%3A%22G%22%2C%22zipcode%22%3A%2252732%22%2C%22banner%22%3A%22jewelosco%22%7D%2C%22J4U%22%3A%7B%22zipcode%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%7D%2C%22SHOP%22%3A%7B%22zipcode%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%7D%7D%7D; SWY_SYND_USER_INFO=%7B%22storeAddress%22%3A%22%22%2C%22storeZip%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%2C%22preference%22%3A%22J4U%22%7D; ECommSignInCount=0; SAFEWAY_MODAL_LINK=; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Mar+01+2023+18%3A10%3A25+GMT-0600+(Central+Standard+Time)&version=202212.1.0&isIABGlobal=false&hosts=&consentId=2481ceef-8878-4f3b-924b-3b28079d9b13&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A0%2CC0004%3A0%2CC0003%3A1&AwaitingReconsent=false; nlbi_1990338_2147483392=jYckLK1heGBAHrRyzoaznQAAAACZvsW6rrz3C1oXWBs6UFc8; reese84=3:Gl8qjGMtFKfV15EgMleAnA==:OIn+iQ/52nnNf5lyREodaDDwUAjg8dDGS98wIlrt5otpbU+Cf8LVvyWEszAKcXR472IFIvx0GqApqQXL+AwRenGrptfNzKJtsu+zlyayIVp5q9BJEyz9T9tIFT2YmnQ+D1rZkBlw2lcnRZqxvVX5dSG6pFJH9nebThXLpHGzKF+j2O1jRKRTanLc72sHU5aqkDgp6aKgzvMI3IQTg9JPnSYW1I0779+gNrb/WfVOID4YT3FLG3OBiMxXsnGGrGQD+3QUsGWzJGXqKkLgErxusDcDI+J82YxLg8Lg7u+qbLFLdUPB4dUsPJJLlHJx8kMBuoRh/47QtMYdykoXYmcZ4PYYLnop7lpDFahVOwcqGmwGCCBjkAnxGuVejNESYc4Yiu5iHFluuEHSDyLxXUmlQWRfDl6axKS+0m6Zm7IqPmvetfC4BsZKbDRk5p/jbFDCIYD/iHbRi8OE/mkzTD03r+un1iC5GFK4BhIQrtBDybXmZYJU1VBwXl+raL8wR0Db3d3I/Mbh4/CK1uT/7CJDRIDznlCZC0/C3gFwXQpfLiA=:XtGGSfw6IB+W6dYIh0iO+xPVdddBfiRA1zwKMhu0OmE=; mbox=session#2686aefa9dea422db9f92c9b39a01830#1677717696; at_check=true; ADRUM_BT=R:57|i:5124367|g:a106a4d3-bbb8-4619-8262-9d3f98852991652436|e:104|n:safeway-loyalty_d99a98d0-07cc-4871-98b7-0beac77d0580"
formatted_cookies = format_cookies(raw_form_cookies)
# combine api_url and url_params and make GET request with headers
product_data = request_from_api(api_url, url_params, headers, formatted_cookies).json()
# pretty print json file
print(json.dumps(product_data, indent=3))
{
"appMsg": "[PS: Success.]",
"primaryProducts": {
"appCode": "400",
"appMsg": "Search encountered a problem. Please try again OSSR0033-R",
"pgmName": "search-products",
"order": "1"
},
"appCode": "[PS: 200]"
}
它是一个很大的 json 文件,包含所有产品信息(例如名称、价格、数量...)。这是它的“片段”:
{
"appMsg":"[PS: Success.]",
"primaryProducts":{
"response":{
"numFound":725,
"start":0,
"isExactMatch":true,
"docs":[
{
"name":"Signature SELECT Rice Enriched Long Grain - 5 Lb",
"pid":"126150030",
"upc":"0002113050205",
"id":"126150030",
"featured":false,
"inventoryAvailable":"1",
"pastPurchased":false,
"restrictedValue":"0",
"salesRank":99999,
"price":4.99,
"basePrice":4.99,
"pricePer":1.0,
"displayType":"-1",
"aisleId":"1_6_9_9",
"aisleName":"Rice|1_6_9",
"departmentName":"Grains, Pasta & Sides",
"shelfName":"White Rice",
"unitOfMeasure":"LB",
"sellByWeight":"I",
"averageWeight":[
"0.00"
],
"unitQuantity":"LB",
"displayUnitQuantityText":"ea",
"previousPurchaseQty":0,
"maxPurchaseQty":0,
"prop65WarningIconRequired":false,
"isArProduct":true,
"isMtoProduct":false,
"customizable":false,
"inStoreShoppingElig":false,
"preparationTime":"0",
"isMarketplaceItem":"N",
"triggerQuantity":0,
"channelEligibility":{
"pickUp":true,
"delivery":true,
"inStore":true,
"shipping":false
},
"channelInventory":{
"delivery":"1",
"pickup":"1",
"instore":"1",
"shipping":"0"
},
"productReview":{
"avgRating":"4.8",
"reviewCount":"64",
"isReviewWriteEligible":"true",
"isReviewDisplayEligible":"true",
"isForOnetimeReview":"true",
"reviewTemplateType":"default"
}
}
},
"appCode":"[PS: 200]"
}
尽管添加了所有请求标头,但响应是相同的。
这是我添加的所有标题:
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"DNT": "1",
"Host": "www.jewelosco.com",
"Ocp-Apim-Subscription-Key": "5e790236c84e46338f4290aa1050cdd4",
"Referer": "https://www.jewelosco.com/shop/search-results.html?q=rice",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"TE": "trailers",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"
}
all_cookies = {
"visid_incap_1990338":"s+w9h0GrTSqb/iWdgj5yGT7p/2MAAAAAQUIPAAAAAAD+pkwygiCfx/ikABjRUg/L",
"nlbi_1990338": "mHC1ApVnlTLFJURPzoaznQAAAACG3swCSzQedoLPtuqqPhlT",
"incap_ses_8080_1990338":"FeLoM/tDE2aUu2sos+0hcD7p/2MAAAAAyClJy0AvAh6rRWqmCVVCcw",
"ECommBanner": "jewelosco",
"abs_gsession":"%7B%22info%22%3A%7B%22COMMON%22%3A%7B%22Selection%22%3A%22user%22%2C%22preference%22%3A%22J4U%22%2C%22userType%22%3A%22G%22%2C%22zipcode%22%3A%2252732%22%2C%22banner%22%3A%22jewelosco%22%7D%2C%22J4U%22%3A%7B%22zipcode%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%7D%2C%22SHOP%22%3A%7B%22zipcode%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%7D%7D%7D",
"SWY_SHARED_SESSION_INFO":"%7B%22info%22%3A%7B%22COMMON%22%3A%7B%22userType%22%3A%22G%22%2C%22zipcode%22%3A%2252732%22%2C%22banner%22%3A%22jewelosco%22%2C%22preference%22%3A%22J4U%22%2C%22Selection%22%3A%22user%22%2C%22userData%22%3A%7B%7D%7D%2C%22J4U%22%3A%7B%22storeId%22%3A%221118%22%2C%22zipcode%22%3A%2252732%22%2C%22userData%22%3A%7B%7D%7D%2C%22SHOP%22%3A%7B%22storeId%22%3A%221118%22%2C%22zipcode%22%3A%2252732%22%2C%22userData%22%3A%7B%7D%7D%7D%7D",
"abs_previouslogin":"%7B%22info%22%3A%7B%22COMMON%22%3A%7B%22Selection%22%3A%22user%22%2C%22preference%22%3A%22J4U%22%2C%22userType%22%3A%22G%22%2C%22zipcode%22%3A%2252732%22%2C%22banner%22%3A%22jewelosco%22%7D%2C%22J4U%22%3A%7B%22zipcode%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%7D%2C%22SHOP%22%3A%7B%22zipcode%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%7D%7D%7D",
"SWY_SYND_USER_INFO":"%7B%22storeAddress%22%3A%22%22%2C%22storeZip%22%3A%2252732%22%2C%22storeId%22%3A%221118%22%2C%22preference%22%3A%22J4U%22%7D",
"ECommSignInCount": "0",
"SAFEWAY_MODAL_LINK": "",
"OptanonConsent": "isGpcEnabled",
"nlbi_1990338_2147483392":"jYckLK1heGBAHrRyzoaznQAAAACZvsW6rrz3C1oXWBs6UFc8",
"reese84": "3:Gl8qjGMtFKfV15EgMleAnA",
"mbox": "session#2686aefa9dea422db9f92c9b39a01830#1677717696",
"at_check": "true",
"ADRUM_BT": "R:57|i:5124367|g:a106a4d3-bbb8-4619-8262-9d3f98852991652436|e:104|n:safeway-loyalty_d99a98d0-07cc-4871-98b7-0beac77d0580"
}
这是我制作的一个函数,它将 cookie 值对列表(例如
Cookie: "c1=v1; c2=v2; c3=v3"
)格式化为字典,其中键是 cookie 名称,值是 cookie 值 - 需要这种格式才能使用 request.get()
:
def format_cookies(cookie_pairs):
'''
Takes a "list" of name-value pairs e.g. "cook1=value1; cook2=val2"
'''
pairs = [pair.split('=') for pair in cookie_pairs.split('; ')]
formatted_pairs = {cookie_val[0]:cookie_val[1] for cookie_val in pairs}
return formatted_pairs
我修改了原始脚本以反映这些更改。
我也在尝试刮宝石并且已经稍微接近了。这是我仅复制标头(没有 cookie)得到的响应:
headers = {
'authority': 'www.jewelosco.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'en-US,en;q=0.9',
'dnt': '1',
'ocp-apim-subscription-key': '5e790236c84e46338f4290aa1050cdd4',
'referer': 'https://www.jewelosco.com/shop/search-results.html?q=fig%20bars',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Brave";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'sec-gpc': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
}
response = requests.get(
'https://www.jewelosco.com/abs/pub/xapi/pgmsearch/v1/search/products?request-id=7001696985613407723&url=https://www.jewelosco.com&pageurl=https://www.jewelosco.com&pagename=search&rows=30&start=0&search-type=keyword&storeid=3455&featured=true&search-uid=uid%253D6696364499362%253Av%253D12.0%253Ats%253D1696961074175%253Ahc%253D12&q=fig%20bars&sort=&featuredsessionid=&screenwidth=149&dvid=web-4.1search&channel=pickup&banner=jewelosco',
#cookies=cookies,
headers=headers,
)
response.json()
这让我得到了这个成功的回应:
{'appMsg': '[PS: Success.]',
'primaryProducts': {'response': {'numFound': 8,
'start': 0,
'miscInfo': {'query': 'fig bars',
'filter': '[pickup:"true", isMarketplaceItem:"N"]'},
'isExactMatch': True,
'docs': [{'status': 'active',
'name': 'Natures Bakery Fig Bar Stone Ground Whole Wheat Raspberry - 6-2 Oz',
'pid': '960166981',
'upc': '0004749521004',
'id': '960166981',
'featured': False,
'inventoryAvailable': '1',
'pastPurchased': False,
'restrictedValue': '0',
'salesRank': 1621,
'agreementId': 0,
'featuredProductId': 0,
'imageUrl': 'https://images.albertsons-media.com/is/image/ABS/960166981',
'price': 8.49,
'basePrice': 8.49,
'pricePer': 0.71,
'displayType': '-1',
'aisleId': '1_7_1_2',
'aisleName': 'Breakfast Bars & Bites|1_7_1',
关键是由该页面上的javascript生成的请求ID:https://www.jewelosco.com/etc.clientlibs/wcax-core/clientlibs/clientlib-unified-header/jewelosco.min.be3248d9f590a9c3b6bcaa7d5faf84f3 .js
唯一的问题是我认为它是根据我发现的这段代码动态生成的:
'request-id': (Math.floor(1000000000000 + Math.random() * 9000000000000)),
因此,除非您找到另一种获取此请求 ID 的方法,否则可能无法抓取请求。一切都只需要标头和请求 ID,所以如果您找到一种方法来处理请求,请告诉我。