我正试图刮掉eastbay.com的Jordans。我已经使用BS4设置了我的剪贴板并且它可以正常工作,但从未完成或报告错误,只是在某些时候冻结。
奇怪的是,它在某些时候停止并在Python控制台中按下CTRL + C(它在运行时输出打印件)什么都不做,但它应该停止操作并报告它已被用户停止。此外,在它停止后,它会保存它管理的数据,以便在.csv文件中抓取该点。奇怪的是,如果我再次运行程序,它将获得更多数据,然后再次冻结。每次运行它时,它会获得更多的数据,尽管收益递减。我从来没有经历过这样的事情。
我已经设置了我将在此处粘贴的整个程序,所以如果有人知道它为什么会停止,请告诉我。
import requests
import csv
import io
import json
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from datetime import datetime
from bs4 import BeautifulSoup
url = 'https://www.eastbay.com/api/products/search'
session = requests.Session()
session.max_redirects = 30
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
payload = {
'query': ':relevance:gender:200000:productType:200005:brand:Jordan',
'currentPage': '0',
'pageSize': '200',
'timestamp': '4'}
jsonData = session.get(url, headers=headers, params=payload).json()
totalPages = jsonData['pagination']['totalPages']
totalResults = jsonData['pagination']['totalResults']
print ('%s total results to acquire' %totalResults)
container = []
for page in range(0,totalPages+1):
payload = {
'query': ':relevance:gender:200000:productType:200005:brand:Jordan',
'currentPage': page,
'pageSize': '200',
'timestamp': '4'}
jsonData = session.get(url, headers=headers, params=payload).json()
try:
for product in jsonData['products']:
name = (product['name'])
removal_list4 = [" ", "/", "'"]
for word4 in removal_list4:
name = name.replace(word4, "")
url2 = (product['url'])
url3 = "https://www.eastbay.com/product/"+name+"/"+url2+".html"
container.append(url3)
except:
print ('Products not found on this request')
print(container)
timeanddate=datetime.now().strftime("%Y%m%d-%H%M%S")
folder_path = 'my_path'
file_name = 'eastbay_jordans_'+timeanddate+'.csv'
full_name = os.path.join(folder_path, file_name)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
with io.open(full_name, 'w', newline='', encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["Brand", "Model", "SKU", "Color", "Size", "Price", "Link"])
for url3 in container:
data2 = session.get(url3,headers=headers)
soup2 = BeautifulSoup(data2.text, 'lxml')
if not soup2.find('script', attrs={'type': 'application/ld+json'}):
brand = "Unavailable"
getbrand = "Unavailable"
else:
brand = soup2.find('script', attrs={'type': 'application/ld+json'})
getbrand = json.loads(brand.text)['brand']
if not soup2.find('span', attrs={'class': 'ProductName-primary'}):
model = "Unavailable"
else:
model = soup2.find('span', attrs={'class': 'ProductName-primary'}).text.strip()
removal_list2 = [" - ", "NIKE", "Nike", "Jordan", "JORDAN", "REEBOK", "CHAMPION", "TIMBERLANDS", "FILA", "LACOSTE", "CONVERSE", "Adidas", "ADIDAS", "New Balance", "NEW BALANCE", "Vans", "Puma", "UGG", "Saucony", "Reebok", "Women's ", "adidas", "Dr. Martens", "Converse", "Fila", "PUMA", "Champion", "Diadora", "Timberland", "SNKR PROJECT", "Women's ", "Men's ", "Unisex ", "Under Armour", "UNDER ARMOUR"]
for word2 in removal_list2:
model = model.replace(word2, "")
if not soup2.find('div', attrs={'class': 'Tab-panel'}):
sku = "Unavailable"
getsku = "Unavailable"
else:
sku = soup2.find('div', attrs={'class': 'Tab-panel'})
for child in sku.findAll("div"):
child.decompose()
getsku = sku.get_text()
removal_list3 = ["Product #: "]
for word3 in removal_list3:
getsku = getsku.replace(word3, "")
if not soup2.find('p', attrs={'class': 'ProductDetails-form__label'}):
color = "Unavailable"
else:
color = soup2.find('p', attrs={'class': 'ProductDetails-form__label'}).text.strip()
if not soup2.find('div', attrs={'class': 'ProductSize-group'}):
size = "Unavailable"
getsize = "Unavailable"
else:
size = soup2.find('div', attrs={'class': 'ProductSize-group'})
getsize = [item.text.strip() for item in size.select('div.c-form-field.c-form-field--radio.ProductSize:not(div.c-form-field.c-form-field--radio.c-form-field--disabled.ProductSize)')]
if not soup2.find('div', attrs={'class': 'ProductPrice'}):
price = "Unavailable"
elif not soup2.find('span', attrs={'class': 'ProductPrice-final'}):
price = soup2.find('div', attrs={'class': 'ProductPrice'}).text.strip()
else:
price = soup2.find('span', attrs={'class': 'ProductPrice-final'}).text.strip()
productlink = url3
#Print for test purposes
print(getbrand,model,getsku,color,getsize,price,productlink)
writer.writerow([getbrand, model, getsku, color, getsize, price, productlink])
file.close()
您应该考虑以下事项:
429 Too Many Requests
,那么你的速度有限。要解决这个问题,有办法:
time.sleep(2)
。