问题描述:
我正在尝试自动化一个流程,我可以访问一个网站并抓取该页面上前 100 个产品的产品详细信息,并将其放入 Excel 文件中。
代码解释:
我有一个类Webscraper,我在其中调用两个函数。首先,我调用 scroll_and_click_view_more 函数,该函数只是向下滚动我正在访问的网页。然后我调用 prod_vitals 函数,该函数从该网页中提取产品代码和产品名称。
错误描述:
每当我运行下面的代码达到某个最大数量时。对于产品,代码在某个点后卡住并抛出 Index out of range 错误。如果我设置 max_count_of_products=50,代码会卡在行,如果我设置 max_count_of_products=100,代码卡在93处。卡住的地方没有固定的索引,如果我改变max_count_of_products的值,代码卡住的点也在改变。
我附上了下面错误的屏幕截图。
请在下面找到我的代码:
products_summary = []
max_count_of_products=100
def scroll_and_click_view_more(driver,href):
flag=False
last_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
while True:
try:
driver.execute_script("window.scrollBy(0, 800);")
time.sleep(4)
new_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.product-tile')))
except Exception as e:
if new_height == last_height and flag==False:
print("Reached the end of the page and no product tiles were found: ",href)
return "No product tiles found"
else:
last_height = new_height
continue
div_count = 0
flag=True
response = driver.page_source
soup = BeautifulSoup(response, 'html.parser')
div_elements = soup.find_all('div', class_ = 'product-tile')
div_count = len(div_elements)
if(div_count > max_count_of_products):
return(driver.page_source)
else:
driver.execute_script("window.scrollBy(0, 300);")
time.sleep(3)
new_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
#print(new_height)
if new_height == last_height:
return(driver.page_source)
else:
last_height = new_height
except Exception as e:
print(e)
break
def prod_vitals(soup,title,url):
count_of_items=1
products_data = [] # Array to store all product data for our excel sheet
for div in soup.find_all('div', class_ = 'product-tile'): # Iterate over each individual product-tile div tag
if count_of_items<=max_count_of_products:
#print(title)
list_price = 0 # Variable to store list price
sale_price = 0 # Variable to store sale price
discount1 = 0 # Variable to store discount% that is displayed on the site
discount2 = 0
count_of_items = count_of_items+1; # Variable to store discount% calculated manually
res = "Incorrect" # Variable to store result of discount1==discount2; initialized with Incorrect
pro_code = div.select('div.css-1fg6eq7 img')[0]['id']
pro_name = div.select('div.product-name a.css-avqw6d p.css-1d5mpur')[0].get_text()
products_data.append({'Product Code': pro_code, 'Product Name': pro_name}) # Append the extracted data to the list
print("Count: ", count_of_items)
print("Product Code: ",pro_code)
print("Product Name: ",pro_name)
print("\n")
else:
break
time.sleep(5)
class WebScraper:
def __init__(self):
self.url = "https://staging1-japan.coach.com/shop/new/women/?auto=true"
options = Options()
options.add_argument("--remote-debugging-port=9222")
self.driver = webdriver.Chrome(service=Service(r"c:\Users\DELL\Documents\Self_Project\chromedriver.exe"), options=options)
def scrape(self):
self.driver.get(self.url)
time.sleep(5)
soup = BeautifulSoup(self.driver.page_source, 'html.parser') # Refresh the page source and parse it
response = scroll_and_click_view_more(self.driver, 'Link')
time.sleep(3)
if response != "No product tiles found" and response != "Reached the end of the page.":
soup = BeautifulSoup(response, 'html.parser')
prod_vitals(soup,'TITLE', self.url)
time.sleep(2)
else:
self.driver.execute_script("window.scrollTo(0,0);")
time.sleep(3)
self.driver.close()
scraper = WebScraper()
scraper.scrape()
time.sleep(5)
scraper.driver.quit() ```
这是您的浏览器的一个例外,因为它没有找到某些产品的
id
值(可能显示了不属于 div.css-1fg6eq7
的产品 ID/代码的不同变体,如果您这样做,这将修复添加异常处理程序,这是prod_vitals
函数中代码的修改版本:
def prod_vitals(soup,title,url):
count_of_items=1
products_data = []
try: #added error handler to avoid any disruption while the function is running # Array to store all product data for our excel sheet
for div in soup.find_all('div', class_ = 'product-tile'): # Iterate over each individual product-tile div tag
if count_of_items<=max_count_of_products:
#print(title)
list_price = 0 # Variable to store list price
sale_price = 0 # Variable to store sale price
discount1 = 0 # Variable to store discount% that is displayed on the site
discount2 = 0
count_of_items = count_of_items+1; # Variable to store discount% calculated manually
res = "Incorrect" # Variable to store result of discount1==discount2; initialized with Incorrect
pro_code = div.select('div.css-1fg6eq7 img')[0]['id']
pro_name = div.select('div.product-name a.css-avqw6d p.css-1d5mpur')[0].get_text()
products_data.append({'Product Code': pro_code, 'Product Name': pro_name}) # Append the extracted data to the list
print("Count: ", count_of_items)
print("Product Code: ",pro_code)
print("Product Name: ",pro_name)
print("\n")
else:
break
except Exception:
pass
time.sleep(3)
你能尝试一下吗?
希望这会有所帮助。
谢谢