我试图从 Allen Solly 网站获取 100 件衬衫的清单。因此,我添加了“加载更多”按钮的逻辑,因为网站加载时仅显示 30 到 32 个产品。但是,当单击“加载更多”按钮时,它显示一条错误,指出该项目“无法滚动到视图中”。
selenium.common.exceptions.ElementNotInteractableException:消息:元素
堆栈跟踪: RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8 WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5 ElementNotInteractableError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:353:5 webdriverClickElement@chrome://remote/content/marionette/interaction.sys.mjs:166:11 互动.clickElement@chrome://remote/content/marionette/interaction.sys.mjs:135:11 clickElement@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:204:29 receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:84:31
代码
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
# Set up the Selenium WebDriver for Firefox
driver = webdriver.Firefox()
try:
# Navigate to the main shirts page
url = 'https://allensolly.abfrl.in/c/men-shirts'
driver.get(url)
# Wait for the page to load completely
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'ProductCard_productInfo__uZhFN')))
# Ensure we have at least 100 shirts by clicking the "LOAD MORE" button if needed
product_info = []
while len(product_info) < 100:
# Scroll down to load more products
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5) # Wait for new products to load
# Click the "LOAD MORE" button
load_more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/main/div/div[7]/div/div[2]/button')))
load_more_button.click()
print("Clicked 'LOAD MORE' button.")
# Wait for new content to load
WebDriverWait(driver, 10).until(EC.staleness_of(load_more_button))
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Get the updated page source
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# Find all elements with the product title class and add to the list
products = soup.find_all('div', class_='ProductCard_productInfo__uZhFN')
for product in products:
title_element = product.find('div', class_='ProductCard_title__9M6wy')
detail_element = product.find('div', class_='ProductCard_description__BQzle') # Assuming this class holds details
if title_element and detail_element:
title = title_element.text.strip()
detail = detail_element.text.strip()
product_info.append((title, detail))
# If we have reached 100 unique products, break the loop
if len(product_info) >= 100:
break
# Print the number of unique products found
print(f"Number of unique products found: {len(product_info)}")
# Click on each product detail to open it in a new tab
data = [] # List to store scraped data
for idx, (title, detail) in enumerate(product_info[:100], start=1):
print(f"{idx}. Title: {title}\n Product: {detail}\n")
try:
# Find the product description element using the product title
description_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "ProductCard_title__9M6wy") and contains(text(), "{title}")]/ancestor::div[contains(@class, "ProductCard_productInfo__uZhFN")]'))
)
# Scroll the element into view and click it using JavaScript
driver.execute_script("arguments[0].scrollIntoView(true);", description_element)
driver.execute_script("arguments[0].click();", description_element)
print("Opened product description.")
# Wait for the new tab to open
time.sleep(2)
# Switch to the new tab
driver.switch_to.window(driver.window_handles[-1])
print("Switched to new tab.")
# Scraping the product description from the new tab using CSS selector
product_description_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#__next > main > div > div:nth-child(1) > div > div.col-sm-5.col-md-5.col-lg-5 > div > div.PDPDetails_prodDescDetails__D7ddV > div.ProductDetails_container__0vRlj.ProductDetails_AS__WcrW_ > p.MuiTypography-root.MuiTypography-body1.ProductDetails_description__7hqm9.css-12a9y8i'))
)
product_description = product_description_element.text.strip()
# Append scraped data to the list
data.append(('Allen Solly', 'Men', title, detail, product_description))
print(product_description)
# Close the new tab
driver.close()
print("Closed new tab.")
# Switch back to the original tab
driver.switch_to.window(driver.window_handles[0])
print("Switched back to the original tab.")
# Wait for the page to reload
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "ProductCard_title__9M6wy") and contains(text(), "{title}")]'))
)
except Exception as e:
print(f"Error occurred while processing product detail: {str(e)}")
# Convert the scraped data into a pandas DataFrame
df = pd.DataFrame(data, columns=['Brand', 'Category', 'Title', 'Product Detail', 'Product Description'])
# Export the DataFrame to an Excel file
df.to_excel('scraped_data.xlsx', index=False)
finally:
driver.quit()
我重写了剧本开头的大部分内容。
我删除了
try-except
,因为它只是隐藏了错误。您需要查看何时出现问题,以便能够修复它。一旦一切正常,然后添加 try-except
仅捕获特定异常并且仅当您要处理它们时。
我删除了
time.sleep()
,它们不是一个好的做法。我用WebDriverWait
替换了它们。
我更改了“加载更多”按钮的处理。我添加了一个变量
max_pages
,用于跟踪要加载的页面数。每页包含 32 个产品。
我更改了页面滚动方法,因为它是一次性滚动整个页面,不会加载每个产品。您需要一次滚动一页才能强制加载所有产品。
我测试了我重写的部分代码,验证获得了96个产品。当你开始打开新标签时我停了下来。
更新后的代码。
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# Set up the Selenium WebDriver for Firefox
driver = webdriver.Chrome()
driver.maximize_window()
# Navigate to the main shirts page
url = 'https://allensolly.abfrl.in/c/men-shirts'
driver.get(url)
wait = WebDriverWait(driver, 10)
# wait for the popup to appear and close
popup_root = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "ct-web-popup-imageonly"))).shadow_root
close_button = popup_root.find_element(By.ID, "close")
driver.execute_script("arguments[0].click()", close_button)
# click LOAD MORE
load_more = wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//button[text()='LOAD MORE']")))
print(f"first len(load_more): {len(load_more)}")
count = 1
max_pages = 3 # each page contains 32 products
while load_more and count < max_pages:
driver.execute_script("arguments[0].click()", load_more[0])
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.loader-wrapper")))
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, "div.loader-wrapper")))
load_more = wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//button[text()='LOAD MORE']")))
count += 1
# scroll page by page to the bottom
current_position = driver.execute_script("return window.scrollY + window.innerHeight")
height = driver.execute_script("return document.body.clientHeight")
while current_position < height:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
current_position = driver.execute_script("return window.scrollY + window.innerHeight")
time.sleep(0.5)
product_info = []
# Get the updated page source
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# Find all elements with the product title class and add to the list
products = soup.find_all('div', class_='ProductCard_productInfo__uZhFN')
for product in products:
title_element = product.find('div', class_='ProductCard_title__9M6wy')
detail_element = product.find('div', class_='ProductCard_description__BQzle') # Assuming this class holds details
if title_element and detail_element:
title = title_element.text.strip()
detail = detail_element.text.strip()
product_info.append((title, detail))
# Print the number of unique products found
print(f"Number of unique products found: {len(product_info)}")
# Click on each product detail to open it in a new tab
data = [] # List to store scraped data
for idx, (title, detail) in enumerate(product_info[:100], start=1):
print(f"{idx}. Title: {title}\n Product: {detail}\n")
try:
# Find the product description element using the product title
description_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "ProductCard_title__9M6wy") and contains(text(), "{title}")]/ancestor::div[contains(@class, "ProductCard_productInfo__uZhFN")]'))
)
# Scroll the element into view and click it using JavaScript
driver.execute_script("arguments[0].scrollIntoView(true);", description_element)
driver.execute_script("arguments[0].click();", description_element)
print("Opened product description.")
# Wait for the new tab to open
time.sleep(2)
# Switch to the new tab
driver.switch_to.window(driver.window_handles[-1])
print("Switched to new tab.")
# Scraping the product description from the new tab using CSS selector
product_description_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#__next > main > div > div:nth-child(1) > div > div.col-sm-5.col-md-5.col-lg-5 > div > div.PDPDetails_prodDescDetails__D7ddV > div.ProductDetails_container__0vRlj.ProductDetails_AS__WcrW_ > p.MuiTypography-root.MuiTypography-body1.ProductDetails_description__7hqm9.css-12a9y8i'))
)
product_description = product_description_element.text.strip()
# Append scraped data to the list
data.append(('Allen Solly', 'Men', title, detail, product_description))
print(product_description)
# Close the new tab
driver.close()
print("Closed new tab.")
# Switch back to the original tab
driver.switch_to.window(driver.window_handles[0])
print("Switched back to the original tab.")
# Wait for the page to reload
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "ProductCard_title__9M6wy") and contains(text(), "{title}")]'))
)
except Exception as close_button:
print(f"Error occurred while processing product detail: {str(close_button)}")
# Convert the scraped data into a pandas DataFrame
df = pd.DataFrame(data, columns=['Brand', 'Category', 'Title', 'Product Detail', 'Product Description'])
# Export the DataFrame to an Excel file
df.to_excel('scraped_data.xlsx', index=False)
driver.quit()