向下滚动时出现错误,并出现错误,找不到“加载更多”按钮或无法单击该按钮

问题描述 投票:0回答:1

我试图从 Allen Solly 网站获取 100 件衬衫的清单。因此,我添加了“加载更多”按钮的逻辑,因为网站加载时仅显示 30 到 32 个产品。但是,当单击“加载更多”按钮时,它显示一条错误,指出该项目“无法滚动到视图中”。

selenium.common.exceptions.ElementNotInteractableException:消息:元素

堆栈跟踪: RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8 WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5 ElementNotInteractableError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:353:5 webdriverClickElement@chrome://remote/content/marionette/interaction.sys.mjs:166:11 互动.clickElement@chrome://remote/content/marionette/interaction.sys.mjs:135:11 clickElement@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:204:29 receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:84:31

代码

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# Set up the Selenium WebDriver for Firefox
driver = webdriver.Firefox()

try:
    # Navigate to the main shirts page
    url = 'https://allensolly.abfrl.in/c/men-shirts'
    driver.get(url)

    # Wait for the page to load completely
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'ProductCard_productInfo__uZhFN')))

    # Ensure we have at least 100 shirts by clicking the "LOAD MORE" button if needed
    product_info = []
    while len(product_info) < 100:
        # Scroll down to load more products
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)  # Wait for new products to load

            # Click the "LOAD MORE" button
            load_more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/main/div/div[7]/div/div[2]/button')))
            load_more_button.click()
            print("Clicked 'LOAD MORE' button.")
            
            # Wait for new content to load
            WebDriverWait(driver, 10).until(EC.staleness_of(load_more_button))

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Get the updated page source
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all elements with the product title class and add to the list
        products = soup.find_all('div', class_='ProductCard_productInfo__uZhFN')
        for product in products:
            title_element = product.find('div', class_='ProductCard_title__9M6wy')
            detail_element = product.find('div', class_='ProductCard_description__BQzle')  # Assuming this class holds details
            if title_element and detail_element:
                title = title_element.text.strip()
                detail = detail_element.text.strip()
                product_info.append((title, detail))

        # If we have reached 100 unique products, break the loop
        if len(product_info) >= 100:
            break

    # Print the number of unique products found
    print(f"Number of unique products found: {len(product_info)}")

    # Click on each product detail to open it in a new tab
    data = []  # List to store scraped data
    for idx, (title, detail) in enumerate(product_info[:100], start=1):
        print(f"{idx}. Title: {title}\n   Product: {detail}\n")
        try:
            # Find the product description element using the product title
            description_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "ProductCard_title__9M6wy") and contains(text(), "{title}")]/ancestor::div[contains(@class, "ProductCard_productInfo__uZhFN")]'))
            )

            # Scroll the element into view and click it using JavaScript
            driver.execute_script("arguments[0].scrollIntoView(true);", description_element)
            driver.execute_script("arguments[0].click();", description_element)
            print("Opened product description.")
            
            # Wait for the new tab to open
            time.sleep(2)

            # Switch to the new tab
            driver.switch_to.window(driver.window_handles[-1])
            print("Switched to new tab.")

            # Scraping the product description from the new tab using CSS selector
            product_description_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#__next > main > div > div:nth-child(1) > div > div.col-sm-5.col-md-5.col-lg-5 > div > div.PDPDetails_prodDescDetails__D7ddV > div.ProductDetails_container__0vRlj.ProductDetails_AS__WcrW_ > p.MuiTypography-root.MuiTypography-body1.ProductDetails_description__7hqm9.css-12a9y8i'))
            )
            product_description = product_description_element.text.strip()

            # Append scraped data to the list
            data.append(('Allen Solly', 'Men', title, detail, product_description))
            print(product_description)
            
            # Close the new tab
            driver.close()
            print("Closed new tab.")

            # Switch back to the original tab
            driver.switch_to.window(driver.window_handles[0])
            print("Switched back to the original tab.")

            # Wait for the page to reload
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "ProductCard_title__9M6wy") and contains(text(), "{title}")]'))
            )
        except Exception as e:
            print(f"Error occurred while processing product detail: {str(e)}")

    # Convert the scraped data into a pandas DataFrame
    df = pd.DataFrame(data, columns=['Brand', 'Category', 'Title', 'Product Detail', 'Product Description'])

    # Export the DataFrame to an Excel file
    df.to_excel('scraped_data.xlsx', index=False)

finally:
    driver.quit()

python selenium-webdriver beautifulsoup
1个回答
0
投票

我重写了剧本开头的大部分内容。

  1. 我删除了

    try-except
    ,因为它只是隐藏了错误。您需要查看何时出现问题,以便能够修复它。一旦一切正常,然后添加
    try-except
    仅捕获特定异常并且仅当您要处理它们时。

  2. 我删除了

    time.sleep()
    ,它们不是一个好的做法。我用
    WebDriverWait
    替换了它们。

  3. 我更改了“加载更多”按钮的处理。我添加了一个变量

    max_pages
    ,用于跟踪要加载的页面数。每页包含 32 个产品。

  4. 我更改了页面滚动方法,因为它是一次性滚动整个页面,不会加载每个产品。您需要一次滚动一页才能强制加载所有产品。

我测试了我重写的部分代码,验证获得了96个产品。当你开始打开新标签时我停了下来。

更新后的代码。

import time
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

# Set up the Selenium WebDriver for Firefox
driver = webdriver.Chrome()
driver.maximize_window()

# Navigate to the main shirts page
url = 'https://allensolly.abfrl.in/c/men-shirts'
driver.get(url)
wait = WebDriverWait(driver, 10)

# wait for the popup to appear and close
popup_root = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "ct-web-popup-imageonly"))).shadow_root
close_button = popup_root.find_element(By.ID, "close")
driver.execute_script("arguments[0].click()", close_button)

# click LOAD MORE
load_more = wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//button[text()='LOAD MORE']")))
print(f"first len(load_more): {len(load_more)}")
count = 1
max_pages = 3 # each page contains 32 products
while load_more and count < max_pages:
    driver.execute_script("arguments[0].click()", load_more[0])
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.loader-wrapper")))
    wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, "div.loader-wrapper")))
    load_more = wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//button[text()='LOAD MORE']")))
    count += 1

# scroll page by page to the bottom
current_position = driver.execute_script("return window.scrollY + window.innerHeight")
height = driver.execute_script("return document.body.clientHeight")
while current_position < height:
    driver.execute_script("window.scrollBy(0, window.innerHeight);")
    current_position = driver.execute_script("return window.scrollY + window.innerHeight")
    time.sleep(0.5)

product_info = []
# Get the updated page source
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

# Find all elements with the product title class and add to the list
products = soup.find_all('div', class_='ProductCard_productInfo__uZhFN')
for product in products:
    title_element = product.find('div', class_='ProductCard_title__9M6wy')
    detail_element = product.find('div', class_='ProductCard_description__BQzle')  # Assuming this class holds details
    if title_element and detail_element:
        title = title_element.text.strip()
        detail = detail_element.text.strip()
        product_info.append((title, detail))

# Print the number of unique products found
print(f"Number of unique products found: {len(product_info)}")

# Click on each product detail to open it in a new tab
data = []  # List to store scraped data
for idx, (title, detail) in enumerate(product_info[:100], start=1):
    print(f"{idx}. Title: {title}\n   Product: {detail}\n")
    try:
        # Find the product description element using the product title
        description_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "ProductCard_title__9M6wy") and contains(text(), "{title}")]/ancestor::div[contains(@class, "ProductCard_productInfo__uZhFN")]'))
        )

        # Scroll the element into view and click it using JavaScript
        driver.execute_script("arguments[0].scrollIntoView(true);", description_element)
        driver.execute_script("arguments[0].click();", description_element)
        print("Opened product description.")
        
        # Wait for the new tab to open
        time.sleep(2)

        # Switch to the new tab
        driver.switch_to.window(driver.window_handles[-1])
        print("Switched to new tab.")

        # Scraping the product description from the new tab using CSS selector
        product_description_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#__next > main > div > div:nth-child(1) > div > div.col-sm-5.col-md-5.col-lg-5 > div > div.PDPDetails_prodDescDetails__D7ddV > div.ProductDetails_container__0vRlj.ProductDetails_AS__WcrW_ > p.MuiTypography-root.MuiTypography-body1.ProductDetails_description__7hqm9.css-12a9y8i'))
        )
        product_description = product_description_element.text.strip()

        # Append scraped data to the list
        data.append(('Allen Solly', 'Men', title, detail, product_description))
        print(product_description)
        
        # Close the new tab
        driver.close()
        print("Closed new tab.")

        # Switch back to the original tab
        driver.switch_to.window(driver.window_handles[0])
        print("Switched back to the original tab.")

        # Wait for the page to reload
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "ProductCard_title__9M6wy") and contains(text(), "{title}")]'))
        )
    except Exception as close_button:
        print(f"Error occurred while processing product detail: {str(close_button)}")

# Convert the scraped data into a pandas DataFrame
df = pd.DataFrame(data, columns=['Brand', 'Category', 'Title', 'Product Detail', 'Product Description'])

# Export the DataFrame to an Excel file
df.to_excel('scraped_data.xlsx', index=False)

driver.quit()
© www.soinside.com 2019 - 2024. All rights reserved.