BeautifulSoup 未阅读页面

问题描述 投票:0回答:1

我确实有这个简单的页面,我使用 selenium 和 BeautifulSoup。据我所知,该页面加载了 Javascript。有一个加载更多按钮,因此它会点击直到按钮不再出现。但我正在尝试用 BeautifulSoup 进行一些“导航”。它将获取链接 href,然后加载它。但是,由于某种原因,它没有读取该页面。这是代码:


def extrai_dados_prodirectsports():
    print("entrou")
    url_linha = "https://www.prodirectsport.com/soccer/l/adults/departments-boots/activity-football/brand-adidas/silo-predator/",  
    
    driver.get(url_linha)
    
    #Cookies
    WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,"//button[@title='Accept all cookies']"))).click()
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//div[@id='zonos']")))
    botao_close = driver.find_element(By.XPATH, "//a[@class = 'z-close']")
    driver.execute_script("arguments[0].click();",botao_close)
    #WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='global-popup']")))
    botao = driver.find_element(By.XPATH,"//button[@aria-label='Close']")
    driver.execute_script("arguments[0].click();",botao)
    #driver.implicitly_wait(500000)
    
    x = 0 
    driver.maximize_window()
    
    while True:
        try:
        # Reencontrar o botão dentro do loop para evitar stale element
            button_seemore = WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.XPATH, "//div[@class='product-listing__view-more']/button"))
                    )

            if button_seemore.is_displayed():
                driver.execute_script("arguments[0].scrollIntoView();", button_seemore)
                driver.execute_script("arguments[0].click();", button_seemore)
                x += 1
                print(x)
            else:
                break  # Sai do loop se o botão não for mais exibido
        except StaleElementReferenceException:
            print("Elemento 'Ver mais' tornou-se stale, reencontrando o botão...")
            continue  # Tenta novamente o loop
        except TimeoutException:
            print("Tempo esgotado esperando o botão 'Ver mais'.")
            break
    current_page = driver.page_source
    soup = BeautifulSoup(pagina_atual, "html.parser")
    
    list_products = soup.find_all("a", class_ = "_link_129ai_20")
    
    for product in list_products:
        link = product.attrs["href"]
        r = requests.get(link)
        soup = BeautifulSoup(r.text, "html.parser")
        name_product = soup.find("h1", "ml-meta__title").text
        print(name_product)
        preco_elemento = soup.find("div", "ml-prices__price").text
        preco_produto = re.sub("£", "", preco_elemento)
        preco = float(preco_produto) 
        print(preco)
        product_code = driver.current_url.split('-')[-1]
        print(code_product)
python selenium-webdriver web-scraping beautifulsoup
1个回答
0
投票

您可以通过API获取产品:

import requests
from urllib.parse import urlparse


def get_products(url, page):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
    }

    path = urlparse(url_linha).path
    params = {
        'location': f'{path}?pg={page}',
    }

    url = 'https://www.prodirectsport.com/api/v1/search'
    response = requests.get(url, params=params, headers=headers)
    data = response.json()

    return data['products']


url_linha = "https://www.prodirectsport.com/soccer/l/adults/departments-boots/activity-football/brand-adidas/silo-predator/"
products = get_products(url=url_linha, page=1)

for product in products:
    print(f'{product["name"]} = {product["pricing"]["current"]}')
© www.soinside.com 2019 - 2024. All rights reserved.