使用selenium进行分页无限循环

问题描述 投票:0回答:1

我有这个简单的刮擦。它工作正常,但是当进入分页时,当它进入最后一个分页时,它会进入无限循环。最后一页进入无限循环:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov  4 21:22:10 2024

@author: user
"""
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from selenium import webdriver
from urllib.parse import urlparse
import undetected_chromedriver as uc
from selenium import webdriver
quantidade_2024 = 0
quantidade_2023 = 0
quantidade_2022 = 0
quantidade_2021 = 0
precos = []
tamanhos = []
options = webdriver.ChromeOptions() 
options.add_argument("start-maximized")
driver = uc.Chrome(options=options)
    #options.add_argument(f"--proxy-server={proxy}")    
    #options.add_argument("start-maximized")
    #options.add_argument("disable-infobars")options.add_argument("--disable-extensions")
actions = ActionChains(driver)
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--verbose")
        #options.add_argument("--remote-debugging-port=9222") 
options.add_argument('--headless')
options.add_argument("--disable-gpu")    
data_planilha = []
precos_m2 = []

url_base = "https://www.vivareal.com.br/venda/ceara/caucaia/bairros/caucaia/#onde=,Cear%C3%A1,Caucaia,Bairros,Caucaia,,,,BR%3ECeara%3ENULL%3ECaucaia%3EBarrios%3ECaucaia,,,/"
driver.get(url_base)
#driver.find_element(By.XPATH, "//ul[@class = 'header__nav-links']/li[2]/a").click()
#driver.implicitly_wait(30)
resultados = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//section[@class = 'results__main']" )))
quantidade_imoveis = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//strong[@class = 'results-summary__count js-total-records']"))).text
quantidade_imoveis = int(quantidade_imoveis)
print(quantidade_imoveis)


while True:
    time.sleep(1)    
    imoveis = WebDriverWait(driver, 200).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'results-list js-results-list']/div")))
    print("entrou")
    for index, value in enumerate(imoveis):
        driver.implicitly_wait(20)
        imoveis = WebDriverWait(driver, 200).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'results-list js-results-list']/div")))
        
        driver.execute_script("arguments[0].scrollIntoView(true);", imoveis[index])
        time.sleep(3)
        
        
        tamanho = imoveis[index].find_element(By.XPATH, ".//span[@class = 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area']").text
        #driver.execute_script("arguments[0].scrollIntoView(true);", tamanho)
       # tamanho = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, ".//span[@class = 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area']"))).text
        print(tamanho)
        driver.implicitly_wait(20)
        preco = imoveis[index].find_element(By.XPATH, ".//div[@class = 'property-card__price js-property-card-prices js-property-card__price-small']/p").text
        preco_imovel = "".join(re.findall('\d+', preco))
        price = int(preco_imovel)
#        driver.execute_script("arguments[0].scrollIntoView(true);", preco)
        
        #preco = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, ".//div[@class = 'property-card__price js-property-card-prices js-property-card__price-small']/p"))).text
        size = int(tamanho)
        print(preco_imovel)
        precos.append(int(price))
        tamanhos.append(size)
        preco_m2 = price/size
        print(preco_m2)
        precos_m2.append(preco_m2)
        driver.save_screenshot("VIVAREAL.png")
    try:
        botao_proximo = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, "//button[@title = 'Próxima página']")))

        if botao_proximo.is_enabled():
            
            driver.execute_script('arguments[0].click()', botao_proximo)            
        else:
            break
           
    except NoSuchElementException:
        print("No more pages available")
        break
tamanho_medio = sum(tamanhos)/quantidade_imoveis
preco_medio_total = sum(precos)/quantidade_imoveis
precos_m2_total = sum(precos_m2)/quantidade_imoveis
data_planilha.append({
        'URL': url_base,
        'Número de anúncios': quantidade_imoveis,
        'Preço Médio': preco_medio_total,
        'Tamanho Médio': tamanho_medio,
        'Preço Médio por m2': precos_m2_total,
        
        })
df_planilha = pd.DataFrame(data_planilha)
df_planilha.to_excel("catalogo_vivareal.xlsx")
driver.quit()
        

在此网站中,最后一页按钮如下所示:

<button class="js-change-page" title="Próxima página" data-page="" data-disabled="">
          Próxima página &gt;        
</button>

data-page 是一个空字符串,并且有一个数据禁用属性,该属性不会出现在除最后一页之外的其他页面上。

python selenium-webdriver web-scraping
1个回答
0
投票

is_enabled
不是您要查找的内容:最后一页上的元素没有
disabled
属性,它具有
data-disabled
属性

这是一个用户定义的数据属性,selenium不会查看它来确定按钮是否被禁用

© www.soinside.com 2019 - 2024. All rights reserved.