我有这个简单的刮擦。它工作正常,但是当进入分页时,当它进入最后一个分页时,它会进入无限循环。最后一页进入无限循环:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 4 21:22:10 2024
@author: user
"""
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from selenium import webdriver
from urllib.parse import urlparse
import undetected_chromedriver as uc
from selenium import webdriver
quantidade_2024 = 0
quantidade_2023 = 0
quantidade_2022 = 0
quantidade_2021 = 0
precos = []
tamanhos = []
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
driver = uc.Chrome(options=options)
#options.add_argument(f"--proxy-server={proxy}")
#options.add_argument("start-maximized")
#options.add_argument("disable-infobars")options.add_argument("--disable-extensions")
actions = ActionChains(driver)
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--verbose")
#options.add_argument("--remote-debugging-port=9222")
options.add_argument('--headless')
options.add_argument("--disable-gpu")
data_planilha = []
precos_m2 = []
url_base = "https://www.vivareal.com.br/venda/ceara/caucaia/bairros/caucaia/#onde=,Cear%C3%A1,Caucaia,Bairros,Caucaia,,,,BR%3ECeara%3ENULL%3ECaucaia%3EBarrios%3ECaucaia,,,/"
driver.get(url_base)
#driver.find_element(By.XPATH, "//ul[@class = 'header__nav-links']/li[2]/a").click()
#driver.implicitly_wait(30)
resultados = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//section[@class = 'results__main']" )))
quantidade_imoveis = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//strong[@class = 'results-summary__count js-total-records']"))).text
quantidade_imoveis = int(quantidade_imoveis)
print(quantidade_imoveis)
while True:
time.sleep(1)
imoveis = WebDriverWait(driver, 200).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'results-list js-results-list']/div")))
print("entrou")
for index, value in enumerate(imoveis):
driver.implicitly_wait(20)
imoveis = WebDriverWait(driver, 200).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'results-list js-results-list']/div")))
driver.execute_script("arguments[0].scrollIntoView(true);", imoveis[index])
time.sleep(3)
tamanho = imoveis[index].find_element(By.XPATH, ".//span[@class = 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area']").text
#driver.execute_script("arguments[0].scrollIntoView(true);", tamanho)
# tamanho = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, ".//span[@class = 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area']"))).text
print(tamanho)
driver.implicitly_wait(20)
preco = imoveis[index].find_element(By.XPATH, ".//div[@class = 'property-card__price js-property-card-prices js-property-card__price-small']/p").text
preco_imovel = "".join(re.findall('\d+', preco))
price = int(preco_imovel)
# driver.execute_script("arguments[0].scrollIntoView(true);", preco)
#preco = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, ".//div[@class = 'property-card__price js-property-card-prices js-property-card__price-small']/p"))).text
size = int(tamanho)
print(preco_imovel)
precos.append(int(price))
tamanhos.append(size)
preco_m2 = price/size
print(preco_m2)
precos_m2.append(preco_m2)
driver.save_screenshot("VIVAREAL.png")
try:
botao_proximo = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, "//button[@title = 'Próxima página']")))
if botao_proximo.is_enabled():
driver.execute_script('arguments[0].click()', botao_proximo)
else:
break
except NoSuchElementException:
print("No more pages available")
break
tamanho_medio = sum(tamanhos)/quantidade_imoveis
preco_medio_total = sum(precos)/quantidade_imoveis
precos_m2_total = sum(precos_m2)/quantidade_imoveis
data_planilha.append({
'URL': url_base,
'Número de anúncios': quantidade_imoveis,
'Preço Médio': preco_medio_total,
'Tamanho Médio': tamanho_medio,
'Preço Médio por m2': precos_m2_total,
})
df_planilha = pd.DataFrame(data_planilha)
df_planilha.to_excel("catalogo_vivareal.xlsx")
driver.quit()
在此网站中,最后一页按钮如下所示:
<button class="js-change-page" title="Próxima página" data-page="" data-disabled="">
Próxima página >
</button>
data-page 是一个空字符串,并且有一个数据禁用属性,该属性不会出现在除最后一页之外的其他页面上。