所以我尝试使用 Selenium 创建一个代码来抓取动态网站。 我目前陷入如何从多个列表中提取数据的困境(该页面有多个列表)。
这是代码:
from selenium import webdriver
#to enable Wait for Page Loading
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#to enable scrolling
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
#Initialize browser
options = webdriver.ChromeOptions()
#options.add_argument('--headless') # Not headless because there's an error with Hotjar
driver = webdriver.Chrome(options=options)
url = "https://www.archify.com/id/professionals"
driver.get(url)
#Click Load More Button
l = driver.find_element("xpath", "//button[text()='Load More']")
l.click()
# Scroll until bottom of the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
#Tell browser to wait for for loading (for elements to load / for 60 seconds)
wait = WebDriverWait(driver, 60)
# Extract product details
product_elements = driver.find_elements(By.CLASS_NAME, 'professional-box')
product_data = []
for product_element in product_elements:
link = product_element.find_element(By.NAME, 'href').text
title = product_element.find_element(By.NAME, '.title').text
subtitle = product_element.find_element(By.NAME,"subtitle").text
product_data.append({'title': title, 'subtitle': subtitle, 'link': link})
# Print extracted data
for product in product_data:
print(f"Title: {product['title']}, Subtitle: {product['subtitle']}, link: {product['link']}")
driver.quit
以及下面的回溯:
PS C:\Users\user\Desktop\Code> & C:/Users/user/AppData/Local/Programs/Python/Python311/python.exe c:/Users/user/Desktop/Code/Scrape_Selenium.py
DevTools listening on ws://127.0.0.1:51089/devtools/browser/581a68b5-c26f-42fd-93e0-f4970c14fd1b
Traceback (most recent call last):
File "c:\Users\user\Desktop\Code\Scrape_Selenium.py", line 43, in <module>
link = product_element.find_element(By.NAME, 'href').text
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\remote\webelement.py", line 417, in find_element
return self._execute(Command.FIND_CHILD_ELEMENT, {"using": by, "value": value})["value"]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\remote\webelement.py", line 395, in _execute
return self._parent.execute(command, params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 347, in execute
self.error_handler.check_response(response)
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 229, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[name="href"]"}
(Session info: chrome=122.0.6261.112); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
GetHandleVerifier [0x00007FF6B60AAD32+56930]
(No symbol) [0x00007FF6B601F632]
(No symbol) [0x00007FF6B5ED42E5]
(No symbol) [0x00007FF6B5F198ED]
(No symbol) [0x00007FF6B5F19A2C]
(No symbol) [0x00007FF6B5F0F13C]
(No symbol) [0x00007FF6B5F3BCDF]
(No symbol) [0x00007FF6B5F0F09A]
(No symbol) [0x00007FF6B5F3BEB0]
(No symbol) [0x00007FF6B5F581E2]
(No symbol) [0x00007FF6B5F3BA43]
(No symbol) [0x00007FF6B5F0D438]
(No symbol) [0x00007FF6B5F0E4D1]
GetHandleVerifier [0x00007FF6B6426ABD+3709933]
GetHandleVerifier [0x00007FF6B647FFFD+4075821]
GetHandleVerifier [0x00007FF6B647818F+4043455]
GetHandleVerifier [0x00007FF6B6149766+706710]
(No symbol) [0x00007FF6B602B90F]
(No symbol) [0x00007FF6B6026AF4]
(No symbol) [0x00007FF6B6026C4C]
(No symbol) [0x00007FF6B6016904]
BaseThreadInitThunk [0x00007FFEA0697344+20]
RtlUserThreadStart [0x00007FFEA08A26B1+33]
我看到另一个步骤提到我需要将“product_elements”放入for循环中,但我想我已经在上面的代码中做到了这一点。
谢谢你
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
url = "https://www.archify.com/id/professionals"
driver.get(url)
l = driver.find_element("xpath", "//button[text()='Load More']")
l.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
wait = WebDriverWait(driver, 60)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Here you made mistake
product_elements = soup.find_all('div', class_='professional-box') # find all div element
product_data = []
for product_element in product_elements:
content = product_element.find('div', class_='text-box type-a')
if content:
href = content.find('a').get('href')
title = content.find('p', class_='title').text
subtitle = content.find('p', class_='subtitle').text
product_data.append({'title': title, 'subtitle': subtitle, 'link': href})
# Print extracted data
for product in product_data:
print(f"Title: {product['title']}, Subtitle: {product['subtitle']}, link: {product['link']}")
driver.quit()