我正在开发一个个人项目,使用 Python 和 Selenium 从中小企业机构的数据库中抓取动态数据。网页以“卡片”格式显示公司信息,单击按钮动态加载包含费用详细信息的表格。
但是,我遇到以下问题:
按钮被检测到并成功点击(通过打印语句验证)。 即使在等待表加载后,动态表数据也不会出现在 driver.page_source 中。
目标页面: https://ma-shienkikan.go.jp/search
下面是点击按钮并尝试检索表数据的代码:
def get_fee_details_optimized(card, fee_type, side):
try:
# Determine button color based on fee_type
if fee_type == "FA":
button_color = "bg-yellow-500"
elif fee_type == "仲介":
button_color = "bg-green-500"
else:
raise ValueError(f"Invalid fee_type: {fee_type}")
# Find the button based on side (either transferor or transferee)
button = card.find_element(
By.XPATH,
f".//a[contains(@class, '{button_color}') and contains(text(), '{side}側')]"
)
# Use JavaScript to click the button
driver.execute_script("arguments[0].click();", button)
# Wait for the table to appear
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
# Parse the updated page source with BeautifulSoup
updated_html = driver.page_source
updated_soup = BeautifulSoup(updated_html, "html.parser")
table = updated_soup.find("table")
if table:
return table.prettify()
else:
return "No table found"
except Exception as e:
print(f"Error retrieving fee details for {fee_type} {side}側: {e}")
return "N/A"
使用打印语句验证是否检测到并成功单击了按钮。
手动测试浏览器中的按钮单击,以确保表数据按预期加载。
使用 WebDriverWait 为表加载留出足够的时间。
检查浏览器开发者工具中的网络活动,以识别相关的 API 调用。
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
# Path to ChromeDriver
chromedriver_path = "/Users/kazatoy/PLEX/chromedriver"
# Chrome options
options = Options()
options.add_argument("--headless") # Uncomment if headless mode is needed
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# Set up ChromeDriver service
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=options)
# Data storage
data = []
# Base URL
base_url = "https://ma-shienkikan.go.jp/search"
# Format table data
def format_table_data(table_html):
soup = BeautifulSoup(table_html, "html.parser")
rows = soup.find("tbody").find_all("tr")
formatted_rows = []
for row in rows:
cols = row.find_all("td")
if len(cols) == 2:
# Extract range data and fee rate
amount_range = cols[0].get_text(strip=True)
rate = cols[1].get_text(strip=True)
formatted_rows.append(f"{amount_range} {rate}")
return "\n".join(formatted_rows)
# Function to click button and retrieve table data
def get_fee_details_optimized(card, fee_type, side):
try:
# Determine button color based on fee_type
if fee_type == "FA":
button_color = "bg-yellow-500"
elif fee_type == "仲介":
button_color = "bg-green-500"
else:
raise ValueError(f"Invalid fee_type: {fee_type}")
# Find the button based on side (either transferor or transferee)
button = card.find_element(
By.XPATH,
f".//a[contains(@class, '{button_color}') and contains(text(), '{side}側')]"
)
print(button)
# Click the button using JavaScript
driver.execute_script("arguments[0].click();", button)
print(f"Clicked Button: {fee_type} {side}側 using JavaScript.")
# Wait for the table to appear
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
# Parse the updated page source
updated_html = driver.page_source
updated_soup = BeautifulSoup(updated_html, "html.parser")
# Retrieve table data
table = updated_soup.find("table")
if table:
return format_table_data(table.prettify())
else:
print(f"No table found for {fee_type} {side}側")
return "No table found"
except Exception as e:
print(f"Error retrieving fee details for {fee_type} {side}側: {e}")
return "N/A"
try:
for page in range(1, 142): # Adjust page range as needed
print(f"Processing page {page}")
driver.get(f"{base_url}?page={page}")
# Wait for cards to be visible
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "section-card")))
cards = driver.find_elements(By.CLASS_NAME, "section-card")
for card in cards:
try:
card_html = card.get_attribute('outerHTML')
soup = BeautifulSoup(card_html, "html.parser")
# Extract company name
company_name = soup.find("h1", class_="text-base sm:text-lg py-4 font-bold bg-main text-white px-4 -mx-4 sm:-mx-8")
company_name = company_name.find("div", class_="inline-block").get_text(strip=True) if company_name else "N/A"
if company_name == "N/A":
continue
# Extract additional data
support_type = soup.find("span", text="M&A支援機関の種類")
support_type = support_type.find_next("span").get_text(strip=True) if support_type else "N/A"
# Retrieve table data for each button
mediator_fee_transferee = get_fee_details_optimized(card, "仲介", "譲受")
mediator_fee_transferor = get_fee_details_optimized(card, "仲介", "譲渡")
fa_fee_transferee = get_fee_details_optimized(card, "FA", "譲受")
fa_fee_transferor = get_fee_details_optimized(card, "FA", "譲渡")
# Save data
data.append([
company_name, support_type, mediator_fee_transferee,
mediator_fee_transferor, fa_fee_transferee, fa_fee_transferor
])
except Exception as e:
print(f"Error processing card: {e}")
finally:
driver.quit()
# Save data to CSV
df = pd.DataFrame(data, columns=["Company Name", "Support Type", "Mediator Fee (Transferee)", "Mediator Fee (Transferor)", "FA Fee (Transferee)", "FA Fee (Transferor)"])
output_file = "scraped_data.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"Data saved to {output_file}")
我要检索的表格是所附图像部分
您还没有处理过卡片不包含绿色或黄色按钮的情况。 在某些情况下,按钮会被禁用并且呈灰色。单击它们不会返回表格。
当您收到错误“不存在这样的元素”时,该按钮被禁用,并且您尝试检索的数据不存在。
为了可靠地获取数据,您必须检查按钮是否被禁用。
不要通过背景颜色查找按钮元素,而是尝试先找到按钮,然后检查它们是否为灰色。这样您就可以验证该按钮是否已禁用并且没有要收集的数据表。 如果它不是灰色的,您可以继续收集数据。否则,继续下一张卡片。
我更新了以下代码中的
get_fee_details_optimized
函数。
更新了按钮 XPath,使其仅根据 Fee_type 值和 side 值获取该行的按钮,而不检查其颜色。 然后获取
button_color
中的类值,并在点击之前检查它是否不包含bg-gray-300
。
试试这个,
# Function to click button and retrieve table data
def get_fee_details_optimized(card, fee_type, side):
try:
# Determine button color based on fee_type
if fee_type == "FA":
button_color = "bg-yellow-500"
elif fee_type == "仲介":
button_color = "bg-green-500"
else:
raise ValueError(f"Invalid fee_type: {fee_type}")
# Find the button based on side (either transferor or transferee)
button = card.find_element(
By.XPATH,
f"//div[@x-ref='container']/div[span[contains(.,'{fee_type}手数料体系')]]/div[@class='flex gap-2']/a[contains(.,'{side}側')]"
)
button_color = button.get_attribute('class')
# Click the button using JavaScript when it is not disabled
if 'bg-gray-300' not in button_color:
driver.execute_script("arguments[0].click();", button)
print(f"Clicked Button: {fee_type} {side}側 using JavaScript.")
# Wait for the table to appear
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
# Parse the updated page source
updated_html = driver.page_source
updated_soup = BeautifulSoup(updated_html, "html.parser")
# Retrieve table data
table = updated_soup.find("table")
if table:
return format_table_data(table.prettify())
else:
print(f"No table found for {fee_type} {side}側")
return "No table found"
except Exception as e:
print(f"Error retrieving fee details for {fee_type} {side}側: {e}")
return "N/A"