我是 Python 和网络抓取的新手。我是编程初学者,仍在练习中。我正在使用 Python 和 Selenium 进行网页抓取。并使用 Chat GPT 来帮助我。请记住,它仍在进行中:)
我正在尝试从(Heimtextil)等展览网站上抓取数据。我的目标是找到并抓取该公司的网站链接,以及参展商页面上的地址和联系信息。由此,详细信息将被放入 Excel 文档中。
我特别难以找到 class="a-link--no-focus" 和 herf="link"。
我可以很好地获取公司名称,只是无法获取从参展商列表中查找/打开参展商页面的代码。
这是最终结果和终端输出的图片
我非常感谢您的帮助!
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# Set up Selenium WebDriver
service = Service('/LOCATION/chromedriver')
driver = webdriver.Chrome(service=service)
try:
# Navigate to the exhibitor search page
url = 'https://heimtextil.messefrankfurt.com/frankfurt/en/exhibitor-search.html?country=AUT'
driver.get(url)
# Wait for the main container to load (use explicit wait)
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, 'ex-exhibitor-search-result-item'))
)
print("Exhibitor containers loaded")
except TimeoutException:
print("Loading took too much time!")
# Scroll down multiple times to load more elements if lazy loading is present
for _ in range(3):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5) # Wait for new content to load after scrolling
# Locate all exhibitor items
exhibitors = driver.find_elements(By.CLASS_NAME, 'ex-exhibitor-search-result-item')
print(f"Found {len(exhibitors)} exhibitors")
# Extract data
data = []
base_url = 'https://heimtextil.messefrankfurt.com' # Base URL for detail pages
for exhibitor in exhibitors:
# Scroll each exhibitor into view
driver.execute_script("arguments[0].scrollIntoView();", exhibitor)
time.sleep(1) # Small delay to allow scrolling to complete
# Get the company name
try:
company_name = exhibitor.find_element(By.CLASS_NAME, 'ex-exhibitor-search-result-item__headline').text.strip()
except NoSuchElementException:
company_name = 'N/A'
print("Error getting company name")
# Get the link to the exhibitor's detail page
detail_page_url = 'N/A'
try:
detail_page_tag = exhibitor.find_element(By.CSS_SELECTOR, 'a.a-link--no-focus')
detail_page_url = base_url + detail_page_tag.get_attribute('href')
print(f"Detail page URL found: {detail_page_url}")
except NoSuchElementException:
print("No detail page link found for exhibitor")
# Now navigate to the detail page if it exists
website_url = 'N/A'
if detail_page_url != 'N/A':
driver.get(detail_page_url)
time.sleep(5) # Wait for the detail page to load
# Try to locate the website URL in the detail page
try:
website_tag = driver.find_element(By.CSS_SELECTOR, 'a.ex-contact-box__website-link')
website_url = website_tag.get_attribute('href')
print(f"Found website URL: {website_url}")
except NoSuchElementException:
print("No website URL found on detail page")
# Return to the main page
driver.back()
time.sleep(5)
# Append extracted data to the list
data.append({
'Company': company_name,
'Website': website_url,
})
# Convert to DataFrame
df = pd.DataFrame(data)
# Check if DataFrame contains the expected data
print("DataFrame columns:", df.columns)
print(df.head()) # Display first few rows to verify data
# Save to Excel
df.to_excel('heimtextil_exhibitors.xlsx', index=False)
finally:
# Close the browser
driver.quit()
print("Browser closed")
我已经浏览了您提供的网站。所有提到的公司网站地址都可以在 html 中找到,类为 icon icon-news-before ex-contact-box__website-link
<a class="icon icon-news-before ex-contact-box__website-link" href="http://www.bandex.com" rel="noopener noreferrer" target="_blank"><span><strong>Our website</strong></span></a>
这是更新后的代码
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
service = Service('/LOCATION/chromedriver')
driver = webdriver.Chrome(service=service)
try:
url = 'https://heimtextil.messefrankfurt.com/frankfurt/en/exhibitor-search.html?country=AUT'
driver.get(url)
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, 'ex-exhibitor-search-result-item'))
)
for _ in range(3):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
exhibitors = driver.find_elements(By.CLASS_NAME, 'ex-exhibitor-search-result-item')
data = []
base_url = 'https://heimtextil.messefrankfurt.com' # Base URL for detail pages
for exhibitor in exhibitors:
driver.execute_script("arguments[0].scrollIntoView();", exhibitor)
time.sleep(1) # Small delay to allow scrolling to complete
try:
company_name = exhibitor.find_element(By.CLASS_NAME, 'ex-exhibitor-search-result-item__headline').text.strip()
except NoSuchElementException:
company_name = 'N/A'
detail_page_url = 'N/A'
try:
detail_page_tag = exhibitor.find_element(By.CSS_SELECTOR, 'a.a-link--no-focus')
detail_page_url = base_url + detail_page_tag.get_attribute('href')
except NoSuchElementException:
pass
website_urls = []
if detail_page_url != 'N/A':
driver.get(detail_page_url)
time.sleep(5)
# Find all 'a' elements with the class 'ex-contact-box__website-link'
try:
website_tags = driver.find_elements(By.CSS_SELECTOR, 'a.ex-contact-box__website-link')
website_urls = [website_tag.get_attribute('href') for website_tag in website_tags]
except NoSuchElementException:
pass
website_url = ', '.join(website_urls) if website_urls else 'N/A'
driver.get(url)
time.sleep(5)
data.append({
'Company': company_name,
'Website': website_url,
})
df = pd.DataFrame(data)
df.to_excel('heimtextil_exhibitors.xlsx', index=False)
print("Data successfully saved to heimtextil_exhibitors.xlsx")
finally:
driver.quit()
print("Browser closed")