我无法找到展览网站的Python抓取代码链接

问题描述 投票:0回答:1

我是 Python 和网络抓取的新手。我是编程初学者,仍在练习中。我正在使用 Python 和 Selenium 进行网页抓取。并使用 Chat GPT 来帮助我。请记住,它仍在进行中:)

我正在尝试从(Heimtextil)等展览网站上抓取数据。我的目标是找到并抓取该公司的网站链接,以及参展商页面上的地址和联系信息。由此,详细信息将被放入 Excel 文档中。

我特别难以找到 class="a-link--no-focus"herf="link"

参展商名单

参展商页面

我可以很好地获取公司名称,只是无法获取从参展商列表中查找/打开参展商页面的代码。

这是最终结果和终端输出的图片

最终结果

我非常感谢您的帮助!

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Set up Selenium WebDriver
service = Service('/LOCATION/chromedriver')
driver = webdriver.Chrome(service=service)

try:
    # Navigate to the exhibitor search page
    url = 'https://heimtextil.messefrankfurt.com/frankfurt/en/exhibitor-search.html?country=AUT'
    driver.get(url)

    # Wait for the main container to load (use explicit wait)
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'ex-exhibitor-search-result-item'))
        )
        print("Exhibitor containers loaded")
    except TimeoutException:
        print("Loading took too much time!")

    # Scroll down multiple times to load more elements if lazy loading is present
    for _ in range(3):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  # Wait for new content to load after scrolling

    # Locate all exhibitor items
    exhibitors = driver.find_elements(By.CLASS_NAME, 'ex-exhibitor-search-result-item')
    print(f"Found {len(exhibitors)} exhibitors")

    # Extract data
    data = []
    base_url = 'https://heimtextil.messefrankfurt.com'  # Base URL for detail pages

    for exhibitor in exhibitors:
        # Scroll each exhibitor into view
        driver.execute_script("arguments[0].scrollIntoView();", exhibitor)
        time.sleep(1)  # Small delay to allow scrolling to complete

        # Get the company name
        try:
            company_name = exhibitor.find_element(By.CLASS_NAME, 'ex-exhibitor-search-result-item__headline').text.strip()
        except NoSuchElementException:
            company_name = 'N/A'
            print("Error getting company name")

        # Get the link to the exhibitor's detail page
        detail_page_url = 'N/A'
        try:
            detail_page_tag = exhibitor.find_element(By.CSS_SELECTOR, 'a.a-link--no-focus')
            detail_page_url = base_url + detail_page_tag.get_attribute('href')
            print(f"Detail page URL found: {detail_page_url}")
        except NoSuchElementException:
            print("No detail page link found for exhibitor")

        # Now navigate to the detail page if it exists
        website_url = 'N/A'
        if detail_page_url != 'N/A':
            driver.get(detail_page_url)
            time.sleep(5)  # Wait for the detail page to load

            # Try to locate the website URL in the detail page
            try:
                website_tag = driver.find_element(By.CSS_SELECTOR, 'a.ex-contact-box__website-link')
                website_url = website_tag.get_attribute('href')
                print(f"Found website URL: {website_url}")
            except NoSuchElementException:
                print("No website URL found on detail page")

            # Return to the main page
            driver.back()
            time.sleep(5)

        # Append extracted data to the list
        data.append({
            'Company': company_name,
            'Website': website_url,
        })

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Check if DataFrame contains the expected data
    print("DataFrame columns:", df.columns)
    print(df.head())  # Display first few rows to verify data

    # Save to Excel
    df.to_excel('heimtextil_exhibitors.xlsx', index=False)

finally:
    # Close the browser
    driver.quit()
    print("Browser closed")
python selenium-webdriver web-scraping hyperlink
1个回答
0
投票

我已经浏览了您提供的网站。所有提到的公司网站地址都可以在 html 中找到,类为 icon icon-news-before ex-contact-box__website-link

<a class="icon icon-news-before ex-contact-box__website-link" href="http://www.bandex.com" rel="noopener noreferrer" target="_blank"><span><strong>Our website</strong></span></a>

这是更新后的代码

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

service = Service('/LOCATION/chromedriver') 
driver = webdriver.Chrome(service=service)

try:

    url = 'https://heimtextil.messefrankfurt.com/frankfurt/en/exhibitor-search.html?country=AUT'
    driver.get(url)

    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'ex-exhibitor-search-result-item'))
    )

    for _ in range(3):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

    exhibitors = driver.find_elements(By.CLASS_NAME, 'ex-exhibitor-search-result-item')

    data = []
    base_url = 'https://heimtextil.messefrankfurt.com'  # Base URL for detail pages

    for exhibitor in exhibitors:
        driver.execute_script("arguments[0].scrollIntoView();", exhibitor)
        time.sleep(1)  # Small delay to allow scrolling to complete
        try:
            company_name = exhibitor.find_element(By.CLASS_NAME, 'ex-exhibitor-search-result-item__headline').text.strip()
        except NoSuchElementException:
            company_name = 'N/A'

        detail_page_url = 'N/A'
        try:
            detail_page_tag = exhibitor.find_element(By.CSS_SELECTOR, 'a.a-link--no-focus')
            detail_page_url = base_url + detail_page_tag.get_attribute('href')
        except NoSuchElementException:
            pass

        website_urls = []
        if detail_page_url != 'N/A':
            driver.get(detail_page_url)
            time.sleep(5)  

            # Find all 'a' elements with the class 'ex-contact-box__website-link'
            try:
                website_tags = driver.find_elements(By.CSS_SELECTOR, 'a.ex-contact-box__website-link')
                website_urls = [website_tag.get_attribute('href') for website_tag in website_tags]
            except NoSuchElementException:
                pass

            
            website_url = ', '.join(website_urls) if website_urls else 'N/A'


            driver.get(url)
            time.sleep(5)

        data.append({
            'Company': company_name,
            'Website': website_url,
        })

    df = pd.DataFrame(data)

    df.to_excel('heimtextil_exhibitors.xlsx', index=False)

    print("Data successfully saved to heimtextil_exhibitors.xlsx")

finally:
    driver.quit()
    print("Browser closed")
© www.soinside.com 2019 - 2024. All rights reserved.