提取带分页的 html 表格。更改页面时 URL 不会更改

Question

我想从以下链接中提取附件中的表格：https://www.rfi.it/en/stations.html。但我可以提取第1页的数据。我需要提取所有页面的数据。你们能帮我吗？谢谢

我对编码不熟悉，所以我使用了 github copilot，但它无法帮助我。代码如下：

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time

# Initialize WebDriver with the correct path to your Edge WebDriver executable
service = Service('D:/Downloads/edgedriver_win64/msedgedriver.exe')
driver = webdriver.Edge(service=service)

# Navigate to the URL
url = 'https://www.rfi.it/it/stazioni.html'
driver.get(url)

# Accept cookies if the popup appears
try:
    accept_cookies_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accetta tutti i cookie")]'))
    )
    accept_cookies_button.click()
except Exception as e:
    print(f"No cookies button found or unable to click it. Error: {e}")

# Wait for the LISTA tab to be clickable and click it using JavaScript
lista_tab = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "LISTA")]'))
)
driver.execute_script("arguments[0].click();", lista_tab)

# Initialize variables
data = []
max_pages = 5  # Set the maximum number of pages to scrape
current_page = 1

while current_page <= max_pages:
    print(f"Processing page {current_page}")
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find the table element
    table = soup.find('table', class_='table table-striped table-hover')
    
    # Check if the table exists
    if table:
        headers = [header.text.strip() for header in table.find_all('th')]
        print(f"Headers: {headers}")  # Debugging statement
        if current_page == 1:
            data.append(headers)  # Add headers only once
        
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if cols:  # Ensure the row has columns
                cols = [ele.text.strip() for ele in cols]
                print(f"Row data: {cols}")  # Debugging statement
                data.append(cols)
    else:
        print(f"No table found on page {current_page}. Stopping.")
        break
    
    # Find the "successivi" button and click it
    try:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "pagination--next-btn")]'))
        )
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(2)  # Wait for the page to load
    except Exception as e:
        print(f"No more pages or unable to find the next button. Stopping. Error: {e}")
        break
    
    # Increment the page counter
    current_page += 1

# Close the WebDriver
driver.quit()

# Create a DataFrame from the list of data
if data:
    print(f"Data collected: {data}")  # Debugging statement
    df = pd.DataFrame(data[1:], columns=data[0])  # Use the first row as header
    df = df.dropna(how='all')  # Drop rows where all elements are NaN
else:
    df = pd.DataFrame()

# Display the DataFrame
print(df)

结果总是：没有更多页面或找不到下一页按钮。停下来。错误：消息：

收集的数据：[['Stazione'、'Indirizzo'、'Comune'、'Provincia'、'Classificazione']、['阿巴诺泰尔梅'、'Via Della Stazione, 10'、'阿巴诺泰尔梅'....（表第 1 页）

Answer 1

您要查找的数据嵌入在 HTML 页面中，因此不需要分页：

import requests
from bs4 import BeautifulSoup
import json

url = "https://www.rfi.it/en/stations.html"

soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = soup.select_one("#stationsJSON")["value"]
data = json.loads(data)

# pretty print the data:
print(json.dumps(data, indent=4))

打印：

[
    {
        "name": "Borgo S.Martino",
        "loc": {
            "n": 1,
            "add": "Via Stazione, 3",
            "lat": "45.0871605076522",
            "lng": "8.5207003862554007",
            "acc": false
        },
        "pr": "al",
        "rg": "piemonte",
        "ct": "Borgo San Martino",
        "lk": "borgo-s-martino.html"
    },
    {
        "name": "Funo Centergross",
        "loc": {
            "n": 2,
            "add": "Strada Provinciale Galliera",
            "lat": "44.5929334611169",
            "lng": "11.370742907462899",
            "acc": false
        },
        "pr": "bo",
        "rg": "emilia romagna",
        "ct": "Argelato",
        "lk": "funo-centergross.html"
    },

...

提取带分页的 html 表格。更改页面时 URL 不会更改

问题描述投票：0回答：1

1个回答

最新问题

提取带分页的 html 表格。更改页面时 URL 不会更改

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1