我想从以下链接中提取附件中的表格:https://www.rfi.it/en/stations.html。 但我可以提取第1页的数据。我需要提取所有页面的数据。 你们能帮我吗?谢谢
我对编码不熟悉,所以我使用了 github copilot,但它无法帮助我。代码如下:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
# Initialize WebDriver with the correct path to your Edge WebDriver executable
service = Service('D:/Downloads/edgedriver_win64/msedgedriver.exe')
driver = webdriver.Edge(service=service)
# Navigate to the URL
url = 'https://www.rfi.it/it/stazioni.html'
driver.get(url)
# Accept cookies if the popup appears
try:
accept_cookies_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accetta tutti i cookie")]'))
)
accept_cookies_button.click()
except Exception as e:
print(f"No cookies button found or unable to click it. Error: {e}")
# Wait for the LISTA tab to be clickable and click it using JavaScript
lista_tab = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "LISTA")]'))
)
driver.execute_script("arguments[0].click();", lista_tab)
# Initialize variables
data = []
max_pages = 5 # Set the maximum number of pages to scrape
current_page = 1
while current_page <= max_pages:
print(f"Processing page {current_page}")
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find the table element
table = soup.find('table', class_='table table-striped table-hover')
# Check if the table exists
if table:
headers = [header.text.strip() for header in table.find_all('th')]
print(f"Headers: {headers}") # Debugging statement
if current_page == 1:
data.append(headers) # Add headers only once
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
if cols: # Ensure the row has columns
cols = [ele.text.strip() for ele in cols]
print(f"Row data: {cols}") # Debugging statement
data.append(cols)
else:
print(f"No table found on page {current_page}. Stopping.")
break
# Find the "successivi" button and click it
try:
next_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "pagination--next-btn")]'))
)
driver.execute_script("arguments[0].click();", next_button)
time.sleep(2) # Wait for the page to load
except Exception as e:
print(f"No more pages or unable to find the next button. Stopping. Error: {e}")
break
# Increment the page counter
current_page += 1
# Close the WebDriver
driver.quit()
# Create a DataFrame from the list of data
if data:
print(f"Data collected: {data}") # Debugging statement
df = pd.DataFrame(data[1:], columns=data[0]) # Use the first row as header
df = df.dropna(how='all') # Drop rows where all elements are NaN
else:
df = pd.DataFrame()
# Display the DataFrame
print(df)
结果总是:没有更多页面或找不到下一页按钮。停下来。错误: 消息:
收集的数据:[['Stazione'、'Indirizzo'、'Comune'、'Provincia'、'Classificazione']、['阿巴诺泰尔梅'、'Via Della Stazione, 10'、'阿巴诺泰尔梅'....(表第 1 页)
您要查找的数据嵌入在 HTML 页面中,因此不需要分页:
import requests
from bs4 import BeautifulSoup
import json
url = "https://www.rfi.it/en/stations.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = soup.select_one("#stationsJSON")["value"]
data = json.loads(data)
# pretty print the data:
print(json.dumps(data, indent=4))
打印:
[
{
"name": "Borgo S.Martino",
"loc": {
"n": 1,
"add": "Via Stazione, 3",
"lat": "45.0871605076522",
"lng": "8.5207003862554007",
"acc": false
},
"pr": "al",
"rg": "piemonte",
"ct": "Borgo San Martino",
"lk": "borgo-s-martino.html"
},
{
"name": "Funo Centergross",
"loc": {
"n": 2,
"add": "Strada Provinciale Galliera",
"lat": "44.5929334611169",
"lng": "11.370742907462899",
"acc": false
},
"pr": "bo",
"rg": "emilia romagna",
"ct": "Argelato",
"lk": "funo-centergross.html"
},
...