目前我有一个从特定网站抓取数据的项目,如下所示,这是我自 2023 年以来用于抓取的代码,现在在 2024 年该网站已更改,我不知道如何修复错误以显示它。如果它有效,我需要有人向我展示并感谢您提供一杯学生价的咖啡。谢谢管理员批准我的帖子。
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import re
def get_soup(url, headers):
"""Makes a GET request to the given URL and returns a BeautifulSoup object."""
response = requests.get(url, headers=headers)
return BeautifulSoup(response.content, "html.parser")
def extract_review_data(row, selectors):
"""Extracts data from a single review row using specified selectors."""
values = []
for column, selector in selectors.items():
if column == "Rating":
value = row.select_one(selector)["aria-label"]
else:
element = row.select_one(selector)
value = element.text.strip() if element else ""
values.append(value)
return values
def scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range):
"""Scrapes reviews from TripAdvisor and returns a DataFrame."""
comments = []
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
for i in tqdm(page_range):
url = base_url.format(i)
soup = get_soup(url, headers)
rows = soup.select(row_selector)
for row in rows:
review_data = extract_review_data(row, selectors)
comments.append(review_data)
time.sleep(1) # Delay to avoid being blocked
return pd.DataFrame(comments, columns=list(selectors.keys()))
# CSS selectors for data extraction
selectors = {
"NameReviewer": ".mwPje .ukgoS",
"Country": ".XExLl:nth-last-child(2) > .zpDvc > .JINyA",
"ShortReview": ".FGwzt .yCeTE",
"FullReview": ".pZUbB .yCeTE",
"TotalContributions": ".k .IugUm",
"Like": ".Vonfv .FwFXZ",
"DateandType": ".RpeCd",
"Rating": ".UctUV"
}
# Row selector for individual review blocks
row_selector = ".LbPSX .C"
# Base URL for scraping
base_url = "https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-or{}-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html"
# Define the range for scraping pages
page_range = range(0, 10, 10)
# Scrape reviews and get DataFrame
df_raw = scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range)
# Function to extract numeric rating
def extract_numeric_rating(rating_value):
match = re.search(r'\d+', str(rating_value))
return int(match.group()) if match else None
# Modify the 'Rating' column in-place
df_raw['Rating'] = df_raw['Rating'].apply(lambda x: extract_numeric_rating(x))
# Function to extract date, year, and type
def extract_date_year_type(date_type_value):
match = re.search(r'(\w{3}) (\d{4}) • (.+)', str(date_type_value))
return match.groups() if match else (None, None, None)
# Apply the function to create new columns
df_raw[['Month', 'Year', 'Type']] = pd.DataFrame(df_raw['DateandType'].apply(lambda x: extract_date_year_type(x)).tolist(), index=df_raw.index)
# Convert 'Year' to string and combine into a comma-separated string
df_raw['Year'] = df_raw.groupby('NameReviewer')['Year'].transform(lambda x: ','.join(x.astype(str)))
# Drop the 'DateandType' column
df_raw = df_raw.drop('DateandType', axis=1)
# Function to extract numeric contributions
def extract_numeric_contributions(value):
match = re.search(r'\d+', str(value))
return int(match.group()) if match else None
# Apply the function to the 'TotalContributions' column
df_raw['TotalContributions'] = df_raw['TotalContributions'].apply(extract_numeric_contributions)
# Function to extract city or country
def extract_city_or_country(value):
match = re.search(r'(.+?)(?:\d+)?\s*(?:contribution|contributions)?$', str(value))
return match.group(1).strip() if match else None
# Apply the function to the 'Country' column
df_raw['Country'] = df_raw['Country'].apply(extract_city_or_country)
# Display the modified DataFrame
df_raw.to_csv("tripadvisor_HoiAn_full_Output.csv", index=False)
df_raw
我尝试修复代码,但不起作用。希望大家能够帮助我,谢谢
这可能是因为该网站不再支持无 JavaScript 客户端,并且 BeautifulSoup 不是 JavaScript 客户端,而仅解析 HTML 数据。
我不会详细介绍如何修复您的用例,但这里有一篇关于 JavaScript BS 的文章:https://pythonprogramming.net/javascript-dynamic-scraping-parsing-beautiful-soup-tutorial/