我一直在尝试使用抓取这个网站(https://www.datacenters.com/locations/united-states/virginia)并且分页不起作用。我尝试了各种各样的方法,但无法让它发挥作用。它只会刮掉第一页。有人可以帮忙吗?我正在使用 pycharm。
当然,这是您提供的确切代码:
import requests
from bs4 import BeautifulSoup
import csv
# Base URL of the website
base_url = 'https://www.datacenters.com'
# URL of the main webpage to scrape
main_url = f'{base_url}/locations/united-states/virginia'
def get_data_from_page(url, writer):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
location_tiles = soup.find_all('div', class_='LocationTile__details__sXkB0')
if not location_tiles:
print(f"No location tiles found on {url}")
for tile in location_tiles:
name = tile.find('div', class_='LocationTile__name__NrDKr').text
address = tile.find('div', class_='LocationTile__address__Utj30').text
parent_anchor = tile.find_parent('a', href=True)
if parent_anchor:
relative_link = parent_anchor['href']
link = f'{base_url}{relative_link}'
detail_response = requests.get(link)
if detail_response.status_code == 200:
detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
power_div = detail_soup.find('div', id='power')
power = power_div.find('strong').text.strip() if power_div else 'N/A'
sqf_div = detail_soup.find('div', id='statInfo')
sqf = sqf_div.find('strong').text.strip() if sqf_div else 'N/A'
# Write the data to the CSV file
writer.writerow([name, address, power, sqf, link])
print(f"Scraped data for {name}")
else:
print(f"Failed to retrieve details from {link}. Status code: {detail_response.status_code}")
else:
print(f"No link found for {name}")
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
# Open a CSV file to write the data
with open('datacenters.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header row
writer.writerow(['Name', 'Address', 'Power', 'SQF', 'Link'])
# Start with the first page
page_url = main_url
while True:
print(f"Scraping page: {page_url}")
get_data_from_page(page_url, writer)
# Check if there is a next page link
response = requests.get(page_url)
soup = BeautifulSoup(response.content, 'html.parser')
next_page_button = soup.find('button', class_='Control__control__ijHLR Pagination__pageItem__NsQSw Pagination__symbol__KHv6r')
if next_page_button and not 'Pagination__disabled__FbUC6' in next_page_button['class']:
next_page_button.click()
next_page_link_tag = soup.find('a', href=True)
if next_page_link_tag:
next_page_link = next_page_link_tag['href']
page_url = f"{base_url}{next_page_link}"
print(f"Next page link found: {page_url}")
else:
print("Next page link not found.")
break
else:
print("No more pages.")
break
如果您需要任何进一步的帮助或修改,请随时询问!
我尝试了多种配置,但总是收到错误消息,表明没有下一步按钮。
该页面似乎使用带有端点
locations
https://www.datacenters.com/api/v1/locations 的 API 实现分页,对于弗吉尼亚州则使用值 https://www.datacenters.com/api /v1/locations?page=7&query=&country_id=234&state_id=3173.
该页面似乎还使用了 React 和 Rails 组件,这使得很难获得指向
next
按钮功能的任何工作链接。因此,我使用 bs4
查找该州的 JSON 数据并将信息传递给您的页面方法。
下面的代码似乎最接近您已有的代码。
import requests
from bs4 import BeautifulSoup
import csv
import json
# Base URL of the website
base_url = 'https://www.datacenters.com'
# URL of the main webpage to scrape
main_url = f'{base_url}/locations/united-states/virginia'
def get_data_from_page(relative_link, name, address, writer):
link = f'{base_url}{relative_link}'
detail_response = requests.get(link)
if detail_response.status_code == 200:
detail_soup = BeautifulSoup(detail_response.content,
'html.parser')
power_div = detail_soup.find('div', id='power')
power = power_div.find(
'strong').text.strip() if power_div else 'N/A'
sqf_div = detail_soup.find('div', id='statInfo')
sqf = sqf_div.find(
'strong').text.strip() if sqf_div else 'N/A'
# Write the data to the CSV file
writer.writerow([name, address, power, sqf, link])
print(f"Scraped data for {name}")
# Open a CSV file to write the data
with open('datacenters.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header row
writer. writerow(['Name', 'Address', 'Power', 'SQF', 'Link'])
# Start with the first page
page_url = main_url
print(f"Scraping page: {page_url}")
response = requests.get(main_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
state_tag = soup.find('script',
attrs={'class': 'js-react-on-rails-component',
'data-component-name': 'StateShow'})
state_json_data = json.loads(state_tag.text)
for i, location in enumerate(state_json_data['show']['mapLocations'], 1):
print(f'page {i}', end=' ')
get_data_from_page(location['url'], location['name'],
location['fullAddress'], writer)