我的 PyCharm 脚本上的分页问题

Question

我一直在尝试使用抓取这个网站（https://www.datacenters.com/locations/united-states/virginia）并且分页不起作用。我尝试了各种各样的方法，但无法让它发挥作用。它只会刮掉第一页。有人可以帮忙吗？我正在使用 pycharm。

当然，这是您提供的确切代码：

import requests
from bs4 import BeautifulSoup
import csv

# Base URL of the website
base_url = 'https://www.datacenters.com'
# URL of the main webpage to scrape
main_url = f'{base_url}/locations/united-states/virginia'

def get_data_from_page(url, writer):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        location_tiles = soup.find_all('div', class_='LocationTile__details__sXkB0')
        
        if not location_tiles:
            print(f"No location tiles found on {url}")
        
        for tile in location_tiles:
            name = tile.find('div', class_='LocationTile__name__NrDKr').text
            address = tile.find('div', class_='LocationTile__address__Utj30').text
            
            parent_anchor = tile.find_parent('a', href=True)
            if parent_anchor:
                relative_link = parent_anchor['href']
                link = f'{base_url}{relative_link}'
                
                detail_response = requests.get(link)
                if detail_response.status_code == 200:
                    detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
                    
                    power_div = detail_soup.find('div', id='power')
                    power = power_div.find('strong').text.strip() if power_div else 'N/A'
                    
                    sqf_div = detail_soup.find('div', id='statInfo')
                    sqf = sqf_div.find('strong').text.strip() if sqf_div else 'N/A'
                    
                    # Write the data to the CSV file
                    writer.writerow([name, address, power, sqf, link])
                    print(f"Scraped data for {name}")
                else:
                    print(f"Failed to retrieve details from {link}. Status code: {detail_response.status_code}")
            else:
                print(f"No link found for {name}")
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

# Open a CSV file to write the data
with open('datacenters.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['Name', 'Address', 'Power', 'SQF', 'Link'])
    
    # Start with the first page
    page_url = main_url
    while True:
        print(f"Scraping page: {page_url}")
        get_data_from_page(page_url, writer)
        
        # Check if there is a next page link
        response = requests.get(page_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        next_page_button = soup.find('button', class_='Control__control__ijHLR Pagination__pageItem__NsQSw Pagination__symbol__KHv6r')
        if next_page_button and not 'Pagination__disabled__FbUC6' in next_page_button['class']:
            next_page_button.click()
            next_page_link_tag = soup.find('a', href=True)
            if next_page_link_tag:
                next_page_link = next_page_link_tag['href']
                page_url = f"{base_url}{next_page_link}"
                print(f"Next page link found: {page_url}")
            else:
                print("Next page link not found.")
                break
        else:
            print("No more pages.")
            break

如果您需要任何进一步的帮助或修改，请随时询问！

我尝试了多种配置，但总是收到错误消息，表明没有下一步按钮。

Answer 1

该页面似乎使用带有端点

locations

https://www.datacenters.com/api/v1/locations 的 API 实现分页，对于弗吉尼亚州则使用值 https://www.datacenters.com/api /v1/locations?page=7&query=&country_id=234&state_id=3173.

该页面似乎还使用了 React 和 Rails 组件，这使得很难获得指向

next

按钮功能的任何工作链接。因此，我使用

bs4

查找该州的 JSON 数据并将信息传递给您的页面方法。

下面的代码似乎最接近您已有的代码。

import requests
from bs4 import BeautifulSoup
import csv
import json

# Base URL of the website
base_url = 'https://www.datacenters.com'
# URL of the main webpage to scrape
main_url = f'{base_url}/locations/united-states/virginia'

def get_data_from_page(relative_link, name, address, writer):
    link = f'{base_url}{relative_link}'
    detail_response = requests.get(link)

    if detail_response.status_code == 200:
        detail_soup = BeautifulSoup(detail_response.content,
                                    'html.parser')

        power_div = detail_soup.find('div', id='power')
        power = power_div.find(
            'strong').text.strip() if power_div else 'N/A'

        sqf_div = detail_soup.find('div', id='statInfo')
        sqf = sqf_div.find(
            'strong').text.strip() if sqf_div else 'N/A'

        # Write the data to the CSV file
        writer.writerow([name, address, power, sqf, link])
        print(f"Scraped data for {name}")


# Open a CSV file to write the data
with open('datacenters.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header row
    writer. writerow(['Name', 'Address', 'Power', 'SQF', 'Link'])

    # Start with the first page
    page_url = main_url

    print(f"Scraping page: {page_url}")
    response = requests.get(main_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        state_tag = soup.find('script',
                              attrs={'class': 'js-react-on-rails-component',
                                     'data-component-name': 'StateShow'})

        state_json_data = json.loads(state_tag.text)
        for i, location in enumerate(state_json_data['show']['mapLocations'], 1):
            print(f'page {i}', end=' ')
            get_data_from_page(location['url'], location['name'],
                               location['fullAddress'], writer)

我的 PyCharm 脚本上的分页问题

问题描述投票：0回答：1

1个回答

最新问题

我的 PyCharm 脚本上的分页问题

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1