在Python中使用Selenium抓取动态网站的所有数据

问题描述 投票:0回答:1

我正在尝试从下面的网址抓取数据

url=https://www.usaspending.gov/search/?hash=7e5e5a79e871a86ff6b69395e47ab41e

但它也包含一个滚动条,导致所有值都没有被刮掉。

下面提到了我尝试过的代码,请求您的帮助

import requests
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains

driver=webdriver.Chrome()

url = "https://www.usaspending.gov/search/?hash=7e5e5a79e871a86ff6b69395e47ab41e"
driver.get(url)

action = ActionChains(driver)

from selenium.webdriver.common.by import By
time.sleep(7)

headings_row = driver.find_elements(By.CLASS_NAME, 'award-result-header-cell')
award_link = driver.find_elements(By.PARTIAL_LINK_TEXT, 'FA')
link="/recipient/"
recp_link = driver.find_elements(By.XPATH,'//a[@href="'+link+'"]')


list_headings_row = []
award_id = []
reciepient = []

for i in range(15):
    list_headings_row.append(headings_row[i].text)
    award_id.append(award_link[i].text)
    reciepient.append(recp_link[i].text)

print(list_headings_row)

下面是我从打印 list_headings_row 得到的响应

> ['Award ID',  'Recipient Name',  'Start Date\n(Period of
> Performance)',  'End Date\n(Period of Performance)',  '',  '',  '', 
> '',  '',  '',  '',  '',  '',  '',  '']
python-3.x selenium-webdriver web-scraping
1个回答
0
投票

正如评论中提到的,可以通过官方API获取数据。

有关奖励 API 端点支出的完整文档。

具体操作方法如下:

import json
import time

import requests

api_url = "https://api.usaspending.gov/api/v2/search/spending_by_award/"
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.200",
    "X-Requested-With": "XMLHttpRequest",
}

payload = {
    "filters": {
        "time_period": [
            {
                "start_date": "2022-10-01",
                "end_date": "2023-09-30"
            },
            {
                "start_date": "2021-10-01",
                "end_date": "2022-09-30"
            },
            {
                "start_date": "2020-10-01",
                "end_date": "2021-09-30"
            }
        ],
        "award_type_codes": [
            "A",
            "B",
            "C",
            "D"
        ],
        "agencies": [
            {
                "type": "awarding",
                "tier": "subtier",
                "name": "Department of the Air Force",
                "toptier_name": "Department of Defense"
            }
        ],
        "award_amounts": [
            {
                "upper_bound": 75000000
            }
        ]
    },
    "fields": [
        "Award ID",
        "Recipient Name",
        "Start Date",
        "End Date",
        "Award Amount",
        "Total Outlays",
        "Description",
        "def_codes",
        "COVID-19 Obligations",
        "COVID-19 Outlays",
        "Infrastructure Obligations",
        "Infrastructure Outlays",
        "Awarding Agency",
        "Awarding Sub Agency",
        "Contract Award Type",
        "recipient_id",
        "prime_award_recipient_id"
    ],
    "page": 1,
    "limit": 60,
    "sort": "Award Amount",
    "order": "desc",
    "subawards": False
}


def wait_a_bit(wait_for: int = 1, message: bool = False) -> None:
    if message:
        print(f"Waiting for {wait_for} seconds...")
    time.sleep(wait_for)


def get_spending_data():
    with requests.Session() as session:
        while True:
            response = session.post(api_url, headers=headers, json=payload)
            response.raise_for_status()
            spending_data = response.json()
            awards = spending_data["results"]
            # This shows only the first award in the list; remove [0] to see all
            print(json.dumps(awards[0], indent=4))
            payload["page"] += 1
            wait_a_bit(wait_for=1, message=True)
            if not spending_data["page_metadata"]["hasNext"]:
                break


if __name__ == "__main__":
    get_spending_data()

这应该将数据的第一“行”作为每个“滚动”(表格页面)的字典。

{
    "internal_id": 90310986,
    "Award ID": "FA875019C1518",
    "Recipient Name": "INTERNATIONAL BUSINESS MACHINES CORP",
    "Start Date": "2019-08-16",
    "End Date": "2023-08-16",
    "Award Amount": 74999951.0,
    "Total Outlays": 5984934.86,
    "Description": "IBM NORTHPOLE NEURAL INFERENCE MACHINE: ARCHITECTURE, SOFT INTELLECTUAL PROPERTY (IP) CORE TECHNOLOGY, SOFTWARE ECOSYSTEM, PROTOTYPE CHIP&BOARD PHASE 2",
    "def_codes": [
        "N",
        "Q"
    ],
    "COVID-19 Obligations": -3175836.46,
    "COVID-19 Outlays": 3175836.46,
    "Infrastructure Obligations": null,
    "Infrastructure Outlays": null,
    "Awarding Agency": "Department of Defense",
    "Awarding Sub Agency": "Department of the Air Force",
    "Contract Award Type": "DEFINITIVE CONTRACT",
    "recipient_id": "d1776a20-1dbc-351a-8f2e-e20d504a1d3f-C",
    "prime_award_recipient_id": null,
    "awarding_agency_id": 1173,
    "agency_slug": "department-of-defense",
    "generated_internal_id": "CONT_AWD_FA875019C1518_9700_-NONE-_-NONE-"
}
Waiting for 1 seconds...
{
    "internal_id": 15069464,
    "Award ID": "FA867217C0010",
    "Recipient Name": "RAYTHEON COMPANY",
    "Start Date": "2019-06-30",
    "End Date": "2023-06-30",
    "Award Amount": 70866143.0,
    "Total Outlays": 2579829.0,
    "Description": "SMALL DIAMETER BOMB II - LOT 3 PRODUCTION",
    "def_codes": [
        "Q"
    ],
    "COVID-19 Obligations": null,
    "COVID-19 Outlays": null,
    "Infrastructure Obligations": null,
    "Infrastructure Outlays": null,
    "Awarding Agency": "Department of Defense",
    "Awarding Sub Agency": "Department of the Air Force",
    "Contract Award Type": "DEFINITIVE CONTRACT",
    "recipient_id": "01c4a3a3-b4c5-ce4e-822b-d17f09985001-C",
    "prime_award_recipient_id": null,
    "awarding_agency_id": 1173,
    "agency_slug": "department-of-defense",
    "generated_internal_id": "CONT_AWD_FA867217C0010_9700_-NONE-_-NONE-"
}
Waiting for 1 seconds...
{
    "internal_id": 15058192,
    "Award ID": "FA862215F8112",
    "Recipient Name": "HX5 LLC",
    "Start Date": "2015-08-14",
    "End Date": "2020-08-31",
    "Award Amount": 66839178.32,
    "Total Outlays": 0.0,
    "Description": "IGF::CL::IGF SCATI ENGINEERING PROFESSIONAL AND ADMINISTRATIVE SUPPORT SERVICES (EPASS) ADVISORY AND ASSISTANCE SERVICES (A&AS) SUPPORT IN SUPPORT OF AIR FORCE PROGRAM EXECUTIVE OFFICER, AGILE COMBAT SUPPORT (AFPEO/ACS) AIR FORCE LIFE CYCLE MANAGEMENT CENTER (AFLCMC) AGILE COMBAT SUPPORT DIRECTORATE (AFLCMC/WN) WRIGHT-PATTERSON AFB",
    "def_codes": [
        "Q"
    ],
    "COVID-19 Obligations": null,
    "COVID-19 Outlays": null,
    "Infrastructure Obligations": null,
    "Infrastructure Outlays": null,
    "Awarding Agency": "Department of Defense",
    "Awarding Sub Agency": "Department of the Air Force",
    "Contract Award Type": "DELIVERY ORDER",
    "recipient_id": "385dd1df-55cb-ae3f-a24c-0b7430d4ae02-C",
    "prime_award_recipient_id": null,
    "awarding_agency_id": 1173,
    "agency_slug": "department-of-defense",
    "generated_internal_id": "CONT_AWD_FA862215F8112_9700_GS00Q14OADS712_4732"
}
Waiting for 1 seconds...
© www.soinside.com 2019 - 2024. All rights reserved.