Selenium - 无法下载文档

问题描述 投票:0回答:1

我目前正在开发一个网络抓取工具,每次我尝试单击或尝试使用它获取某个链接按钮的 href 时,它绝对不会给我任何结果。 但是,我尝试过,并且必须指出,当我自己访问该网站时,我需要单击的链接有效并且数据可以访问,但是当我使用我的网络爬虫时却无法访问,为什么?

这是我个人使用的代码:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import urllib.request
import os

WEBSITE_URL = 'https://www.i-de.es/conexion-red-electrica/produccion-energia/mapa-capacidad-acceso'
BUTTON_COOKIE_XPATH = '//*[@id="onetrust-accept-btn-handler"]'
BUTTON_AVISO_XPATH = '//*[@id="MapaCapaciadaModalButton"]/span[1]'
BUTTON_PDF_XPATH = '//*[@id="portlet_com_liferay_journal_content_web_portlet_JournalContentPortlet_INSTANCE_aVVDHaAKM4S6"]/div/div/div/div/div/p/a'
DOWNLOAD_PATH = '/path/to/download/directory'
PROFILE_PATH = 'my personal path to my chrome profile'

def setup_driver(profile_path: str = None) -> webdriver.Chrome:
    chrome_options = Options()
    if profile_path:
        chrome_options.add_argument(f"user-data-dir={profile_path}")
    chrome_options.add_experimental_option("prefs", {
        "download.default_directory": DOWNLOAD_PATH,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True
    })
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def wait_and_click(driver: webdriver.Chrome, by: By, value: str):
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((by, value))
    )
    element.click()

def get_pdf_url(driver: webdriver.Chrome) -> str:
    pdf_link_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, BUTTON_PDF_XPATH))
    )
    url = pdf_link_element.get_attribute('href')
    if not url:
        raise ValueError("Failed to retrieve the PDF URL")
    return url

def download_pdf(url: str, download_path: str) -> str:
    local_pdf_path = os.path.join(download_path, "downloaded_file.pdf")
    urllib.request.urlretrieve(url, local_pdf_path)
    sleep(10)
    if not os.path.isfile(local_pdf_path):
        raise FileNotFoundError("PDF file was not found after downloading")
    return local_pdf_path

def main():
    driver = setup_driver()

    try:
        driver.get(WEBSITE_URL)
        sleep(10)
        wait_and_click(driver, By.XPATH, BUTTON_COOKIE_XPATH)
        wait_and_click(driver, By.XPATH, BUTTON_AVISO_XPATH)
        pdf_url = get_pdf_url(driver)
        downloaded_pdf_path = download_pdf(pdf_url, DOWNLOAD_PATH)
        print(f"PDF downloaded to: {downloaded_pdf_path}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

如您所见,它并不是一个很大的抓取工具,只想将这个文件描述为“BUTTON_PDF_XPATH”。

因此,我尝试了一些方法来修复它,例如将我的 chrome 配置文件与网络抓取器一起使用,这有时会导致出现错误: Err_HTTP2_Protocol_Error ,无限加载直到超时,或者在某些情况下加载了网站,但它无法点击任何内容(我可以向您保证所有 XPATH 工作)。

我也尝试用一些 sleep() 来减慢刮刀速度,但结果只是让我什么都不等,或者我什至尝试直接点击它,但它只是让我离开。

最后我想尝试使用诸如 :options.add_argument('--disable-http2') 之类的参数来解决 Err_HTTP2_Protocol_Error 但我不知道如何使用它。

谢谢您的宝贵时间,我希望您能帮助我解决这个问题

python selenium-webdriver selenium-chromedriver data-mining
1个回答
2
投票

您可以从静态html中获取pdf链接,不需要selenium:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
  
def extract_pdf_link(url):
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')
    pdf_url = urljoin(url, soup.select_one('a[href*=".pdf/"]').get('href'))
    return pdf_url


def download_pdf(url, download_path):
    local_pdf_path = os.path.join(download_path, "downloaded_file.pdf")
    response = requests.get(url, headers=HEADERS)
    
    with open(local_pdf_path, 'wb') as f:
        f.write(response.content)

    return local_pdf_path


WEBSITE_URL = 'https://www.i-de.es/conexion-red-electrica/produccion-energia/mapa-capacidad-acceso'
DOWNLOAD_PATH = ''
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'}

pdf_url = extract_pdf_link(WEBSITE_URL)
downloaded_pdf_path = download_pdf(pdf_url, DOWNLOAD_PATH)
© www.soinside.com 2019 - 2024. All rights reserved.