我目前正在开发一个网络抓取工具,每次我尝试单击或尝试使用它获取某个链接按钮的 href 时,它绝对不会给我任何结果。 但是,我尝试过,并且必须指出,当我自己访问该网站时,我需要单击的链接有效并且数据可以访问,但是当我使用我的网络爬虫时却无法访问,为什么?
这是我个人使用的代码:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import urllib.request
import os
WEBSITE_URL = 'https://www.i-de.es/conexion-red-electrica/produccion-energia/mapa-capacidad-acceso'
BUTTON_COOKIE_XPATH = '//*[@id="onetrust-accept-btn-handler"]'
BUTTON_AVISO_XPATH = '//*[@id="MapaCapaciadaModalButton"]/span[1]'
BUTTON_PDF_XPATH = '//*[@id="portlet_com_liferay_journal_content_web_portlet_JournalContentPortlet_INSTANCE_aVVDHaAKM4S6"]/div/div/div/div/div/p/a'
DOWNLOAD_PATH = '/path/to/download/directory'
PROFILE_PATH = 'my personal path to my chrome profile'
def setup_driver(profile_path: str = None) -> webdriver.Chrome:
chrome_options = Options()
if profile_path:
chrome_options.add_argument(f"user-data-dir={profile_path}")
chrome_options.add_experimental_option("prefs", {
"download.default_directory": DOWNLOAD_PATH,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})
driver = webdriver.Chrome(options=chrome_options)
return driver
def wait_and_click(driver: webdriver.Chrome, by: By, value: str):
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((by, value))
)
element.click()
def get_pdf_url(driver: webdriver.Chrome) -> str:
pdf_link_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, BUTTON_PDF_XPATH))
)
url = pdf_link_element.get_attribute('href')
if not url:
raise ValueError("Failed to retrieve the PDF URL")
return url
def download_pdf(url: str, download_path: str) -> str:
local_pdf_path = os.path.join(download_path, "downloaded_file.pdf")
urllib.request.urlretrieve(url, local_pdf_path)
sleep(10)
if not os.path.isfile(local_pdf_path):
raise FileNotFoundError("PDF file was not found after downloading")
return local_pdf_path
def main():
driver = setup_driver()
try:
driver.get(WEBSITE_URL)
sleep(10)
wait_and_click(driver, By.XPATH, BUTTON_COOKIE_XPATH)
wait_and_click(driver, By.XPATH, BUTTON_AVISO_XPATH)
pdf_url = get_pdf_url(driver)
downloaded_pdf_path = download_pdf(pdf_url, DOWNLOAD_PATH)
print(f"PDF downloaded to: {downloaded_pdf_path}")
finally:
driver.quit()
if __name__ == "__main__":
main()
如您所见,它并不是一个很大的抓取工具,只想将这个文件描述为“BUTTON_PDF_XPATH”。
因此,我尝试了一些方法来修复它,例如将我的 chrome 配置文件与网络抓取器一起使用,这有时会导致出现错误: Err_HTTP2_Protocol_Error ,无限加载直到超时,或者在某些情况下加载了网站,但它无法点击任何内容(我可以向您保证所有 XPATH 工作)。
我也尝试用一些 sleep() 来减慢刮刀速度,但结果只是让我什么都不等,或者我什至尝试直接点击它,但它只是让我离开。
最后我想尝试使用诸如 :options.add_argument('--disable-http2') 之类的参数来解决 Err_HTTP2_Protocol_Error 但我不知道如何使用它。
谢谢您的宝贵时间,我希望您能帮助我解决这个问题
您可以从静态html中获取pdf链接,不需要selenium:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
def extract_pdf_link(url):
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
pdf_url = urljoin(url, soup.select_one('a[href*=".pdf/"]').get('href'))
return pdf_url
def download_pdf(url, download_path):
local_pdf_path = os.path.join(download_path, "downloaded_file.pdf")
response = requests.get(url, headers=HEADERS)
with open(local_pdf_path, 'wb') as f:
f.write(response.content)
return local_pdf_path
WEBSITE_URL = 'https://www.i-de.es/conexion-red-electrica/produccion-energia/mapa-capacidad-acceso'
DOWNLOAD_PATH = ''
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'}
pdf_url = extract_pdf_link(WEBSITE_URL)
downloaded_pdf_path = download_pdf(pdf_url, DOWNLOAD_PATH)