我正在尝试抓取网页并下载 PDF、PNG 或 JPG 格式的图像。我正在使用的网页是:https://asn.scientificposters.com/epsAbstractASN.cfm?id=6.
在此页面上,有一个可点击的图像,其 URL 为:
但是,当我检查该元素时,我找不到 PDF、PNG 或 JPG 文件的直接链接。
这是我迄今为止使用的代码:
import requests
from PyPDF2 import PdfReader, PdfWriter
import io
import json
headers = {
"accept": "*/*",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "en-US,en;q=0.9,en-IN;q=0.8",
"priority": "u=1, i",
"referer": "https://asn.scientificposters.com/apprizr.cfm?C1A%2F12Y8hALzGGa7XK43k6yc%2BvAzbBWUzMVrtMoqM6BBIsnQV7bYHul%2BzTSg5vOqmtrKjRzudgo%3D",
"sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"127\", \"Chromium\";v=\"127\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
}
response = requests.get("https://asn.scientificposters.com/epsAbstractASN.cfm?id=6", headers=headers)
pdf_stream = io.BytesIO(response.content)
pdf_reader = PdfReader(pdf_stream)
但我不断收到以下错误:
PdfReadError:未找到 EOF 标记
有人能帮我弄清楚如何通过 python 正确下载并保存这个文件吗?
这很棘手,因为它似乎使用 .js 来加载它以获取 Base64 数据图像。
试试这个:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import base64
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://asn.scientificposters.com/apprizr.cfm?C1A%2F12Y8hALzGGa7XK43k6yc%2BvAzbBWUzMVrtMoqM6BBIsnQV7bYHul%2BzTSg5vOqmtrKjRzudgo%3D'
# Open the URL
driver.get(url)
# Wait for the PDF viewer to load and interact with it
try:
# Wait until the canvas is loaded and visible
canvas = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "canvas0"))
)
# Execute JavaScript to zoom in
driver.execute_script("""
// Assuming zoomInBtn is the button to zoom in
for (let i = 0; i < 15; i++) {
document.getElementById('zoomInBtn').click();
}
""")
# Run JavaScript to get the base64 encoded image data from the canvas
base64_image = driver.execute_script(
"return document.getElementById('canvas0').toDataURL('image/png');"
)
# Decode the base64 image data (removing the prefix)
image_data = base64.b64decode(base64_image.split(',')[1])
# Write the image data to a file
with open("output.png", "wb") as file:
file.write(image_data)
print("Image saved to output.png")
finally:
# Close the WebDriver
driver.quit()