def get_metadata_bs4(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
try:
title = soup.find("h1").text.strip() if soup.find("h1") else "Titre inconnu"
publisher = soup.select_one("dl dd:nth-of-type(1)").text.strip() if soup.select_one("dl dd:nth-of-type(1)") else "Auteur inconnu"
Date of publication = soup.select_one("dl dd:nth-of-type(2)").text.strip() if soup.select_one("dl dd:nth-of-type(2)") else "Date inconnue"
return {"title": title, "author": author, "Date of publication": Date of publication}
except Exception as e:
print(f"Erreur pour {url}: {e}")
return None
# Tester avec un seul lien
url_test = "https://gallica.bnf.fr/ark:/12148/cb42768809f/date"
print(get_metadata_bs4(url_test))
我尝试了硒,但这是我第一次使用此Python库……我试图在以下代码块中使用此X-Path找到源代码的正确X-PATH,并替换为“元数据级”:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# Configuration Selenium
chrome_options = Options()
chrome_options.add_argument("--headless") # Mode sans interface graphique
driver = webdriver.Chrome(options=chrome_options)
def get_metadata_from_notice(url):
driver.get(url)
time.sleep(2) # Laisser le temps de charger
try:
# Cliquer sur le dropdown "Informations détaillées"
dropdown = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, "//div[contains(text(), 'Informations détaillées')]"))
)
dropdown.click()
time.sleep(2) # Attendre le chargement après le clic
except Exception as e:
print(f"⚠️ Erreur lors du clic sur {url} : {e}")
return None
try:
# Extraction des métadonnées après ouverture du dropdown
metadata_section = driver.find_element(By.XPATH, "//div[@class='metadata-class']") # À remplacer par la bonne classe
metadata_text = metadata_section.text
return {"url": url, "metadata": metadata_text}
except Exception as e:
print(f"⚠️ Impossible de récupérer les métadonnées pour {url} : {e}")
return None
# Test sur une URL
test_url = "https://gallica.bnf.fr/ark:/12148/cb42768809f/date"
print(get_metadata_from_notice(test_url))
# Fermer Selenium
driver.quit()
但它一直给我带来这样的结果:
⚠️ Impossible de récupérer les métadonnées pour https://gallica.bnf.fr/ark:/12148/cb42768809f/date
⚠️ Erreur sur https://gallica.bnf.fr/ark:/12148/cb452698066/date : Message:
Stacktrace:
GetHandleVerifier [0x00007FF7940A02F5+28725]
(No symbol) [0x00007FF794002AE0]
(No symbol) [0x00007FF793E9510A]
(No symbol) [0x00007FF793EE93D2]
(No symbol) [0x00007FF793EE95FC]
(No symbol) [0x00007FF793F33407]
(No symbol) [0x00007FF793F0FFEF]
(No symbol) [0x00007FF793F30181]
(No symbol) [0x00007FF793F0FD53]
(No symbol) [0x00007FF793EDA0E3]
(No symbol) [0x00007FF793EDB471]
GetHandleVerifier [0x00007FF7943CF30D+3366989]
GetHandleVerifier [0x00007FF7943E12F0+3440688]
GetHandleVerifier [0x00007FF7943D78FD+3401277]
GetHandleVerifier [0x00007FF79416AAAB+858091]
(No symbol) [0x00007FF79400E74F]
(No symbol) [0x00007FF79400A304]
(No symbol) [0x00007FF79400A49D]
(No symbol) [0x00007FF793FF8B69]
BaseThreadInitThunk [0x00007FFC0A7D259D+29]
RtlUserThreadStart [0x00007FFC0BA0AF38+40]
无需使用硒,只有一个简单的卷发请求就会产生结果:
curl https://gallica.bnf.fr/services/ajax/notice/ark:/12148/cb42768809f/date
我如何找到这个?只需打开浏览器的DevTools,选择“网络”选项卡,然后单击“ Informationsdétaillées”,将出现一个新的获取条目。