我正在尝试抓取此页面:https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big+data 正如您在页面底部看到的那样,有页数和带有箭头的图标,我用硒单击。
问题是,每当我复制该对象的 Xpath 时,我都会到达第二页,并且我的代码开始在第一页和第二页之间无休止地循环。
这是我尝试用于打开下一页的代码块:
next_page_link = driver.find_element(By.XPATH, "//li[@class = 'page-item pager__item pager__item--next ']")
# search_results_url = next_page_link.get_attribute("href")
# driver.get(search_results_url) # <--- open next page with results using URL
next_page_link.click() # <--- or click link
当脚本尝试转到第三页时,我收到此错误:
[DEBUG] Exception: Message: element click intercepted: Element <li class="page-item pager__item pager__item--next ">...</li> is not clickable at point (1181, 869). Other element would receive the click: <div class="popup-content info eu-cookie-compliance-content">...</div>
但是我似乎无法越过第二页,有人知道该怎么做吗?
这是完整的代码供参考
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
# ---
import selenium
print('Selenium:', selenium.__version__)
# ---
def scrape_page(driver, keyword):
try:
# try:
# print('Clicking cookie banner')
# cookie_banner = driver.find_element(By.XPATH, "//*[@id='popup-buttons']/button[1]")
# cookie_banner.click()
# except Exception as e:
# print('Exception:', e)
elements_dt = driver.find_elements(By.XPATH, "//div[@class='row']")
#elements_dd = driver.find_elements(By.XPATH, "//dl[@class='sample-list.results']/dd/a")
print('[DEBUG] len(elements_dt):', len(elements_dt))
# Lista per memorizzare i dati estratti
data = []
# Clicca su ciascun elemento
#for index, element_dt, element_dd in enumerate(zip(elements_dt, elements_dd), 1): # you can use `enumerate(..., 1)` to start `index` with `1`
for index, element in enumerate(elements_dt, 1): # you can use `enumerate(..., 1)` to start `index` with `1`
try:
article_url = element.find_element(By.XPATH, '//a[@class="search-item-link"]').get_attribute("href")
article_title = element.text
# ... DON'T CLIK LINKS BECAUSE IT WILL REMOVE CURRENT PAGE FROM MEMPRY
# ... AND YOU WILL LOST ACCESS TO OTHER `elements` ON CURRENT PAGE
# ...
# ... Get `href` and later (after loop) use `.get(href)` to access subpages.
data.append({
'keyword': keyword,
'Titolo': article_title,
'URL': article_url,
#'Data': article_date,
#'Contenuto': article_content
})
print('[DEBUG] data:', data[-1])
# Torna alla pagina precedente
#driver.back()
except Exception as e:
print("Errore durante il clic sull'elemento:", e)
# work with subpages
# for item in data:
# print('[DEBUG] subpage:', item['URL'])
# driver.get(item['URL'])
# #article_date = ...
# #article_content = ...
# #item['Data'] = article_date
# #item['Contenuto'] = article_content
except Exception as e:
print("Errore durante lo scraping della pagina:", e)
return None
return data
# --- main ---
driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)
# ---
start_url = "https://www.lavoro.gov.it/"
all_data = []
keywords = ['big data', 'data warehouse', 'data lake', 'data science', 'data mining', 'data privacy', 'open data',
'data governance', 'data literacy', 'data', 'biometric', 'analisi biometrica',
'machine learning', 'apprendimento automatico', 'algoritm', 'calcolo', 'punteggio', 'valutazione',
'monitoraggi','predittiv', 'cloud', 'statistic', 'reti neurali', 'rete neurale', 'neural network',
'apprendimento profondo', 'deep learning', 'ranking', 'classificazione', 'apprendimento supervisionato',
'apprendimento non supervisionato', 'software', 'numeric', 'rango', 'ranghi', 'rank', "Elaborazione del linguaggio naturale",
'natural language processing', 'NLP', 'graduazione', 'transformer', 'GPT', 'Bard', 'Copilot', 'Gemini', 'DALL-E',
'automa', 'internet delle cose', 'intelligenza artificiale']
for word in keywords:
print("Main Page:", start_url)
# open main page
driver.get(start_url)
# find searchbar
#click_on_search = driver.find_element(By.ID, "search-button")
#click_on_search.click()
print('Search:', word)
searchbar = driver.find_element(By.ID, "search-input")
# put keyword in searchbar and press ENTER
searchbar.send_keys(word)
searchbar.send_keys(Keys.ENTER)
time.sleep(5) # wait for results
#get current url (because it could load different URL to show results)
search_results_url = driver.current_url
# start scraping results (with pagination):
#while True: # try to get all pages
for _ in range(999): # try to get only 3 pages
print("Scraping:", search_results_url)
page_data = scrape_page(driver, word) # <--- only scraping, without `.get(url)`, I send `word` only to add it to `data`
if page_data:
all_data.extend(page_data)
driver.get(search_results_url) # go back to result after visiting subpages - to get link to next page
try:
next_page_link = driver.find_element(By.XPATH, "//li[@class = 'page-item pager__item pager__item--next ']")
search_results_url = next_page_link.get_attribute("href")
driver.get(search_results_url) # <--- open next page with results using URL
#next_page_link.click() # <--- or click link
except Exception as e:
print('[DEBUG] Exception:', e)
print('[DEBUG] break')
#input('Press ENTER to continue')
break # exit loop
driver.quit()
import pandas as pd
df = pd.DataFrame(all_data)
print(df)
input("Press ENTER to close")
df.to_excel('lavoro_scrape_tot.xlsx')
问题在于您尝试单击的链接实际上隐藏在 cookie 横幅后面。有以下几种选择:
下面的脚本接受 cookie 并导航到下一页。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException
import time
driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)
start_url = "https://www.lavoro.gov.it/"
all_data = []
keywords = ['big data', 'data warehouse', 'data lake', 'data science', 'data mining', 'data privacy', 'open data',
'data governance', 'data literacy', 'data', 'biometric', 'analisi biometrica',
'machine learning', 'apprendimento automatico', 'algoritm', 'calcolo', 'punteggio', 'valutazione',
'monitoraggi','predittiv', 'cloud', 'statistic', 'reti neurali', 'rete neurale', 'neural network',
'apprendimento profondo', 'deep learning', 'ranking', 'classificazione', 'apprendimento supervisionato',
'apprendimento non supervisionato', 'software', 'numeric', 'rango', 'ranghi', 'rank', "Elaborazione del linguaggio naturale",
'natural language processing', 'NLP', 'graduazione', 'transformer', 'GPT', 'Bard', 'Copilot', 'Gemini', 'DALL-E',
'automa', 'internet delle cose', 'intelligenza artificiale']
for word in keywords:
driver.get(start_url)
time.sleep(5)
# Accept cookies.
try:
driver.find_element(By.CSS_SELECTOR, "button.agree-button").click()
except NoSuchElementException:
pass
print(f"Search: {word}.")
searchbar = driver.find_element(By.ID, "search-input")
searchbar.send_keys(word)
searchbar.send_keys(Keys.ENTER)
while True:
time.sleep(10)
next = driver.find_element(By.CSS_SELECTOR, "a[rel='next']")
# Check if next button is disabled (no more data).
if next.get_attribute("aria-hidden") is not None:
print("No more data.")
break
next = next.get_attribute("href")
print(f"Next page: {next}.")
driver.get(next)
driver.quit()
输出应如下所示:
Search: big data.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=1.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=2.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=3.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=4.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=5.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=6.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=7.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=8.
No more data.
Search: data warehouse.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=data%20warehouse&page=1.
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=data%20warehouse&page=2.
No more data.