如何使用 selenium 循环浏览网站页面

问题描述 投票:0回答:1

我正在尝试抓取此页面:https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big+data 正如您在页面底部看到的那样,有页数和带有箭头的图标,我用硒单击。

enter image description here

问题是,每当我复制该对象的 Xpath 时,我都会到达第二页,并且我的代码开始在第一页和第二页之间无休止地循环。

这是我尝试用于打开下一页的代码块:

            next_page_link = driver.find_element(By.XPATH, "//li[@class = 'page-item pager__item pager__item--next ']")
            # search_results_url = next_page_link.get_attribute("href")
            # driver.get(search_results_url)  # <--- open next page with results using URL
            next_page_link.click()   # <--- or click link 

当脚本尝试转到第三页时,我收到此错误:

[DEBUG] Exception: Message: element click intercepted: Element <li class="page-item pager__item pager__item--next ">...</li> is not clickable at point (1181, 869). Other element would receive the click: <div class="popup-content info eu-cookie-compliance-content">...</div>

但是我似乎无法越过第二页,有人知道该怎么做吗?

这是完整的代码供参考

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException

import time

# ---

import selenium
print('Selenium:', selenium.__version__)

# ---

def scrape_page(driver, keyword):
    try:

        
        # try:
        #     print('Clicking cookie banner')            
        #     cookie_banner = driver.find_element(By.XPATH, "//*[@id='popup-buttons']/button[1]")
        #     cookie_banner.click()
        # except Exception as e:
        #     print('Exception:', e)

        
        elements_dt = driver.find_elements(By.XPATH, "//div[@class='row']")
        #elements_dd = driver.find_elements(By.XPATH, "//dl[@class='sample-list.results']/dd/a")
        
        print('[DEBUG] len(elements_dt):', len(elements_dt))
        # Lista per memorizzare i dati estratti
        data = []

        # Clicca su ciascun elemento
        #for index, element_dt, element_dd in enumerate(zip(elements_dt, elements_dd), 1):  # you can use `enumerate(..., 1)` to start `index` with `1`
        for index, element in enumerate(elements_dt, 1):  # you can use `enumerate(..., 1)` to start `index` with `1`
            
            try:
                article_url = element.find_element(By.XPATH, '//a[@class="search-item-link"]').get_attribute("href")
                article_title = element.text
                
                # ... DON'T CLIK LINKS BECAUSE IT WILL REMOVE CURRENT PAGE FROM MEMPRY
                # ... AND YOU WILL LOST ACCESS TO OTHER `elements` ON CURRENT PAGE
                # ...
                # ... Get `href` and later (after loop) use `.get(href)` to access subpages. 
                
                data.append({
                    'keyword': keyword,
                    'Titolo': article_title, 
                    'URL': article_url, 
                    #'Data': article_date, 
                    #'Contenuto': article_content
                })
                
                print('[DEBUG] data:', data[-1])
                # Torna alla pagina precedente
                #driver.back()
            except Exception as e:
                print("Errore durante il clic sull'elemento:", e)
                
        # work with subpages

        # for item in data:
        #     print('[DEBUG] subpage:', item['URL'])
        #     driver.get(item['URL'])
        #     #article_date = ...
        #     #article_content = ...
        #     #item['Data'] = article_date
        #     #item['Contenuto'] = article_content
             
    except Exception as e:
        print("Errore durante lo scraping della pagina:", e)
        return None

    return data

# --- main ---

driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)

# ---

start_url = "https://www.lavoro.gov.it/"

all_data = []

keywords = ['big data', 'data warehouse', 'data lake', 'data science', 'data mining', 'data privacy', 'open data',
            'data governance', 'data literacy', 'data', 'biometric', 'analisi biometrica',
            'machine learning', 'apprendimento automatico', 'algoritm', 'calcolo', 'punteggio', 'valutazione',
            'monitoraggi','predittiv', 'cloud', 'statistic', 'reti neurali', 'rete neurale', 'neural network',
            'apprendimento profondo', 'deep learning', 'ranking', 'classificazione', 'apprendimento supervisionato',
            'apprendimento non supervisionato', 'software', 'numeric', 'rango', 'ranghi', 'rank', "Elaborazione del linguaggio naturale",
            'natural language processing', 'NLP', 'graduazione', 'transformer', 'GPT', 'Bard', 'Copilot', 'Gemini', 'DALL-E',
            'automa', 'internet delle cose', 'intelligenza artificiale']

for word in keywords:

    print("Main Page:", start_url)

    # open main page 
    driver.get(start_url)

    # find searchbar
    #click_on_search = driver.find_element(By.ID, "search-button")
    #click_on_search.click()
    print('Search:', word)
    searchbar = driver.find_element(By.ID, "search-input")
    # put keyword in searchbar and press ENTER
    searchbar.send_keys(word)
    searchbar.send_keys(Keys.ENTER)
    
    time.sleep(5) # wait for results
    
    #get current url (because it could load different URL to show results)
    search_results_url = driver.current_url
    
    # start scraping results (with pagination):
    #while True:  # try to get all pages
    for _ in range(999):  # try to get only 3 pages
        print("Scraping:", search_results_url)
        
        page_data = scrape_page(driver, word)  # <--- only scraping, without `.get(url)`, I send `word` only to add it to `data`
        
        if page_data:
            all_data.extend(page_data)

        driver.get(search_results_url) # go back to result after visiting subpages - to get link to next page 
        
        try:
            next_page_link = driver.find_element(By.XPATH, "//li[@class = 'page-item pager__item pager__item--next ']")
            search_results_url = next_page_link.get_attribute("href")
            driver.get(search_results_url)  # <--- open next page with results using URL
            #next_page_link.click()   # <--- or click link 
        except Exception as e:
            print('[DEBUG] Exception:', e)
            print('[DEBUG] break')
            #input('Press ENTER to continue')
            break  # exit loop
            
driver.quit()

import pandas as pd
df = pd.DataFrame(all_data)
print(df)

input("Press ENTER to close")

df.to_excel('lavoro_scrape_tot.xlsx')
python selenium-webdriver web-scraping
1个回答
0
投票

问题在于您尝试单击的链接实际上隐藏在 cookie 横幅后面。有以下几种选择:

  1. 接受cookies。
  2. 围绕横幅工作。

下面的脚本接受 cookie 并导航到下一页。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException

import time

driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)

start_url = "https://www.lavoro.gov.it/"

all_data = []

keywords = ['big data', 'data warehouse', 'data lake', 'data science', 'data mining', 'data privacy', 'open data',
            'data governance', 'data literacy', 'data', 'biometric', 'analisi biometrica',
            'machine learning', 'apprendimento automatico', 'algoritm', 'calcolo', 'punteggio', 'valutazione',
            'monitoraggi','predittiv', 'cloud', 'statistic', 'reti neurali', 'rete neurale', 'neural network',
            'apprendimento profondo', 'deep learning', 'ranking', 'classificazione', 'apprendimento supervisionato',
            'apprendimento non supervisionato', 'software', 'numeric', 'rango', 'ranghi', 'rank', "Elaborazione del linguaggio naturale",
            'natural language processing', 'NLP', 'graduazione', 'transformer', 'GPT', 'Bard', 'Copilot', 'Gemini', 'DALL-E',
            'automa', 'internet delle cose', 'intelligenza artificiale']

for word in keywords:
    driver.get(start_url)

    time.sleep(5)

    # Accept cookies.
    try:
        driver.find_element(By.CSS_SELECTOR, "button.agree-button").click()
    except NoSuchElementException:
        pass

    print(f"Search: {word}.")
    searchbar = driver.find_element(By.ID, "search-input")
    searchbar.send_keys(word)
    searchbar.send_keys(Keys.ENTER)
    
    while True:
        time.sleep(10)

        next = driver.find_element(By.CSS_SELECTOR, "a[rel='next']")
        # Check if next button is disabled (no more data).
        if next.get_attribute("aria-hidden") is not None:
            print("No more data.")
            break
        next = next.get_attribute("href")
        print(f"Next page: {next}.")

        driver.get(next)
            
driver.quit()

输出应如下所示:

Search: big data.                                                                                                                                                                                                                       
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=1.                                                                                                                                                    
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=2.                                                                                                                                                    
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=3.                                                                                                                                                    
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=4.                                                                                                                                                    
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=5.                                                                                                                                                    
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=6.                                                                                                                                                    
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=7.                                                                                                                                                    
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=big%20data&page=8.                                                                                                                                                    
No more data.                                                                                                                                                                                                                           
Search: data warehouse.                                                                                                                                                                                                                 
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=data%20warehouse&page=1.                                                                                                                                              
Next page: https://www.lavoro.gov.it/Pagine/Cerca-nel-sito?search=data%20warehouse&page=2.                                                                                                                                              
No more data.
© www.soinside.com 2019 - 2024. All rights reserved.