使用 Selenium 和 Python 进行涉及分页的抓取

问题描述 投票:0回答:1

我正在尝试练习网页抓取,并制作了一个脚本,该脚本将继续到我的个人 Coursera 网站(我在他们的“robots.txt”文件中没有看到任何反对此内容的内容,因此认为这是可以的,因为这是我自己的信息)。主要缺点是,当验证码弹出时,我仍然必须解决它们,但除此之外,它可以从第一页上刮掉文本。然而,就我而言,我有多页文本数据需要抓取,并且在弄清楚如何在场景中涉及分页时遇到了问题。有没有人有什么建议?问题方法我认为是我的

def get_pages(self):
方法。我放入其中的打印语句说它达到了 2,但它似乎在实际再次运行之前终止了......

代码是:


import time
import secret
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

'''
Code text
'''

template = 'Certificate Name: {}\nCredential ID: {}\n'

class CourseraScraper:
    '''
    Class variables
    '''
    def __init__ (self, url: str, username: str, password: str):
        # options = webdriver.FirefoxOptions()
        # options.add_argument('--headless')
        self.url = url
        self.username = username
        self.password = password
        self.browser = webdriver.Firefox()
        self.page_counter = 1
        self.selector_counter = 2

    def login(self):
        '''
        Using login info saved in secret.py, login using CSS_SELECTOR
        '''
        self.browser.get(self.url)
        print('Accessing browser URL...')
        username_css_selector = self.browser.find_element(By.CSS_SELECTOR, '#email')
        username_css_selector.send_keys(self.username)
        print('Entering username...')
        
        pwd_css_selector = self.browser.find_element(By.CSS_SELECTOR, '#password')
        pwd_css_selector.send_keys(self.password)
        print('Entering password...')
        
        login_button = self.browser.find_element(By.CSS_SELECTOR, 'button[type="submit"]')
        login_button.click()
        print('Clicking login button...')
        time.sleep(60)
        print('Time delay for CAPTCHA...')


    def click_accomplishments(self):
        '''
        Once logged in, pause (there is a box element that sometimes obscures the
the drop down so the pause should allow the element to become visible) then click
"Accomplishments" from menu.
        '''
        print('Finding "Accomplishments" from drop-down...')
        select_dropdown = self.browser.find_element(By.CSS_SELECTOR, '.cds-Avatar-initial')
        select_dropdown.click()

        print('Clicking link...')
        accomplishments_link = self.browser.find_element(By.CSS_SELECTOR, 'li.dropdown-btn:nth-child(6) > a:nth-child(1)')
        accomplishments_link.click()
        
        time.sleep(5)


    def get_pages(self):
        '''
        This is the problem area - trying to select page, then scrape - but it does not proceed to the 2nd page.
        '''
        n = self.page_counter
        pages = self.browser.find_elements(By.XPATH, f'//*[@id="pagination_number_box_{n}"]')
        for page in pages:
            print(f'Starting on page {n}')
            self.scrape_page()
            n = self.counter_increment()
        print(f'Total pages scraped: {n}')
        print('Process complete!')


    def counter_increment(self):
        '''
        The page count is initialized to 1, then increment n to the next page(s) number
        '''
        self.page_counter += 1
        return self.page_counter


    def selector_increment(self):
        '''
        Box selector starts at n = 2, then increment for each one encountered
        '''
        self.selector_counter += 1
        return self.selector_counter


    def scrape_page(self):
        '''
        Scrape specific element using CSS_SELECTOR and return the text to the function write_text_to_file() to be written to .txt file.
        '''
        n = self.selector_counter
        
        while True:
            tag_css_selector = f'div.rc-AccomplishmentCard:nth-child({n}) > div:nth-child(1) > div:nth-child(2) > a:nth-child(1)'
            try:
                accomplishment = self.browser.find_element(By.CSS_SELECTOR, tag_css_selector)
                h3_text = accomplishment.find_element(By.TAG_NAME, 'h3').text
                to_field = accomplishment.get_attribute('to')
                to_field_text_split = to_field.rsplit('/', 1)[-1]

                formatted_text = template.format(h3_text, to_field_text_split)
                print(formatted_text)
                self.write_text_to_file(formatted_text)
                n = self.selector_increment()

            except NoSuchElementException as e:
                print(f'Reached end of elements.')
                break

            except Exception as e:
                print(f'An error occurred: {e}')
                break


    def write_text_to_file(self, write_text: str):
        '''
        Write values to file
        '''
        with open('div_elements.txt', 'a+') as f:
            f.write(write_text + '\n')


    def close(self):
        '''
        Close browser session.
        '''
        self.browser.close()
        print('All done!')



if __name__ == '__main__':
    
    '''
    Class calls
    '''
    url = secret.url
    username = secret.username
    password = secret.password

    browser_session = CourseraScraper(url, username, password)

    browser_session.login()

    browser_session.click_accomplishments()

    browser_session.get_pages()

    browser_session.close()


我已尝试使用 ChatGPT 的建议重新调整该方法,但到目前为止没有任何效果。

我的期望/愿望是该方法增加到第 2 页(或者如果有多个页面则进一步增加)并将抓取的数据添加到 .txt 文件中。

python python-3.x selenium-webdriver web-scraping
1个回答
0
投票

使用 selenium 抓取多个页面的总体思路是使用 .click() 方法导航到下一页,直到禁用下一页按钮。 当下一页按钮被禁用时,您可以使用循环和 try 和 except 语句来跳出循环。 示例代码:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

link = 'https://www.ebay.com/b/TV-Video-Home-Audio-Electronics/32852/bn_1648392?rt=nc&_pgn=1'
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service((ChromeDriverManager().install())), options=options)
driver.get(link)

WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//ol[@class='pagination__items']/li/a[@aria-current='page']")))

while True:
    try:
        # Searching for current page
        pno = driver.find_element(By.XPATH, "//ol[@class='pagination__items']/li/a[@aria-current='page']").get_attribute("textContent") 
        # Printing current page
        print("Page no: ", pno) 
        # Check if next page is not disabled
        driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@class='pagination__next icon-link']"))))
        # If next page not disabled, go to next page
        driver.find_element(By.XPATH, "//a[@class='pagination__next icon-link']").click()
    except (WebDriverException, TimeoutException) as err:
        print("End")
        break
driver.quit()
© www.soinside.com 2019 - 2024. All rights reserved.