我正在尝试练习网页抓取,并制作了一个脚本,该脚本将继续到我的个人 Coursera 网站(我在他们的“robots.txt”文件中没有看到任何反对此内容的内容,因此认为这是可以的,因为这是我自己的信息)。主要缺点是,当验证码弹出时,我仍然必须解决它们,但除此之外,它可以从第一页上刮掉文本。然而,就我而言,我有多页文本数据需要抓取,并且在弄清楚如何在场景中涉及分页时遇到了问题。有没有人有什么建议?问题方法我认为是我的
def get_pages(self):
方法。我放入其中的打印语句说它达到了 2,但它似乎在实际再次运行之前终止了......
代码是:
import time
import secret
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
'''
Code text
'''
template = 'Certificate Name: {}\nCredential ID: {}\n'
class CourseraScraper:
'''
Class variables
'''
def __init__ (self, url: str, username: str, password: str):
# options = webdriver.FirefoxOptions()
# options.add_argument('--headless')
self.url = url
self.username = username
self.password = password
self.browser = webdriver.Firefox()
self.page_counter = 1
self.selector_counter = 2
def login(self):
'''
Using login info saved in secret.py, login using CSS_SELECTOR
'''
self.browser.get(self.url)
print('Accessing browser URL...')
username_css_selector = self.browser.find_element(By.CSS_SELECTOR, '#email')
username_css_selector.send_keys(self.username)
print('Entering username...')
pwd_css_selector = self.browser.find_element(By.CSS_SELECTOR, '#password')
pwd_css_selector.send_keys(self.password)
print('Entering password...')
login_button = self.browser.find_element(By.CSS_SELECTOR, 'button[type="submit"]')
login_button.click()
print('Clicking login button...')
time.sleep(60)
print('Time delay for CAPTCHA...')
def click_accomplishments(self):
'''
Once logged in, pause (there is a box element that sometimes obscures the
the drop down so the pause should allow the element to become visible) then click
"Accomplishments" from menu.
'''
print('Finding "Accomplishments" from drop-down...')
select_dropdown = self.browser.find_element(By.CSS_SELECTOR, '.cds-Avatar-initial')
select_dropdown.click()
print('Clicking link...')
accomplishments_link = self.browser.find_element(By.CSS_SELECTOR, 'li.dropdown-btn:nth-child(6) > a:nth-child(1)')
accomplishments_link.click()
time.sleep(5)
def get_pages(self):
'''
This is the problem area - trying to select page, then scrape - but it does not proceed to the 2nd page.
'''
n = self.page_counter
pages = self.browser.find_elements(By.XPATH, f'//*[@id="pagination_number_box_{n}"]')
for page in pages:
print(f'Starting on page {n}')
self.scrape_page()
n = self.counter_increment()
print(f'Total pages scraped: {n}')
print('Process complete!')
def counter_increment(self):
'''
The page count is initialized to 1, then increment n to the next page(s) number
'''
self.page_counter += 1
return self.page_counter
def selector_increment(self):
'''
Box selector starts at n = 2, then increment for each one encountered
'''
self.selector_counter += 1
return self.selector_counter
def scrape_page(self):
'''
Scrape specific element using CSS_SELECTOR and return the text to the function write_text_to_file() to be written to .txt file.
'''
n = self.selector_counter
while True:
tag_css_selector = f'div.rc-AccomplishmentCard:nth-child({n}) > div:nth-child(1) > div:nth-child(2) > a:nth-child(1)'
try:
accomplishment = self.browser.find_element(By.CSS_SELECTOR, tag_css_selector)
h3_text = accomplishment.find_element(By.TAG_NAME, 'h3').text
to_field = accomplishment.get_attribute('to')
to_field_text_split = to_field.rsplit('/', 1)[-1]
formatted_text = template.format(h3_text, to_field_text_split)
print(formatted_text)
self.write_text_to_file(formatted_text)
n = self.selector_increment()
except NoSuchElementException as e:
print(f'Reached end of elements.')
break
except Exception as e:
print(f'An error occurred: {e}')
break
def write_text_to_file(self, write_text: str):
'''
Write values to file
'''
with open('div_elements.txt', 'a+') as f:
f.write(write_text + '\n')
def close(self):
'''
Close browser session.
'''
self.browser.close()
print('All done!')
if __name__ == '__main__':
'''
Class calls
'''
url = secret.url
username = secret.username
password = secret.password
browser_session = CourseraScraper(url, username, password)
browser_session.login()
browser_session.click_accomplishments()
browser_session.get_pages()
browser_session.close()
我已尝试使用 ChatGPT 的建议重新调整该方法,但到目前为止没有任何效果。
我的期望/愿望是该方法增加到第 2 页(或者如果有多个页面则进一步增加)并将抓取的数据添加到 .txt 文件中。
使用 selenium 抓取多个页面的总体思路是使用 .click() 方法导航到下一页,直到禁用下一页按钮。 当下一页按钮被禁用时,您可以使用循环和 try 和 except 语句来跳出循环。 示例代码:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
link = 'https://www.ebay.com/b/TV-Video-Home-Audio-Electronics/32852/bn_1648392?rt=nc&_pgn=1'
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service((ChromeDriverManager().install())), options=options)
driver.get(link)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//ol[@class='pagination__items']/li/a[@aria-current='page']")))
while True:
try:
# Searching for current page
pno = driver.find_element(By.XPATH, "//ol[@class='pagination__items']/li/a[@aria-current='page']").get_attribute("textContent")
# Printing current page
print("Page no: ", pno)
# Check if next page is not disabled
driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@class='pagination__next icon-link']"))))
# If next page not disabled, go to next page
driver.find_element(By.XPATH, "//a[@class='pagination__next icon-link']").click()
except (WebDriverException, TimeoutException) as err:
print("End")
break
driver.quit()