Selenium 抓取不断返回 ValueError:以 10 为基数的 int() 的无效文字:''

问题描述 投票:0回答:1

当尝试抓取网站时,他们无法在我的页面索引变量中找到文本。用于分页的 len() 显示正确的数字,所以我知道它找到了我想要的元素。只是不是其中的文本。如果有人可以帮助我,我将不胜感激。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options # ALLOWS PROGRAM TO RUN W/O WINDOW OPENING
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


import pandas as pd
import time
# THESE TURN ON HEADLESS MODE
options = Options()
options.add_argument("--headless=new")
options.add_argument("window-size=1920x1080")

# THESE ACCESS THE WEBSITE AND PULL DATA
web = "https://www.audible.com/adblbestsellers?page=1&ref_pageloadid=vungFE9JcaT4XEUq&ref=a_adblbests_c5_pageNum_0&pf_rd_p=3c1c017b-585f-4bde-98c6-e3cb784e4b8e&pf_rd_r=CTE7EBFZPGVW9C9MJYWQ&pageLoadId=ViHKKYBoSP03JQKv&creativeId=0bf0e03f-bb55-481b-b4fd-d67375977170"
driver = webdriver.Chrome(options= options)
driver.get(web)
# driver.maximize_window()

# PAGINATION
pagination = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH,'.//ul[contains(@class, "pagingElements")]')))
# pagination = driver.find_element(By.XPATH,'.//ul[contains(@class, "pagingElements")]')
pages = WebDriverWait(pagination, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'a.pageNumberElement')))
#pages = pagination.find_elements(By.XPATH,'.//ul[contains(@class,"pagingElements")]/li')
last_page = int(pages[1].text)

current_page = 1

while current_page <= last_page:
    
    # time.sleep(2)
    # FINDING ELEMENTS W/ WAITS AND CONDITIONS
    container = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'adbl-impression-container')))
    products = WebDriverWait(container, 5).until(EC.presence_of_all_elements_located((By.XPATH, './/li[contains(@class, "productListItem")]')))
   

    book_title = []
    book_author = []
    book_length = []

    for product in products:

        book_title.append(product.find_element(By.XPATH,".//h3[contains(@class, 'bc-heading')]").text)
        book_author.append(product.find_element(By.XPATH,".//li[contains(@class, 'authorLabel')]").text)
        book_length.append(product.find_element(By.XPATH,".//li[contains(@class, 'runtimeLabel')]").text) 

    current_page = current_page + 1
    
    try:
        next_page = driver.find_element(By.XPATH, './/span[contains(@class, "nextButton")]')
        driver.implicitly_wait(10)
        ActionChains(driver).move_to_element(next_page).click(next_page).perform()
    except:
        print("Nope")
    
    
df_books = pd.DataFrame({'title': book_title, 'author': book_author, 'length': book_length})
df_books.to_csv('pagination.csv', index=False)

driver.quit()

回溯(最近一次调用最后一次): 文件“c:\用户 lucky\OneDrive\Desktop xtracted_content\my_apps ilter\program.py”,第 18 行,在 最后一页 = int(页[-2].text) ^^^^^^^^^^^^^^^^^^^^ ValueError:以 10 为基数的 int() 的文字无效:''

我尝试过通过 Xpath、类名、标记名和 css 选择器进行定位。我试图从索引中获取文本来指示 while 循环何时应该中断。

当我应用一些调试时,它向我显示了我正在寻找的文本,但否则我无法得到它。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

web = "https://www.audible.com/adblbestsellers?ref_pageloadid=not_applicable&ref=a_search_t1_navTop_pl0cg1c0r0&pf_rd_p=334a4a9c-12d2-4c3f-aee6-ae0cbc6a1eb0&pf_rd_r=XG7ESF49MWSMHKFW027S&pageLoadId=0IuRVXnX3DGs76JX&creativeId=7ba42fdf-1145-4990-b754-d2de428ba482"
driver = webdriver.Chrome()
driver.get(web)

pagination = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, './/ul[contains(@class, "pagingElements")]')))

# Print the HTML content of the pagination element
print(pagination.get_attribute("outerHTML"))

pages = WebDriverWait(pagination, 20).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'li')))

# Print the HTML content of each page element
for i, page in enumerate(pages):
    print(f"Page {i + 1} HTML={repr(page.get_attribute('outerHTML'))} text={repr(page.text)}")

# Extract the last page number
last_page = int(pages[-2].text) if pages and pages[-2].text.isdigit() else 1

print("Last Page:", last_page)
'''
<li class="bc-list-item">
    <a class="bc-link
    refinementFormLink pageNumberElement

    bc-color-link" tabindex="0" href="/adblbestsellers?page=5&amp;ref_pageloadid=0IuRVXnX3DGs76JX&amp;ref=a_adblbests_c5_pageNum_3&amp;pf_rd_p=3c1c017b-585f-4bde-98c6-e3cb784e4b8e&amp;pf_rd_r=RGEY6R11PSFH96HYBZZ1&amp;pageLoadId=7wau2qFrQbdrOLQh&amp;creativeId=0bf0e03f-bb55-481b-b4fd-d67375977170">5</a>
</li>

Page 1 HTML='<li class="bc-list-item">\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n \n\n\n  \n  \n  \n  \n  \n  \n\n\n\n  \n    \n  \n  \n\n\n\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n    \n    <span class="bc-button\n  bc-button-secondary\n  previousButton  \n  \n  \n  bc-button-small \n  bc-button-inline \n  bc-button-disabled">\n  
    <a class="bc-button-text" aria-disabled="true" href="/adblbestsellers?page=0&amp;ref_pageloadid=0IuRVXnX3DGs76JX&amp;ref=a_adblbests_c5_pageBack&amp;pf_rd_p=3c1c017b-585f-4bde-98c6-e3cb784e4b8e&amp;pf_rd_r=RGEY6R11PSFH96HYBZZ1&amp;pageLoadId=7wau2qFrQbdrOLQh&amp;creativeId=0bf0e03f-bb55-481b-b4fd-d67375977170" tabindex="0" role="button">\n        <span class="bc-text\n    bc-button-text-inner \n    bc-size-action-small">\n    \n    \n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\t\n\t\n\n\n\n\n\n\n\n\n<i 
aria-hidden="true" class="bc-icon\n\tbc-icon-fill-base\n\tbc-icon-chevron-left-s2\n\t\n\tbc-icon-chevron-left \n\t\n\tbc-color-base">\n</i>\n \n\n  \n<span class="bc-text\n    bc-pub-offscreen">Go back a page</span>\n  </span>\n      </a>\n    </span>\n  \n\n  \n\n  \n\n  \n\n\n</li>' text='Go back a page'
Page 2 HTML='<li class="bc-list-item">\n    \n \n\n  \n<span class="bc-text\n    pageNumberElement">1</span>\n</li>' text='1'
Page 3 HTML='<li class="bc-list-item">\n    <a class="bc-link\n    refinementFormLink pageNumberElement \n    \n    bc-color-link" tabindex="0" href="/adblbestsellers?page=2&amp;ref_pageloadid=0IuRVXnX3DGs76JX&amp;ref=a_adblbests_c5_pageNum_1&amp;pf_rd_p=3c1c017b-585f-4bde-98c6-e3cb784e4b8e&amp;pf_rd_r=RGEY6R11PSFH96HYBZZ1&amp;pageLoadId=7wau2qFrQbdrOLQh&amp;creativeId=0bf0e03f-bb55-481b-b4fd-d67375977170">2</a>\n</li>' text='2'
Page 4 HTML='<li class="bc-list-item">\n    \n \n\n  \n<span class="bc-text\n    pageNumberElement">...</span>\n</li>' text=''
Page 5 HTML='<li class="bc-list-item">\n    <a class="bc-link\n    refinementFormLink pageNumberElement \n    \n    bc-color-link" tabindex="0" href="/adblbestsellers?page=5&amp;ref_pageloadid=0IuRVXnX3DGs76JX&amp;ref=a_adblbests_c5_pageNum_3&amp;pf_rd_p=3c1c017b-585f-4bde-98c6-e3cb784e4b8e&amp;pf_rd_r=RGEY6R11PSFH96HYBZZ1&amp;pageLoadId=7wau2qFrQbdrOLQh&amp;creativeId=0bf0e03f-bb55-481b-b4fd-d67375977170">5</a>\n</li>' text=''
Page 6 HTML='<li class="bc-list-item">\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n \n\n\n  \n  \n  \n  \n  \n  \n\n\n\n  \n    \n  \n  \n\n\n\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n    \n    <span class="bc-button\n  bc-button-secondary\n  nextButton refinementFormButton \n  \n  \n  bc-button-small \n  bc-button-inline">\n      <a 
class="bc-button-text" href="/adblbestsellers?page=2&amp;ref_pageloadid=0IuRVXnX3DGs76JX&amp;ref=a_adblbests_c5_pageNext&amp;pf_rd_p=3c1c017b-585f-4bde-98c6-e3cb784e4b8e&amp;pf_rd_r=RGEY6R11PSFH96HYBZZ1&amp;pageLoadId=7wau2qFrQbdrOLQh&amp;creativeId=0bf0e03f-bb55-481b-b4fd-d67375977170" tabindex="0" role="button">\n        <span class="bc-text\n    bc-button-text-inner \n    bc-size-action-small">\n    \n    \n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\t\n\t\n\n\n\n\n\n\n\n\n<i aria-hidden="true" class="bc-icon\n\tbc-icon-fill-base\n\tbc-icon-chevron-right-s2\n\t\n\tbc-icon-chevron-right \n\t\n\tbc-color-base">\n</i>\n \n\n  \n<span class="bc-text\n    bc-pub-offscreen">Go forward a page</span>\n  </span>\n      </a>\n    </span>\n  \n\n  \n\n  \n\n  \n\n\n</li>' text=''
Last Page: 1
'''
I'm trying to pull the "5" that's on HTML Page 5
python selenium-webdriver web-scraping
1个回答
0
投票

正如您在调试输出中看到的,一些

text=
值不是整数:

 text='Go back a page'
 text=''
 text='1'
 text='2'

您可能想使用最后一个整数:

good_pages = []
for page in pages:
  try:
    good_pages.append(int(page.text))
  except ValueError:
    pass
last_page = good_pages[-1]
© www.soinside.com 2019 - 2024. All rights reserved.