我想使用selenium和python来抓取数据。我的 chrome 版本是 123.0.6312.87,网络驱动程序与此 chrome 版本兼容。从以下页面“https://web.bcpa.net/BcpaClient/#/Record-Search”,我想抓取数据。在此页面中,当我使用 selenium 提供地址“2216 NW 6 PL FORT LAUDERDALE”时,它将提供该房产的详细信息。现在有一个打印按钮,当我使用 selenium 单击它时,它会将我重定向到新页面“https://web.bcpa.net/BcpaClient/recinfoprint.html”。现在,在此 HTML 页面中,“md-select”类下有一个下拉选项,我想从中选择“另存为 PDF”,其值为“另存为 PDF/local/”。但是这个HTML页面有shadow root。所以selenium无法定位“md-select”类的位置。另外,我想单击“action-button”类中下面的 html 页面中的“保存”按钮,但影子根的存在造成了巨大的问题。我尝试从位于shadow-root之前的“print-preview-app”中提取信息。但它也不起作用。
代码:
import datetime as dt
import os
import time
from datetime import datetime
import pandas as pd
import numpy as np
from datetime import timedelta
import sys
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.options import Options
import os.path
import json
import ssl
import io
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
try:
# # Path to your Chrome WebDriver executable
webdriver_path = "D:/Grass_Image_Classification/chromedriver-win64/chromedriver.exe"
# Create a Chrome WebDriver instance
service = Service(webdriver_path)
driver = webdriver.Chrome(service=service)
print('Successfully received the chromedriver path')
driver.maximize_window()
actions = ActionChains(driver)
driver.get("https://web.bcpa.net/BcpaClient/#/Record-Search")
driver.implicitly_wait(10)
text_input = driver.find_element(By.XPATH, '//input[@class="form-control"]').send_keys("2216 NW 6 PL FORT LAUDERDALE, FL 33311")
driver.implicitly_wait(10)
search_button = driver.find_element(By.XPATH, '//span[@class="input-group-addon"]/span[@class="glyphicon glyphicon-search"]').click()
driver.implicitly_wait(10)
printer_click = driver.find_element(By.XPATH, '//div[@class="col-sm-1 btn-printrecinfo"]').click()
driver.implicitly_wait(15)
# Switch to the new tab
handles = driver.window_handles
print(handles)
print(handles[-1])
driver.switch_to.window(handles[-1])
print(driver.current_url)
# shadow_root = driver.find_element(By.ID,"sidebar").shadow_root
shadow_root = driver.find_element(By.CSS_SELECTOR,"body > print-preview-app").shadow_root
# shadow_root = driver.find_element(By.XPATH,"/html/body/print-preview-app").shadow_root
# shadow_root = driver.find_element(By.XPATH,'//*[@id="sidebar"]').shadow_root
shadow_text = shadow_root.find_element(By.CSS_SELECTOR,"print-preview-settings-section > div > select").text
print(shadow_text)
time.sleep(10)
except Exception as e:
print(e)
sys.exit(1)
我想选择“另存为 PDF”,其值是“md-select”类中的“另存为 PDF/local/”,然后我想单击“action-”类中该 html 页面中的“保存”按钮按钮”。
应用 webdriver 影子根方法后代码停止工作。
您不需要在新打开的选项卡上访问
shadow-root
。使用 Chrome 驱动程序选项,保存 pdf 文件要容易得多。
您只需将首选项传递给 chromedriver,它会在打印操作时自动将您的 pdf 文件保存到目录中。
import json
import sys
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
try:
print_settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
"isLandscapeEnabled": True
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(print_settings),
"download.prompt_for_download": False,
"profile.default_content_setting_values.automatic_downloads": 1,
"download.directory_upgrade": True,
"savefile.default_directory": "/Users/a1/PycharmProjects/PythonProject", #this is path to dir where you want to save the file
"safebrowsing.enabled": True}
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', prefs)
options.add_argument('--kiosk-printing')
service = Service()
driver = webdriver.Chrome(options)
driver.maximize_window()
actions = ActionChains(driver)
wait = WebDriverWait(driver, 20)
driver.get("https://web.bcpa.net/BcpaClient/#/Record-Search")
text_input = wait.until(EC.visibility_of_element_located((By.XPATH, '//input[@class="form-control"]'))).send_keys(
"2216 NW 6 PL FORT LAUDERDALE, FL 33311")
search_button = driver.find_element(By.XPATH,
'//span[@class="input-group-addon"]/span[@class="glyphicon glyphicon-search"]').click()
printer_click = wait.until(EC.visibility_of_element_located((By.XPATH, '//div[@class="col-sm-1 btn-printrecinfo"]'))).click()
time.sleep(5)
except Exception as e:
print(e)
sys.exit(1)