我有这个网站(https://www.nfrc.co.uk/search-members),我想点击“立即搜索屋顶承包商”,然后点击“国内房产”,然后点击“搜索”。出现很多页面,我只需要隐藏在所有公司名称下的链接并浏览所有页面。我没有编码背景。如果有人向我提供 python 代码来抓取这些链接,我将不胜感激。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
# Setup Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized") # Maximize the browser window
# Initialize Selenium WebDriver using ChromeDriverManager
driver = webdriver.Chrome(service=webdriver.chrome.service.Service(ChromeDriverManager().install()), options=chrome_options)
# Initialize an empty list to store company data
company_data = []
# Function to wait and click with retry
def wait_and_click(driver, by, value, retries=3):
for _ in range(retries):
try:
element = WebDriverWait(driver,20).until(EC.element_to_be_clickable((by, value)))
driver.execute_script("arguments[0].scrollIntoView(true);", element)
element.click()
return True
except Exception as e:
print(f"Error clicking element {value}: {e}")
time.sleep(2) # Wait before retrying
return False
# Open the NFRC website
driver.get("https://www.nfrc.co.uk/search-members")
# Click on "Search for a roofing contractor now"
wait_and_click(driver, By.LINK_TEXT, "Search for a roofing contractor now")
# Click to open the dropdown for contractor type selection
dropdown_locator = (By.CLASS_NAME, "sfDropdownList")
wait_and_click(driver, By.CLASS_NAME, "sfDropdownList")
# Select "For a Domestic Property" from the dropdown
wait_and_click(driver, By.XPATH, '//[@id="MainContent_C001_contractorType"]/option[@value="CONTRACTOR_ADVANCED_DOMESTIC_PROPERTY_VIEW"]')
# Select "Pitched Roof"
wait_and_click(driver, By.XPATH, '//*[@id="MainContent_C001_rblRoofType"]/li[1]/label')
# Proceed to the next step
wait_and_click(driver, By.ID, "MainContent_C001_btnSearch")
# Add a delay to ensure the page loads completely after
clicking search button
time.sleep(5) # Adjust the time delay as needed
我已经尝试过这段代码,它一直有效,直到列表出现为止,而且我不知道。
这应该可以帮助您完成大部分工作。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=webdriver.chrome.service.Service(ChromeDriverManager().install()), options=chrome_options)
def wait_and_click(driver, by, value, retries=3):
for _ in range(retries):
try:
element = WebDriverWait(driver,20).until(EC.element_to_be_clickable((by, value)))
driver.execute_script("arguments[0].scrollIntoView(true);", element)
element.click()
return True
except Exception as e:
print(f"Error clicking element {value}: {e}")
time.sleep(2)
return False
driver.get("https://www.nfrc.co.uk/search-members")
wait_and_click(driver, By.LINK_TEXT, "Search for a roofing contractor now")
wait_and_click(driver, By.CLASS_NAME, "sfDropdownList")
wait_and_click(driver, By.XPATH, '//option[@value="CONTRACTOR_ADVANCED_DOMESTIC_PROPERTY_VIEW"]')
wait_and_click(driver, By.ID, "MainContent_C001_btnSearch")
WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".search_results")))
while True:
index = 0
# Can't just iterate over list of elements because when return to page these elements are stale.
#
while True:
results = driver.find_elements(By.CSS_SELECTOR, "h3 > a")
try:
results[index].click()
except IndexError:
# Exhausted all results on page.
break
try:
link = driver.find_element(By.CSS_SELECTOR, "#ctl00_MainContent_C004_radListView_ctrl0_tr4 td.details_item > a")
print(link.get_attribute("href"))
except NoSuchElementException:
# No URL on contractor page.
pass
# Return to search results.
driver.back()
index += 1
paginate = driver.find_elements(By.CSS_SELECTOR, "#ctl00_MainContent_C001_radListView_radDataPager1 a")
for next in range(len(paginate)):
page = paginate[next]
if page.get_attribute("onclick") == "return false;":
print("⏩ Move to next page!")
paginate[next+1].click()
break
driver.close()
这是搜索结果前几页的输出:
http://www.1stchoiceroofingbristol.co.uk/
https://www.1stroofingltd.co.uk/
https://www.3aroofing.co.uk/
http://www.aeelkins.co.uk/
⏩ Move to next page!
https://www.armstrongroofing.expert/
https://www.acwhyte.co.uk/
https://www.adlangandsons.co.uk/
https://www.aehughes.com/
https://www.aj-developments.co.uk/
⏩ Move to next page!
https://www.alkingroofing.co.uk/
http://www.sneddonroofing.co.uk/
https://www.taylorroofing.co.uk/
https://www.awilsonroofing.co.uk/
https://www.aaharrisonandsons.co.uk/
⏩ Move to next page!
https://www.abaulroofing.com/
https://abcroofinglimited.co.uk/
http://www.absroofing.co.uk/
https://www.aesroofing.co.uk/
http://www.a1feltroofing.co.uk/
⏩ Move to next page!
https://www.a1roofspec.co.uk/
https://www.aaaroofing.co.uk/
https://www.abbeyroofing.co.uk/
https://www.abbeymillhomes.co.uk/
http://www.acroofingservices.co.uk/
⏩ Move to next page!
我看到您包含了
pandas
,所以想必您会想要累积 URL,然后在最后创建一个数据框,可能会将其转储到 CSV 文件中。
此外,主分页循环仍然需要一个干净的退出点。我没有时间等待它遍历所有结果页面。您应该捕获出现的任何异常事件,然后从无限分页循环中干净地退出。