我想抓取隐藏在_doPostBack链接下的公司链接

问题描述 投票:0回答:1

我有这个网站(https://www.nfrc.co.uk/search-members),我想点击“立即搜索屋顶承包商”,然后点击“国内房产”,然后点击“搜索”。出现很多页面,我只需要隐藏在所有公司名称下的链接并浏览所有页面。我没有编码背景。如果有人向我提供 python 代码来抓取这些链接,我将不胜感激。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

# Setup Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")  # Maximize the browser window

# Initialize Selenium WebDriver using ChromeDriverManager
driver = webdriver.Chrome(service=webdriver.chrome.service.Service(ChromeDriverManager().install()), options=chrome_options)

# Initialize an empty list to store company data
company_data = []

# Function to wait and click with retry
def wait_and_click(driver, by, value, retries=3):
for _ in range(retries):
    try:
        element = WebDriverWait(driver,20).until(EC.element_to_be_clickable((by, value)))
        driver.execute_script("arguments[0].scrollIntoView(true);", element)
        element.click()
        return True
    except Exception as e:
        print(f"Error clicking element {value}: {e}")
        time.sleep(2)  # Wait before retrying
return False

    # Open the NFRC website
driver.get("https://www.nfrc.co.uk/search-members")

    # Click on "Search for a roofing contractor now"
wait_and_click(driver, By.LINK_TEXT, "Search for a roofing contractor now")

    # Click to open the dropdown for contractor type selection
 dropdown_locator = (By.CLASS_NAME, "sfDropdownList")
 wait_and_click(driver, By.CLASS_NAME, "sfDropdownList")

    # Select "For a Domestic Property" from the dropdown
 wait_and_click(driver, By.XPATH, '//[@id="MainContent_C001_contractorType"]/option[@value="CONTRACTOR_ADVANCED_DOMESTIC_PROPERTY_VIEW"]')

    # Select "Pitched Roof"
wait_and_click(driver, By.XPATH, '//*[@id="MainContent_C001_rblRoofType"]/li[1]/label')

    # Proceed to the next step
wait_and_click(driver, By.ID, "MainContent_C001_btnSearch")

    # Add a delay to ensure the page loads completely after 
clicking search button
time.sleep(5)  # Adjust the time delay as needed

我已经尝试过这段代码,它一直有效,直到列表出现为止,而且我不知道。

python web-scraping
1个回答
0
投票

这应该可以帮助您完成大部分工作。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")

driver = webdriver.Chrome(service=webdriver.chrome.service.Service(ChromeDriverManager().install()), options=chrome_options)

def wait_and_click(driver, by, value, retries=3):
    for _ in range(retries):
        try:
            element = WebDriverWait(driver,20).until(EC.element_to_be_clickable((by, value)))
            driver.execute_script("arguments[0].scrollIntoView(true);", element)
            element.click()
            return True
        except Exception as e:
            print(f"Error clicking element {value}: {e}")
            time.sleep(2)
    return False

driver.get("https://www.nfrc.co.uk/search-members")

wait_and_click(driver, By.LINK_TEXT, "Search for a roofing contractor now")

wait_and_click(driver, By.CLASS_NAME, "sfDropdownList")

wait_and_click(driver, By.XPATH, '//option[@value="CONTRACTOR_ADVANCED_DOMESTIC_PROPERTY_VIEW"]')

wait_and_click(driver, By.ID, "MainContent_C001_btnSearch")

WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".search_results")))

while True:
    index = 0

    # Can't just iterate over list of elements because when return to page these elements are stale.
    #
    while True:
        results = driver.find_elements(By.CSS_SELECTOR, "h3 > a")

        try:
            results[index].click()
        except IndexError:
            # Exhausted all results on page.
            break

        try:
            link = driver.find_element(By.CSS_SELECTOR, "#ctl00_MainContent_C004_radListView_ctrl0_tr4 td.details_item > a")
            print(link.get_attribute("href"))
        except NoSuchElementException:
            # No URL on contractor page.
            pass

        # Return to search results.
        driver.back()

        index += 1

    paginate = driver.find_elements(By.CSS_SELECTOR, "#ctl00_MainContent_C001_radListView_radDataPager1 a")

    for next in range(len(paginate)):
        page = paginate[next]
        if page.get_attribute("onclick") == "return false;":
            print("⏩ Move to next page!")
            paginate[next+1].click()
            break

driver.close()

这是搜索结果前几页的输出:

http://www.1stchoiceroofingbristol.co.uk/                                                                                                                                                                                               
https://www.1stroofingltd.co.uk/                                                                                                                                                                                                        
https://www.3aroofing.co.uk/                                                                                                                                                                                                            
http://www.aeelkins.co.uk/                                                                                                                                                                                                              
⏩ Move to next page!                                                                                                                                                                                                                   
https://www.armstrongroofing.expert/                                                                                                                                                                                                    
https://www.acwhyte.co.uk/                                                                                                                                                                                                              
https://www.adlangandsons.co.uk/                                                                                                                                                                                                        
https://www.aehughes.com/                                                                                                                                                                                                               
https://www.aj-developments.co.uk/                                                                                                                                                                                                      
⏩ Move to next page!                                                                                                                                                                                                                   
https://www.alkingroofing.co.uk/                                                                                                                                                                                                        
http://www.sneddonroofing.co.uk/                                                                                                                                                                                                        
https://www.taylorroofing.co.uk/                                                                                                                                                                                                        
https://www.awilsonroofing.co.uk/                                                                                                                                                                                                       
https://www.aaharrisonandsons.co.uk/                                                                                                                                                                                                    
⏩ Move to next page!                                                                                                                                                                                                                   
https://www.abaulroofing.com/                                                                                                                                                                                                           
https://abcroofinglimited.co.uk/                                                                                                                                                                                                        
http://www.absroofing.co.uk/                                                                                                                                                                                                            
https://www.aesroofing.co.uk/                                                                                                                                                                                                           
http://www.a1feltroofing.co.uk/                                                                                                                                                                                                         
⏩ Move to next page!                                                                                                                                                                                                                   
https://www.a1roofspec.co.uk/                                                                                                                                                                                                           
https://www.aaaroofing.co.uk/                                                                                                                                                                                                           
https://www.abbeyroofing.co.uk/                                                                                                                                                                                                         
https://www.abbeymillhomes.co.uk/                                                                                                                                                                                                       
http://www.acroofingservices.co.uk/                                                                                                                                                                                                     
⏩ Move to next page!

我看到您包含了

pandas
,所以想必您会想要累积 URL,然后在最后创建一个数据框,可能会将其转储到 CSV 文件中。

此外,主分页循环仍然需要一个干净的退出点。我没有时间等待它遍历所有结果页面。您应该捕获出现的任何异常事件,然后从无限分页循环中干净地退出。

© www.soinside.com 2019 - 2024. All rights reserved.