我正在编写一个硒脚本来搜索https://ssllc.com/。该代码似乎不可靠并且仅有时有效。
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys
import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options as ChromeOptions # Import ChromeOptions here
# Define the base URL of the website
base_url = 'https://www.ssllc.com'
search_bar='#page-top > div > div > div > div:nth-child(1) > input[type=text]' #CSS_SELECTOR
items = '#gatsby-focus-wrapper > div.root > div > div > div > div.medium-8.columns.main-content > div.ais-Hits > ul '
# Define the list of search queries
search_queries = [
'Unused+Sartorius+1000+Liter+BIOSTAT+CultiBag+STR+Single+Use+Bioreactor',
'3+x+V5/XCell+Repigen+Next+Gen+ATF+controllers',
'InSite+Integrity+Tester'
]
# Configure ChromeOptions
options = ChromeOptions()
options.add_argument('--headless') # Optional: Run Chrome in headless mode for faster execution
try:
for search_query in search_queries:
# Initialize Chrome WebDriver for each search
with Chrome(options=options) as driver:
# Load the website
driver.get(base_url)
# Find the search input field and enter the query
search_input = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, search_bar)))
search_input.clear()
search_input.send_keys(search_query)
search_input.send_keys(Keys.RETURN)
# Wait for the search results to load
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, items )))
# Extract and print search results
search_results = driver.find_elements(By.CSS_SELECTOR, items )
if search_results:
print(f"Search results for '{search_query}':")
for result in search_results:
# Get the link to the search result
#result_link = result.find_element(By.TAG_NAME, 'a').get_attribute('href')
#print(f"{result.text.strip()} - {result_link}")
print(result.text.strip())
print()
else:
print(f"No search results found for '{search_query}'")
except Exception as e:
print("An error occurred:", e)
我尝试添加一个我知道网站上有的新项目,但它没有出现。例如,生物反应器没有出现,但完整性测试仪却出现了。它们都可以在网站上找到。该代码也不能一致地工作。有时根本不显示任何结果。我认为这是由于网站超载造成的。我的目标是搜索任何关键字。并列出所有相关结果。
您的结果可能“不可靠”,因为该网站正在限制您。
所有这些警告都表明由于 API 过载而未返回结果。
我注意到,实际上您可以通过在 URL 中使用
query
参数直接转到搜索结果。这似乎比使用页面上的搜索框更强大。
我还引入了一些延迟,因为你想避免受到限制。
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options as ChromeOptions
import traceback
import time
import logging
import json
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s [%(levelname)7s] %(message)s',
)
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
BASE_URL = "https://www.ssllc.com"
SEARCH_QUERIES = [
"Unused+Sartorius+1000+Liter+BIOSTAT+CultiBag+STR+Single+Use+Bioreactor",
"3+x+V5/XCell+Repigen+Next+Gen+ATF+controllers",
"InSite+Integrity+Tester"
]
RESULTS = []
options = ChromeOptions()
# options.add_argument('--headless')
# Don't instantiate browser inside loop. Once is enough!
with Chrome(options=options) as driver:
# Just open the base URL once too.
driver.get(BASE_URL)
try:
for search_query in SEARCH_QUERIES:
logging.info(f"🟦 Search term: {search_query}")
query_url = BASE_URL+"/search/?query="+search_query
driver.get(query_url)
logging.debug("- Wait for results to load.")
WebDriverWait(driver, 10).until(EC.presence_of_element_located((
By.CSS_SELECTOR,
".ais-Hits > ul.ais-Hits-list"
)))
logging.debug("- Extract results.")
search_results = driver.find_elements(
By.CSS_SELECTOR,
".ais-Hits > ul.ais-Hits-list > li"
)
if search_results:
logging.info(f"✅ Search results ({len(search_results)} items).")
results = [result.text.strip() for result in search_results]
else:
results = []
logging.warning(f"🚨 No search results found.")
RESULTS.append({
"search": search_query,
"results": results
})
time.sleep(60)
except Exception as e:
logging.error("An error occurred:"+str(e))
logging.error(traceback.format_exc())
with open("search-results.json", "wt") as fid:
json.dump(RESULTS, fid)
这是运行日志:
2024-02-24 08:06:47,910 [ INFO] 🟦 Search term: Unused+Sartorius+1000+Liter+BIOSTAT+CultiBag+STR+Single+Use+Bioreactor
2024-02-24 08:06:50,701 [ DEBUG] - Wait for results to load.
2024-02-24 08:06:50,721 [ DEBUG] - Extract results.
2024-02-24 08:06:50,740 [ INFO] ✅ Search results (4 items).
2024-02-24 08:08:10,910 [ INFO] 🟦 Search term: 3+x+V5/XCell+Repigen+Next+Gen+ATF+controllers
2024-02-24 08:08:12,979 [ DEBUG] - Wait for results to load.
2024-02-24 08:08:12,994 [ DEBUG] - Extract results.
2024-02-24 08:08:13,014 [WARNING] 🚨 No search results found.'
2024-02-24 08:09:33,014 [ INFO] 🟦 Search term: InSite+Integrity+Tester
2024-02-24 08:09:35,355 [ DEBUG] - Wait for results to load.
2024-02-24 08:09:35,370 [ DEBUG] - Extract results.
2024-02-24 08:09:35,383 [ INFO] ✅ Search results (3 items).
我将结果写入 JSON 文件。
[
{
"search":
"Unused+Sartorius+1000+Liter+BIOSTAT+CultiBag+STR+Single+Use+Bioreactor",
"results": [
"Unused Sartorius 1000 Liter BIOSTAT CultiBag STR Single Use
Bioreactor\nManufacturer: Sartorius\nProduct Model #: 1000L Single-Use
Bioreactor\nProduct Code: 337327\nUnused Sartorius 1000 Liter BIOSTAT CultiBag
STR Single Use Bioreactor for sale. SSLLC offers a wide selection of used
Bioreactors / Fermenters for your Used Lab Equipment needs.\nVIEW DETAILS",
"Unused Sartorius 1000 Liter BIOSTAT CultiBag STR Single Use
Bioreactor\nManufacturer: Sartorius\nProduct Model #: 1000L Single-Use
Bioreactor\nProduct Code: 337326\nUnused Sartorius 1000 Liter BIOSTAT CultiBag
STR Single Use Bioreactor for sale. SSLLC offers a wide selection of used
Bioreactors / Fermenters for your Used Lab Equipment needs.\nVIEW DETAILS",
"Unused Sartorius 1000 Liter BIOSTAT CultiBag STR Single Use
Bioreactor\nManufacturer: Sartorius\nProduct Model #: 1000L Single-Use
Bioreactor\nProduct Code: 337325\nUnused Sartorius 1000 Liter BIOSTAT CultiBag
STR Single Use Bioreactor for sale. SSLLC offers a wide selection of used
Bioreactors / Fermenters for your Used Lab Equipment needs.\nVIEW DETAILS",
"Unused Sartorius 1000 Liter BIOSTAT CultiBag STR Single Use
Bioreactor\nManufacturer: Sartorius\nProduct Model #: 1000L Single-Use
Bioreactor\nProduct Code: 337329\nUnused Sartorius 1000 Liter BIOSTAT CultiBag
STR Single Use Bioreactor for sale. SSLLC offers a wide selection of used
Bioreactors / Fermenters for your Used Lab Equipment needs.\nVIEW DETAILS"
]
},
{
"search": "3+x+V5/XCell+Repigen+Next+Gen+ATF+controllers",
"results": []
},
{
"search": "InSite+Integrity+Tester",
"results": [
"Unused Thermo Scientific inSITE Integrity Tester\nManufacturer: Thermo
Fisher Scientific\nProduct Model #: 30-IN-1052 RG\nProduct Code: 332788\nUsed
Unused Thermo Scientific 30-IN-1001 Filter Integrity Tester for sale. SSLLC
offers a wide selection of used Analyzers for your Used Lab Equipment
needs.\nVIEW DETAILS",
"Unused Thermo Scientific inSITE Integrity Tester\nManufacturer: Thermo
Fisher Scientific\nProduct Model #: 30-IN-1052 RG\nProduct Code: 332787\nUsed
Unused Thermo Scientific 30-IN-1001 Filter Integrity Tester for sale. SSLLC
offers a wide selection of used Analyzers for your Used Lab Equipment
needs.\nVIEW DETAILS",
"Unused Thermo Scientific inSITE Integrity Tester\nManufacturer: Thermo
Fisher Scientific\nProduct Model #: 30-IN-1052 RG\nProduct Code: 332786\nUnused
Thermo Scientific 30-IN-1001 Filter Integrity Tester for sale. SSLLC offers a
wide selection of used Analyzers for your Used Lab Equipment needs.\nVIEW
DETAILS"
]
}
]