我正在制作一个网络抓取工具,可以从该页面中抓取大约 1500 个产品的所有数据。
我正在提取商品名称、是否畅销、服装类型、有多少种颜色以及商品价格。一切都通过 BeautifulSoup 在同一链接上提取。
for item in items:
name = item.find('a', class_ = 'product-card__link-overlay').text.strip()
try:
special_tag = item.find('div', class_ = 'product-card__messaging accent--color').text.strip()
except:
special_tag = '/'
productclass = item.find('div', class_ = 'product-card__subtitle').text.strip()
colours = item.find('div', class_ = 'product-card__product-count').text.strip()
try:
price = item.find('div', class_ = 'product-price us__styling is--current-price css-11s12ax').text.strip()
except:
price = item.find('div', class_ = 'product-price is--current-price css-1ydfahe').text.strip()
product = {'name':name, 'special':special_tag, 'class':productclass, 'colours':colours, 'price':price}
sh.append_row([str(product['name']),str(product['special']),str(product['class']),str(product['colours']),str(product['price'])])
为了能够提取所有内容,我使用 Selenium 滚动整个页面并将其完整加载。
time.sleep(3)
previous_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
time.sleep(3)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == previous_height:
page_source = driver.page_source
break
previous_height = new_height
当我提取页面源代码并在 BeautifulSoup 中使用它后,似乎出现了问题。我尝试了一切,甚至在互联网上研究了我能想到的,但它仍然不起作用。该程序仅提取大约 65-70 个产品,然后突然停止。
整个代码仅供参考:
from bs4 import BeautifulSoup
import gspread
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
gc = gspread.service_account(filename='creds.json')
sh = gc.open('Nike catalog').sheet1
driver.get('https://www.nike.com/w/mens-clothing-6ymx6znik1')
#Scroll program
time.sleep(3)
previous_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
time.sleep(3)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == previous_height:
page_source = driver.page_source
break
previous_height = new_height
#Main program
baseurl='https://www.nike.com/w/mens-clothing-6ymx6znik1'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}
soup = BeautifulSoup( page_source,'lxml')
items = soup.find_all('div', class_ = 'product-card__body')
#HTML parser
for item in items:
name = item.find('a', class_ = 'product-card__link-overlay').text.strip()
try:
special_tag = item.find('div', class_ = 'product-card__messaging accent--color').text.strip()
except:
special_tag = '/'
productclass = item.find('div', class_ = 'product-card__subtitle').text.strip()
colours = item.find('div', class_ = 'product-card__product-count').text.strip()
try:
price = item.find('div', class_ = 'product-price us__styling is--current-price css-11s12ax').text.strip()
except:
price = item.find('div', class_ = 'product-price is--current-price css-1ydfahe').text.strip()
product = {'name':name, 'special':special_tag, 'class':productclass, 'colours':colours, 'price':price}
sh.append_row([str(product['name']),str(product['special']),str(product['class']),str(product['colours']),str(product['price'])])
我浏览了整个加载的页面,并确保每个产品的 HTML 类和代码都是相同的。也许这是耐克在他们的网站中集成的某种禁令,我不使用任何代理。我还确保通过异常,但它不起作用。
我不知道如何解决这个问题。有人遇到过同样的问题或者有类似的经验吗?如果您这样做,如果您能解决它,将不胜感激。预先感谢。
在用我的程序对此进行测试时,我观察到“卡片”的加载在卡片库存达到 48 时停止。这种情况是当 PAGE_DOWN 操作设置为每 3 秒 10 次时。因此,如果我们在加载新卡之前到达视图底部,加载似乎就会停止。
以下测试程序中设定的时序保证有效。大约需要10分钟。
import lxml.html
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from time import sleep
from datetime import datetime
start_time = datetime.now().strftime('%y-%m-%d %H:%M:%S')
driver = webdriver.Firefox()
#driver = webdriver.Chrome()
driver.implicitly_wait(5)
actions = ActionChains(driver)
url='https://www.nike.com/w/mens-clothing-6ymx6znik1'
driver.get(url)
card_num_elm= WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[@class='wall-header__content']/h1/span")))
card_num = int(card_num_elm.text.strip('()'))
print('\ntop expression: ',card_num,'\n' )
#load more by scrolling down
pre_count = 0
same_count_times = 0
for i in range(150):
for pd in range(4):
actions.send_keys(Keys.PAGE_DOWN).perform()
sleep(1)
sleep(2)
cards= WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='product-card__body']")))
new_count = len(cards)
print('len(cards): ',new_count)
#if new_count >= card_num-1:
#break
if new_count == pre_count:
same_count_times += 1
else:
same_count_times = 0
pre_count = new_count
if same_count_times > 3:
break
#get whole html (by Selenium)
page = driver.page_source
#transfer to lxml
p_root = lxml.html.fromstring(page)
list_elements = p_root.xpath("//div[@class='product-card__body']")
new_count = len(list_elements)
print('\nnumber on the lxml: ',new_count)
print ('\nstarted at : ', start_time )
print ('ended at : ', datetime.today().strftime('%y-%m-%d %H:%M:%S') ,'\n')