这是测试https://stockx.com/puma?prices=300-400,200-300&size_types=men&years=2017的网址
我能够提取所有产品详细页面href
链接,但是我最后只得到一个结果。它应该去所有链接并提取我的名字和img url。我在这里错过了什么?
当前输出结果为json
[
{
"product_name": "Puma Clyde WWE Undertaker Black",
"imgurl": "https://stockx.imgix.net/Puma-Clyde-WWE-Undertaker-Black.png?fit=fill&bg=FFFFFF&w=700&h=500&auto=format,compress&q=90&dpr=2&trim=color&updated_at=1538080256"
}
]
这是工作代码
import selenium
import json
import time
import re
import string
import requests
import bs4
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
domain = 'https://stockx.com/puma?prices=300-400,200-300&size_types=men&years=2017'
def prepare_driver(url):
options = Options()
# options.add_argument('-headless')
driver = webdriver.Chrome(executable_path='/Users/Documents/python/Selenium/bin/chromedriver')
driver.get(url)
time.sleep(2)
wait = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.CLASS_NAME, 'title-container')))
return driver
def fill_form(driver, search_argument):
'''Finds all the input tags in form and makes a POST requests.'''
#search_field = driver.find_element_by_id('q')
#search_field.send_keys(search_argument)
# We look for the search button and click it
#driver.find_element_by_class_name('search__submit')\
#.click()
wait = WebDriverWait(driver, timeout=10).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'title-container')))
def scrape_results(driver, n_results):
'''Returns the data from n_results amount of results.'''
product_urls = list()
product_data = list()
for product_title in driver.find_elements_by_css_selector("div[class*='tile browse-tile']"):
product_urls.append(product_title.find_element_by_css_selector(
"a[href*='/']").get_attribute('href'))
print(*product_urls, sep = "\n")
for url in range(0, n_results):
if url == n_results:
break
url_data = scrape_product_data(driver, product_urls[url])
product_data.append(url_data)
#print(*product_data, sep = "\n")
return product_data
def scrape_product_data(driver, product_url):
'''Visits an product page and extracts the data.'''
if driver == None:
driver = prepare_driver(product_url)
driver.get(product_url)
time.sleep(12)
product_fields = dict()
# Get the product name
product_fields['product_name'] = driver.find_element_by_xpath(
'//div[@class="col-md-12"]/h1').text
# Get the image url
product_fields['imgurl'] = driver.find_element_by_xpath(
'//img[@class="product-image"]').get_attribute('src')
return product_fields
if __name__ == '__main__':
try:
driver = prepare_driver(domain)
#fill_form(driver, 'juniole tf')
product_data = scrape_results(driver, 4)
product_data = json.dumps(product_data, indent=4) #ensure_acii => changes japanese to correct character
with open('booking_data_stockx.json', 'w') as f:
f.write(product_data)
finally:
driver.quit()
您可以根据我的想法完成整个过程。我从访问过的页面中选择了一些随机项来证明访问过。
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
baseURL = 'https://stockx.com'
final = []
with requests.Session() as s:
res = s.get('https://stockx.com/puma?prices=300-400,200-300&size_types=men&years=2017')
soup = bs(res.content, 'lxml')
items = soup.select('#products-container [href]')
titles = [item['id'] for item in items]
links = [baseURL + item['href'] for item in items]
results = list(zip(titles, links))
df = pd.DataFrame(results)
for result in results:
res = s.get(result[1])
soup = bs(res.content, 'lxml')
details = [item.text for item in soup.select('.detail')]
final.append([result[0], result[1], details])
df2 = pd.DataFrame(final)
df2.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )