我试图从网站上抓取产品(名称,价格,img,url),但因为价格值可能是字符串货币(例如:1.000.000đ)或“Giá Liên Hệ”,所以我想检查价格是否=“Giá Liên Hệ”然后价格 = 0 但它似乎返回重复的项目。
我的数据类(可能需要将 current_price 更改为 Decimal):
@dataclass
class Item:
url: str
name: str
current_price: str
place: str
img: str
date_add: datetime.datetime
我的功能:
def cellphones(query):
lists = []
search = query.replace(' ', '%20')
url = f"https://cellphones.com.vn/catalogsearch/result?q={search}"
driver.get(url)
content = driver.find_element(By.CSS_SELECTOR, "div[id*='search-catalog-page']")
items = content.find_elements(By.CSS_SELECTOR, "div[class*='product-info']")
for _ in items:
if _.find_element(By.CSS_SELECTOR, "p[class*='product__price--show']").text == "Giá Liên Hệ":
item = Item(
url=_.find_element(By.CSS_SELECTOR, "a").get_attribute('href'),
name=_.find_element(By.CSS_SELECTOR, "h3").text,
current_price=Decimal("0"),
place="Cellphones",
img=_.find_element(By.CSS_SELECTOR, "img").get_attribute('src'),
date_add=datetime.datetime.now()
)
else:
item = Item(
url=_.find_element(By.CSS_SELECTOR, "a").get_attribute('href'),
name=_.find_element(By.CSS_SELECTOR, "h3").text,
current_price=Decimal(_.find_element(By.CSS_SELECTOR, "p[class*='product__price--show']").text.replace("₫", "").replace(".","").replace(" ","")),
place="Cellphones",
img=_.find_element(By.CSS_SELECTOR, "img").get_attribute('src'),
date_add=datetime.datetime.now()
)
item, created = Product.objects.update_or_create(
name=item.name,
place=item.place,
defaults={
'current_price': item.current_price,
'url': item.url,
'img': item.img,
'date_add': item.date_add
}
)
lists.append(item)
return lists
我的模板:
{% block content %}
<p>You searched '<span>{{context.name}}</span>'</p>
<p>Return {{lists|length}} products.</p>
<ul class="d-flex flex-row flex-wrap align-content-center justify-content-around align-items-center list-unstyled">
{% for p in lists %}
<li>
<div class="card h-100 mt-2 border-dark mb-3" style="max-width: 18rem;">
<img src="{{p.img}}" class="card-img-top" style="height:17.813em;width:auto;" alt="product image">
<div class="card-body">
<h5 class="card-title">{{p.name}}</h5>
{% if p.current_price == 0 %}
<p class="card-text">Giá Liên Hệ</p>
{% else %}
<p class="card-text">{{p.current_price|intcomma}}</p>
{% endif %}
<p class="card-text">Store: {{p.place}}</p>
<p class="card-text">Add at {{p.date_add}}</p>
<a href="{{p.url}}" target="_blank" class="btn btn-primary">Go to store</a>
</div>
</div>
</li>
{% endfor %}
</ul>{% endblock %}
好吧,我放弃尝试为此寻找干净的解决方案,所以我只是检查 item.name 是否存在,如果不存在,则将其添加到列表中
def cellphones(query):
lists = []
search = query.replace(' ', '%20')
url = f"https://cellphones.com.vn/catalogsearch/result?q={search}"
driver.get(url)
content = driver.find_element(
By.CSS_SELECTOR, "div[id*='search-catalog-page']")
items = content.find_elements(
By.CSS_SELECTOR, "div[class*='product-info']")
print(items)
for _ in items:
try:
a = _.find_element(By.CSS_SELECTOR, "a").get_attribute('href')
b = _.find_element(By.CSS_SELECTOR, "h3").text
c = _.find_element(By.CSS_SELECTOR, "p[class*='product__price--show']").text
d = _.find_element(By.CSS_SELECTOR, "img").get_attribute('src')
e = datetime.datetime.now(tz=timezone.utc)
if c == "Giá Liên Hệ":
c = Decimal("0")
else:
c = Decimal(c.replace("₫", "").replace(".","").replace(" ",""))
newItem = Item(
url=a,
name=b,
current_price=c,
place="Cellphones",
img=d,
date_add=e
)
print(newItem)
item, created = Product.objects.update_or_create(
name=newItem.name,
place=newItem.place,
defaults={
'current_price': newItem.current_price,
'url': newItem.url,
'img': newItem.img,
'date_add': newItem.date_add
}
)
print(item)
print(created)
if any(obj.name == item.name for obj in lists):
pass
else:
lists.append(item)
except:
pass
return lists