我正在尝试为本地本地存储编写异步解析器,但结果不稳定。它应该获得 ~11k 项,但有时它会毫无例外地获得随机数量。
可能是什么问题,我如何捕捉/记录它?
URL = 'https://shop.samberi.com'
HEADERS = {
'Accept': '*/*',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/101.0.4951.54 Safari/537.36'
}
all_products = []
async def get_products(url):
async with aiohttp.ClientSession() as session:
res = await session.get(url=url, headers=HEADERS)
bs = BeautifulSoup(await res.text(), 'lxml')
cats = [URL + cat.get('href') + '?SHOWALL_1=1'
for cat in bs.find('ul', id='vertical-multilevel-menu')
.find_all('a', class_='parent')] + [
#Костыль, не могу получить эти ссылки автоматически(
'https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1',
'https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1',
'https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1'
]
tasks = [asyncio.shield(parse_page(session, url, max_s)) for url in cats]
await asyncio.gather(*tasks)
async def parse_page(session, cat_url, max_s):
async with session.get(url=cat_url, headers=HEADERS) as res:
res_text = await res.text()
pagebs = BeautifulSoup(res_text, 'lxml')
products_on_page = pagebs.find_all('div', class_='product-item')
for product in products_on_page:
name = product.find('div', class_='product-item-title').text.strip()
price = product.find('span', class_='product-item-price-current')\
.text.strip().strip('₽').strip()
all_products.append([name, price])
def main():
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(get_products(URL))
你可能会因为同时发出太多请求而被服务器踢出。尝试检查 https 响应以进一步检查问题。
TCPConnector
来限制同时请求的数量。这个使用 8 个限制的程序始终返回 10886 个产品:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import json
URL = "https://shop.samberi.com"
HEADERS = {
"Accept": "*/*",
}
async def get_products(url: str) -> list:
connector = aiohttp.TCPConnector(limit=8, limit_per_host=8)
async with aiohttp.ClientSession(connector=connector) as session:
res = await session.get(url=url, headers=HEADERS)
bs = BeautifulSoup(await res.text(), "lxml")
cats = [
URL + cat.get("href") + "?SHOWALL_1=1"
for cat in bs.find("ul", id="vertical-multilevel-menu").find_all(
"a", class_="parent"
)
] + [
# Костыль, не могу получить эти ссылки автоматически(
"https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1",
"https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1",
"https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1",
]
tasks = [parse_page(session, url) for url in cats]
print(f"Fetching {len(tasks)} pages")
results = await asyncio.gather(*tasks)
return [product for products in results for product in products]
async def parse_page(session: aiohttp.ClientSession, cat_url: str) -> list:
all_products = []
async with session.get(url=cat_url, headers=HEADERS) as res:
res_text = await res.text()
pagebs = BeautifulSoup(res_text, "lxml")
products_on_page = pagebs.find_all("div", class_="product-item")
print(f"Fething {len(products_on_page)} products")
for product in products_on_page:
name = product.find("div", class_="product-item-title").text.strip()
price = (
product.find("span", class_="product-item-price-current")
.text.strip()
.strip("₽")
.strip()
)
all_products.append([name, price])
return all_products
async def main():
products = await get_products(URL)
print(len(products))
with open("products.json", "w") as f:
json.dump({"products": products}, f)
if __name__ == "__main__":
asyncio.run(main())