使用异步解析(Python)缺少数据

问题描述 投票:0回答:1

我正在尝试为本地本地存储编写异步解析器,但结果不稳定。它应该获得 ~11k 项,但有时它会毫无例外地获得随机数量。

可能是什么问题,我如何捕捉/记录它?

URL = 'https://shop.samberi.com'

HEADERS = {
    'Accept': '*/*',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/101.0.4951.54 Safari/537.36'
}

all_products = []

async def get_products(url):
    async with aiohttp.ClientSession() as session:
        res = await session.get(url=url, headers=HEADERS)
        bs = BeautifulSoup(await res.text(), 'lxml')
        cats = [URL + cat.get('href') + '?SHOWALL_1=1'
                for cat in bs.find('ul', id='vertical-multilevel-menu')
                .find_all('a', class_='parent')] + [
            #Костыль, не могу получить эти ссылки автоматически(
            'https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1',
            'https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1',
            'https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1'
        ]
        tasks = [asyncio.shield(parse_page(session, url, max_s)) for url in cats]

        await asyncio.gather(*tasks)

async def parse_page(session, cat_url, max_s):
    async with session.get(url=cat_url, headers=HEADERS) as res:
        res_text = await res.text()
        pagebs = BeautifulSoup(res_text, 'lxml')
        products_on_page = pagebs.find_all('div', class_='product-item')
        for product in products_on_page:
            name = product.find('div', class_='product-item-title').text.strip()
            price = product.find('span', class_='product-item-price-current')\
                .text.strip().strip('₽').strip()
            all_products.append([name, price])
def main():
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    asyncio.run(get_products(URL))
python asynchronous python-asyncio
1个回答
0
投票

你可能会因为同时发出太多请求而被服务器踢出。尝试检查 https 响应以进一步检查问题。

如果这确实是问题所在,您可以使用

TCPConnector
来限制同时请求的数量。这个使用 8 个限制的程序始终返回 10886 个产品:

import asyncio
import aiohttp
from bs4 import BeautifulSoup
import json

URL = "https://shop.samberi.com"

HEADERS = {
    "Accept": "*/*",
}


async def get_products(url: str) -> list:
    connector = aiohttp.TCPConnector(limit=8, limit_per_host=8)
    async with aiohttp.ClientSession(connector=connector) as session:
        res = await session.get(url=url, headers=HEADERS)
        bs = BeautifulSoup(await res.text(), "lxml")
        cats = [
            URL + cat.get("href") + "?SHOWALL_1=1"
            for cat in bs.find("ul", id="vertical-multilevel-menu").find_all(
                "a", class_="parent"
            )
        ] + [
            # Костыль, не могу получить эти ссылки автоматически(
            "https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1",
            "https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1",
            "https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1",
        ]

        tasks = [parse_page(session, url) for url in cats]
        print(f"Fetching {len(tasks)} pages")

        results = await asyncio.gather(*tasks)

        return [product for products in results for product in products]


async def parse_page(session: aiohttp.ClientSession, cat_url: str) -> list:
    all_products = []

    async with session.get(url=cat_url, headers=HEADERS) as res:
        res_text = await res.text()
        pagebs = BeautifulSoup(res_text, "lxml")
        products_on_page = pagebs.find_all("div", class_="product-item")

        print(f"Fething {len(products_on_page)} products")
        for product in products_on_page:
            name = product.find("div", class_="product-item-title").text.strip()
            price = (
                product.find("span", class_="product-item-price-current")
                .text.strip()
                .strip("₽")
                .strip()
            )
            all_products.append([name, price])

    return all_products


async def main():
    products = await get_products(URL)
    print(len(products))

    with open("products.json", "w") as f:
        json.dump({"products": products}, f)


if __name__ == "__main__":
    asyncio.run(main())
© www.soinside.com 2019 - 2024. All rights reserved.