爬虫中的Python异步行为

问题描述 投票:0回答:1

我有一个使用 Crawlee 包编写的爬虫代码,用户上传他们的网站链接,爬虫会抓取所提供输入中的所有链接。然而,问题在于 Crawlee 混合了不同用户输入之间的爬行链接,导致共享用户数据。

我的代码:


async def run_crawler(username, urls) -> None:

    crawler = PlaywrightCrawler(
        max_requests_per_crawl=10,
        retry_on_blocked=True,
        #max_session_rotations=10,
        max_request_retries=5,
        # concurrency_settings=self.concurrency_settings,
        
    )

    crawled_data = []
    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')


        # Extract data from the page.
        try:
            text_content = await context.page.evaluate("document.querySelector('main').innerText")
        except:
            text_content = await context.page.evaluate("document.body.innerText")


        data = {
            'username': username,
            'url': context.request.url,
            'title': await context.page.title(),
            "context": text_content,
        }
        
        # Push the extracted data to the default dataset.
        await context.push_data(data)
        crawled_data.append(data)

    
        await context.enqueue_links(
            strategy =  EnqueueStrategy.SAME_DOMAIN,
        
            )

    # Run the crawler with the initial list of requests.
    await crawler.run(urls)
    await crawler.export_data(f'data/{username}_results.json')

    return crawled_data


async def run_multiple_crawlers(user_requests):
    tasks = []
    
    for username, urls in user_requests:
        tasks.append(run_crawler(username, urls))
    
    # Run all crawlers concurrently
    results = await asyncio.gather(*tasks)
    
    return results

# Example usage with a list of user requests
user_requests = [
    ('user1', ['https://github,com']),
    ('user2', ['https://stackoverflow.com']),
    # Add more users and URLs here
]

# Run the crawlers
asyncio.run(run_multiple_crawlers(user_requests))
python asynchronous crawlee
1个回答
0
投票

这应该有效

async def main(domains):
    for domain in domains:
        await crawler.run(domain)
        await crawler.export_data('results.json') 
if name == 'main': 
 asyncio.run(main(['example.com', 'google.com']))
© www.soinside.com 2019 - 2024. All rights reserved.