我有一个使用 Crawlee 包编写的爬虫代码,用户上传他们的网站链接,爬虫会抓取所提供输入中的所有链接。然而,问题在于 Crawlee 混合了不同用户输入之间的爬行链接,导致共享用户数据。
我的代码:
async def run_crawler(username, urls) -> None:
crawler = PlaywrightCrawler(
max_requests_per_crawl=10,
retry_on_blocked=True,
#max_session_rotations=10,
max_request_retries=5,
# concurrency_settings=self.concurrency_settings,
)
crawled_data = []
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
try:
text_content = await context.page.evaluate("document.querySelector('main').innerText")
except:
text_content = await context.page.evaluate("document.body.innerText")
data = {
'username': username,
'url': context.request.url,
'title': await context.page.title(),
"context": text_content,
}
# Push the extracted data to the default dataset.
await context.push_data(data)
crawled_data.append(data)
await context.enqueue_links(
strategy = EnqueueStrategy.SAME_DOMAIN,
)
# Run the crawler with the initial list of requests.
await crawler.run(urls)
await crawler.export_data(f'data/{username}_results.json')
return crawled_data
async def run_multiple_crawlers(user_requests):
tasks = []
for username, urls in user_requests:
tasks.append(run_crawler(username, urls))
# Run all crawlers concurrently
results = await asyncio.gather(*tasks)
return results
# Example usage with a list of user requests
user_requests = [
('user1', ['https://github,com']),
('user2', ['https://stackoverflow.com']),
# Add more users and URLs here
]
# Run the crawlers
asyncio.run(run_multiple_crawlers(user_requests))
这应该有效
async def main(domains):
for domain in domains:
await crawler.run(domain)
await crawler.export_data('results.json')
if name == 'main':
asyncio.run(main(['example.com', 'google.com']))