我正在尝试进行一些抓取,其中我基本上有一个 URL 列表,我得到 HTML 响应,然后继续抓取。当然,我尝试过异步向 URL 发出请求,但失败了。
这是我到目前为止所拥有的:
import aiohttp
import asyncio
async def save_file(row, file_path):
with open(file_path, 'w') as f:
f.write(row)
async def download_html(url_idx, url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
# Requesting the page
body = await resp.text()
file_path = #path to output file
#some scraping logic......
await save_file(#result of the scrapping, file_path)
async def main():
urls = ['url1', 'url2', 'url3']
tasks = []
for url_idx, url in enumerate(urls):
task = asyncio.create_task(download_html(url_idx, url))
tasks.append(task)
await asyncio.gather(*tasks)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
运行上面的代码给我这个错误:
当前没有事件循环:loop = asyncio.get_event_loop()
什么是正确的方法?
我也很想知道如何使用
tqdm
显示进度条。
这里是如何重组代码+集成
tqdm
进度条的示例:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from tqdm import tqdm
async def download_html(session, url):
async with session.get(url) as resp:
# Requesting the page
body = await resp.text()
soup = BeautifulSoup(body, "html.parser")
return url, soup.title.text
async def main():
urls = [
"http://www.google.com",
"http://www.google.es",
"http://www.google.de",
]
async with aiohttp.ClientSession() as session:
tasks = set()
for url_idx, url in enumerate(urls):
task = asyncio.create_task(download_html(session, url))
tasks.add(task)
results = []
for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
url, title = await t
results.append((url, title))
print()
print("Results:")
print(*results, sep="\n")
# save the results, etc.
# ...
if __name__ == "__main__":
asyncio.run(main())
打印:
100%|███████████████████████████████████████████████████████| 3/3 [00:00<00:00, 24.20it/s]
Results:
('http://www.google.com', 'Google')
('http://www.google.de', 'Google')
('http://www.google.es', 'Google')