我有一个简单的代理检查器,我想返回有效代理的列表。
import asyncio
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession
async def fetch_proxies(url):
session = AsyncHTMLSession()
webpage = await session.get(url) # headers=headers
await session.close()
soup = BeautifulSoup(webpage.html.raw_html, 'lxml')
tag = soup.find('textarea', {'class':'form-control'})
return tag.text
async def check_valid_proxy(proxy):
try:
session = AsyncHTMLSession()
webpage = await session.get('https://ifconfig.me', proxies = {'https' : proxy})
await session.close()
return proxy
except Exception as err:
print(err)
async def main():
url = 'https://free-proxy-list.net'
proxy_fetch = await fetch_proxies(url)
proxy_list = proxy_fetch.split('\n')[3:-1]
tasks = []
for proxy in proxy_list:
task = asyncio.create_task(check_valid_proxy(proxy))
tasks.append(task)
tasks.append(await asyncio.sleep(.1))
valid_proxies = asyncio.gather(*tasks)
return valid_proxies
loop = asyncio.get_event_loop()
valid_proxy_list = loop.run_until_complete(main())
print(valid_proxy_list)
代码完全运行,但是当尝试打印结果列表时,我收到此错误:
Traceback (most recent call last):
File "/home/mu0/Documents/python_files/web_automation/mailSpider/proxy.py", line 36, in <module>
valid_proxy_list = loop.run_until_complete(main())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/base_events.py", line 653, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "/home/mu0/Documents/python_files/web_automation/mailSpider/proxy.py", line 32, in main
valid_proxies = asyncio.gather(*tasks)
^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/tasks.py", line 817, in gather
fut = _ensure_future(arg, loop=loop)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/tasks.py", line 664, in _ensure_future
raise TypeError('An asyncio.Future, a coroutine or an awaitable '
TypeError: An asyncio.Future, a coroutine or an awaitable is required
循环一直运行,直到我键盘中断:
Traceback (most recent call last):
File "/usr/lib/python3.11/threading.py", line 1553, in _shutdown
atexit_call()
File "/usr/lib/python3.11/concurrent/futures/thread.py", line 31, in _python_exit
t.join()
File "/usr/lib/python3.11/threading.py", line 1112, in join
self._wait_for_tstate_lock()
File "/usr/lib/python3.11/threading.py", line 1132, in _wait_for_tstate_lock
if lock.acquire(block, timeout):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt:
我对我糟糕的编码感到抱歉,但我似乎无法弄清楚为什么会发生这种情况。另外,我使用 AsyncHTMLSession 因为它应该是使用此模块的更大脚本的一部分。
create_task 函数需要一个协程,但在您的情况下,您传递的结果是await asyncio.sleep(.1),它不是一个协程。相反,您需要确保将协程传递给 asyncio.create_task。
tasks = []
for proxy in proxy_list:
task = asyncio.create_task(check_valid_proxy(proxy))
tasks.append(task)
tasks.append(asyncio.sleep(.1))
valid_proxies = await asyncio.gather(*tasks)
return valid_proxies
这样,当您调用 asyncio.gather(*tasks) 时,它会正确收集协程列表。