我正在编写一个脚本来确定工作代理服务器。使用 Synchronous 类一切都运行顺利,但是,当我想使用异步编程来执行此操作时,我会得到非常不同的结果。据我所知,实际上没有其他任何改变。
异步类:
class FreeProxyList(Proxy):
def __init__(self, testEach=False):
self.testEach = testEach
self.proxies = []
self.session = None
async def setup(self):
async with aiohttp.ClientSession() as session:
async with session.get("https://free-proxy-list.net/") as response:
html = await response.text()
souped = BeautifulSoup(html, "html.parser")
table = souped.find_all("table")[0]
headers = [header.text for header in table.find_all("th")]
proxies_list = []
for row in table.find_all("tr")[1:]:
columns = row.find_all("td")
if columns:
proxy_info = {headers[i]: columns[i].text for i in range(len(headers))}
proxies_list.append(proxy_info)
# Filter proxies
proxies_list = list(filter(lambda x: x['Https'] == "yes", proxies_list))
# proxies_list = list(filter(lambda x: x['Code'] in ["US", "CA", "GB", "AU", "NZ"], proxies_list))
if self.testEach:
self.proxies = []
tasks = [self.testOne(session, proxy, index, len(proxies_list)) for index, proxy in enumerate(proxies_list)]
results = await asyncio.gather(*tasks)
self.proxies = [proxy for proxy, result in zip(proxies_list, results) if result]
async def testOne(self, session, proxy, index, total):
import time
start = time.time()
try:
print(f"{index + 1} / {total :<5} Testing {proxy['IP Address']}:{proxy['Port']}")
async with session.get("https://reedgraff.com", proxy=f"http://{proxy['IP Address']}:{proxy['Port']}", timeout=5) as response:
if response.status == 200:
return time.time() - start
else:
return None
except KeyboardInterrupt:
raise KeyboardInterrupt
except:
return None
同步类:
class FreeProxyList(Proxy):
def __init__(self, testEach=False):
self.testEach = testEach
self.proxies = []
self.session = None
def setup(self):
import bs4
import requests
html = requests.get("https://free-proxy-list.net/")
souped = bs4.BeautifulSoup(html.text, "html.parser")
table = souped.find_all("table")[0]
headers = [header.text for header in table.find_all("th")]
proxies_list = []
for row in table.find_all("tr")[1:]:
columns = row.find_all("td")
if columns:
proxy_info = {headers[i]: columns[i].text for i in range(len(headers))}
proxies_list.append(proxy_info)
# Filter proxies
proxies_list = list(filter(lambda x: x['Https'] == "yes", proxies_list))
# proxies_list = list(filter(lambda x: x['Code'] in ["US", "CA", "GB", "AU", "NZ"], proxies_list))
if self.testEach:
self.proxies = []
for index, proxy in enumerate(proxies_list):
print(f"{index + 1} / {len(proxies_list) :<5} Testing {proxy['IP Address']}:{proxy['Port']}")
if self.testOne(proxy):
self.proxies.append(proxy)
def testOne(self, proxy):
import requests
import time
start = time.time()
try:
response = requests.get("https://reedgraff.com", proxies={"https": f"{proxy['IP Address']}:{proxy['Port']}", "http": f"{proxy['IP Address']}:{proxy['Port']}"}, timeout=5)
if response.status_code == 200:
return time.time() - start
else:
return None
except KeyboardInterrupt:
raise KeyboardInterrupt
except:
return None
运行同步程序时,几乎所有 IP 都可以工作,但是,运行异步程序时,没有一个或一个可以工作。
您的异步版本中确实有一些错误,这解释了您的结果。您有...
souped = BeautifulSoup(html, "html.parser")
...当需要的是:
souped = bs4.BeautifulSoup(html, "html.parser")
此外,方法
testone
需要一个total参数,这是not传递给它,所以它肯定会失败。
我可能会观察到,在同步版本的连续运行之间,正在测试的代理总数和工作代理数量可能会有很大差异。在下面执行的同步和异步版本中,两个版本测试的代理数量相同,尽管结果略有不同,这并不令我感到惊讶(测试的是相同的代理吗?)
这基本上是您的代码,仅进行了微小的修改:
import asyncio
class AsyncFreeProxyList():
def __init__(self, testEach=False):
self.testEach = testEach
self.proxies = []
self.session = None
async def setup(self):
import bs4
import aiohttp
async with aiohttp.ClientSession() as session:
async with session.get("https://free-proxy-list.net/") as response:
html = await response.text()
souped = bs4.BeautifulSoup(html, "html.parser") # Missing bs4.
table = souped.find_all("table")[0]
headers = [header.text for header in table.find_all("th")]
proxies_list = []
for row in table.find_all("tr")[1:]:
columns = row.find_all("td")
if columns:
proxy_info = {headers[i]: columns[i].text for i in range(len(headers))}
proxies_list.append(proxy_info)
# Filter proxies
proxies_list = list(filter(lambda x: x['Https'] == "yes", proxies_list))
# proxies_list = list(filter(lambda x: x['Code'] in ["US", "CA", "GB", "AU", "NZ"], proxies_list))
if self.testEach:
self.proxies = []
total = len(proxies_list)
tasks = (self.testOne(session, proxy, index, total) for index, proxy in enumerate(proxies_list))
results = await asyncio.gather(*tasks)
self.proxies = [proxy for proxy in results if proxy]
async def testOne(self, session, proxy, index, total):
import time
#start = time.time()
try:
print(f"{index + 1} / {total :<5} Testing {proxy['IP Address']}:{proxy['Port']}")
async with session.get("https://reedgraff.com", proxy=f"http://{proxy['IP Address']}:{proxy['Port']}", timeout=5) as response:
if response.status == 200:
return proxy
else:
return None
except KeyboardInterrupt:
raise KeyboardInterrupt
except:
return None
#######################################
class FreeProxyList:
def __init__(self, testEach=False):
self.testEach = testEach
self.proxies = []
self.session = None
def setup(self):
import bs4
import requests
html = requests.get("https://free-proxy-list.net/")
souped = bs4.BeautifulSoup(html.text, "html.parser")
table = souped.find_all("table")[0]
headers = [header.text for header in table.find_all("th")]
proxies_list = []
for row in table.find_all("tr")[1:]:
columns = row.find_all("td")
if columns:
proxy_info = {headers[i]: columns[i].text for i in range(len(headers))}
proxies_list.append(proxy_info)
# Filter proxies
proxies_list = list(filter(lambda x: x['Https'] == "yes", proxies_list))
# proxies_list = list(filter(lambda x: x['Code'] in ["US", "CA", "GB", "AU", "NZ"], proxies_list))
if self.testEach:
self.proxies = []
for index, proxy in enumerate(proxies_list):
print(f"{index + 1} / {len(proxies_list) :<5} Testing {proxy['IP Address']}:{proxy['Port']}")
if self.testOne(proxy):
self.proxies.append(proxy)
def testOne(self, proxy):
import requests
try:
response = requests.get("https://reedgraff.com", proxies={"https": f"{proxy['IP Address']}:{proxy['Port']}", "http": f"{proxy['IP Address']}:{proxy['Port']}"}, timeout=5)
if response.status_code == 200:
return True
else:
return None
except KeyboardInterrupt:
raise KeyboardInterrupt
except:
return None
if __name__ == '__main__':
import time
print('synchronous:')
t = time.monotonic()
fpl = FreeProxyList(True)
fpl.setup()
elapsed = time.monotonic() - t
for proxy in fpl.proxies:
print(proxy)
print('Elapsed time:', elapsed)
print('\nasynchronous:')
t = time.monotonic()
afpl = AsyncFreeProxyList(True)
asyncio.run(afpl.setup())
elapsed = time.monotonic() - t
for proxy in afpl.proxies:
print(proxy)
print('Elapsed time:', elapsed)
打印:
synchronous:
1 / 52 Testing 85.210.84.11:8080
2 / 52 Testing 8.219.97.248:80
3 / 52 Testing 3.122.84.99:80
4 / 52 Testing 3.126.147.182:80
5 / 52 Testing 13.36.104.85:80
6 / 52 Testing 43.201.121.81:80
7 / 52 Testing 13.37.73.214:80
8 / 52 Testing 35.72.118.126:80
9 / 52 Testing 13.208.56.180:80
10 / 52 Testing 3.123.150.192:80
11 / 52 Testing 43.202.154.212:80
12 / 52 Testing 52.196.1.182:80
13 / 52 Testing 13.37.59.99:80
14 / 52 Testing 18.228.149.161:80
15 / 52 Testing 13.38.153.36:80
16 / 52 Testing 37.187.25.85:80
17 / 52 Testing 160.86.242.23:8080
18 / 52 Testing 67.43.228.253:12915
19 / 52 Testing 165.85.253.175:8080
20 / 52 Testing 72.10.160.172:9739
21 / 52 Testing 54.83.185.141:3128
22 / 52 Testing 43.131.45.21:8443
23 / 52 Testing 43.134.68.153:3128
24 / 52 Testing 116.105.18.72:10004
25 / 52 Testing 72.10.160.91:8167
26 / 52 Testing 158.160.63.194:8090
27 / 52 Testing 47.89.184.18:3128
28 / 52 Testing 178.48.68.61:18080
29 / 52 Testing 8.213.151.128:3128
30 / 52 Testing 47.243.92.199:3128
31 / 52 Testing 47.91.104.88:3128
32 / 52 Testing 8.213.34.58:3128
33 / 52 Testing 114.130.153.122:58080
34 / 52 Testing 35.220.254.137:8080
35 / 52 Testing 188.166.197.129:3128
36 / 52 Testing 72.10.160.94:8355
37 / 52 Testing 72.10.160.171:10095
38 / 52 Testing 67.43.227.227:11023
39 / 52 Testing 15.204.161.192:18080
40 / 52 Testing 4.159.61.189:8080
41 / 52 Testing 85.210.84.189:8080
42 / 52 Testing 51.222.161.115:80
43 / 52 Testing 62.106.70.185:8118
44 / 52 Testing 174.138.171.162:36247
45 / 52 Testing 174.138.167.250:38864
46 / 52 Testing 67.43.236.20:10145
47 / 52 Testing 157.230.89.122:18084
48 / 52 Testing 116.107.229.82:5004
49 / 52 Testing 209.121.164.50:31147
50 / 52 Testing 72.10.160.93:13931
51 / 52 Testing 72.10.160.170:2657
52 / 52 Testing 223.135.156.183:8080
{'IP Address': '160.86.242.23', 'Port': '8080', 'Code': 'JP', 'Country': 'Japan', 'Anonymity': 'elite proxy', 'Google': 'no', 'Https': 'yes', 'Last Checked': '1 min ago'}
{'IP Address': '165.85.253.175', 'Port': '8080', 'Code': 'US', 'Country': 'United States', 'Anonymity': 'anonymous', 'Google': 'no', 'Https': 'yes', 'Last Checked': '1 min ago'}
{'IP Address': '178.48.68.61', 'Port': '18080', 'Code': 'HU', 'Country': 'Hungary', 'Anonymity': 'anonymous', 'Google': 'no', 'Https': 'yes', 'Last Checked': '14 mins ago'}
Elapsed time: 156.14000000013039
asynchronous:
1 / 52 Testing 85.210.84.11:8080
2 / 52 Testing 8.219.97.248:80
3 / 52 Testing 3.122.84.99:80
4 / 52 Testing 3.126.147.182:80
5 / 52 Testing 13.36.104.85:80
6 / 52 Testing 43.201.121.81:80
7 / 52 Testing 13.37.73.214:80
8 / 52 Testing 35.72.118.126:80
9 / 52 Testing 13.208.56.180:80
10 / 52 Testing 3.123.150.192:80
11 / 52 Testing 43.202.154.212:80
12 / 52 Testing 52.196.1.182:80
13 / 52 Testing 13.37.59.99:80
14 / 52 Testing 18.228.149.161:80
15 / 52 Testing 13.38.153.36:80
16 / 52 Testing 37.187.25.85:80
17 / 52 Testing 160.86.242.23:8080
18 / 52 Testing 67.43.228.253:12915
19 / 52 Testing 165.85.253.175:8080
20 / 52 Testing 72.10.160.172:9739
21 / 52 Testing 54.83.185.141:3128
22 / 52 Testing 43.131.45.21:8443
23 / 52 Testing 43.134.68.153:3128
24 / 52 Testing 116.105.18.72:10004
25 / 52 Testing 72.10.160.91:8167
26 / 52 Testing 158.160.63.194:8090
27 / 52 Testing 47.89.184.18:3128
28 / 52 Testing 178.48.68.61:18080
29 / 52 Testing 8.213.151.128:3128
30 / 52 Testing 47.243.92.199:3128
31 / 52 Testing 47.91.104.88:3128
32 / 52 Testing 8.213.34.58:3128
33 / 52 Testing 114.130.153.122:58080
34 / 52 Testing 35.220.254.137:8080
35 / 52 Testing 188.166.197.129:3128
36 / 52 Testing 72.10.160.94:8355
37 / 52 Testing 72.10.160.171:10095
38 / 52 Testing 67.43.227.227:11023
39 / 52 Testing 15.204.161.192:18080
40 / 52 Testing 4.159.61.189:8080
41 / 52 Testing 85.210.84.189:8080
42 / 52 Testing 51.222.161.115:80
43 / 52 Testing 62.106.70.185:8118
44 / 52 Testing 174.138.171.162:36247
45 / 52 Testing 174.138.167.250:38864
46 / 52 Testing 67.43.236.20:10145
47 / 52 Testing 157.230.89.122:18084
48 / 52 Testing 116.107.229.82:5004
49 / 52 Testing 209.121.164.50:31147
50 / 52 Testing 72.10.160.93:13931
51 / 52 Testing 72.10.160.170:2657
52 / 52 Testing 223.135.156.183:8080
{'IP Address': '178.48.68.61', 'Port': '18080', 'Code': 'HU', 'Country': 'Hungary', 'Anonymity': 'anonymous', 'Google': 'no', 'Https': 'yes', 'Last Checked': '14 mins ago'}
{'IP Address': '8.213.34.58', 'Port': '3128', 'Code': 'SA', 'Country': 'Saudi Arabia', 'Anonymity': 'anonymous', 'Google': 'no', 'Https': 'yes', 'Last Checked': '14 mins ago'}
Elapsed time: 6.2649999998975545
异步版本快了大约 25 倍,并且同步版本不会返回不同数量的工作代理(尽管不能保证它们应该这样做,因为某一时刻工作的代理可能不会在下一时刻工作,或者可能正在测试一组不同的代理)。