我正在写一个简单的程序来下载网址列表。
!curl -LO https://github.com/mozilla/cipherscan/raw/master/top1m/top-1m.csv
!head -5 top-1m.csv
1,google.com
2,facebook.com
3,youtube.com
4,yahoo.com
5,baidu.com
alexa_1m = []
with open('top-1m.csv') as csv:
lines = csv.read().splitlines()
for line in lines:
index, url = line.split(',')
alexa_1m.append(url)
print(len(alexa_1m))
print(alexa_1m[:6])
1000000
['google.com', 'facebook.com', 'youtube.com', 'yahoo.com', 'baidu.com', 'wikipedia.org']
import asks
import trio
import time
async def fetch(s, url):
try:
response = await s.get('http://' + url, timeout=15, connection_timeout=15)
html_str = response.text
results[url] = len(html_str) # save only html length for simplicity
except BaseException as e:
errors [url] = type(e).__name__
async def main(urls):
s = asks.Session(connections=20)
async with trio.open_nursery() as nursery:
for url in urls:
nursery.start_soon(fetch, s, url)
for n_urls in [10, 25, 50, 75, 100, 150, 200, 250, 300, 500, 600, 700, 800, 1000, 2000]:
urls = alexa_1m[:n_urls]
results = dict()
errors = dict()
start_time = time.time()
trio.run(main, urls)
total_time = time.time() - start_time
assert len(results) + len(errors) == len(urls)
print(f'time {total_time:>8.4f} | RPS {(n_urls / total_time):>8.4f} | URLS {len(urls):>6} | RESULTS {len(results):>6} | ERRORS {len(errors):>6}')
output for connections=10
time 16.8685 | RPS 0.5928 | URLS 10 | RESULTS 9 | ERRORS 1
time 10.9119 | RPS 2.2911 | URLS 25 | RESULTS 25 | ERRORS 0
time 17.8106 | RPS 2.8073 | URLS 50 | RESULTS 49 | ERRORS 1
time 19.0452 | RPS 3.9380 | URLS 75 | RESULTS 71 | ERRORS 4
time 19.1133 | RPS 5.2320 | URLS 100 | RESULTS 91 | ERRORS 9
time 18.1323 | RPS 8.2725 | URLS 150 | RESULTS 140 | ERRORS 10
time 27.2238 | RPS 7.3465 | URLS 200 | RESULTS 190 | ERRORS 10
time 33.8300 | RPS 7.3899 | URLS 250 | RESULTS 240 | ERRORS 10
time 40.0035 | RPS 7.4994 | URLS 300 | RESULTS 287 | ERRORS 13
time 67.6925 | RPS 7.3863 | URLS 500 | RESULTS 475 | ERRORS 25
time 78.3422 | RPS 7.6587 | URLS 600 | RESULTS 568 | ERRORS 32
time 98.5214 | RPS 7.1051 | URLS 700 | RESULTS 658 | ERRORS 42
time 118.0234 | RPS 6.7783 | URLS 800 | RESULTS 747 | ERRORS 53
time 154.7543 | RPS 6.4619 | URLS 1000 | RESULTS 928 | ERRORS 72
time 356.5015 | RPS 5.6101 | URLS 2000 | RESULTS 1813 | ERRORS 187
time 571.1392 | RPS 5.2527 | URLS 3000 | RESULTS 2699 | ERRORS 301
output for connections=20
time 3.1848 | RPS 3.1399 | URLS 10 | RESULTS 10 | ERRORS 0
time 22.6056 | RPS 1.1059 | URLS 25 | RESULTS 24 | ERRORS 1
time 15.0470 | RPS 3.3229 | URLS 50 | RESULTS 49 | ERRORS 1
time 16.8384 | RPS 4.4541 | URLS 75 | RESULTS 72 | ERRORS 3
time 18.2071 | RPS 5.4924 | URLS 100 | RESULTS 90 | ERRORS 10
time 20.6861 | RPS 7.2512 | URLS 150 | RESULTS 140 | ERRORS 10
time 21.7591 | RPS 9.1915 | URLS 200 | RESULTS 190 | ERRORS 10
time 25.2696 | RPS 9.8933 | URLS 250 | RESULTS 239 | ERRORS 11
time 31.1732 | RPS 9.6237 | URLS 300 | RESULTS 285 | ERRORS 15
time 51.9217 | RPS 9.6299 | URLS 500 | RESULTS 471 | ERRORS 29
time 53.7469 | RPS 11.1634 | URLS 600 | RESULTS 567 | ERRORS 33
time 58.8306 | RPS 11.8986 | URLS 700 | RESULTS 657 | ERRORS 43
time 65.4025 | RPS 12.2320 | URLS 800 | RESULTS 744 | ERRORS 56
time 96.4105 | RPS 10.3723 | URLS 1000 | RESULTS 926 | ERRORS 74
time 225.2461 | RPS 8.8792 | URLS 2000 | RESULTS 1813 | ERRORS 187
output for connections=100
time 4.1067 | RPS 2.4350 | URLS 10 | RESULTS 10 | ERRORS 0
time 16.7758 | RPS 1.4902 | URLS 25 | RESULTS 24 | ERRORS 1
time 17.3520 | RPS 2.8815 | URLS 50 | RESULTS 47 | ERRORS 3
time 16.0240 | RPS 4.6805 | URLS 75 | RESULTS 71 | ERRORS 4
time 16.0592 | RPS 6.2270 | URLS 100 | RESULTS 90 | ERRORS 10
time 19.6826 | RPS 7.6209 | URLS 150 | RESULTS 138 | ERRORS 12
time 20.9570 | RPS 9.5433 | URLS 200 | RESULTS 189 | ERRORS 11
time 22.2123 | RPS 11.2550 | URLS 250 | RESULTS 239 | ERRORS 11
time 28.9885 | RPS 10.3489 | URLS 300 | RESULTS 285 | ERRORS 15
time 45.7558 | RPS 10.9276 | URLS 500 | RESULTS 468 | ERRORS 32
time 40.8652 | RPS 14.6824 | URLS 600 | RESULTS 565 | ERRORS 35
time 48.3942 | RPS 14.4645 | URLS 700 | RESULTS 656 | ERRORS 44
time 56.0546 | RPS 14.2718 | URLS 800 | RESULTS 744 | ERRORS 56
time 67.9813 | RPS 14.7099 | URLS 1000 | RESULTS 924 | ERRORS 76
time 205.3066 | RPS 9.7415 | URLS 2000 | RESULTS 1727 | ERRORS 273
time 275.1011 | RPS 10.9051 | URLS 3000 | RESULTS 2572 | ERRORS 428
output for connections=1000
time 2.7290 | RPS 3.6643 | URLS 10 | RESULTS 10 | ERRORS 0
time 15.4174 | RPS 1.6215 | URLS 25 | RESULTS 24 | ERRORS 1
time 15.4496 | RPS 3.2363 | URLS 50 | RESULTS 48 | ERRORS 2
time 16.3329 | RPS 4.5920 | URLS 75 | RESULTS 70 | ERRORS 5
time 15.7269 | RPS 6.3585 | URLS 100 | RESULTS 90 | ERRORS 10
time 16.8205 | RPS 8.9177 | URLS 150 | RESULTS 139 | ERRORS 11
time 15.9112 | RPS 12.5697 | URLS 200 | RESULTS 190 | ERRORS 10
time 16.2899 | RPS 15.3469 | URLS 250 | RESULTS 240 | ERRORS 10
time 16.4773 | RPS 18.2069 | URLS 300 | RESULTS 286 | ERRORS 14
time 35.9516 | RPS 13.9076 | URLS 500 | RESULTS 133 | ERRORS 367
time 21.9307 | RPS 27.3589 | URLS 600 | RESULTS 226 | ERRORS 374
time 25.0500 | RPS 27.9441 | URLS 700 | RESULTS 396 | ERRORS 304
time 51.7007 | RPS 15.4737 | URLS 800 | RESULTS 93 | ERRORS 707
time 54.0819 | RPS 18.4905 | URLS 1000 | RESULTS 98 | ERRORS 902
time 171.3959 | RPS 11.6689 | URLS 2000 | RESULTS 206 | ERRORS 1794
output for connections=10000
time 11.5818 | RPS 0.8634 | URLS 10 | RESULTS 10 | ERRORS 0
time 9.3749 | RPS 2.6667 | URLS 25 | RESULTS 25 | ERRORS 0
time 15.1868 | RPS 3.2923 | URLS 50 | RESULTS 49 | ERRORS 1
time 15.1500 | RPS 4.9505 | URLS 75 | RESULTS 72 | ERRORS 3
time 15.7089 | RPS 6.3658 | URLS 100 | RESULTS 91 | ERRORS 9
time 19.6824 | RPS 7.6210 | URLS 150 | RESULTS 139 | ERRORS 11
time 16.5324 | RPS 12.0975 | URLS 200 | RESULTS 190 | ERRORS 10
time 19.9391 | RPS 12.5382 | URLS 250 | RESULTS 238 | ERRORS 12
time 20.4344 | RPS 14.6811 | URLS 300 | RESULTS 284 | ERRORS 16
time 32.1311 | RPS 15.5612 | URLS 500 | RESULTS 376 | ERRORS 124
time 26.8989 | RPS 22.3057 | URLS 600 | RESULTS 363 | ERRORS 237
time 41.2159 | RPS 16.9837 | URLS 700 | RESULTS 176 | ERRORS 524
time 48.1362 | RPS 16.6195 | URLS 800 | RESULTS 83 | ERRORS 717
time 55.6773 | RPS 17.9606 | URLS 1000 | RESULTS 114 | ERRORS 886
time 130.0663 | RPS 15.3768 | URLS 2000 | RESULTS 244 | ERRORS 1756
当有许多任务和许多连接时,我得到的大多数请求都失败了RequestTimeout
和gaierror
:
import itertools
list(itertools.islice(errors.items(), 100))
output
[
('ggpht.com', 'gaierror'),
('ilivid.com', 'gaierror'),
('lpmxbox600.com', 'gaierror'),
('matlabgah.com', 'gaierror'),
('palxxx.com', 'gaierror'),
('ytimg.com', 'gaierror'),
('paipai.com', 'gaierror'),
('hotspotshield.com', 'OSError'),
('icmwebserv.com', 'gaierror'),
('bgr.com', 'BadHttpResponse'),
('directrev.com', 'gaierror'),
('workercn.cn', 'gaierror'),
('axisbank.co.in', 'gaierror'),
('intentmedia.net', 'gaierror'),
('reliancebroadband.co.in', 'RequestTimeout'),
('v9.com', 'RequestTimeout'),
('tebyan.net', 'RequestTimeout'),
('asriran.com', 'RequestTimeout'),
('akairan.com', 'RequestTimeout'),
('tuolar.com', 'gaierror'),
('thomann.de', 'RequestTimeout'),
('unian.net', 'BadHttpResponse'),
('rr.com', 'RequestTimeout'),
('multitran.ru', 'BadHttpResponse'),
('chexun.com', 'OSError'),
('geocities.jp', 'gaierror'),
('plaintube.com', 'RequestTimeout'),
('rei.com', 'RequestTimeout'),
('ldblog.jp', 'gaierror'),
('dfiles.ru', 'SSLError'),
('shahrekhabar.com', 'RequestTimeout'),
('asos.com', 'RequestTimeout'),
('yjc.ir', 'RequestTimeout'),
('mihanblog.com', 'RequestTimeout'),
('sportsdirect.com', 'RequestTimeout'),
('mmgp.ru', 'RequestTimeout'),
('cloob.com', 'RequestTimeout'),
('alluc.to', 'OSError'),
('postimg.org', 'gaierror'),
('sockshare.com', 'RequestTimeout'),
('khabarpu.com', 'RequestTimeout'),
('mashreghnews.ir', 'RequestTimeout'),
('chinabroadcast.cn', 'RequestTimeout'),
('linksynergy.com', 'gaierror'),
('akamaihd.net', 'gaierror'),
('bmi.ir', 'RequestTimeout'),
('cartfill.in', 'gaierror'),
('cocolog-nifty.com', 'gaierror'),
('extra.com.br', 'RequestTimeout'),
('installerapplicationusa.com', 'gaierror'),
('chinanews.com', 'BadHttpResponse'),
('taobaocdn.com', 'RequestTimeout'),
('sweetim.com', 'gaierror'),
('timesjobs.com', 'RequestTimeout'),
('persianblog.ir', 'RequestTimeout'),
('haivl.com', 'RequestTimeout'),
('shaparak.ir', 'RequestTimeout'),
('rozblog.com', 'RequestTimeout'),
('statscrop.com', 'RequestTimeout'),
('pgmediaserve.com', 'gaierror'),
('xhamster.com/user/video', 'BadHttpResponse'),
('mysearchresults.com', 'RequestTimeout'),
('downloadquick.net', 'gaierror'),
('alimama.com', 'RequestTimeout'),
('bodybuilding.com', 'RequestTimeout'),
('sergey-mavrodi.com', 'RequestTimeout'),
('societe.com', 'RequestTimeout'),
('series.ly', 'RequestTimeout'),
('daum.net', 'RequestTimeout'),
('myfreshnet.com', 'gaierror'),
('archive.today', 'RequestTimeout'),
('sweet-page.com', 'SSLError'),
('shop.com', 'RequestTimeout'),
('nasdaq.com', 'RequestTimeout'),
('tvrain.ru', 'BadHttpResponse'),
('tsetmc.com', 'RequestTimeout'),
('delta-homes.com', 'RequestTimeout'),
('seemorgh.com', 'RequestTimeout'),
('inetglobal.com', 'RequestTimeout'),
('medu.ir', 'RequestTimeout'),
('readmanga.eu', 'RequestTimeout'),
('goo.ne.jp', 'RequestTimeout'),
('indiegogo.com', 'RequestTimeout'),
('lpcloudbox328.com', 'gaierror'),
('secureinternetbank.com', 'gaierror'),
('picofile.com', 'RequestTimeout'),
('styletv.com.cn', 'RequestTimeout'),
('tv.com', 'RequestTimeout'),
('extratorrent.cc', 'RequestTimeout'),
('a8.net', 'RequestTimeout'),
('livedoor.biz', 'gaierror'),
('adk2.com', 'RequestTimeout'),
('cmbchina.com', 'RequestTimeout'),
('gruposantander.es', 'gaierror'),
('beamtele.com', 'gaierror'),
('ppstream.com', 'RequestTimeout'),
('icicibank.co.in', 'RequestTimeout'),
('bartarinha.ir', 'RequestTimeout'),
('theblaze.com', 'RequestTimeout'),
('americanas.com.br', 'RequestTimeout')
]
有关如何调整此参数的任何建议: 为了更快地下载网址,错误更少?
trio.run(main, urls)
的任务/网址数量(最好将其分解成块?)1000
太大了?10000
?)timeout
和connection_timeout
trio.run(main, urls)
的任务/网址数量(最好将其分解成块?)
我不认为有任何需要将它分成块...通常最简单的方法就是在程序开头只有一个trio.run
切换到“三重模式”,然后从此开始保持三重奏模式。
连接数量(这仅限于我的硬件?1000太大了?10000?)
这是不可能预测的 - 它将取决于您的网络连接和各种事情。我认为你能做的最好的就是你已经在做的事:尝试一些不同的价值并看看。
超时和connection_timeout
同样,你可能想要根据经验进行调整。太久了,你浪费了很多年代等待永远无法回应的联系。太短了,你切断了正常工作的连接。
当您尝试像这样使网络饱和时,一定程度的超时和DNS查找错误是不可避免的。我认为最好的方法就是添加一些重试逻辑:如果第一次尝试出错,那么再尝试三次,或类似的事情。
tenacity
图书馆对此有好处。以下是在实际项目中使用韧性和三重奏的示例:https://github.com/pypa/linehaul/blob/89ed128deb714827f732d0404d4d664ee4fc1634/linehaul/server.py