基本上我想这样做:
curl --retry 10 --retry-all-errors --remote-name-all --parallel --parallel-max 150 "https://api.pwnedpasswords.com/range/000{0,1,2,3}{0,1,2,3,4,5,6,7,8,9,A,B,C,D,E,F}" > curl.log 2>&1
(注:上述“疯狂并行”
curl
命令已得到“我被典当”密码数据库提供商的正式认可,因此这并不构成DoS攻击!)
上面检索了 64 个文本文件,每个文件约 32kB。在具有 Gbit 互联网连接的廉价虚拟机上,这只需要大约 0.2 秒。太棒了。
我想做一些非常类似的事情,但使用
libcurl
以编程方式作为 C++ 应用程序的一部分。
我从官方网站的此示例代码开始。
该页面的代码逐字副本。没有变化。
/***************************************************************************
* _ _ ____ _
* Project ___| | | | _ \| |
* / __| | | | |_) | |
* | (__| |_| | _ <| |___
* \___|\___/|_| \_\_____|
*
* Copyright (C) Daniel Stenberg, <[email protected]>, et al.
*
* This software is licensed as described in the file COPYING, which
* you should have received as part of this distribution. The terms
* are also available at https://curl.se/docs/copyright.html.
*
* You may opt to use, copy, modify, merge, publish, distribute and/or sell
* copies of the Software, and permit persons to whom the Software is
* furnished to do so, under the terms of the COPYING file.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
* SPDX-License-Identifier: curl
*
***************************************************************************/
/* <DESC>
* multi_socket API using libevent
* </DESC>
*/
#include <stdio.h>
#include <stdlib.h>
#include <event2/event.h>
#include <curl/curl.h>
struct event_base *base;
CURLM *curl_handle;
struct event *timeout;
typedef struct curl_context_s {
struct event *event;
curl_socket_t sockfd;
} curl_context_t;
static void curl_perform(int fd, short event, void *arg);
static curl_context_t *create_curl_context(curl_socket_t sockfd)
{
curl_context_t *context;
context = (curl_context_t *) malloc(sizeof(*context));
context->sockfd = sockfd;
context->event = event_new(base, sockfd, 0, curl_perform, context);
return context;
}
static void destroy_curl_context(curl_context_t *context)
{
event_del(context->event);
event_free(context->event);
free(context);
}
static void add_download(const char *url, int num)
{
char filename[50];
FILE *file;
CURL *handle;
snprintf(filename, 50, "%d.download", num);
file = fopen(filename, "wb");
if(!file) {
fprintf(stderr, "Error opening %s\n", filename);
return;
}
handle = curl_easy_init();
curl_easy_setopt(handle, CURLOPT_WRITEDATA, file);
curl_easy_setopt(handle, CURLOPT_PRIVATE, file);
curl_easy_setopt(handle, CURLOPT_URL, url);
curl_multi_add_handle(curl_handle, handle);
fprintf(stderr, "Added download %s -> %s\n", url, filename);
}
static void check_multi_info(void)
{
char *done_url;
CURLMsg *message;
int pending;
CURL *easy_handle;
FILE *file;
while((message = curl_multi_info_read(curl_handle, &pending))) {
switch(message->msg) {
case CURLMSG_DONE:
/* Do not use message data after calling curl_multi_remove_handle() and
curl_easy_cleanup(). As per curl_multi_info_read() docs:
"WARNING: The data the returned pointer points to does not survive
calling curl_multi_cleanup, curl_multi_remove_handle or
curl_easy_cleanup." */
easy_handle = message->easy_handle;
curl_easy_getinfo(easy_handle, CURLINFO_EFFECTIVE_URL, &done_url);
curl_easy_getinfo(easy_handle, CURLINFO_PRIVATE, &file);
printf("%s DONE\n", done_url);
curl_multi_remove_handle(curl_handle, easy_handle);
curl_easy_cleanup(easy_handle);
if(file) {
fclose(file);
}
break;
default:
fprintf(stderr, "CURLMSG default\n");
break;
}
}
}
static void curl_perform(int fd, short event, void *arg)
{
int running_handles;
int flags = 0;
curl_context_t *context;
if(event & EV_READ)
flags |= CURL_CSELECT_IN;
if(event & EV_WRITE)
flags |= CURL_CSELECT_OUT;
context = (curl_context_t *) arg;
curl_multi_socket_action(curl_handle, context->sockfd, flags,
&running_handles);
check_multi_info();
}
static void on_timeout(evutil_socket_t fd, short events, void *arg)
{
int running_handles;
curl_multi_socket_action(curl_handle, CURL_SOCKET_TIMEOUT, 0,
&running_handles);
check_multi_info();
}
static int start_timeout(CURLM *multi, long timeout_ms, void *userp)
{
if(timeout_ms < 0) {
evtimer_del(timeout);
}
else {
if(timeout_ms == 0)
timeout_ms = 1; /* 0 means call socket_action asap */
struct timeval tv;
tv.tv_sec = timeout_ms / 1000;
tv.tv_usec = (timeout_ms % 1000) * 1000;
evtimer_del(timeout);
evtimer_add(timeout, &tv);
}
return 0;
}
static int handle_socket(CURL *easy, curl_socket_t s, int action, void *userp,
void *socketp)
{
curl_context_t *curl_context;
int events = 0;
switch(action) {
case CURL_POLL_IN:
case CURL_POLL_OUT:
case CURL_POLL_INOUT:
curl_context = socketp ?
(curl_context_t *) socketp : create_curl_context(s);
curl_multi_assign(curl_handle, s, (void *) curl_context);
if(action != CURL_POLL_IN)
events |= EV_WRITE;
if(action != CURL_POLL_OUT)
events |= EV_READ;
events |= EV_PERSIST;
event_del(curl_context->event);
event_assign(curl_context->event, base, curl_context->sockfd, events,
curl_perform, curl_context);
event_add(curl_context->event, NULL);
break;
case CURL_POLL_REMOVE:
if(socketp) {
event_del(((curl_context_t*) socketp)->event);
destroy_curl_context((curl_context_t*) socketp);
curl_multi_assign(curl_handle, s, NULL);
}
break;
default:
abort();
}
return 0;
}
int main(int argc, char **argv)
{
if(argc <= 1)
return 0;
if(curl_global_init(CURL_GLOBAL_ALL)) {
fprintf(stderr, "Could not init curl\n");
return 1;
}
base = event_base_new();
timeout = evtimer_new(base, on_timeout, NULL);
curl_handle = curl_multi_init();
curl_multi_setopt(curl_handle, CURLMOPT_SOCKETFUNCTION, handle_socket);
curl_multi_setopt(curl_handle, CURLMOPT_TIMERFUNCTION, start_timeout);
while(argc-- > 1) {
add_download(argv[argc], argc);
}
event_base_dispatch(base);
curl_multi_cleanup(curl_handle);
event_free(timeout);
event_base_free(base);
libevent_global_shutdown();
curl_global_cleanup();
return 0;
}
我将上面的代码编译如下:
gcc -O3 -Wall -Wextra -Wno-unused-parameter -std=c11 -o multi multi.c -lcurl -levent
我使用的是 ubuntu 24.04,使用来自官方仓库的
libcurl
和 libevent
。
如果我将与
argv
相同的 64 个 url 传递给该程序,如下所示:
./multi \
"https://api.pwnedpasswords.com/range/00000" \
"https://api.pwnedpasswords.com/range/00001" \
"https://api.pwnedpasswords.com/range/00002" \
"https://api.pwnedpasswords.com/range/00003" \
"https://api.pwnedpasswords.com/range/00004" \
"https://api.pwnedpasswords.com/range/00005" \
"https://api.pwnedpasswords.com/range/00006" \
"https://api.pwnedpasswords.com/range/00007" \
"https://api.pwnedpasswords.com/range/00008" \
"https://api.pwnedpasswords.com/range/00009" \
"https://api.pwnedpasswords.com/range/0000A" \
"https://api.pwnedpasswords.com/range/0000B" \
"https://api.pwnedpasswords.com/range/0000C" \
"https://api.pwnedpasswords.com/range/0000D" \
"https://api.pwnedpasswords.com/range/0000E" \
"https://api.pwnedpasswords.com/range/0000F" \
"https://api.pwnedpasswords.com/range/00010" \
"https://api.pwnedpasswords.com/range/00011" \
"https://api.pwnedpasswords.com/range/00012" \
"https://api.pwnedpasswords.com/range/00013" \
"https://api.pwnedpasswords.com/range/00014" \
"https://api.pwnedpasswords.com/range/00015" \
"https://api.pwnedpasswords.com/range/00016" \
"https://api.pwnedpasswords.com/range/00017" \
"https://api.pwnedpasswords.com/range/00018" \
"https://api.pwnedpasswords.com/range/00019" \
"https://api.pwnedpasswords.com/range/0001A" \
"https://api.pwnedpasswords.com/range/0001B" \
"https://api.pwnedpasswords.com/range/0001C" \
"https://api.pwnedpasswords.com/range/0001D" \
"https://api.pwnedpasswords.com/range/0001E" \
"https://api.pwnedpasswords.com/range/0001F" \
"https://api.pwnedpasswords.com/range/00020" \
"https://api.pwnedpasswords.com/range/00021" \
"https://api.pwnedpasswords.com/range/00022" \
"https://api.pwnedpasswords.com/range/00023" \
"https://api.pwnedpasswords.com/range/00024" \
"https://api.pwnedpasswords.com/range/00025" \
"https://api.pwnedpasswords.com/range/00026" \
"https://api.pwnedpasswords.com/range/00027" \
"https://api.pwnedpasswords.com/range/00028" \
"https://api.pwnedpasswords.com/range/00029" \
"https://api.pwnedpasswords.com/range/0002A" \
"https://api.pwnedpasswords.com/range/0002B" \
"https://api.pwnedpasswords.com/range/0002C" \
"https://api.pwnedpasswords.com/range/0002D" \
"https://api.pwnedpasswords.com/range/0002E" \
"https://api.pwnedpasswords.com/range/0002F" \
"https://api.pwnedpasswords.com/range/00030" \
"https://api.pwnedpasswords.com/range/00031" \
"https://api.pwnedpasswords.com/range/00032" \
"https://api.pwnedpasswords.com/range/00033" \
"https://api.pwnedpasswords.com/range/00034" \
"https://api.pwnedpasswords.com/range/00035" \
"https://api.pwnedpasswords.com/range/00036" \
"https://api.pwnedpasswords.com/range/00037" \
"https://api.pwnedpasswords.com/range/00038" \
"https://api.pwnedpasswords.com/range/00039" \
"https://api.pwnedpasswords.com/range/0003A" \
"https://api.pwnedpasswords.com/range/0003B" \
"https://api.pwnedpasswords.com/range/0003C" \
"https://api.pwnedpasswords.com/range/0003D" \
"https://api.pwnedpasswords.com/range/0003E" \
"https://api.pwnedpasswords.com/range/0003F" \
;
文件获取正常,但需要 3 秒。
top
显示 100% CPU。即 CPU 限制。
慢 15 倍。
这使得使用它变得不可行。我需要检索 100 万个这样的文件。
这篇文章建议基于事件的
curl_multi
是最快的。所以我选择这个使用libevent的例子。 libev 会更好。
我检查了
curl_multi
选项以确保我获得了连接池(全部位于同一域等),但我没有找到任何表明我没有获得连接池的信息。上述网址的服务器提供带有 TLS3 的 HTTP2
curl --parallel --parallel-max 150
内部在做什么以及如何使用 libcurl 重现这种性能?
回答我自己的问题...
对
perf
的一些调查表明,100% CPU 进程几乎将所有时间都花在协商加密连接上。
启用 CURLOPT_VERBOSE 显示它正在为每次下载打开一个单独的加密连接。
与
curl --parallel --verbose
的情况进行比较,发现 curl
仅协商单个连接,然后进行 HTTP2 多路复用流传输,正如我所怀疑的那样。
一些实验表明设置
CURLMOPT_MAX_CONCURRENT_STREAMS = 1
从而将curl_multi限制为单个加密连接,强制libcurl通过HTTP2复用流。
所以现在 libcurl 提供与
curl --parallel
非常相似的性能。
一些额外的调整
CURLMOPT_MAX_CONCURRENT_STREAMS
可能需要匹配 --parallel-max=150`,以优化全尺寸下载。