如何从 libcurl 中获得类似于带有 globbing 的 `curl --parallel` 的性能

问题描述 投票:0回答:1

基本上我想这样做:

curl --retry 10 --retry-all-errors --remote-name-all --parallel --parallel-max 150 "https://api.pwnedpasswords.com/range/000{0,1,2,3}{0,1,2,3,4,5,6,7,8,9,A,B,C,D,E,F}" > curl.log 2>&1                             

(注:上述“疯狂并行”

curl
命令已得到“我被典当”密码数据库提供商的正式认可,因此这并不构成DoS攻击!)

上面检索了 64 个文本文件,每个文件约 32kB。在具有 Gbit 互联网连接的廉价虚拟机上,这只需要大约 0.2 秒。太棒了。

我想做一些非常类似的事情,但使用

libcurl
以编程方式作为 C++ 应用程序的一部分。

我从官方网站的此示例代码开始。

该页面的代码逐字副本。没有变化。

/***************************************************************************
 *                                  _   _ ____  _
 *  Project                     ___| | | |  _ \| |
 *                             / __| | | | |_) | |
 *                            | (__| |_| |  _ <| |___
 *                             \___|\___/|_| \_\_____|
 *
 * Copyright (C) Daniel Stenberg, <[email protected]>, et al.
 *
 * This software is licensed as described in the file COPYING, which
 * you should have received as part of this distribution. The terms
 * are also available at https://curl.se/docs/copyright.html.
 *
 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
 * copies of the Software, and permit persons to whom the Software is
 * furnished to do so, under the terms of the COPYING file.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 * SPDX-License-Identifier: curl
 *
 ***************************************************************************/

/* <DESC>
 * multi_socket API using libevent
 * </DESC>
 */

#include <stdio.h>
#include <stdlib.h>
#include <event2/event.h>
#include <curl/curl.h>

struct event_base *base;
CURLM *curl_handle;
struct event *timeout;

typedef struct curl_context_s {
  struct event *event;
  curl_socket_t sockfd;
} curl_context_t;

static void curl_perform(int fd, short event, void *arg);

static curl_context_t *create_curl_context(curl_socket_t sockfd)
{
  curl_context_t *context;

  context = (curl_context_t *) malloc(sizeof(*context));

  context->sockfd = sockfd;

  context->event = event_new(base, sockfd, 0, curl_perform, context);

  return context;
}

static void destroy_curl_context(curl_context_t *context)
{
  event_del(context->event);
  event_free(context->event);
  free(context);
}

static void add_download(const char *url, int num)
{
  char filename[50];
  FILE *file;
  CURL *handle;

  snprintf(filename, 50, "%d.download", num);

  file = fopen(filename, "wb");
  if(!file) {
    fprintf(stderr, "Error opening %s\n", filename);
    return;
  }

  handle = curl_easy_init();
  curl_easy_setopt(handle, CURLOPT_WRITEDATA, file);
  curl_easy_setopt(handle, CURLOPT_PRIVATE, file);
  curl_easy_setopt(handle, CURLOPT_URL, url);
  curl_multi_add_handle(curl_handle, handle);
  fprintf(stderr, "Added download %s -> %s\n", url, filename);
}

static void check_multi_info(void)
{
  char *done_url;
  CURLMsg *message;
  int pending;
  CURL *easy_handle;
  FILE *file;

  while((message = curl_multi_info_read(curl_handle, &pending))) {
    switch(message->msg) {
    case CURLMSG_DONE:
      /* Do not use message data after calling curl_multi_remove_handle() and
         curl_easy_cleanup(). As per curl_multi_info_read() docs:
         "WARNING: The data the returned pointer points to does not survive
         calling curl_multi_cleanup, curl_multi_remove_handle or
         curl_easy_cleanup." */
      easy_handle = message->easy_handle;

      curl_easy_getinfo(easy_handle, CURLINFO_EFFECTIVE_URL, &done_url);
      curl_easy_getinfo(easy_handle, CURLINFO_PRIVATE, &file);
      printf("%s DONE\n", done_url);

      curl_multi_remove_handle(curl_handle, easy_handle);
      curl_easy_cleanup(easy_handle);
      if(file) {
        fclose(file);
      }
      break;

    default:
      fprintf(stderr, "CURLMSG default\n");
      break;
    }
  }
}

static void curl_perform(int fd, short event, void *arg)
{
  int running_handles;
  int flags = 0;
  curl_context_t *context;

  if(event & EV_READ)
    flags |= CURL_CSELECT_IN;
  if(event & EV_WRITE)
    flags |= CURL_CSELECT_OUT;

  context = (curl_context_t *) arg;

  curl_multi_socket_action(curl_handle, context->sockfd, flags,
                           &running_handles);

  check_multi_info();
}

static void on_timeout(evutil_socket_t fd, short events, void *arg)
{
  int running_handles;
  curl_multi_socket_action(curl_handle, CURL_SOCKET_TIMEOUT, 0,
                           &running_handles);
  check_multi_info();
}

static int start_timeout(CURLM *multi, long timeout_ms, void *userp)
{
  if(timeout_ms < 0) {
    evtimer_del(timeout);
  }
  else {
    if(timeout_ms == 0)
      timeout_ms = 1; /* 0 means call socket_action asap */
    struct timeval tv;
    tv.tv_sec = timeout_ms / 1000;
    tv.tv_usec = (timeout_ms % 1000) * 1000;
    evtimer_del(timeout);
    evtimer_add(timeout, &tv);
  }
  return 0;
}

static int handle_socket(CURL *easy, curl_socket_t s, int action, void *userp,
                  void *socketp)
{
  curl_context_t *curl_context;
  int events = 0;

  switch(action) {
  case CURL_POLL_IN:
  case CURL_POLL_OUT:
  case CURL_POLL_INOUT:
    curl_context = socketp ?
      (curl_context_t *) socketp : create_curl_context(s);

    curl_multi_assign(curl_handle, s, (void *) curl_context);

    if(action != CURL_POLL_IN)
      events |= EV_WRITE;
    if(action != CURL_POLL_OUT)
      events |= EV_READ;

    events |= EV_PERSIST;

    event_del(curl_context->event);
    event_assign(curl_context->event, base, curl_context->sockfd, events,
      curl_perform, curl_context);
    event_add(curl_context->event, NULL);

    break;
  case CURL_POLL_REMOVE:
    if(socketp) {
      event_del(((curl_context_t*) socketp)->event);
      destroy_curl_context((curl_context_t*) socketp);
      curl_multi_assign(curl_handle, s, NULL);
    }
    break;
  default:
    abort();
  }

  return 0;
}

int main(int argc, char **argv)
{
  if(argc <= 1)
    return 0;

  if(curl_global_init(CURL_GLOBAL_ALL)) {
    fprintf(stderr, "Could not init curl\n");
    return 1;
  }

  base = event_base_new();
  timeout = evtimer_new(base, on_timeout, NULL);

  curl_handle = curl_multi_init();
  curl_multi_setopt(curl_handle, CURLMOPT_SOCKETFUNCTION, handle_socket);
  curl_multi_setopt(curl_handle, CURLMOPT_TIMERFUNCTION, start_timeout);

  while(argc-- > 1) {
    add_download(argv[argc], argc);
  }

  event_base_dispatch(base);

  curl_multi_cleanup(curl_handle);
  event_free(timeout);
  event_base_free(base);

  libevent_global_shutdown();
  curl_global_cleanup();

  return 0;
}

我将上面的代码编译如下:

gcc -O3 -Wall -Wextra -Wno-unused-parameter -std=c11 -o multi multi.c -lcurl -levent

我使用的是 ubuntu 24.04,使用来自官方仓库的

libcurl
libevent

如果我将与

argv
相同的 64 个 url 传递给该程序,如下所示:

./multi \
     "https://api.pwnedpasswords.com/range/00000" \
     "https://api.pwnedpasswords.com/range/00001" \
     "https://api.pwnedpasswords.com/range/00002" \
     "https://api.pwnedpasswords.com/range/00003" \
     "https://api.pwnedpasswords.com/range/00004" \
     "https://api.pwnedpasswords.com/range/00005" \
     "https://api.pwnedpasswords.com/range/00006" \
     "https://api.pwnedpasswords.com/range/00007" \
     "https://api.pwnedpasswords.com/range/00008" \
     "https://api.pwnedpasswords.com/range/00009" \
     "https://api.pwnedpasswords.com/range/0000A" \
     "https://api.pwnedpasswords.com/range/0000B" \
     "https://api.pwnedpasswords.com/range/0000C" \
     "https://api.pwnedpasswords.com/range/0000D" \
     "https://api.pwnedpasswords.com/range/0000E" \
     "https://api.pwnedpasswords.com/range/0000F" \
     "https://api.pwnedpasswords.com/range/00010" \
     "https://api.pwnedpasswords.com/range/00011" \
     "https://api.pwnedpasswords.com/range/00012" \
     "https://api.pwnedpasswords.com/range/00013" \
     "https://api.pwnedpasswords.com/range/00014" \
     "https://api.pwnedpasswords.com/range/00015" \
     "https://api.pwnedpasswords.com/range/00016" \
     "https://api.pwnedpasswords.com/range/00017" \
     "https://api.pwnedpasswords.com/range/00018" \
     "https://api.pwnedpasswords.com/range/00019" \
     "https://api.pwnedpasswords.com/range/0001A" \
     "https://api.pwnedpasswords.com/range/0001B" \
     "https://api.pwnedpasswords.com/range/0001C" \
     "https://api.pwnedpasswords.com/range/0001D" \
     "https://api.pwnedpasswords.com/range/0001E" \
     "https://api.pwnedpasswords.com/range/0001F" \
     "https://api.pwnedpasswords.com/range/00020" \
     "https://api.pwnedpasswords.com/range/00021" \
     "https://api.pwnedpasswords.com/range/00022" \
     "https://api.pwnedpasswords.com/range/00023" \
     "https://api.pwnedpasswords.com/range/00024" \
     "https://api.pwnedpasswords.com/range/00025" \
     "https://api.pwnedpasswords.com/range/00026" \
     "https://api.pwnedpasswords.com/range/00027" \
     "https://api.pwnedpasswords.com/range/00028" \
     "https://api.pwnedpasswords.com/range/00029" \
     "https://api.pwnedpasswords.com/range/0002A" \
     "https://api.pwnedpasswords.com/range/0002B" \
     "https://api.pwnedpasswords.com/range/0002C" \
     "https://api.pwnedpasswords.com/range/0002D" \
     "https://api.pwnedpasswords.com/range/0002E" \
     "https://api.pwnedpasswords.com/range/0002F" \
     "https://api.pwnedpasswords.com/range/00030" \
     "https://api.pwnedpasswords.com/range/00031" \
     "https://api.pwnedpasswords.com/range/00032" \
     "https://api.pwnedpasswords.com/range/00033" \
     "https://api.pwnedpasswords.com/range/00034" \
     "https://api.pwnedpasswords.com/range/00035" \
     "https://api.pwnedpasswords.com/range/00036" \
     "https://api.pwnedpasswords.com/range/00037" \
     "https://api.pwnedpasswords.com/range/00038" \
     "https://api.pwnedpasswords.com/range/00039" \
     "https://api.pwnedpasswords.com/range/0003A" \
     "https://api.pwnedpasswords.com/range/0003B" \
     "https://api.pwnedpasswords.com/range/0003C" \
     "https://api.pwnedpasswords.com/range/0003D" \
     "https://api.pwnedpasswords.com/range/0003E" \
     "https://api.pwnedpasswords.com/range/0003F" \
     ;


文件获取正常,但需要 3 秒。

top
显示 100% CPU。即 CPU 限制。

慢 15 倍。

这使得使用它变得不可行。我需要检索 100 万个这样的文件。

这篇文章建议基于事件的

curl_multi
是最快的。所以我选择这个使用libevent的例子。 libev 会更好。

我检查了

curl_multi
选项以确保我获得了连接池(全部位于同一域等),但我没有找到任何表明我没有获得连接池的信息。上述网址的服务器提供带有 TLS3 的 HTTP2

curl --parallel --parallel-max 150
内部在做什么以及如何使用 libcurl 重现这种性能?

c curl libcurl
1个回答
0
投票

回答我自己的问题...

perf
的一些调查表明,100% CPU 进程几乎将所有时间都花在协商加密连接上。

启用 CURLOPT_VERBOSE 显示它正在为每次下载打开一个单独的加密连接。

curl --parallel --verbose
的情况进行比较,发现
curl
仅协商单个连接,然后进行 HTTP2 多路复用流传输,正如我所怀疑的那样。

一些实验表明设置

CURLMOPT_MAX_CONCURRENT_STREAMS = 1

从而将curl_multi限制为单个加密连接,强制libcurl通过HTTP2复用流。

所以现在 libcurl 提供与

curl --parallel
非常相似的性能。

一些额外的调整

CURLMOPT_MAX_CONCURRENT_STREAMS

可能需要匹配 --parallel-max=150`,以优化全尺寸下载。

© www.soinside.com 2019 - 2024. All rights reserved.