为什么memcpy从堆到共享内存的性能这么差？

Question

我测试了

memcpy

从堆到堆以及从堆到共享内存的性能 ([shm_open](https://www.man7.org/linux/man-pages/man3/shm_open.3.html)。测试代码如下：

// shm_msg.hpp
#ifndef _SHM_MSG_HPP_
#define _SHM_MSG_HPP_

#include <fcntl.h>
#include <semaphore.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <array>
#include <stdint.h>

#define BUF_SIZE 100 * 1024 * 1024 /* Maximum size */

typedef struct shmbuf
{
    uint32_t a;
    double b;
    std::array<uint8_t, BUF_SIZE> data;
    uint64_t c;
} shmbuf;

#endif // !_SHM_MSG_HPP_

// shm_openWriter.cpp
#include "shm_msg.hpp"
#include <iostream>
#include <ctype.h>
#include <vector>
#include <cstring>
#include <chrono>

int main(int argc, char *argv[])
{

    /* Create shared memory object and set its size to the size
       of our structure. */

    int fd = shm_open("SHM_1", O_CREAT | O_EXCL | O_RDWR, 0600);
    if (fd == -1)
    {
        perror("shm_open");
    }
    size_t size = sizeof(shmbuf);
    if (ftruncate(fd, size) == -1)
    {
        perror("ftruncate");
    }

    /* Map the object into the caller's address space. */

    shmbuf *shmp = (shmbuf *)mmap(NULL, size, PROT_READ | PROT_WRITE,
                                  MAP_SHARED, fd, 0);
    if (shmp == MAP_FAILED)
    {
        perror("mmap");
    }

    /* Copy data into the shared memory object. */
    std::vector<uint8_t> vec(BUF_SIZE, 0x56);
    vec[100] = 0x89;
    shmp->a = 5;
    shmp->b = 2.5;
    shmp->c = 100L;
    std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
    std::memcpy(shmp->data.data(), vec.data(), BUF_SIZE);
    std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
    printf("data[100] = 0x%X\n", shmp->data[100]);
    std::cout << "a = " << shmp->a << std::endl;
    std::cout << "b = " << shmp->b << std::endl;
    std::cout << "c = " << shmp->c << std::endl;
    std::cout << "memcpy time from heap to SHM = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp2 - tp1).count() / 1e6 << " ms." << std::endl;

    std::vector<uint8_t> vec2(BUF_SIZE);
    std::chrono::high_resolution_clock::time_point tp3 = std::chrono::high_resolution_clock::now();
    std::memcpy(shmp->data.data(), vec.data(), BUF_SIZE);
    std::chrono::high_resolution_clock::time_point tp4 = std::chrono::high_resolution_clock::now();
    std::cout << "memcpy time from heap to heap = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp4 - tp3).count() / 1e6 << " ms." << std::endl;

    shm_unlink("SHM_1");
    exit(EXIT_SUCCESS);
}

编译并运行测试：

g++ shm_openWriter.cpp -o shm_openWriter
./shm_openWriter

输出：

data[100] = 0x89
a = 5
b = 2.5
c = 100
memcpy time from heap to SHM = 343.589 ms.
memcpy time from heap to heap = 27.0181 ms.

从输出中可以看出，从堆到共享内存的memcpy比从堆到堆的memcpy慢12倍。

为什么memcpy从堆到共享内存这么慢？有没有办法提高memcpy的性能？

Answer 1

@PepijnKramer 你好。如何测试SHM中“memcpy”的性能？我的测试代码如下：

#include "shm_msg.hpp"
#include <iostream>
#include <ctype.h>
#include <vector>
#include <cstring>
#include <chrono>

int main(int argc, char *argv[])
{

    /* Create shared memory object and set its size to the size
       of our structure. */
    typedef struct shm_buf2
    {
        shmbuf buf1;
        shmbuf buf2;
    } shm_buf2;

    int fd = shm_open("SHM_1", O_CREAT | O_EXCL | O_RDWR, 0600);
    if (fd == -1)
    {
        perror("shm_open");
    }
    size_t size = sizeof(shm_buf2);
    if (ftruncate(fd, size) == -1)
    {
        perror("ftruncate");
    }

    /* Map the object into the caller's address space. */

    shm_buf2 *shmp = (shm_buf2 *)mmap(NULL, size, PROT_READ | PROT_WRITE,
                                      MAP_SHARED, fd, 0);
    if (shmp == MAP_FAILED)
    {
        perror("mmap");
    }

    /* Copy data into the shared memory object. */
    std::vector<uint8_t> vec(BUF_SIZE, 0x56);
    std::memcpy(shmp->buf1.data.data(), vec.data(), BUF_SIZE);

    std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
    std::memcpy(shmp->buf2.data.data(), shmp->buf1.data.data(), BUF_SIZE);
    std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
    std::cout << "memcpy time from SHM to SHM = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp2 - tp1).count() / 1e6 << " ms." << std::endl;

    std::vector<uint8_t> vec2(BUF_SIZE);
    std::chrono::high_resolution_clock::time_point tp3 = std::chrono::high_resolution_clock::now();
    std::memcpy(vec2.data(), vec.data(), BUF_SIZE);
    std::chrono::high_resolution_clock::time_point tp4 = std::chrono::high_resolution_clock::now();
    std::cout << "memcpy time from heap to heap = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp4 - tp3).count() / 1e6 << " ms." << std::endl;

    shm_unlink("SHM_1");
    exit(EXIT_SUCCESS);
}

编译并运行：

g++ shm_openWriter.cpp -o shm_openWriter -O2
./shm_openWriter

输出：

memcpy time from SHM to SHM = 266.59 ms.
memcpy time from heap to heap = 29.0502 ms.

看来从SHM到SHM的memcpy性能仍然比堆到堆差。

为什么memcpy从堆到共享内存的性能这么差？

问题描述投票：0回答：1

1个回答

最新问题

为什么memcpy从堆到共享内存的性能这么差？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1