我测试了
memcpy
从堆到堆以及从堆到共享内存的性能 ([shm_open](https://www.man7.org/linux/man-pages/man3/shm_open.3.html)。测试代码如下:
// shm_msg.hpp
#ifndef _SHM_MSG_HPP_
#define _SHM_MSG_HPP_
#include <fcntl.h>
#include <semaphore.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <array>
#include <stdint.h>
#define BUF_SIZE 100 * 1024 * 1024 /* Maximum size */
typedef struct shmbuf
{
uint32_t a;
double b;
std::array<uint8_t, BUF_SIZE> data;
uint64_t c;
} shmbuf;
#endif // !_SHM_MSG_HPP_
// shm_openWriter.cpp
#include "shm_msg.hpp"
#include <iostream>
#include <ctype.h>
#include <vector>
#include <cstring>
#include <chrono>
int main(int argc, char *argv[])
{
/* Create shared memory object and set its size to the size
of our structure. */
int fd = shm_open("SHM_1", O_CREAT | O_EXCL | O_RDWR, 0600);
if (fd == -1)
{
perror("shm_open");
}
size_t size = sizeof(shmbuf);
if (ftruncate(fd, size) == -1)
{
perror("ftruncate");
}
/* Map the object into the caller's address space. */
shmbuf *shmp = (shmbuf *)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
if (shmp == MAP_FAILED)
{
perror("mmap");
}
/* Copy data into the shared memory object. */
std::vector<uint8_t> vec(BUF_SIZE, 0x56);
vec[100] = 0x89;
shmp->a = 5;
shmp->b = 2.5;
shmp->c = 100L;
std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
std::memcpy(shmp->data.data(), vec.data(), BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
printf("data[100] = 0x%X\n", shmp->data[100]);
std::cout << "a = " << shmp->a << std::endl;
std::cout << "b = " << shmp->b << std::endl;
std::cout << "c = " << shmp->c << std::endl;
std::cout << "memcpy time from heap to SHM = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp2 - tp1).count() / 1e6 << " ms." << std::endl;
std::vector<uint8_t> vec2(BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp3 = std::chrono::high_resolution_clock::now();
std::memcpy(shmp->data.data(), vec.data(), BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp4 = std::chrono::high_resolution_clock::now();
std::cout << "memcpy time from heap to heap = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp4 - tp3).count() / 1e6 << " ms." << std::endl;
shm_unlink("SHM_1");
exit(EXIT_SUCCESS);
}
编译并运行测试:
g++ shm_openWriter.cpp -o shm_openWriter
./shm_openWriter
输出:
data[100] = 0x89
a = 5
b = 2.5
c = 100
memcpy time from heap to SHM = 343.589 ms.
memcpy time from heap to heap = 27.0181 ms.
从输出中可以看出,从堆到共享内存的memcpy比从堆到堆的memcpy慢12倍。
为什么memcpy从堆到共享内存这么慢?有没有办法提高memcpy的性能?
@PepijnKramer 你好。如何测试SHM中“memcpy”的性能? 我的测试代码如下:
#include "shm_msg.hpp"
#include <iostream>
#include <ctype.h>
#include <vector>
#include <cstring>
#include <chrono>
int main(int argc, char *argv[])
{
/* Create shared memory object and set its size to the size
of our structure. */
typedef struct shm_buf2
{
shmbuf buf1;
shmbuf buf2;
} shm_buf2;
int fd = shm_open("SHM_1", O_CREAT | O_EXCL | O_RDWR, 0600);
if (fd == -1)
{
perror("shm_open");
}
size_t size = sizeof(shm_buf2);
if (ftruncate(fd, size) == -1)
{
perror("ftruncate");
}
/* Map the object into the caller's address space. */
shm_buf2 *shmp = (shm_buf2 *)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
if (shmp == MAP_FAILED)
{
perror("mmap");
}
/* Copy data into the shared memory object. */
std::vector<uint8_t> vec(BUF_SIZE, 0x56);
std::memcpy(shmp->buf1.data.data(), vec.data(), BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
std::memcpy(shmp->buf2.data.data(), shmp->buf1.data.data(), BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
std::cout << "memcpy time from SHM to SHM = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp2 - tp1).count() / 1e6 << " ms." << std::endl;
std::vector<uint8_t> vec2(BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp3 = std::chrono::high_resolution_clock::now();
std::memcpy(vec2.data(), vec.data(), BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp4 = std::chrono::high_resolution_clock::now();
std::cout << "memcpy time from heap to heap = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp4 - tp3).count() / 1e6 << " ms." << std::endl;
shm_unlink("SHM_1");
exit(EXIT_SUCCESS);
}
编译并运行:
g++ shm_openWriter.cpp -o shm_openWriter -O2
./shm_openWriter
输出:
memcpy time from SHM to SHM = 266.59 ms.
memcpy time from heap to heap = 29.0502 ms.
看来从SHM到SHM的memcpy性能仍然比堆到堆差。