我对检查内存块是否相等的最有效方法感兴趣。
目前,我使用
memcmp(...) == 0
来确定两个内存块是否相等。虽然这有效,但 memcmp
的设计不仅用于检查相等性,还用于确定两个内存块的顺序(大于、小于或等于)。此附加功能可能会带来不必要的开销。
任务:
我需要有效地检查两个内存块是否相等。
问题:
memcmp
提供排序信息这一事实意味着它可能比专门设计用于检查相等性的函数效率低。
memcmp
中的额外开销与需要确定块不相等时的确切顺序有关。例如,如果 memcmp
返回非零结果,则意味着每个内存块中某个索引 i
处有两个字节以某种方式不同,并且所有先前的字节都相等。为了实现这一点,memcmp
必须验证所有先前字节的相等性,这需要时间。相反,仅检查相等性的函数可以立即检查最后一个字节,并在发现差异时终止检查。这可能会导致时间复杂度的摊销改进。
简单来说:
memcmp
的低效率是因为它不仅搜索内存块之间的差异,还搜索FIRST差异。
我尝试过的:
我编写了自己的实现来演示这个想法并进行了一些测试。
#include <cstring>
#include <immintrin.h>
bool memcmp_memeq(const void* s1, const void* s2, size_t n) {
return memcmp(s1, s2, n) == 0;
}
bool my_memeq(const void* s1, const void* s2, size_t n) {
const static size_t simd_width = 16;
if (n <= simd_width) {
return memcmp(s1, s2, n) == 0;
}
const int8_t* end1 = (const int8_t*)s1 + n - simd_width;
const int8_t* end2 = (const int8_t*)s2 + n - simd_width;
const __m128i m1 = _mm_loadu_si128((__m128i*)end1);
const __m128i m2 = _mm_loadu_si128((__m128i*)end2);
const __m128i result = _mm_cmpeq_epi8(m1, m2);
if (_mm_movemask_epi8(result) != 0xFFFF) {
return false;
}
return memcmp(s1, s2, (const int8_t*)end1 - (const int8_t*)s1) == 0;
}
bool my_memeq_256(const void* s1, const void* s2, size_t n) {
const static size_t simd_width = 32;
if (n <= simd_width) {
return memcmp(s1, s2, n) == 0;
}
const int8_t* end1 = (const int8_t*)s1 + n - simd_width;
const int8_t* end2 = (const int8_t*)s2 + n - simd_width;
const __m256i m1 = _mm256_loadu_si256((__m256i*)end1);
const __m256i m2 = _mm256_loadu_si256((__m256i*)end2);
const __m256i result = _mm256_cmpeq_epi8(m1, m2);
if (static_cast<unsigned int>(_mm256_movemask_epi8(result)) != 0xFFFFFFFF) {
return false;
}
return memcmp(s1, s2, (const int8_t*)end1 - (const int8_t*)s1) == 0;
}
#include <cstring>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <gtest/gtest.h>
#include <src/util/my_memeq.h>
template <typename Func>
std::pair<std::chrono::duration<double>, bool> measure_time(
Func func,
const std::vector<std::pair<std::string, std::string>>& test_cases,
int iterations = 100000) {
if (test_cases.empty()) {
return { std::chrono::duration<double>(0), true };
}
volatile bool result;
const auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i) {
for (const auto& test_case : test_cases) {
result = func(test_case.first.c_str(), test_case.second.c_str(), test_case.first.size());
}
}
const auto end = std::chrono::high_resolution_clock::now();
return { (end - start) / test_cases.size() / iterations, result };
}
void run_tests(std::ostream& out, size_t len, const std::vector<std::pair<std::string, std::string>>& test_cases) {
const auto memcmp_memeq_time_result = measure_time(memcmp_memeq, test_cases);
const auto my_memeq_time_result = measure_time(my_memeq, test_cases);
const auto my_memeq_256_time_result = measure_time(my_memeq_256, test_cases);
const auto memcmp_memeq_time = memcmp_memeq_time_result.first.count();
const auto my_memeq_time = my_memeq_time_result.first.count();
const auto my_memeq_256_time = my_memeq_256_time_result.first.count();
const auto memcmp_memeq_result = memcmp_memeq_time_result.second;
const auto my_memeq_result = my_memeq_time_result.second;
const auto my_memeq_256_result = my_memeq_256_time_result.second;
EXPECT_EQ(memcmp_memeq_result, my_memeq_result);
EXPECT_EQ(memcmp_memeq_result, my_memeq_256_result);
out << "RESULTS for len: " << len << " bytes\n";
out << std::setw(19) << "memcmp_memeq time: " << std::setw(9) << std::setprecision(4) << memcmp_memeq_time << " seconds\n";
out << std::setw(19) << "my_memeq time: " << std::setw(9) << std::setprecision(4) << my_memeq_time << " seconds\n";
out << std::setw(19) << "my_memeq_256 time: " << std::setw(9) << std::setprecision(4) << my_memeq_256_time << " seconds\n";
out << '\n';
}
std::vector<std::pair<std::string, std::string>>
generate_test_cases(
size_t len,
bool include_equal = true,
bool include_start = true,
bool include_middle = true,
bool include_end = true,
bool include_quarters = true) {
std::vector<std::pair<std::string, std::string>> test_cases;
std::string str1 (len, 'A');
std::string str2 (len, 'A');
if (include_equal) {
test_cases.emplace_back(str1, str2);
}
if (include_end) {
str2[len - 1] = 'B';
test_cases.emplace_back(str1, str2);
str2[len - 1] = 'A';
}
if (include_start) {
str2[0] = 'B';
test_cases.emplace_back(str1, str2);
str2[0] = 'A';
}
if (include_middle) {
if (len > 2) {
str2[len / 2] = 'B';
test_cases.emplace_back(str1, str2);
str2[len / 2] = 'A';
}
}
if (include_quarters) {
if (len > 4) {
str2[len / 4] = 'B';
test_cases.emplace_back(str1, str2);
str2[len / 4] = 'A';
str2[len / 4 * 3] = 'B';
test_cases.emplace_back(str1, str2);
str2[len / 4 * 3] = 'A';
}
}
return test_cases;
}
TEST(MemcmpTest, PerformanceComparison) {
const static size_t start_len = 4;
const static size_t max_bytes = 131072;
std::stringstream out;
out << "VARIOUS STRINGS\n\n";
for (size_t len = start_len; len <= max_bytes; len *= 2) {
run_tests(
out,
len,
generate_test_cases(
len,
true,
true,
true,
true,
true)
);
}
out << "DIFFERENCE AT THE START\n\n";
for (size_t len = start_len; len <= max_bytes; len *= 2) {
run_tests(
out,
len,
generate_test_cases(
len,
false,
true,
false,
false,
false)
);
}
out << "DIFFERENCE AT THE END\n\n";
for (size_t len = start_len; len <= max_bytes; len *= 2) {
run_tests(
out,
len,
generate_test_cases(
len,
false,
false,
false,
true,
false)
);
}
out << "DIFFERENCE AT THE MIDDLE\n\n";
for (size_t len = start_len; len <= max_bytes; len *= 2) {
run_tests(
out,
len,
generate_test_cases(
len,
false,
false,
true,
false,
false)
);
}
const auto output = out.str();
// std::cout << output;
{{ std::ofstream out_file ("../test/my_memeq_test_results.txt");
out_file << output;
}}
}
测试结果:
VARIOUS STRINGS
RESULTS for len: 4 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.7e-08 seconds
my_memeq_256 time: 1.6e-08 seconds
RESULTS for len: 8 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.6e-08 seconds
my_memeq_256 time: 1.6e-08 seconds
RESULTS for len: 16 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.6e-08 seconds
my_memeq_256 time: 1.6e-08 seconds
RESULTS for len: 32 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.7e-08 seconds
my_memeq_256 time: 1.6e-08 seconds
RESULTS for len: 64 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.7e-08 seconds
my_memeq_256 time: 1.8e-08 seconds
RESULTS for len: 128 bytes
memcmp_memeq time: 1.5e-08 seconds
my_memeq time: 1.8e-08 seconds
my_memeq_256 time: 1.9e-08 seconds
RESULTS for len: 256 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.8e-08 seconds
my_memeq_256 time: 1.9e-08 seconds
RESULTS for len: 512 bytes
memcmp_memeq time: 1.7e-08 seconds
my_memeq time: 1.9e-08 seconds
my_memeq_256 time: 2e-08 seconds
RESULTS for len: 1024 bytes
memcmp_memeq time: 1.9e-08 seconds
my_memeq time: 2.1e-08 seconds
my_memeq_256 time: 2.2e-08 seconds
RESULTS for len: 2048 bytes
memcmp_memeq time: 2.5e-08 seconds
my_memeq time: 2.5e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 4096 bytes
memcmp_memeq time: 4.1e-08 seconds
my_memeq time: 3.3e-08 seconds
my_memeq_256 time: 3.5e-08 seconds
RESULTS for len: 8192 bytes
memcmp_memeq time: 8e-08 seconds
my_memeq time: 6e-08 seconds
my_memeq_256 time: 6e-08 seconds
RESULTS for len: 16384 bytes
memcmp_memeq time: 1.57e-07 seconds
my_memeq time: 1.21e-07 seconds
my_memeq_256 time: 1.22e-07 seconds
RESULTS for len: 32768 bytes
memcmp_memeq time: 2.97e-07 seconds
my_memeq time: 2.19e-07 seconds
my_memeq_256 time: 2.2e-07 seconds
RESULTS for len: 65536 bytes
memcmp_memeq time: 6.34e-07 seconds
my_memeq time: 4.25e-07 seconds
my_memeq_256 time: 4.26e-07 seconds
RESULTS for len: 131072 bytes
memcmp_memeq time: 1.429e-06 seconds
my_memeq time: 9.97e-07 seconds
my_memeq_256 time: 9.8e-07 seconds
DIFFERENCE AT THE START
RESULTS for len: 4 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 8 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 16 bytes
memcmp_memeq time: 2.7e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 32 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.8e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 64 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 128 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 256 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.8e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 512 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.8e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 1024 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 2048 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 4096 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 8192 bytes
memcmp_memeq time: 2.4e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 16384 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 32768 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 65536 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 131072 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
DIFFERENCE AT THE END
RESULTS for len: 4 bytes
memcmp_memeq time: 2.5e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 8 bytes
memcmp_memeq time: 2.5e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 16 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 32 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 64 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 128 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 256 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 512 bytes
memcmp_memeq time: 3e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 1024 bytes
memcmp_memeq time: 3.5e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 2048 bytes
memcmp_memeq time: 4.2e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 4096 bytes
memcmp_memeq time: 6.1e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 8192 bytes
memcmp_memeq time: 9.8e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 16384 bytes
memcmp_memeq time: 1.95e-07 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 32768 bytes
memcmp_memeq time: 4.84e-07 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 65536 bytes
memcmp_memeq time: 9.42e-07 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 131072 bytes
memcmp_memeq time: 1.862e-06 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
DIFFERENCE AT THE MIDDLE
RESULTS for len: 4 bytes
memcmp_memeq time: 2.7e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 8 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 3.2e-08 seconds
my_memeq_256 time: 4.9e-08 seconds
RESULTS for len: 16 bytes
memcmp_memeq time: 2.7e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.7e-08 seconds
RESULTS for len: 32 bytes
memcmp_memeq time: 2.8e-08 seconds
my_memeq time: 2.8e-08 seconds
my_memeq_256 time: 2.7e-08 seconds
RESULTS for len: 64 bytes
memcmp_memeq time: 2.7e-08 seconds
my_memeq time: 3e-08 seconds
my_memeq_256 time: 3e-08 seconds
RESULTS for len: 128 bytes
memcmp_memeq time: 2.8e-08 seconds
my_memeq time: 3e-08 seconds
my_memeq_256 time: 3.2e-08 seconds
RESULTS for len: 256 bytes
memcmp_memeq time: 2.8e-08 seconds
my_memeq time: 3.3e-08 seconds
my_memeq_256 time: 3.1e-08 seconds
RESULTS for len: 512 bytes
memcmp_memeq time: 3.1e-08 seconds
my_memeq time: 3.2e-08 seconds
my_memeq_256 time: 3.3e-08 seconds
RESULTS for len: 1024 bytes
memcmp_memeq time: 3.3e-08 seconds
my_memeq time: 3.4e-08 seconds
my_memeq_256 time: 3.6e-08 seconds
RESULTS for len: 2048 bytes
memcmp_memeq time: 3.7e-08 seconds
my_memeq time: 3.7e-08 seconds
my_memeq_256 time: 3.8e-08 seconds
RESULTS for len: 4096 bytes
memcmp_memeq time: 4.3e-08 seconds
my_memeq time: 4.7e-08 seconds
my_memeq_256 time: 4.8e-08 seconds
RESULTS for len: 8192 bytes
memcmp_memeq time: 5.8e-08 seconds
my_memeq time: 6e-08 seconds
my_memeq_256 time: 6.1e-08 seconds
RESULTS for len: 16384 bytes
memcmp_memeq time: 1.02e-07 seconds
my_memeq time: 1.06e-07 seconds
my_memeq_256 time: 1.07e-07 seconds
RESULTS for len: 32768 bytes
memcmp_memeq time: 2.31e-07 seconds
my_memeq time: 2.36e-07 seconds
my_memeq_256 time: 2.41e-07 seconds
RESULTS for len: 65536 bytes
memcmp_memeq time: 5.17e-07 seconds
my_memeq time: 5.2e-07 seconds
my_memeq_256 time: 5.23e-07 seconds
RESULTS for len: 131072 bytes
memcmp_memeq time: 9.9e-07 seconds
my_memeq time: 1.007e-06 seconds
my_memeq_256 time: 9.78e-07 seconds
另外,以下是最大内存块的结果:
VARIOUS STRINGS
RESULTS for len: 131072 bytes
memcmp_memeq time: 1.429e-06 seconds
my_memeq time: 9.97e-07 seconds
my_memeq_256 time: 9.8e-07 seconds
DIFFERENCE AT THE START
RESULTS for len: 131072 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
DIFFERENCE AT THE END
RESULTS for len: 131072 bytes
memcmp_memeq time: 1.862e-06 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
DIFFERENCE AT THE MIDDLE
RESULTS for len: 131072 bytes
memcmp_memeq time: 9.9e-07 seconds
my_memeq time: 1.007e-06 seconds
my_memeq_256 time: 9.78e-07 seconds
可以看出,我的实现在某些情况下可以提供显着的性能改进。
然而,对于较小的内存块,性能可能会略有下降。
我很想知道是否有专门针对检查内存块相等性进行优化的现有解决方案。
从根本上说,处理器在比较时设置一个条件代码,然后执行条件分支或跳转。 一些常见的运算包括减法、异或和比较指令。
(注意:如果内存块大小可以比较,先进行大小比较,最快的内存块比较不是比较。)
高效的相等函数比较只需要找到一个不同的单元格。 该单元格可以是第一个、“中间”或最后一个。 这是优化算法的刺。 最坏的情况是,所有细胞都需要进行测试。 每个分支都比比较慢。 这意味着处理器可以在分支指令的持续时间内执行多重比较(数据处理)指令。 目标是减少分支数量。
实验
bool result = result & (*p_block_1++ ^ p_block_2++);
在执行条件分支之前。 尽管该技术可以执行额外的比较,但是额外比较的持续时间可以忽略不计。 内存的获取应该限制在内存块内;不要在任何一个内存块之外获取。
总结
减少分支指令,例如循环分支。 (最大的速度瓶颈)