我不能使用 boost:hash 因为我必须坚持使用 C 而不能使用 C++。
但是,我需要对大量(10K 到 100k)令牌字符串(5 到 40 字节长度)进行哈希处理,以便在这些字符串中进行搜索速度最快。
MD5、SHA1 或任何长哈希函数对于一个简单的任务来说似乎太重了,我不做密码学。另外还有存储和计算成本。
因此我的问题:
最简单的哈希算法可能会在大多数实际情况下确保防止碰撞。
哈希值使用多少位?我正在为 32 位系统进行开发。 Perl/Python 中的哈希算法也使用 32 位哈希吗?还是必须跳到64?
关于常见脚本语言中哈希表的实现:实现是否检查冲突,或者我可以完全避免该部分吗?
您可以在 http://www.azillionmonkeys.com/qed/hash.html
找到一个好的(且快速)哈希函数和有趣的读物唯一不应该检查冲突的情况是,如果您使用完美哈希——一个很好的老式查找表,例如gperf。
这里是对最著名的已知哈希函数的很好的概述。
32 位应该可以正常工作。
你总是需要检查冲突,除非你想写一个有趣的哈希表:)
用于哈希表查找的通用哈希函数。它指定请勿用于加密目的,但既然您指定您无意这样做,那么您应该没问题。
其中包括可供尝试的哈希函数调查
xxhash 是非常快速且简单的选择。一个简单的代码将使用 XXH32
函数:
unsigned int XXH32 (const void* input, int len, unsigned int seed);
它是 32 位哈希。由于
len
是
int
,对于超过
2^31-1
字节的较大数据,请使用这些:
void* XXH32_init (unsigned int seed);
XXH_errorcode XXH32_update (void* state, const void* input, int len);
unsigned int XXH32_digest (void* state);
话虽这么说,一些最好的简单哈希函数是
FNV1a、MurmurOAAT、RSHash 和 SDBM。后两者是四个中最快的。然而,SDBM 也很容易记住,所以它可能是其中最著名的。
SDBM 是乘法哈希函数的一个示例,DJB2 和 K&Rv2 也属于其中。此类函数只需将当前哈希值乘以奇数并添加下一个字节:hash = hash * factor + data[i]
。 SDBM 使用数字 65599 作为因子,DJB2 – 33,K&Rv2 – 31。RSHash 由 Robert Sedgewick (1990) 提出,它通过使用生成的因子序列而不是仅使用一个常数因子,将乘法方法更进一步。这是我最喜欢的“最小”哈希函数。但 MurmurOAAT 具有更好的雪崩能力。
当您使用所有 64 位时,或者如果您正在针对 64 位架构进行开发,则使用 64 位哈希是有意义的。如果您使用散列表的散列,则使用 64 位散列没有什么意义,因为您的散列表很可能不会有数十亿个槽。但是,例如,如果您使用哈希值作为指纹(校验和),则 64 位哈希值比 32 位好得多。
以下是上述哈希函数之间的比较,以及与一些现代优化函数(如 Murmur3 和 FarmHash)的比较。还包括其他答案中提到的一些其他哈希函数。
(1) 碰撞和雪崩
| Collisions | Avalanching
Hash function | random 16B | words + names | words + names
=============================================================================
FNV1a | 0.20% | 45, 0.01% | 98.0
MurmurOAAT | 0.20% | 50, 0.01% | 100.0
RSHash | 0.20% | 51, 0.01% | 95.5
SDBM | 0.20% | 53, 0.01% | 93.1
Jenkins OAAT | 0.19% | 66, 0.01% | 100.0
K&R v2 | 0.20% | 767, 0.12% | 89.8
SuperFastHash | 0.21% | 808, 0.13% | 100.0
DJB2 | 0.20% | 856, 0.13% | 89.7
XOR_rotate | 0.19% | 2156, 0.34% | 89.5
DEKHash | 0.19% | 2312, 0.36% | 88.6
Fletcher32 | 0.19% | 5274, 0.82% | 90.9
Adler32 | 67.44% | 371448, 58.03% | 73.9
-----------------------------------------------------------------------------
MurmurHash3_x86_32 | 0.19% | 42, 0.01% | 100.0
MurmurHash3_x64_32 | 0.20% | 50, 0.01% | 100.0
farmhash_fingerprint32 | 0.20% | 42, 0.01% | 100.0
farmhash_fingerprint64 | 0.19% | 36, 0.01% | 100.0
CRC32 (bit-by-bit) | 0.20% | 45, 0.01% | 99.8
CRC32 (table-driven) | 0.20% | 45, 0.01% | 99.8
Adler32 是唯一一个性能较差并且对 16 字节随机缓冲区产生大量冲突的算法。这真是一个糟糕的哈希函数!对我来说,单词和名称的性能不佳表明哈希算法的质量较差。这就是为什么我不会推荐任何在此数据集上碰撞率超过 0.01% 的算法。
(2) 速度
| Time, ms
Hash function | random 16B | random 16MB | words + names
======================================================================
FNV1a | 132.3 | 339.3 | 12.9
MurmurOAAT | 170.3 | 500.7 | 11.9
RSHash | 122.2 | 250.5 | 10.9
SDBM | 116.1 | 250.8 | 11.3
Jenkins OAAT | 159.0 | 417.8 | 11.2
K&R v2 | 116.1 | 250.2 | 11.1
SuperFastHash | 71.0 | 127.0 | 9.8
DJB2 | 169.5 | 251.0 | 10.8
XOR_rotate | 119.8 | 167.6 | 10.9
DEKHash | 115.5 | 167.0 | 10.9
Fletcher32 | 79.0 | 52.2 | 18.5
Adler32 | 391.6 | 669.0 | 12.3
----------------------------------------------------------------------
MurmurHash3_x86_32 | 93.0 | 124.3 | 10.9
MurmurHash3_x64_32 | 96.3 | 53.6 | 13.5
farmhash_fingerprint32 | 87.2 | 52.2 | 5.4
farmhash_fingerprint64 | 48.8 | 17.3 | 5.7
CRC32 (bit-by-bit) | 1189.6 | 2091.1 | 23.0
CRC32 (table-driven) | 262.0 | 667.3 | 11.1
对于计时,我在 Apple M1 Pro 机器上编译并运行了我的代码。您可能会在 Intel x86-64 架构上获得不同的速度,特别是对于 FarmHash,它针对 Intel 指令集进行了深度优化。但总体情况可能会保持不变。(3) 代码
gcc -O2 minimal_hash.c -o minimal_hash
./minimal_hash words.txt
,其中words.txt 是所有英文单词和人名(640143 个唯一字符串)的组合。
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define SMALL 16
#define LARGE (1 << 24) // size of 16 MiB
#define SEED 0x12345678
uint8_t buffer[LARGE * SMALL];
uint32_t hashes[LARGE];
typedef uint32_t (*hash_function) (const uint8_t*, size_t);
uint32_t DJB2_hash(const uint8_t* buf, size_t size) {
uint32_t hash = 5381;
for (size_t i = 0; i < size; i++)
hash = ((hash << 5) + hash) + buf[i]; /* hash * 33 + byte */
return hash;
}
uint32_t SDBM_hash(const uint8_t* buf, size_t size) {
uint32_t hash = 0;
for (size_t i = 0; i < size; i++)
hash = (hash << 6) + (hash << 16) - hash + buf[i]; /* hash * 65599 + byte */
return hash;
}
uint32_t FNV1a(const uint8_t* data, size_t size) {
uint32_t h = 2166136261UL;
for (size_t i = 0; i < size; i++) {
h ^= data[i];
h *= 16777619;
}
return h;
}
uint64_t FNV1a_64(const uint8_t* data, size_t size) {
uint64_t h = 0xcbf29ce484222325UL;
for (size_t i = 0; i < size; i++) {
h ^= data[i];
h *= 0x00000100000001B3UL;
}
return h;
}
uint32_t FNV1a_64_32(const uint8_t* data, size_t size) {
uint64_t h = FNV1a_64(data, size);
return h ^ (h >> 32);
}
uint32_t MurmurOAAT_32(const uint8_t* data, size_t size) {
// One-byte-at-a-time hash based on Murmur's mix
uint32_t h = SEED;
for (size_t i = 0; i < size; i++) {
h ^= data[i];
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint32_t KR_v2_hash(const uint8_t* data, size_t size) {
// a.k.a. Java String hashCode(), a.k.a. BKDR Hash Function
uint32_t hashval = 0;
for (size_t i = 0; i < size; i++)
hashval = 31*hashval + data[i];
return hashval;
}
uint32_t Jenkins_one_at_a_time_hash(const uint8_t* data, size_t size) {
// a.k.a. simply "One-at-a-Time Hash"
uint32_t hash = 0;
for (size_t i = 0; i < size; i++) {
hash += data[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
#define POLY_CRC32C 0x82f63b78
#define POLY_CRC32 0xedb88320
uint32_t CRC32b(const uint8_t *data, size_t size) {
unsigned int crc = 0xFFFFFFFF, mask;
for (size_t i = 0; i < size; i++) {
crc ^= data[i];
for (int j = 7; j >= 0; j--) {
mask = -(crc & 1);
crc = (crc >> 1) ^ (POLY_CRC32 & mask);
}
}
return ~crc;
}
inline uint32_t _rotl32(uint32_t x, int32_t bits) {
return x<<bits | x>>(32-bits); // C idiom: will be optimized to a single CPU operation
}
uint32_t XOR_rotate(const uint8_t* data, size_t size) {
// Source: https://stackoverflow.com/a/7666668/5407270
uint32_t result = 0x55555555;
for (size_t i = 0; i < size; i++) {
result ^= data[i];
result = _rotl32(result, 5);
}
return result;
}
uint32_t Adler32_slow(const uint8_t* data, size_t len) {
// This is a slow implementation of Adler32, but it doesn't matter because
// it's a VERY BAD hash function anyway. Don't use it!
const uint32_t MOD_ADLER = 65521;
uint32_t a = 1, b = 0;
for (size_t i = 0; i < len; i++) {
a = (a + data[i]) % MOD_ADLER;
b = (b + a) % MOD_ADLER;
}
return (b << 16) | a;
}
uint32_t RSHash(const uint8_t* data, size_t size) {
// Introduced by Robert Sedgewick in his "Algorithms in C" (1990) book.
// Source: http://www.partow.net/programming/hashfunctions/index.html
// RSHash seems to outperforms JSHash, PJWHash (a.k.a. ELFHash) and APHash from the same source in all respects.
uint32_t a = 63689, b = 378551;
uint32_t hash = 0;
for (uint32_t i = 0; i < size; i++) {
hash = hash * a + data[i];
a *= b;
}
return hash;
}
uint32_t DEKHash(const uint8_t* str, size_t length) {
// Introduced by Donald E. Knuth in "The Art Of Computer Programming", Volume 3.
// Source: http://www.partow.net/programming/hashfunctions/index.html
unsigned int hash = length;
for (size_t i = 0; i < length; i++)
hash = ((hash << 5) ^ (hash >> 27)) ^ str[i];
return hash;
}
uint32_t Fletcher32(const uint8_t *data_, size_t len) {
// Note: when `len` (size in bytes) is odd, this function will read past the last byte.
// However, for C-strings, the next byte will be zero, which is sufficient for this test.
// In other cases, you have to do zero-padding.
const uint16_t* data = (const uint16_t*) data_;
uint32_t c0, c1;
len = (len + 1) & ~1; /* Round up len to words */
/* We similarly solve for n > 0 and n * (n+1) / 2 * (2^16-1) < (2^32-1) here. */
/* On modern computers, using a 64-bit c0/c1 could allow a group size of 23726746. */
for (c0 = c1 = 0; len > 0; ) {
size_t blocklen = len;
if (blocklen > 360*2) {
blocklen = 360*2;
}
len -= blocklen;
do {
c0 = c0 + *data++;
c1 = c1 + c0;
} while ((blocklen -= 2));
c0 = c0 % 65535;
c1 = c1 % 65535;
}
return (c1 << 16 | c0);
}
uint32_t Fletcher32_wrap(const uint8_t* data, size_t size) {
if (size & 1) {
// Add zero-padding for odd sizes. (This is, actually, unnecessary for
// C-strings, because they are terminated by a nul-byte by definition.)
uint8_t* buf = malloc(size + 1);
memcpy(buf, data, size);
buf[size + 1] = 0;
uint32_t res = Fletcher32(buf, size + 1);
free(buf);
return res;
}
return Fletcher32(data, size);
}
struct timeval stop, start;
void timing_start() {
gettimeofday(&start, NULL);
}
void timing_end() {
gettimeofday(&stop, NULL);
printf("took %lu us, ", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
}
#define ALGO_COUNT 12
hash_function get_hash_function(int algo) {
hash_function res;
switch (algo) {
case 1: printf("FNV1a "); res = FNV1a; break;
case 2: printf("MurmurOAAT "); res = MurmurOAAT_32; break;
case 3: printf("RSHash "); res = RSHash; break;
case 4: printf("SDBM "); res = SDBM_hash; break;
case 5: printf("Jenkins OAAT "); res = Jenkins_one_at_a_time_hash; break;
case 6: printf("K&R v2 "); res = KR_v2_hash; break;
case 7: printf("DJB2 "); res = DJB2_hash; break;
case 8: printf("XOR_rotate "); res = XOR_rotate; break;
case 9: printf("DEKHash "); res = DEKHash; break;
case 10: printf("Fletcher32 "); res = Fletcher32_wrap; break;
case 11: printf("Adler32 "); res = Adler32_slow; break;
case 12: printf("CRC32 (bit-by-bit) "); res = CRC32b; break;
default: exit(-1);
}
printf("\t");
return res;
}
void fill_random(uint32_t* buffer, size_t count) {
for (size_t i = 0; i < count; i++)
buffer[i] = random();
}
void show_sequence() {
printf("Generated random sequence:");
for (int i = 0; i < SMALL; i++) {
printf(" %02x", buffer[i]);
}
printf("\n");
}
void count_collisions(const uint32_t* hashes, size_t count) {
size_t slots = count * 32;
uint32_t* cnt_table = calloc(slots, sizeof(uint32_t));
size_t collisions = 0;
for (size_t i = 0; i < count; i++) {
size_t j = hashes[i] % slots;
while (cnt_table[j] != 0 && cnt_table[j] != hashes[i]) {
j = (j + 1) % slots;
}
if (cnt_table[j] == 0) {
cnt_table[j] = hashes[i];
} else {
collisions++;
}
}
double share = 100.0*(double)collisions/count;
printf("%lu collisions out of %lu, %0.2f\n", collisions, count, share);
}
void evaluate_avalanching(const uint32_t* hashes, size_t count) {
// Looks at hashes of consecutive sorted strings and assesses their
// similarity score. The lower the total similarity, the worse avalanching.
uint32_t score = 0;
char last[9], current[9];
for (size_t i = 0; i < count; i++) {
sprintf(current, "%08X", hashes[i]);
if (i) {
for (int j = 0; j < 8; j++)
if (last[j] == current[j]) score++;
}
strcpy(last, current);
}
float similarity = 100.*score/count/8;
float avalanching = similarity <= 6.25 ? 100 : (100 - similarity) / (100 - 6.25) * 100;
printf("score: %u, similarity: %.2f, avalanching: %.1f\n", score, similarity, avalanching);
}
void calc_buffer_hashes(char* type, const uint8_t* buffer, size_t length, size_t count) {
printf("Calculating hashes of %s buffers...\n", type);
for (int algo = 1; algo <= ALGO_COUNT; algo++) {
hash_function func = get_hash_function(algo);
timing_start();
for (size_t i = 0; i < count; i++) {
hashes[i] = func(buffer + length*i, length);
}
timing_end();
count_collisions(hashes, count);
}
}
void calc_word_hashes(const char* fname, char* buffer) {
#define MAX_WORD_LENGTH 50
printf("Calculating hashes of English words... (please, provide name of the file with words as first argument)\n");
char line[MAX_WORD_LENGTH + 1];
size_t word_count = 0;
// Open file
FILE* f = fopen(fname, "r");
// Read file line by line
while (fgets(line, sizeof(line), f)) {
line[strcspn(line, "\n")] = '\0'; // strip newline
strcpy(buffer + word_count*MAX_WORD_LENGTH, line);
word_count++;
}
fclose(f);
// Calculate hashes
for (int algo = 1; algo <= ALGO_COUNT; algo++) {
hash_function func = get_hash_function(algo);
timing_start();
for (size_t i = 0; i < word_count; i++) {
char* word = buffer + MAX_WORD_LENGTH*i;
size_t len = strlen(word);
hashes[i] = func((uint8_t*)word, len);
}
timing_end();
count_collisions(hashes, word_count);
evaluate_avalanching(hashes, word_count);
}
}
int main(int argc, char* argv[]) {
srandom(SEED);
fill_random((uint32_t*) buffer, LARGE * SMALL / sizeof(uint32_t));
show_sequence();
calc_buffer_hashes("small", buffer, SMALL, LARGE);
calc_buffer_hashes("large", buffer, LARGE, SMALL);
calc_word_hashes(argv[1], (char*)buffer);
return 0;
}