我需要与此等效的最快(即无分支、最小化 uops)AVX2 代码:
prevlen = 0
for i=0..7:
len = matched_bytes(target[i], src)
if len > prevlen:
prevlen = len
index = i
其中 target[i] 和 src 是 4 字节字符串,matched_bytes 返回 0..4 - 相等的低字节数:
def matched_bytes(target, src):
return lzcnt(target ^ src) / 8
我的最佳版本需要 17 个命令。我可以接受没有最佳长度的情况,索引对于我的一些用例来说已经足够了。
可以用更少的命令来完成吗?我不太关心延迟或不公平的 ALU 使用,因为它是较大代码的一部分。
byte_eq = pmovmskb( pcmpeqb( broadcast(src), targets))
// bit 4*i set if byte1..4 is equal
byte_eq1 = flags
byte_eq2 = flags >> 1
byte_eq3 = flags >> 2
byte_eq4 = flags >> 3
// bit 4*i set if at least 1..4 bytes are equal
len1 = byte_eq1 & 0x11111111
len2 = len1 & byte_eq2
len3 = len2 & byte_eq3
len4 = len3 & byte_eq4
if(len2==0) len2 = len1
if(len3==0) len3 = len2
if(len4==0) len4 = len3
index = lzcnt(len4) / 4
// if len4==0 then no match was found
将 u8x4 比较掩码压缩为 16 位,然后使用
PHMINPOSUW
。
#include <smmintrin.h> // SSE4.1 intrinsics
#include <stdint.h>
#include <stdio.h>
void fn (uint32_t* arr, uint32_t val) {
const __m128i neg1 = _mm_set1_epi32(-1);
__m128i search_value = _mm_set1_epi32(val);
__m128i row0 = _mm_loadu_si128((__m128i*)&arr[0]);
__m128i row1 = _mm_loadu_si128((__m128i*)&arr[4]);
__m128i matched_bytes0 = _mm_cmpeq_epi8(row0, search_value);
__m128i matched_bytes1 = _mm_cmpeq_epi8(row1, search_value);
__m128i packed = _mm_packs_epi16(matched_bytes0, matched_bytes1);
__m128i t1mskc = _mm_or_si128(_mm_xor_si128(packed, neg1), _mm_sub_epi16(packed, neg1));
__m128i best_match = _mm_minpos_epu16(t1mskc);
uint32_t match_desc = (uint32_t)_mm_cvtsi128_si32(best_match);
uint32_t match_index = match_desc >> 16;
uint32_t match_length = __builtin_ctzl(match_desc | 0x10000) >> 2;
printf("index: %d, length: %d\n", match_index, match_length);
}