AVX2 代码用于在 8 个 4 字节目标中查找 4 字节字符串的最长匹配

问题描述 投票:0回答:1

我需要与此等效的最快(即无分支、最小化 uops)AVX2 代码:

prevlen = 0
for i=0..7:
  len = matched_bytes(target[i], src)
  if len > prevlen:
    prevlen = len
    index = i

其中 target[i] 和 src 是 4 字节字符串,matched_bytes 返回 0..4 - 相等的低字节数:

def matched_bytes(target, src):
  return lzcnt(target ^ src) / 8

我的最佳版本需要 17 个命令。我可以接受没有最佳长度的情况,索引对于我的一些用例来说已经足够了。

可以用更少的命令来完成吗?我不太关心延迟或不公平的 ALU 使用,因为它是较大代码的一部分。

byte_eq = pmovmskb( pcmpeqb( broadcast(src), targets))

// bit 4*i set if byte1..4 is equal
byte_eq1 = flags
byte_eq2 = flags >> 1
byte_eq3 = flags >> 2
byte_eq4 = flags >> 3

// bit 4*i set if at least 1..4 bytes are equal
len1 = byte_eq1 & 0x11111111
len2 = len1 & byte_eq2
len3 = len2 & byte_eq3
len4 = len3 & byte_eq4

if(len2==0) len2 = len1
if(len3==0) len3 = len2
if(len4==0) len4 = len3

index = lzcnt(len4) / 4
// if len4==0 then no match was found
intel simd avx avx2
1个回答
0
投票

Pack 4xu8 将掩码降低至 16 位,然后使用

PHMINPOSUW

#include <smmintrin.h> // SSE4.1 intrinsics
#include <stdint.h>
#include <stdio.h>

void fn (uint32_t* arr, uint32_t val) {
    const __m128i neg1 = _mm_set1_epi32(-1);
    __m128i search_value = _mm_set1_epi32(val);
    __m128i row0 = _mm_loadu_si128((__m128i*)&arr[0]);
    __m128i row1 = _mm_loadu_si128((__m128i*)&arr[4]);
    __m128i matched_bytes0 = _mm_cmpeq_epi8(row0, search_value);
    __m128i matched_bytes1 = _mm_cmpeq_epi8(row1, search_value);
    __m128i packed = _mm_packs_epi16(matched_bytes0, matched_bytes1);
    __m128i t1mskc = _mm_or_si128(_mm_xor_si128(packed, neg1), _mm_sub_epi16(packed, neg1));
    __m128i best_match = _mm_minpos_epu16(t1mskc);
    uint32_t match_desc = (uint32_t)_mm_cvtsi128_si32(best_match);
    uint32_t match_index = match_desc >> 16;
    uint32_t match_length = (__builtin_ctzl(match_desc | 0x10000) >> 2);

    printf("index: %d, length: %d\n", match_index, match_length);
}
© www.soinside.com 2019 - 2024. All rights reserved.