如果我有两个 32 位值 X 和 Y,如何有效地将它们的位按照 xyxyxyxy 的顺序交织成一个 64 位值 Z...(Z 是 Z 顺序曲线上的一个位置。)
我可以迭代 X 和 Y 中的每个位,同时设置 Z 中的位。这看起来效率很低。
是否有一种捷径可以将两个值中的位交错为一个大值,而这需要不到一百条CPU指令?
此 C++ 答案也适用于 C:https://stackoverflow.com/a/39490836/11993121
答案概括了原理,但没有写出完整的解决方案。工作实现如下:
#include <stdint.h>
uint64_t interleave(uint32_t x0, uint32_t y0)
{
static const uint64_t B[] = {0x0000FFFF0000FFFF, 0x00FF00FF00FF00FF, 0x0F0F0F0F0F0F0F0F, 0x3333333333333333, 0x5555555555555555};
static const unsigned S[] = {16, 8, 4, 2, 1};
uint64_t x = x0;
uint64_t y = y0;
for(unsigned i = 0; i < sizeof(B)/sizeof(B[0]); i++)
{
x = (x | (x << S[i])) & B[i];
y = (y | (y << S[i])) & B[i];
}
return x | (y << 1);
}
测试示例:
#include <stdio.h>
void printBinary64(uint64_t x)
{
uint64_t bit = ((uint64_t)1 << 63);
for(unsigned i = 0; i < 64; i++)
{
printf("%c", (x&bit) ? '1' : '0');
bit = bit >> 1;
}
}
void printBinary32(uint32_t x)
{
uint32_t bit = ((uint32_t)1 << 31);
for(unsigned i = 0; i < 32; i++)
{
printf("%c ", (x&bit) ? '1' : '0');
bit = bit >> 1;
}
}
int main(void)
{
uint32_t x = 0x01234567;
uint32_t y = 0xFEDCBA98;
printf(" ");
printBinary32(x);
printf("\n");
printBinary32(y);
printf("\n");
printBinary64(interleave(x,y));
printf("\n");
}
就其价值而言,这是一个手动矢量化版本,它使用 SSE 内在函数同时对 x 和 y 进行位运算。然而,gcc 编译器对我来说太聪明了,它以某种方式更有效地矢量化尼尔森的标量函数。我还没研究过为什么。不管怎样,这可行,但可能只对那些可能想要调整它以使用更宽的向量宽度来同时交错一对以上 uint32_ts 的人有用。
uint64_t interleavesse2(uint32_t x0, uint32_t y0) {
// Works but is almost 50% slower!! than scalar version when compiled with gcc -O3 -msse2
// Materially faster than when scalar version compiled with gcc -O2 -msse2
__m128i mask = _mm_setr_epi32(0xffff, 0xffff, 0xffff, 0xffff);
__m128i z = _mm_setr_epi32(x0, 0, y0, 0);
z = _mm_and_si128(_mm_or_si128(z, _mm_slli_epi64(z, 16)), mask);
mask = _mm_setr_epi32(0xff00ff, 0xff00ff, 0xff00ff, 0xff00ff);
z = _mm_and_si128(_mm_or_si128(z, _mm_slli_epi64(z, 8)), mask);
mask = _mm_setr_epi32(0xf0f0f0f, 0xf0f0f0f, 0xf0f0f0f, 0xf0f0f0f);
z = _mm_and_si128(_mm_or_si128(z, _mm_slli_epi64(z, 4)), mask);
mask = _mm_setr_epi32(0x33333333, 0x33333333, 0x33333333, 0x33333333);
z = _mm_and_si128(_mm_or_si128(z, _mm_slli_epi64(z, 2)), mask);
mask = _mm_setr_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555);
z = _mm_and_si128(_mm_or_si128(z, _mm_slli_epi64(z, 1)), mask);
uint64_t res[2];
memcpy(res, &z, 16);
return res[0] | (res[1] << 1);
}