我用 NEON 编写了一个方法来加速 Othello 游戏引擎的代码。
我预计 SIMD 向量会加速,但没有获得速度。没什么:-(
我在 M3 上使用 Xcode 16、Clang/LLVM 16 运行此程序,并使用
-Ofast -fomit-frame-pointer -ffast-math
我用游戏±1/2h或超过十亿次调用例程的真实示例来测试我的程序
这只是简单的指令,
and
,or
,shift...在64位上
我不明白......我应该如何编程这个方法来提高速度?
原始代码:
unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs) {
const unsigned long long inner_o_discs = o_discs & 0x7E7E7E7E7E7E7E7EULL;
/* direction W */
unsigned long long
flipped = (p_discs >> 1) & inner_o_discs;
flipped |= (flipped >> 1) & inner_o_discs;
unsigned long long adjacent_o_discs = inner_o_discs & (inner_o_discs >> 1);
flipped |= (flipped >> 2) & adjacent_o_discs;
flipped |= (flipped >> 2) & adjacent_o_discs;
unsigned long long legals = flipped >> 1;
// /* direction _E*/
// flipped = (p_discs << 1) & inner_o_discs;
// flipped |= (flipped << 1) & inner_o_discs;
//
// adjacent_o_discs = inner_o_discs & (inner_o_discs << 1);
//
// flipped |= (flipped << 2) & adjacent_o_discs;
// flipped |= (flipped << 2) & adjacent_o_discs;
//
// legals |= flipped << 1;
// trick
/* direction _E */
flipped = (p_discs << 1);
legals |= ((flipped + inner_o_discs) & ~flipped);
/* direction S */
flipped = (p_discs >> 8) & o_discs;
flipped |= (flipped >> 8) & o_discs;
adjacent_o_discs = o_discs & (o_discs >> 8);
flipped |= (flipped >> 16) & adjacent_o_discs;
flipped |= (flipped >> 16) & adjacent_o_discs;
legals |= flipped >> 8;
/* direction N */
flipped = (p_discs << 8) & o_discs;
flipped |= (flipped << 8) & o_discs;
adjacent_o_discs = o_discs & (o_discs << 8);
flipped |= (flipped << 16) & adjacent_o_discs;
flipped |= (flipped << 16) & adjacent_o_discs;
legals |= flipped << 8;
/* direction NE */
flipped = (p_discs >> 7) & inner_o_discs;
flipped |= (flipped >> 7) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs >> 7);
flipped |= (flipped >> 14) & adjacent_o_discs;
flipped |= (flipped >> 14) & adjacent_o_discs;
legals |= flipped >> 7;
/* direction SW */
flipped = (p_discs << 7) & inner_o_discs;
flipped |= (flipped << 7) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs << 7);
flipped |= (flipped << 14) & adjacent_o_discs;
flipped |= (flipped << 14) & adjacent_o_discs;
legals |= flipped << 7;
/* direction NW */
flipped = (p_discs >> 9) & inner_o_discs;
flipped |= (flipped >> 9) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs >> 9);
flipped |= (flipped >> 18) & adjacent_o_discs;
flipped |= (flipped >> 18) & adjacent_o_discs;
legals |= flipped >> 9;
/* direction SE */
flipped = (p_discs << 9) & inner_o_discs;
flipped |= (flipped << 9) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs << 9);
flipped |= (flipped << 18) & adjacent_o_discs;
flipped |= (flipped << 18) & adjacent_o_discs;
legals |= flipped << 9;
//Removes existing discs
legals &= ~(p_discs | o_discs);
return legals;
}
我的 NEON 代码
unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs ) {
const uint64x2_t pp_discs = vdupq_n_u64(p_discs);
const uint64x2_t oo_discs = vdupq_n_u64(o_discs);
const uint64x2_t inner_oo_discs = vdupq_n_u64(o_discs & 0x7E7E7E7E7E7E7E7EULL);
//horizontals directions -1, +1
static const int64x2_t shift_1 = {-1, 1};
static const int64x2_t shift_2 = {-2, 2};
uint64x2_t
flipped = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_1), inner_oo_discs));
uint64x2_t
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_1));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
uint64x2_t legals = vshlq_u64(flipped, shift_1);
//verticals directions -8 , +8
static const int64x2_t shift_8 = {-8, 8};
static const int64x2_t shift_16 = {-16, 16};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_8), oo_discs));
adjacent_oo_discs = vandq_u64(oo_discs, vshlq_u64(oo_discs, shift_8));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_8));
//diagonals directions -7 , +7
static const int64x2_t shift_7 = {-7, 7};
static const int64x2_t shift_14 = {-14, 14};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_7), inner_oo_discs));
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_7));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_7));
//diagonals directions -9 , +9
static const int64x2_t shift_9 = {-9, 9};
static const int64x2_t shift_18 = {-18, 18};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_9), inner_oo_discs));
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_9));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_9));
return ((vgetq_lane_u64(legals, 0) | vgetq_lane_u64(legals, 1)) & ~(p_discs | o_discs));
}
我什至写了一个交错版本
../...
uint64x2_t
flip_h = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs);
uint64x2_t
flip_v = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs);
uint64x2_t
flip_d7 = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs);
uint64x2_t
flip_d9 = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs);
flip_h = vorrq_u64(flip_h, vandq_u64(vshlq_u64(flip_h, shift_1), inner_oo_discs));
flip_v = vorrq_u64(flip_v, vandq_u64(vshlq_u64(flip_v, shift_8), oo_discs));
.../...
我可以期待速度增益,不是吗?
两个版本都使用 Clang 16 编译为 ARM64 的 84 条指令
-O3
:https://godbolt.org/z/6rYaK7Pqe我没有尝试检查这两个版本有多相似,但两者都在向量上做了很多
uslh
和 orr
,所以如果 asm 几乎等效,我不会感到惊讶。 如果是这样,显然现代 Clang 已经足够好了,手动矢量化没有任何好处。 至少不是按照你的方式做的;我没有查看内在函数或它们如何编译来查看是否有改进的空间。
某些指令的吞吐量成本与其他指令不同,并且它们之间的依赖链会影响性能。 一般来说,具有相同的指令数并不能保证相似的性能。 但是,由于两个函数对 128 位向量执行相同的工作,因此它们可能以非常相似的方式执行此操作,具有非常相似的指令组合和依赖链模式。
您可以通过从
shift_2
左移获取 shift_1
(而不是单独的 static const
)来节省一次负载。 或者至少将常量分组到一个 int64x2_t
数组中,这样就不需要为每个变量执行单独的 adrp
指令。