为什么 ARM NEON 内在函数在查找合法的奥赛罗棋步方面并不比普通 C++ 更快?

问题描述 投票:0回答:1

我用 NEON 编写了一个方法来加速 Othello 游戏引擎的代码。

我预计 SIMD 向量会加速,但没有获得速度。没什么:-(
我在 M3 上使用 Xcode 16、Clang/LLVM 16 运行此程序,并使用

-Ofast -fomit-frame-pointer -ffast-math

我用游戏±1/2h或超过十亿次调用例程的真实示例来测试我的程序

这只是简单的指令,

and
or
,shift...在64位上

我不明白......我应该如何编程这个方法来提高速度?

原始代码:

unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs) {
    
    
    const unsigned long long inner_o_discs = o_discs & 0x7E7E7E7E7E7E7E7EULL;
    
    
    /* direction W */
    unsigned long long
    flipped  = (p_discs >> 1) & inner_o_discs;
    flipped |= (flipped >> 1) & inner_o_discs;
    
    unsigned long long adjacent_o_discs = inner_o_discs & (inner_o_discs >> 1);
    
    flipped |= (flipped >> 2) & adjacent_o_discs;
    flipped |= (flipped >> 2) & adjacent_o_discs;
    
    unsigned long long legals = flipped >> 1;
    
    
    //    /* direction _E*/
    //    flipped  = (p_discs << 1) & inner_o_discs;
    //    flipped |= (flipped << 1) & inner_o_discs;
    //
    //    adjacent_o_discs = inner_o_discs & (inner_o_discs << 1);
    //
    //    flipped |= (flipped << 2) & adjacent_o_discs;
    //    flipped |= (flipped << 2) & adjacent_o_discs;
    //
    //    legals |= flipped << 1;
    
    // trick
    /* direction _E */
    flipped = (p_discs << 1);
    legals |= ((flipped + inner_o_discs) & ~flipped);
    
    
    /* direction S */
    flipped  = (p_discs >>  8) & o_discs;
    flipped |= (flipped >>  8) & o_discs;
    
    adjacent_o_discs = o_discs & (o_discs >> 8);
    
    flipped |= (flipped >> 16) & adjacent_o_discs;
    flipped |= (flipped >> 16) & adjacent_o_discs;
    
    legals |= flipped >> 8;
    
    
    /* direction N */
    flipped  = (p_discs <<  8) & o_discs;
    flipped |= (flipped <<  8) & o_discs;
    
    adjacent_o_discs = o_discs & (o_discs << 8);
    
    flipped |= (flipped << 16) & adjacent_o_discs;
    flipped |= (flipped << 16) & adjacent_o_discs;
    
    legals |= flipped << 8;
    
    
    /* direction NE */
    flipped  = (p_discs >>  7) & inner_o_discs;
    flipped |= (flipped >>  7) & inner_o_discs;
    
    adjacent_o_discs = inner_o_discs & (inner_o_discs >> 7);
    
    flipped |= (flipped >> 14) & adjacent_o_discs;
    flipped |= (flipped >> 14) & adjacent_o_discs;
    
    legals |= flipped >> 7;
    
    
    /* direction SW */
    flipped  = (p_discs <<  7) & inner_o_discs;
    flipped |= (flipped <<  7) & inner_o_discs;
    
    adjacent_o_discs = inner_o_discs & (inner_o_discs << 7);
    
    flipped |= (flipped << 14) & adjacent_o_discs;
    flipped |= (flipped << 14) & adjacent_o_discs;
    
    legals |= flipped << 7;
    
    
    /* direction NW */
    flipped  = (p_discs >>  9) & inner_o_discs;
    flipped |= (flipped >>  9) & inner_o_discs;
    
    adjacent_o_discs = inner_o_discs & (inner_o_discs >> 9);
    
    flipped |= (flipped >> 18) & adjacent_o_discs;
    flipped |= (flipped >> 18) & adjacent_o_discs;
    
    legals |= flipped >> 9;
    
    
    /* direction SE */
    flipped  = (p_discs <<  9) & inner_o_discs;
    flipped |= (flipped <<  9) & inner_o_discs;
    
    adjacent_o_discs = inner_o_discs & (inner_o_discs << 9);
    
    flipped |= (flipped << 18) & adjacent_o_discs;
    flipped |= (flipped << 18) & adjacent_o_discs;
    
    legals |= flipped << 9;
    
    //Removes existing discs
    legals &= ~(p_discs | o_discs);
    
    return legals;
    
}

我的 NEON 代码

unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs ) {
    
    const uint64x2_t pp_discs = vdupq_n_u64(p_discs);
    const uint64x2_t oo_discs = vdupq_n_u64(o_discs);
    
    const uint64x2_t inner_oo_discs = vdupq_n_u64(o_discs & 0x7E7E7E7E7E7E7E7EULL);

    
    //horizontals directions -1, +1
    static const int64x2_t shift_1 = {-1, 1};
    static const int64x2_t shift_2 = {-2, 2};
    
    uint64x2_t
    flipped = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs);
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_1), inner_oo_discs));

    uint64x2_t 
    adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_1));

    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));

    uint64x2_t legals = vshlq_u64(flipped, shift_1);

    //verticals directions -8 , +8
    static const int64x2_t shift_8  = {-8,   8};
    static const int64x2_t shift_16 = {-16, 16};

    flipped = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs);
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_8), oo_discs));
    
    adjacent_oo_discs = vandq_u64(oo_discs, vshlq_u64(oo_discs, shift_8));
    
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));

    legals = vorrq_u64(legals, vshlq_u64(flipped, shift_8));

    //diagonals directions -7 , +7
    static const int64x2_t shift_7  = {-7,   7};
    static const int64x2_t shift_14 = {-14, 14};

    flipped = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs);
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_7), inner_oo_discs));

    adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_7));

    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));

    legals = vorrq_u64(legals, vshlq_u64(flipped, shift_7));
    
    //diagonals directions -9 , +9
    static const int64x2_t shift_9  = {-9,   9};
    static const int64x2_t shift_18 = {-18, 18};

    flipped = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs);
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_9), inner_oo_discs));

    adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_9));

    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));

    legals = vorrq_u64(legals, vshlq_u64(flipped, shift_9));
    
    
    return ((vgetq_lane_u64(legals, 0) | vgetq_lane_u64(legals, 1)) & ~(p_discs | o_discs));

}

我什至写了一个交错版本


../...
    uint64x2_t
    flip_h = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs);
    uint64x2_t
    flip_v = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs);
    uint64x2_t
    flip_d7 = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs);
    uint64x2_t
    flip_d9 = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs);

    flip_h = vorrq_u64(flip_h, vandq_u64(vshlq_u64(flip_h, shift_1), inner_oo_discs));
    flip_v = vorrq_u64(flip_v, vandq_u64(vshlq_u64(flip_v, shift_8), oo_discs));

.../...

我可以期待速度增益,不是吗?

c++ clang arm64 neon othello
1个回答
0
投票

两个版本都使用 Clang 16 编译为 ARM64 的 84 条指令

-O3
https://godbolt.org/z/6rYaK7Pqe
它自动矢量化标量版本。

我没有尝试检查这两个版本有多相似,但两者都在向量上做了很多

uslh
orr
,所以如果 asm 几乎等效,我不会感到惊讶。 如果是这样,显然现代 Clang 已经足够好了,手动矢量化没有任何好处。 至少不是按照你的方式做的;我没有查看内在函数或它们如何编译来查看是否有改进的空间。

某些指令的吞吐量成本与其他指令不同,并且它们之间的依赖链会影响性能。 一般来说,具有相同的指令数并不能保证相似的性能。 但是,由于两个函数对 128 位向量执行相同的工作,因此它们可能以非常相似的方式执行此操作,具有非常相似的指令组合和依赖链模式。


您可以通过从

shift_2
左移获取
shift_1
(而不是单独的
static const
)来节省一次负载。 或者至少将常量分组到一个
int64x2_t
数组中,这样就不需要为每个变量执行单独的
adrp
指令。

最新问题
© www.soinside.com 2019 - 2025. All rights reserved.