我有一个看似无辜的函数
f
,它在紧密循环中调用并导致速度瓶颈。关于如何改进它有什么见解吗?
#define N 48
// N = 47 is also relevant
int f(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C) {
int E = 0;
for (int r = 0; r < N; ++r) {
for (int s = r; s < N; ++s) {
int pa = __builtin_popcount(A[r] & A[s]);
int pb = __builtin_popcount(B[r] & B[s]);
int pc = __builtin_popcount(C[r] & C[s]);
E += pa*pb*pc;
}
}
return E;
}
我尝试过的:
(a) 在线性时间内预处理
A
、B
、C
数组,并仅使用 pa*pb*pc
非零的那些三元组进入双循环。然而,由于 A
、B
、C
中的比特分布是均匀的,因此几乎没有过滤掉任何内容。
(b) 阻塞以最小化缓存未命中
(c) 将
A
、B
和 C
重新打包到 uint64_t
中,并重新加工 popcount
,以便一次处理 4 个输入
这些都没有帮助。看来迭代次数(~1000)才是主要问题。我在这里缺少什么吗?
编辑。我可以假设 AVX2 在目标处理器上可用。目前相关编译器选项包括
-O3
、-mpopcnt
、-funroll-loops
、-mtune=native
和 -march=native
使用下三角矩阵而不是上三角矩阵,并分割对角线,得到的优化对我来说速度提高了 2.7 倍(在带有
-O3
的 Clang 14.0.3、Apple M1 上)。它允许使用矢量(NEON)指令和一些循环展开:
int f2(const uint16_t * __restrict a,
const uint16_t * __restrict b,
const uint16_t * __restrict c) {
int e = 0;
for (int r = 1; r < N; ++r)
for (int s = 0; s < r; ++s) {
int pa = __builtin_popcount(a[r] & a[s]);
int pb = __builtin_popcount(b[r] & b[s]);
int pc = __builtin_popcount(c[r] & c[s]);
e += pa * pb * pc;
}
for (int r = 0; r < N; ++r) {
int pa = __builtin_popcount(a[r]);
int pb = __builtin_popcount(b[r]);
int pc = __builtin_popcount(c[r]);
e += pa * pb * pc;
}
return e;
}
我还尝试使用表查找而不是 popcount 指令,但这在 M1 和 Intel i7(带有
-march=native
)上速度较慢。 i7 上的 Clang 11 在此版本中的表现并不比原始版本好得多(仅提高了 10%)。
OP 最初并未指定可以假设的硬件支持,在这种情况下,硬件会产生很大的差异。为了支持较旧的硬件,这里有一个函数可以完成仅需要 SSE2 的工作,而不需要 popcount 的硬件支持。 SSE2 版本使用我在快速计算 __m128i 寄存器中设置位的数量中找到的 popcnt8() 函数。
static inline __m128i popcnt8(__m128i x) {
const __m128i popcount_mask1 = _mm_set1_epi8(0x77);
const __m128i popcount_mask2 = _mm_set1_epi8(0x0F);
__m128i n;
// Count bits in each 4-bit field.
n = _mm_srli_epi64(x, 1);
n = _mm_and_si128(popcount_mask1, n);
x = _mm_sub_epi8(x, n);
n = _mm_srli_epi64(n, 1);
n = _mm_and_si128(popcount_mask1, n);
x = _mm_sub_epi8(x, n);
n = _mm_srli_epi64(n, 1);
n = _mm_and_si128(popcount_mask1, n);
x = _mm_sub_epi8(x, n);
x = _mm_add_epi8(x, _mm_srli_epi16(x, 4));
x = _mm_and_si128(popcount_mask2, x);
return x;
}
除非硬件 popcount 不可用,否则与 OP 的函数相比,以下函数非常慢。在我的 Nehalem i5 CPU 上,我测量了 OP 的功能,在启用硬件 popcount 的情况下,大约 8000-9000 个 rdtsc“周期”,在不启用硬件 popcount 的情况下,大约 40000 个 rdtsc“周期”。 SSE2 (gcc -msse2) 函数测量了大约 19000 个周期。它确实可以工作,无需更改不同的 N 值。
int f6(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C) {
int r, s, E = 0;
__m128i ABCr, ABCs, ABC[N];
union {
__m128i v;
uint8_t u[16];
} popcounts;
for (r = 0; r < N; ++r) {
ABC[r] = _mm_setr_epi16(A[r], B[r], C[r], 0, 0, 0, 0, 0);
}
for (r = 0; r < N; ++r) {
ABCr = ABC[r];
ABCs = popcnt8(ABCr);
popcounts.v = _mm_bslli_si128(ABCs, 1);
popcounts.v = _mm_add_epi8(popcounts.v, ABCs);
E += (popcounts.u[1])*(popcounts.u[3])*(popcounts.u[5]);
for (s = r+1; s < N; s++) {
ABCs = ABC[s];
ABCs = _mm_and_si128(ABCs, ABCr);
ABCs = popcnt8(ABCs);
popcounts.v = _mm_bslli_si128(ABCs, 1);
popcounts.v = _mm_add_epi8(popcounts.v, ABCs);
E += (popcounts.u[1])*(popcounts.u[3])*(popcounts.u[5]);
}
}
return E;
}
虽然这个函数的性能不会给任何人留下深刻的印象,但我发现编写和基准测试它很有趣,并且认为其他人可能也会觉得它很有趣。由于假设 SSE2 支持作为最低限度是很常见的,并且多种架构的编码可能非常复杂,因此我认为分享我所做的事情有一定的价值。如果 OP 要求广泛兼容的代码并假设不超过 SSE2 支持,那么我认为这可能是一个值得的改进。
编辑:
我通过重新排序计算,制作了该函数的更快的 SSE2 版本。它的运行速度比硬件 popcount 启用的函数稍快,约为 5900 个周期。我知道 OP 想要 AVX2,但我认为如果有人想在 AVX2 或 AVX512 中创建手动矢量化版本,这种方法很有趣。
SSE2 + AVX512:
_mm_popcnt_epi16 是一个 AVX512 内在函数,如果替换 X_mm_popcnt_epi16 应该会带来不错的性能增益,我认为,将每次调用的向量指令数量从超过 10,000 减少到不到 3,000,但我没有任何支持 AVX512 的硬件来提供基准。
static inline __m128i X_mm_popcnt_epi16(__m128i v) {
// Adapted from scalar 32 bit https://stackoverflow.com/questions/109023/count-the-number-of-set-bits-in-a-32-bit-integer
v = _mm_sub_epi16(v, _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi16(0x5555)));
v = _mm_add_epi16(_mm_and_si128(v, _mm_set1_epi16(0x3333)), _mm_and_si128(_mm_srli_epi16(v, 2), _mm_set1_epi16(0x3333)));
v = _mm_and_si128(_mm_add_epi16(v, _mm_srli_epi16(v, 4)), _mm_set1_epi16(0x0f0f));
return _mm_srli_epi16(_mm_mullo_epi16(v, _mm_set1_epi16(0x101)), 8);
}
static inline __m128i getBrs(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C, uint32_t r, uint32_t s) {
uint32_t i;
__m128i Evec, popA, popB, popC, temp, temp2, Ar, Br, Cr, As, Bs, Cs;
Evec = _mm_setzero_si128();
/* getBrs does sth like...
uint32_t EB = 0;
uint32_t j;
for (i = r; i<r+8; i++) {
for (j = s; j<s+8; j++) {
EB += __builtin_popcount(A[i] & A[j])*__builtin_popcount(B[i] & B[j])*__builtin_popcount(C[i] & C[j]);
}
}
*/
Ar = _mm_loadu_si128( (const __m128i*)&A[r]);
Br = _mm_loadu_si128( (const __m128i*)&B[r]);
Cr = _mm_loadu_si128( (const __m128i*)&C[r]);
As = _mm_loadu_si128( (const __m128i*)&A[s]);
Bs = _mm_loadu_si128( (const __m128i*)&B[s]);
Cs = _mm_loadu_si128( (const __m128i*)&C[s]);
for (i=0; i<8; i++) {
As = _mm_bsrli_si128(As, 2);
As = _mm_insert_epi16(As, A[s], 7);
Bs = _mm_bsrli_si128(Bs, 2);
Bs = _mm_insert_epi16(Bs, B[s], 7);
Cs = _mm_bsrli_si128(Cs, 2);
Cs = _mm_insert_epi16(Cs, C[s], 7);
temp = _mm_and_si128(Ar, As);
popA = X_mm_popcnt_epi16(temp);
temp = _mm_and_si128(Br, Bs);
popB = X_mm_popcnt_epi16(temp);
temp = _mm_and_si128(Cr, Cs);
popC = X_mm_popcnt_epi16(temp);
temp = _mm_mullo_epi16(popA, popB);
temp2 = _mm_mullo_epi16(temp, popC);
Evec = _mm_add_epi16(Evec, temp2);
s++;
}
return _mm_madd_epi16(Evec, _mm_set1_epi16(1));
}
int f8(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C) {
uint32_t r, i,j, Ediag = 0, E = 0;
__m128i Evec, popA, popB, popC, temp, temp2, Ar, Br, Cr;
Evec = _mm_setzero_si128();
union {
__m128i v;
uint32_t u32[4];
} popcounts;
/*
for (i = 0; i<N; i++) {
Ediag += __builtin_popcount(A[i] & A[i])*__builtin_popcount(B[i] & B[i])*__builtin_popcount(C[i] & C[i]);
}
*/
for (r = 0; r < 48; r+=8) {
Ar = _mm_loadu_si128( (const __m128i*)&A[r]);
Br = _mm_loadu_si128( (const __m128i*)&B[r]);
Cr = _mm_loadu_si128( (const __m128i*)&C[r]);
popA = X_mm_popcnt_epi16(Ar);
popB = X_mm_popcnt_epi16(Br);
popC = X_mm_popcnt_epi16(Cr);
temp = _mm_mullo_epi16(popA, popB);
temp2 = _mm_mullo_epi16(temp, popC);
Evec = _mm_add_epi16(Evec, temp2);
}
popcounts.v = _mm_madd_epi16(Evec, _mm_set1_epi16(1));;
Ediag = popcounts.u32[0] + popcounts.u32[1] + popcounts.u32[2] + popcounts.u32[3];
Evec = _mm_setzero_si128();
for (i = 0; i<N; i+=8) {
Evec = _mm_add_epi32(Evec, getBrs(A,B,C,i,i));
for (j = i+8; j<N; j+=8) {
temp = getBrs(A,B,C,i,j);
temp = _mm_add_epi32(temp, temp);
Evec = _mm_add_epi32(Evec, temp);
}
}
popcounts.v = Evec;
E = popcounts.u32[0] + popcounts.u32[1] + popcounts.u32[2] + popcounts.u32[3];
return (Ediag + E)/2;
}