如何在使用输入数据时实现 FMA 的峰值触发器吞吐量（同时保持所需的屋顶线计算/负载比）？

Question

我尝试在加载输入数据时实现 SIMD FMA 计算的峰值浮点吞吐量。我加载相对计算/内存加载速度允许的尽可能多的数据。我还应用了缓冲来避免直接的数据依赖性，但这也没有帮助。不幸的是，吞吐量下降了两倍。

考虑以下 FMA 函数：

 std::pair<float, float> fmadd_256(const float* aptr, const float* bptr,
                                  uint64_t sz, uint32_t num_loads) {
    constexpr size_t num_lanes = 256 / (sizeof(float) * 8);
    __m256 c[8];
    __m256 a[2][num_loads];
    __m256 b[2][num_loads];
    load_buffer(aptr, bptr, a[0], b[0], 0);
    for (uint64_t i = 0; i < sz; i++) {
        uint64_t curr_buffer = i % 2;
        __m256* curra = a[curr_buffer];
        __m256* currb = b[curr_buffer];
        if (i < sz - 1) {
            uint64_t next_buffer = (i + 1) % 2;
            __m256* nexta = a[next_buffer];
            __m256* nextb = b[next_buffer];
            uint64_t load_position = (i + 1) * num_lanes * num_loads;
            load_buffer(aptr, bptr, nexta, nextb, load_position);
        }
        c[0] = _mm256_fmadd_ps(curra[0], currb[0], c[0]);
        c[1] = _mm256_fmadd_ps(curra[0], currb[0], c[1]);
        c[2] = _mm256_fmadd_ps(curra[0], currb[0], c[2]);
        c[3] = _mm256_fmadd_ps(curra[0], currb[0], c[3]);
        c[4] = _mm256_fmadd_ps(curra[1], currb[1], c[4]);
        c[5] = _mm256_fmadd_ps(curra[1], currb[1], c[5]);
        c[6] = _mm256_fmadd_ps(curra[1], currb[1], c[6]);
        c[7] = _mm256_fmadd_ps(curra[1], currb[1], c[7]);
    }
    constexpr size_t num_unrolls = 8;
    uint64_t flops{num_unrolls * num_lanes * 2 * sz};
    float res = epilogue(c);
    return {res, flops};
}

运行上面的代码时，我得到的 GFLOPS/秒比率约为 60，但我的计算机可以达到 130 GFLOPS/秒。当禁用内存加载时，代码达到 120 GFLOPS/秒。我使用 Alderlake 处理器。根据 Intel Advisor 的说法，考虑到 DRAM 速度时，实现峰值吞吐量所需的计算/内存负载比为 4.4。

我可以做些什么来提高程序的吞吐量？

有趣的是，缓冲并不能提高程序的性能。当将数据加载到使用的缓冲区（设置

uint64_t next_buffer = i % 2

）时，代码的运行速度与我在两个缓冲区上循环时的速度一样快。我还可以使用对齐向量而不是未对齐向量，或者重写代码以避免循环内的分支，但我怀疑这是程序相对较慢的根本原因。

代码使用 gcc 11.4.0 编译，标志为

-O2 -march=alderlake -g -ffast-math -Wall -Wextra

。我确保在一个核心上运行该程序。理论上，我计算的是

4.9 GHZ * 2 FMA * 2 UNITS * 8 Lanes ~ 156GFLOPS/sec

的最大 GFLOPS。英特尔顾问向我展示了大约

~130GFLOPS/sec

的最大计算吞吐量。

根据评论进行编辑

编译器的行为

正如下面的评论中所指出的，在检查 godbolt 上的代码时，编译器产生了很多工作。对我来说，高效代码的最大障碍是大量的内存操作和寄存器的多样性低。

我计算每个循环迭代有 12 次
```
vmovups
```
操作，但 4 次似乎就足够了。
编译器使用很少的寄存器，甚至问题更大，使用相同的寄存器来加载缓冲数据并保存 FMA 操作的源数据。

我猜编译器的主循环将类似于以下内容：

使用寄存器 0-7 作为 FMA 操作的目标寄存器
当 i % 2 == 0 时：
- 将数据预取到寄存器8-11中
- 使用寄存器 12-15 作为 FMA 操作的源寄存器
当 i % 2 == 1 时：
- 将数据预取到寄存器12-15中
- 使用寄存器 8-11 作为源寄存器

尝试过但没有成功

根据评论，我尝试了以下方法：

通过将循环迭代次数减少一次并提升循环前后的代码来删除循环内的分支条件
更积极的优化（O3）和不同的编译器（clang 17.0.1）

原程序中的Bug

在原来的程序中，我写了

num_lanes = 256 / sizeof(float);

，但应该是

num_lanes = 256 / (sizeof(float) * 8);

完整节目 完整的测试程序如下：

#include <immintrin.h>
#include <smmintrin.h>

#include <chrono>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <vector>

inline float hadd(__m256 v8) {
    __m128 v = _mm256_extractf128_ps(v8, 1);
    v = _mm_add_ps(v, _mm256_castps256_ps128(v8));
    v = _mm_add_ps(v, _mm_movehl_ps(v, v));
    v = _mm_add_ss(v, _mm_movehdup_ps(v));
    return _mm_cvtss_f32(v);
}

float epilogue(__m256 c[8]) {
    c[0] = _mm256_add_ps(c[0], c[1]);
    c[2] = _mm256_add_ps(c[2], c[3]);

    c[4] = _mm256_add_ps(c[4], c[5]);
    c[6] = _mm256_add_ps(c[6], c[7]);

    c[0] = _mm256_add_ps(c[0], c[3]);
    c[4] = _mm256_add_ps(c[4], c[6]);

    c[0] = _mm256_add_ps(c[0], c[4]);
    float res = hadd(c[0]);
    return res;
}

template <typename T>
void reporting(T duration, double flops, float res) {
    float gflops_sec = (flops * 1000.0) / duration;
    std::cout << "THe inner product is: " << res << std::endl;
    std::cout << "GFLOPS/sec: " << gflops_sec << std::endl;
    std::cout << "total gflops: " << flops << std::endl;
    std::cout << "Duration: " << duration << std::endl;
}

void load_buffer(const float* aptr, const float* bptr, __m256* a, __m256* b,
                 uint64_t idx) {
    a[0] = _mm256_loadu_ps(aptr + idx);
    a[1] = _mm256_loadu_ps(aptr + idx + 8);
    b[0] = _mm256_loadu_ps(bptr + idx);
    b[1] = _mm256_loadu_ps(bptr + idx + 8);
}
std::pair<float, float> fmadd_256(const float* aptr, const float* bptr,
                                  uint64_t sz, uint32_t num_loads) {
    constexpr size_t num_lanes = 256 / (sizeof(float) * 8);
    __m256 c[8];
    __m256 a[2][num_loads];
    __m256 b[2][num_loads];
    load_buffer(aptr, bptr, a[0], b[0], 0);
    for (uint64_t i = 0; i < sz; i++) {
        uint64_t curr_buffer = i % 2;
        __m256* curra = a[curr_buffer];
        __m256* currb = b[curr_buffer];
        if (i < sz - 1) {
            uint64_t next_buffer = (i) % 2;
            __m256* nexta = a[next_buffer];
            __m256* nextb = b[next_buffer];
            uint64_t load_position = (i + 0) * num_lanes * num_loads;
            load_buffer(aptr, bptr, nexta, nextb, load_position);
        }
        c[0] = _mm256_fmadd_ps(curra[0], currb[0], c[0]);
        c[1] = _mm256_fmadd_ps(curra[0], currb[0], c[1]);
        c[2] = _mm256_fmadd_ps(curra[0], currb[0], c[2]);
        c[3] = _mm256_fmadd_ps(curra[0], currb[0], c[3]);
        c[4] = _mm256_fmadd_ps(curra[1], currb[1], c[4]);
        c[5] = _mm256_fmadd_ps(curra[1], currb[1], c[5]);
        c[6] = _mm256_fmadd_ps(curra[1], currb[1], c[6]);
        c[7] = _mm256_fmadd_ps(curra[1], currb[1], c[7]);
    }
    constexpr size_t num_unrolls = 8;
    uint64_t flops{num_unrolls * num_lanes * 2 * sz};
    float res = epilogue(c);
    return {res, flops};
}

int main(int argc, char** argv) {
    constexpr uint64_t sz = {1000000};
    constexpr uint64_t num_loads = 2;
    constexpr uint64_t num_lanes = 256 / sizeof(float);
    constexpr uint64_t eles{num_loads * num_lanes * sz};
    std::vector<float> a(eles);
    std::vector<float> b(eles);
    double total_res{0};
    constexpr size_t RUNS{100};
    fmadd_256(a.data(), b.data(), sz, num_loads);  // test call
    uint64_t total_flops{0};
    auto begin = std::chrono::high_resolution_clock::now();
    for (size_t i = 0; i < RUNS; ++i) {
        auto [res, flops] = fmadd_256(a.data(), b.data(), sz, num_loads);
        total_flops += flops;
        total_res += res;
        std::swap(a, b);
    }
    double total_gflops = 1e-9 * (double)total_flops;
    auto end = std::chrono::high_resolution_clock::now();
    double duration =
        std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
            .count();
    reporting(duration, total_gflops, total_res);
}

Answer 1

我终于明白是怎么回事了：上面例子中我使用的数据大小比我电脑的缓存大。两个输入向量中的 kb 与 GLOPS/sec 之间的关系为：

| Operation   | GFLOPS/Sec       |
|-------------|------------------|
| FMADD/12    | 127.004/s        |
| FMADD/24    | 123.528/s        |
| FMADD/32    | 124.203/s        |
| FMADD/48    | 125.133/s        |
| FMADD/56    | 125.023/s        |
| FMADD/128   | 109.202/s        |
| FMADD/256   | 91.519/s         |
| FMADD/1024  | 86.1444/s        |
| FMADD/2048  | 85.9394/s        |
| FMADD/4096  | 78.2796/s        |
| FMADD/8192  | 53.0381/s        |
| FMADD/16384 | 39.7985/s        |
| FMADD/32768 | 34.2683/s        |

如何在使用输入数据时实现 FMA 的峰值触发器吞吐量（同时保持所需的屋顶线计算/负载比）？

问题描述投票：0回答：1

1个回答

最新问题

如何在使用输入数据时实现 FMA 的峰值触发器吞吐量（同时保持所需的屋顶线计算/负载比）？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1