我有一个循环,我正在尝试与 OpenMP 有效地并行化。它涉及累积矢量流的 L2 范数,并进行缩减。这是循环:
struct vec3
{
float data[3] = {};
};
float accumulate_eta_sq_t_mass(const vec3* etas, const float* masses, const std::size_t& n)
{
auto a = 0.0;
#pragma omp parallel for simd safelen(16) reduction(+:a)
for (auto ii = std::size_t{}; ii < n; ++ii)
{
const auto& eta = etas[ii];
const auto x = static_cast<double>(eta.data[0]);
const auto y = static_cast<double>(eta.data[1]);
const auto z = static_cast<double>(eta.data[2]);
const auto m = static_cast<double>(masses[ii]);
a += (x * x + y * y + z * z) * m;
}
return static_cast<float>(a);
}
我编写了一个微基准测试程序(附录中提供的代码),它表明简单循环的扩展性不太好,如下图所示。
根据阿姆达尔定律,我们看到的序列分数为 15%。根据您的经验(或者理论上),带有归约子句的 OpenMP 循环扩展得如此不起眼是正常的吗?如果需要,我可以提供更多信息。预先感谢。
注意:我在循环中使用双精度来减少累积中的大舍入误差。使用单精度并没有导致缩放性能的任何改进。
其他信息
@paleonix 这是在 Intel(R) Core(TM) i5-8400 CPU @ 2.80GHz 上运行,具有 6 核(无 SMT),缓存大小为 192 KiB、1.5 MiB 和 9 MiB 以及最高矢量标志 AVX2。
附录
这是应该编译的最小重现器
g++ -O3 -std=c++17 -fopenmp ./min_rep.cpp -o min_rep
/// \file This file contains a micro-benchmark program for the SAXPY loop.
#include <string> // std::stoul
#include <utility> // std::abort
#include <iostream> // std::cerr
#include <cstdint> // std::uint64_t
#include <algorithm> // std::min
#include <limits> // std::numeric_limits
#include <vector> // std::vector
#include <omp.h>
#include <stdlib.h> // posix_memalign
#include <unistd.h> // sysconf
struct vec3
{
float data[3] = {};
};
float accumulate_eta_sq_t_mass(const vec3* etas, const float* masses, const std::size_t& n)
{
auto a = 0.0;
#pragma omp parallel for simd safelen(16) reduction(+:a)
for (auto ii = std::size_t{}; ii < n; ++ii)
{
const auto& eta = etas[ii];
const auto x = static_cast<double>(eta.data[0]);
const auto y = static_cast<double>(eta.data[1]);
const auto z = static_cast<double>(eta.data[2]);
const auto m = static_cast<double>(masses[ii]);
a += (x * x + y * y + z * z) * m;
}
return static_cast<float>(a);
}
int main(int argc, char** argv)
{
// extract the problem size
if (argc < 2)
{
std::cerr << "Please provide the problem size as command line argument." << std::endl;
return 1;
}
const auto n = static_cast<std::size_t>(std::stoul(argv[1]));
if (n < 1)
{
std::cerr << "Zero valued problem size provided. The program will now be aborted." << std::endl;
return 1;
}
if (n * sizeof(float) / (1024 * 1024 * 1024) > 40) // let's assume there's only 64 GiB of RAM
{
std::cerr << "Problem size is too large. The program will now be aborted." << std::endl;
return 1;
}
// report
std::cout << "Starting runs with problem size n=" << n << ".\nThread count: " << omp_get_max_threads() << "."
<< std::endl;
// details
const auto page_size = sysconf(_SC_PAGESIZE);
const auto page_size_vec3 = page_size / sizeof(vec3);
const auto page_size_float = page_size / sizeof(float);
// experiment loop
const auto experiment_count = 50;
const auto warm_up_count = 10;
const auto run_count = experiment_count + warm_up_count;
auto durations = std::vector(experiment_count, std::numeric_limits<std::uint64_t>::min());
vec3* etas = nullptr;
float* masses = nullptr;
for (auto run_index = std::size_t{}; run_index < run_count; ++run_index)
{
// allocate
const auto alloc_status0 = posix_memalign(reinterpret_cast<void**>(&etas), page_size, n * sizeof(vec3));
const auto alloc_status1 = posix_memalign(reinterpret_cast<void**>(&masses), page_size, n * sizeof(float));
if (alloc_status0 != 0 || alloc_status1 != 0 || etas == nullptr || masses == nullptr)
{
std::cerr << "Fatal error, failed to allocate memory." << std::endl;
std::abort();
}
// "first touch"
#pragma omp parallel for schedule(static)
for (auto ii = std::size_t{}; ii < n; ii += page_size_vec3)
{
etas[ii].data[0] = 0.f;
}
#pragma omp parallel for schedule(static)
for (auto ii = std::size_t{}; ii < n; ii += page_size_float)
{
masses[ii] = 0.f;
}
// run experiment
const auto t1 = omp_get_wtime();
accumulate_eta_sq_t_mass(etas, masses, n);
const auto t2 = omp_get_wtime();
const auto duration_in_us = static_cast<std::int64_t>((t2 - t1) * 1E+6);
if (duration_in_us <= 0)
{
std::cerr << "Fatal error, no time elapsed in the test function." << std::endl;
std::abort();
}
if (run_index + 1 > warm_up_count)
{
durations[run_index - warm_up_count] = static_cast<std::uint64_t>(duration_in_us);
}
// deallocate
std::free(etas);
std::free(masses);
etas = nullptr;
masses = nullptr;
}
// statistics
auto min = std::numeric_limits<std::uint64_t>::max();
auto max = std::uint64_t{};
auto mean = std::uint64_t{};
for (const auto& duration : durations)
{
min = std::min(min, duration);
max = std::max(max, duration);
mean += duration;
}
mean /= experiment_count;
// report duration
std::cout << "Mean duration: " << mean << " us\n"
<< "Min. duration: " << min << " us\n"
<< "Max. duration: " << max << " us.\n";
// compute effective B/W
const auto traffic = 4 * n * sizeof(float);
constexpr auto inv_gigi = 1.0 / static_cast<double>(1024 * 1024 * 1024);
const auto traffic_in_gib = static_cast<double>(traffic) * inv_gigi;
std::cout << "Traffic per run: " << traffic << " B (" << traffic_in_gib << " GiB)\n"
<< "Mean effective B/W: " << static_cast<double>(traffic_in_gib) / (static_cast<double>(mean) * 1E-6) << " GiB/s\n"
<< "Min. effective B/W: " << static_cast<double>(traffic_in_gib) / (static_cast<double>(max) * 1E-6) << " GiB/s\n"
<< "Max. effective B/W: " << static_cast<double>(traffic_in_gib) / (static_cast<double>(min) * 1E-6) << " GiB/s\n"
<< std::endl;
return 0;
}