我正在使用 OpenMP task 构造来并行执行二叉树应用程序。然而,运行时性能表明单线程实现优于多线程实现。我如何解释结果?
实现如下:
size_t N = 1 << 16;
size_t *D = new size_t [N];
size_t threads = 1; // 1, 2, 4, 8, 16
std::vector<std::vector<int>> mat1(128); // a 2D matrix
std::vector<std::vector<int>> mat2(128); // a 2D matrix
for(size_t i = 0; i < mat1.size(); ++i){
mat1[i].resize(128);
mat2[I].resize(128);
}
#pragma omp parallel num_threads(threads)
{
#pragma omp single
{
for(size_t i = 1; i < N; ++i) {
size_t p = i / 2;
size_t l = i * 2;
size_t r = l + 1;
if(l < N && r < N) {
#pragma omp task firstprivate(i) depend(out:D[l], D[r]) depend(in:D[p])
{
// perform matrix multiplication
for(size_t x = 0; x < 128; ++x) {
for(size_t y = 0; y < 128; ++y) {
size_t sum = 0;
for(size_t z = 0; z < 128; ++z) {
//sum += mat1[x][z] * mat2[z][y];
sum += mat1[x][z] * mat2[y][z]; // mat2 is transposed for better memory access pattern
}
}
}
}
}
else {
#pragma omp task firstprivate(i) depend(in:D[p])
{
// perform matrix multiplication
for(size_t x = 0; x < 128; ++x) {
for(size_t y = 0; y < 128; ++y) {
size_t sum = 0;
for(size_t z = 0; z < 128; ++z) {
//sum += mat1[x][z] * mat2[z][y];
sum += mat1[x][z] * mat2[y][z]; // mat2 is transposed for better memory access pattern
}
}
}
}
}
}
}
}
delete [] D;
}
但是,如果我把矩阵乘法换成sleep。那么运行时性能确实会随着线程数量的增加而变化。 下图是矩阵乘法的运行时性能。
下图是sleep的运行时表现。
如果我用轻量级运算替换矩阵乘法,如下所示:
size_t sum = i * 1000 // i is the private value for each task
我观察到与 OpenMP 无法扩展的矩阵乘法相同的运行时性能。运行时性能图如下所示,轻量级操作的运行时性能图
该实现是在启用 gcc-12 和 O3 的情况下编译的,并在 Ubuntu 19.10 (Eoan Ermine) 机器上运行,该机器具有 80 Intel Xeon Gold 6138 CPU(2.00GHz)和 256 GB RAM。
我预计运行时性能和线程数有一条凸曲线,其中最低点是 4 或 8 个线程。
我测试了你的代码并提出了两个问题:
g++ -O3 -fopenmp
)只是跳过循环(即根本不执行它们),因此时间没有任何意义。我介绍了一些说明来避免这个问题这是修改后的代码:
#include <cstdio>
#include <cstddef>
#include <vector>
#include <omp.h>
int main() {
size_t N = 1 << 10; //16;
size_t *D = new size_t [N];
size_t threads = 2; // 1, 2, 4, 8, 16
size_t *sum = new size_t [N]; // global sum array to avoid skipping the calculations
//std::vector<std::vector<int>> mat1(128); // a 2D matrix
//std::vector<std::vector<int>> mat2(128); // a 2D matrix
// classical C arrays
size_t mat1[128][128], mat2[128][128];
for (int i = 0; i < 128; i++) {
for (int j = 0; j < 128; j++) {
mat1[i][j] = 0;
mat2[i][j] = 0;
}
}
double tic = omp_get_wtime();
// number of threads set by the environment variable OMP_NUM_THREADS
#pragma omp parallel
{
#pragma omp single
{
for(size_t i = 1; i < N; ++i) {
size_t p = i / 2;
size_t l = i * 2;
size_t r = l + 1;
if(l < N && r < N) {
#pragma omp task firstprivate(i) depend(out:D[l], D[r]) depend(in:D[p])
{
// perform matrix multiplication
//size_t sum = 0;
for(size_t x = 0; x < 128; ++x) {
for(size_t y = 0; y < 128; ++y) {
for(size_t z = 0; z < 128; ++z) {
//sum += mat1[x][z] * mat2[z][y];
sum[i] += mat1[x][z] * mat2[y][z]; // mat2 is transposed for better memory access pattern
}
}
}
}
}
else {
#pragma omp task firstprivate(i) depend(in:D[p])
{
// perform matrix multiplication
//size_t sum = 0;
for(size_t x = 0; x < 128; ++x) {
for(size_t y = 0; y < 128; ++y) {
for(size_t z = 0; z < 128; ++z) {
//sum += mat1[x][z] * mat2[z][y];
sum[i] += mat1[x][z] * mat2[y][z]; // mat2 is transposed for better memory access pattern
}
}
}
}
}
}
}
}
double toc = omp_get_wtime();
printf("%lf\n",toc-tic);
delete [] D;
delete [] sum;
}
正如我所说,它要慢得多,并且可以扩展(机器有 10 个核心):
% g++ -O3 -fopenmp mmtask.cpp
% setenv OMP_NUM_THREADS 1 && ./a.out
1.803429
% setenv OMP_NUM_THREADS 2 && ./a.out
1.011750
% setenv OMP_NUM_THREADS 4 && ./a.out
0.475696
% setenv OMP_NUM_THREADS 8 && ./a.out
0.301431
% setenv OMP_NUM_THREADS 16 && ./a.out
0.357968
根据之前的讨论和评论,我发现 OpenMP 任务可以在两种情况下扩展:
for(size_t x = 0; x < 128; ++x) {
for(size_t y = 0; y < 128; ++y) {
size_t sum = 0;
for(size_t z = 0; z < 128; ++z) {
sum += mat1[x][z] * mat2[y][z]; // OpenMP optimizes this out
}
}
}
为了避免这种优化,我们应该像 PierU 建议的那样使用全局数据存储,
size_t *sum = new size_t[N];
for(size_t x = 0; x < 128; ++x) {
for(size_t y = 0; y < 128; ++y) {
for(size_t z = 0; z < 128; ++z) {
sum[i] += mat1[x][z] * mat2[y][z];
}
}
}
在下面,我发布了用于测量二叉树结构上 OpenMP 任务构造的运行时性能的整个实现。该实现根据之前的评论进行了修改。 该实现是在启用 gcc-9 和 O3 的情况下编译的,并在 Ubuntu 19.10 (Eoan Ermine) 机器上运行,该机器具有 80 Intel Xeon Gold 6138 CPU(2.00GHz)和 256 GB RAM。
#include <omp.h>
#include <iostream>
#include <chrono>
#include <vector>
#include <string>
#include <stdlib.h>
int main(int argc, char** argv) {
size_t num_layers = 18;
unsigned num_threads = strtol(argv[1], NULL, 10);
size_t msize = strtol(argv[2], NULL, 10);
size_t N = 1 << num_layers;
std::vector<std::vector<int>> mat1(msize);
std::vector<std::vector<int>> mat2(msize);
for(int i = 0; i < msize; ++i) {
mat1[i].resize(msize);
mat2[i].resize(msize);
}
for(int i = 0; i < msize; ++i) {
for(int j = 0; j < msize; ++j) {
mat1[i][j] = i+j;
mat2[i][j] = i-j;
}
}
auto beg = std::chrono::high_resolution_clock::now();
size_t *D = new size_t [N];
#pragma omp parallel num_threads(num_threads)
{
#pragma omp single
{
for(size_t i = 1; i<N; ++i) {
size_t p = i / 2;
size_t l = i * 2;
size_t r = l + 1;
if(l < N && r < N) {
#pragma omp task firstprivate(i) depend(out:D[l], D[r]) depend(in:D[p])
{
D[i] = 0;
for (int x = 0; x < mat1.size(); x++) {
for (int y = 0; y < mat1.size(); y++) {
for (int z = 0; z < mat1.size(); z++) {
D[i] += mat1[x][z] * mat2[y][z];
}
}
}
}
}
else {
#pragma omp task firstprivate(i) depend(in:D[p])
{
D[i] = 0;
for (int x = 0; x < mat1.size(); x++) {
for (int y = 0; y < mat1.size(); y++) {
for (int z = 0; z < mat1.size(); z++) {
D[i] += mat1[x][z] * mat2[y][z];
}
}
}
}
}
}
}
}
delete [] D;
auto end = std::chrono::high_resolution_clock::now();
std::cout << "Elapsed: " << std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count() << " us\n";
return 0;
}