在下面的 OpenMP C 代码中,我期望使用
tempfield
可以帮助在一个缓存行中写入 p1
,在我的例子中是 64 字节。与串行代码相比,使用两个线程时,我获得了几乎 2 倍的加速,但是当增加内核数量时,加速总是接近 2 倍。我尝试在不使用 mm
的情况下编写 #pragma omp for
循环,因为我认为这会被调用很多次。
可能出了什么问题?
关于变量的一些解释:
nz
=501, nx
=1001)CACHE_SZ
:16NDERIV
:3#pragma omp parallel default(none) \
shared(p1, p2, wavemovie, isrc, srcindex, ishot, nbz, idxs, deriv, ricker, \
dx, velextend, nbzcache, nbx, i0_extended_model, i0_model, \
gamma_x, gamma_z, nx, nxx, nzz, nz, nt, ndt, nshots, nthreads)
{
int iconv, ii, ix, it = 0;
int mm;
int imodel;
int iz, niz;
float gama, mgamma, invpgamma;
float laplacian;
float tempfield[CACHE_SZ];
nthreads = omp_get_num_threads();
#pragma omp single
{
printf("Number of threads used:%d\n", nthreads);
}
while (ishot < nshots) {
#pragma omp single
{
it = 0;
srcindex = isrc + idxs * ishot;
}
#pragma omp for
for (int iswap = 0; iswap < nxx * nzz; iswap++) {
p1[iswap] = 0.0;
p2[iswap] = 0.0;
}
// Forward field time update
for (it = 0; it < nt; it++) {
#pragma omp for schedule(static)
for (mm = 0; mm < nbzcache * nbx; mm++) {
ix = (int)mm / nbzcache;
niz = mm % nbzcache;
iz = niz * CACHE_SZ;
// Loop to write in cachelines
for (ii = 0; ii < CACHE_SZ; ii++) {
imodel = i0_extended_model + ix * nzz + iz + ii;
gama = gamma_x[ix + NDERIV - 1] +
gamma_z[iz + NDERIV - 1 + ii];
mgamma = -(1 - gama) / (1 + gama);
invpgamma = 0.5 * (1 - mgamma);
laplacian = 2 * deriv[0] * p2[imodel];
for (iconv = 1; iconv < NDERIV; iconv++) {
laplacian += deriv[iconv] * p2[imodel - iconv];
laplacian += deriv[iconv] * p2[imodel + iconv];
laplacian += deriv[iconv] * p2[imodel - nzz * iconv];
laplacian += deriv[iconv] * p2[imodel + nzz * iconv];
}
tempfield[ii] = invpgamma * velextend[imodel] * laplacian;
tempfield[ii] += invpgamma * 2.0 * p2[imodel];
tempfield[ii] += mgamma * p1[imodel];
}
for (ii = 0; ii < CACHE_SZ; ii++) {
p1[imodel - (CACHE_SZ - 1) + ii] = tempfield[ii];
}
}
#pragma omp single
{
p1[srcindex] +=
-pow(dx, 2.0) * ricker[it] * velextend[srcindex];
if (it % ndt == 0) {
for (ii = 0; ii < nx; ii++) {
fwrite(p1 + i0_model + ii * nzz, sizeof(float), nz,
wavemovie);
}
}
swap(&p1, &p2);
}
}
#pragma omp single
{
ishot++;
printf("ishot:%d\n", ishot);
printf("nshots:%d\n", nshots);
}
}
}
即使进行了建议的更改,我仍然没有获得太多的性能可扩展性。在 https://github.com/joanguastalla/OpenMP-Acoustic-Wave-Equation.git 中有运行测试所需的所有文件。唯一需要更改的是 makefile
LFLAGS
以包含 OpenMP 库的路径。
没有MRE,我无法测试它。我认为,以下内容应该允许 fwrite 在下一次迭代的计算期间执行 - 您将需要一个动态调度,因为 IO 线程将没有时间工作:
#pragma omp parallel default(none) firstprivate(ishot) shared(p1,p2,wavemovie,isrc,srcindex,nbz,idxs,deriv,ricker,dx,velextend,nbzcache,nbx,i0_extended_model,i0_model,gamma_x,gamma_z,nx,nxx,nzz,nz,nt,ndt,nshots,nthreads)
{
int iconv,ii,ix,it=0;
int mm;
int imodel;
int iz,niz;
float gama,mgamma,invpgamma;
float laplacian;
float tempfield[CACHE_SZ];
nthreads=omp_get_num_threads();
#pragma omp single
{
printf("Number of threads used:%d\n",nthreads);
}
for(;ishot<nshots;ishot++){
#pragma omp single
{
it=0;
srcindex=isrc + idxs*ishot;
}
#pragma omp for
for(int iswap=0;iswap<nxx*nzz;iswap++){
p1[iswap]=0.0;
p2[iswap]=0.0;
}
// Forward field time update
for(it=0;it<nt;it++){
#pragma omp for schedule(dynamic,CACHE_SZ)
for(mm=0;mm<nbzcache*nbx;mm++){
ix=(int) mm/nbzcache;
niz=mm%nbzcache;
iz=niz*CACHE_SZ;
// Loop to write in cachelines
for(ii=0;ii<CACHE_SZ;ii++){
imodel=i0_extended_model + ix*nzz +iz + ii;
gama=gamma_x[ix + NDERIV-1] + gamma_z[iz + NDERIV-1 +ii];
mgamma=-(1-gama)/(1+gama);
invpgamma=0.5*(1-mgamma);
laplacian=2*deriv[0]*p2[imodel];
for(iconv=1;iconv< NDERIV;iconv++){
laplacian+=deriv[iconv]*p2[imodel - iconv];
laplacian+=deriv[iconv]*p2[imodel + iconv];
laplacian+=deriv[iconv]*p2[imodel - nzz*iconv];
laplacian+=deriv[iconv]*p2[imodel + nzz*iconv];
}
tempfield[ii]=invpgamma*velextend[imodel]*laplacian;
tempfield[ii]+=invpgamma*2.0*p2[imodel];
tempfield[ii]+=mgamma*p1[imodel];
}
for(ii=0;ii<CACHE_SZ;ii++){
p1[imodel-(CACHE_SZ-1) + ii]=tempfield[ii];
}
} // barrier waits for IO thread from single nowait before reuse of p2
#pragma omp single
{
p1[srcindex]+= -pow(dx,2.0)*ricker[it]*velextend[srcindex];
swap(&p1,&p2);
}
#pragma omp single nowait
if(it%ndt==0){
for(ii=0;ii<nx;ii++){
fwrite(p2+i0_model + ii*nzz,sizeof(float),nz,wavemovie);
}
}
}
}
#pragma omp single
{
printf("ishot:%d\n",ishot);
printf("nshots:%d\n",nshots);
} // barrier will synchronize p2 writeout across ishot iterations
}
}
也许还值得研究异步 IO 以在后台执行写入。