在下面的 OpenMP C 代码中,我期望使用 tempfield 可以帮助在一个缓存行中写入 p1,在我的例子中是 64 字节。与串行代码相比,使用两个线程时,我获得了几乎 2 倍的加速,但是当增加内核数量时,加速总是接近 2 倍。我尝试在不使用“#pragma omp for”的情况下编写 mm 循环,因为我认为这会被调用很多次。
关于变量的一些解释:模型速度大小 -> (nz=501,nx=1001)
每边边界点数:800(夸张是为了测试代码效率)
CACHE_SZ:16
NDERIV:3
#pragma omp parallel default(none) shared(p1,p2,wavemovie,isrc,srcindex,ishot,nbz,idxs,deriv,ricker,dx,velextend,nbzcache,nbx,i0_extended_model,i0_model,gamma_x,gamma_z,nx,nxx,nzz,nz,nt,ndt,nshots,nthreads)
{
int iconv,ii,ix,it=0;
int mm;
int imodel;
int iz,niz;
float gama,mgamma,invpgamma;
float laplacian;
float tempfield[CACHE_SZ];
nthreads=omp_get_num_threads();
#pragma omp single
{
printf("Number of threads used:%d\n",nthreads);
}
while(ishot<nshots){
#pragma omp single
{
it=0;
srcindex=isrc + idxs*ishot;
}
#pragma omp for
for(int iswap=0;iswap<nxx*nzz;iswap++){
p1[iswap]=0.0;
p2[iswap]=0.0;
}
// Forward field time update
for(it=0;it<nt;it++){
#pragma omp for schedule(static)
for(mm=0;mm<nbzcache*nbx;mm++){
ix=(int) mm/nbzcache;
niz=mm%nbzcache;
iz=niz*CACHE_SZ;
// Loop to write in cachelines
for(ii=0;ii<CACHE_SZ;ii++){
imodel=i0_extended_model + ix*nzz +iz + ii;
gama=gamma_x[ix + NDERIV-1] + gamma_z[iz + NDERIV-1 +ii];
mgamma=-(1-gama)/(1+gama);
invpgamma=0.5*(1-mgamma);
laplacian=2*deriv[0]*p2[imodel];
for(iconv=1;iconv< NDERIV;iconv++){
laplacian+=deriv[iconv]*p2[imodel - iconv];
laplacian+=deriv[iconv]*p2[imodel + iconv];
laplacian+=deriv[iconv]*p2[imodel - nzz*iconv];
laplacian+=deriv[iconv]*p2[imodel + nzz*iconv];
}
tempfield[ii]=invpgamma*velextend[imodel]*laplacian;
tempfield[ii]+=invpgamma*2.0*p2[imodel];
tempfield[ii]+=mgamma*p1[imodel];
}
for(ii=0;ii<CACHE_SZ;ii++){
p1[imodel-(CACHE_SZ-1) + ii]=tempfield[ii];
}
}
#pragma omp single
{
p1[srcindex]+= -pow(dx,2.0)*ricker[it]*velextend[srcindex];
if(it%ndt==0){
for(ii=0;ii<nx;ii++){
fwrite(p1+i0_model + ii*nzz,sizeof(float),nz,wavemovie);
}
}
swap(&p1,&p2);
}
}
#pragma omp single
{
ishot++;
printf("ishot:%d\n",ishot);
printf("nshots:%d\n",nshots);
}
}
}
没有MRE,我无法测试它。我认为,以下内容应该允许 fwrite 在下一次迭代的计算期间执行 - 您将需要一个动态调度,因为 IO 线程将没有时间工作:
#pragma omp parallel default(none) shared(p1,p2,wavemovie,isrc,srcindex,ishot,nbz,idxs,deriv,ricker,dx,velextend,nbzcache,nbx,i0_extended_model,i0_model,gamma_x,gamma_z,nx,nxx,nzz,nz,nt,ndt,nshots,nthreads)
{
int iconv,ii,ix,it=0;
int mm;
int imodel;
int iz,niz;
float gama,mgamma,invpgamma;
float laplacian;
float tempfield[CACHE_SZ];
nthreads=omp_get_num_threads();
#pragma omp single
{
printf("Number of threads used:%d\n",nthreads);
}
while(ishot<nshots){
#pragma omp single
{
it=0;
srcindex=isrc + idxs*ishot;
}
#pragma omp for
for(int iswap=0;iswap<nxx*nzz;iswap++){
p1[iswap]=0.0;
p2[iswap]=0.0;
}
// Forward field time update
for(it=0;it<nt;it++){
#pragma omp for schedule(dynamic,CACHE_SZ)
for(mm=0;mm<nbzcache*nbx;mm++){
ix=(int) mm/nbzcache;
niz=mm%nbzcache;
iz=niz*CACHE_SZ;
// Loop to write in cachelines
for(ii=0;ii<CACHE_SZ;ii++){
imodel=i0_extended_model + ix*nzz +iz + ii;
gama=gamma_x[ix + NDERIV-1] + gamma_z[iz + NDERIV-1 +ii];
mgamma=-(1-gama)/(1+gama);
invpgamma=0.5*(1-mgamma);
laplacian=2*deriv[0]*p2[imodel];
for(iconv=1;iconv< NDERIV;iconv++){
laplacian+=deriv[iconv]*p2[imodel - iconv];
laplacian+=deriv[iconv]*p2[imodel + iconv];
laplacian+=deriv[iconv]*p2[imodel - nzz*iconv];
laplacian+=deriv[iconv]*p2[imodel + nzz*iconv];
}
tempfield[ii]=invpgamma*velextend[imodel]*laplacian;
tempfield[ii]+=invpgamma*2.0*p2[imodel];
tempfield[ii]+=mgamma*p1[imodel];
}
for(ii=0;ii<CACHE_SZ;ii++){
p1[imodel-(CACHE_SZ-1) + ii]=tempfield[ii];
}
}
#pragma omp single
{
p1[srcindex]+= -pow(dx,2.0)*ricker[it]*velextend[srcindex];
swap(&p1,&p2);
}
#pragma omp single nowait
if(it%ndt==0){
for(ii=0;ii<nx;ii++){
fwrite(p2+i0_model + ii*nzz,sizeof(float),nz,wavemovie);
}
}
}
}
#pragma omp single
{
ishot++;
printf("ishot:%d\n",ishot);
printf("nshots:%d\n",nshots);
}
}
}
也许还值得研究异步 IO 以在后台执行写入。