声波方程传播的实现在两个线程之外根本无法扩展

问题描述 投票:0回答:1

在下面的 OpenMP C 代码中,我期望使用

tempfield
可以帮助在一个缓存行中写入
p1
,在我的例子中是 64 字节。与串行代码相比,使用两个线程时,我获得了几乎 2 倍的加速,但是当增加内核数量时,加速总是接近 2 倍。我尝试在不使用
mm
的情况下编写
#pragma omp for
循环,因为我认为这会被调用很多次。

可能出了什么问题?

关于变量的一些解释:

  • 模型速度大小 -> (
    nz
    =501,
    nx
    =1001)
  • 每边边界点数:800(夸张是为了测试代码效率)
  • CACHE_SZ
    :16
  • NDERIV
    :3
#pragma omp parallel default(none)                                             \
    shared(p1, p2, wavemovie, isrc, srcindex, ishot, nbz, idxs, deriv, ricker, \
               dx, velextend, nbzcache, nbx, i0_extended_model, i0_model,      \
               gamma_x, gamma_z, nx, nxx, nzz, nz, nt, ndt, nshots, nthreads)
{
    int iconv, ii, ix, it = 0;
    int mm;
    int imodel;
    int iz, niz;
    float gama, mgamma, invpgamma;
    float laplacian;
    float tempfield[CACHE_SZ];
    nthreads = omp_get_num_threads();
#pragma omp single
    {
        printf("Number of threads used:%d\n", nthreads);
    }
    while (ishot < nshots) {
#pragma omp single
        {
            it = 0;
            srcindex = isrc + idxs * ishot;
        }
#pragma omp for
        for (int iswap = 0; iswap < nxx * nzz; iswap++) {
            p1[iswap] = 0.0;
            p2[iswap] = 0.0;
        }

        //  Forward field time update
        for (it = 0; it < nt; it++) {
#pragma omp for schedule(static)
            for (mm = 0; mm < nbzcache * nbx; mm++) {
                ix = (int)mm / nbzcache;
                niz = mm % nbzcache;
                iz = niz * CACHE_SZ;
                // Loop to write in cachelines
                for (ii = 0; ii < CACHE_SZ; ii++) {
                    imodel = i0_extended_model + ix * nzz + iz + ii;
                    gama = gamma_x[ix + NDERIV - 1] +
                           gamma_z[iz + NDERIV - 1 + ii];
                    mgamma = -(1 - gama) / (1 + gama);
                    invpgamma = 0.5 * (1 - mgamma);
                    laplacian = 2 * deriv[0] * p2[imodel];
                    for (iconv = 1; iconv < NDERIV; iconv++) {
                        laplacian += deriv[iconv] * p2[imodel - iconv];
                        laplacian += deriv[iconv] * p2[imodel + iconv];
                        laplacian += deriv[iconv] * p2[imodel - nzz * iconv];
                        laplacian += deriv[iconv] * p2[imodel + nzz * iconv];
                    }
                    tempfield[ii] = invpgamma * velextend[imodel] * laplacian;
                    tempfield[ii] += invpgamma * 2.0 * p2[imodel];
                    tempfield[ii] += mgamma * p1[imodel];
                }
                for (ii = 0; ii < CACHE_SZ; ii++) {
                    p1[imodel - (CACHE_SZ - 1) + ii] = tempfield[ii];
                }
            }
#pragma omp single
            {
                p1[srcindex] +=
                    -pow(dx, 2.0) * ricker[it] * velextend[srcindex];
                if (it % ndt == 0) {
                    for (ii = 0; ii < nx; ii++) {
                        fwrite(p1 + i0_model + ii * nzz, sizeof(float), nz,
                               wavemovie);
                    }
                }
                swap(&p1, &p2);
            }
        }
#pragma omp single
        {
            ishot++;
            printf("ishot:%d\n", ishot);
            printf("nshots:%d\n", nshots);
        }
    }
}

即使进行了建议的更改,我仍然没有获得太多的性能可扩展性。在 https://github.com/joanguastalla/OpenMP-Acoustic-Wave-Equation.git 中有运行测试所需的所有文件。唯一需要更改的是 makefile

LFLAGS
以包含 OpenMP 库的路径。

c openmp finite-difference
1个回答
2
投票

没有MRE,我无法测试它。我认为,以下内容应该允许 fwrite 在下一次迭代的计算期间执行 - 您将需要一个动态调度,因为 IO 线程将没有时间工作:

#pragma omp parallel  default(none) firstprivate(ishot) shared(p1,p2,wavemovie,isrc,srcindex,nbz,idxs,deriv,ricker,dx,velextend,nbzcache,nbx,i0_extended_model,i0_model,gamma_x,gamma_z,nx,nxx,nzz,nz,nt,ndt,nshots,nthreads)
    {
        int iconv,ii,ix,it=0;
        int mm;
        int imodel;
        int iz,niz;
        float gama,mgamma,invpgamma;
        float laplacian;
        float tempfield[CACHE_SZ];
        nthreads=omp_get_num_threads();
        #pragma omp single
        {   
            printf("Number of threads used:%d\n",nthreads);
        }
        for(;ishot<nshots;ishot++){
            #pragma omp single
            {
                it=0;
                srcindex=isrc + idxs*ishot;
            }
            #pragma omp for        
            for(int iswap=0;iswap<nxx*nzz;iswap++){
                p1[iswap]=0.0;
                p2[iswap]=0.0;
            }
        
            //  Forward field time update
            for(it=0;it<nt;it++){
                #pragma omp for schedule(dynamic,CACHE_SZ)
                for(mm=0;mm<nbzcache*nbx;mm++){
                        ix=(int) mm/nbzcache;
                        niz=mm%nbzcache;
                        iz=niz*CACHE_SZ; 
                    // Loop to write in cachelines
                    for(ii=0;ii<CACHE_SZ;ii++){
                        imodel=i0_extended_model + ix*nzz +iz + ii;
                        gama=gamma_x[ix + NDERIV-1] + gamma_z[iz + NDERIV-1 +ii];
                        mgamma=-(1-gama)/(1+gama);
                        invpgamma=0.5*(1-mgamma);
                        laplacian=2*deriv[0]*p2[imodel];
                        for(iconv=1;iconv< NDERIV;iconv++){
                            laplacian+=deriv[iconv]*p2[imodel - iconv];
                            laplacian+=deriv[iconv]*p2[imodel + iconv];
                            laplacian+=deriv[iconv]*p2[imodel - nzz*iconv];             
                            laplacian+=deriv[iconv]*p2[imodel + nzz*iconv];
                        }
                        tempfield[ii]=invpgamma*velextend[imodel]*laplacian;
                        tempfield[ii]+=invpgamma*2.0*p2[imodel];
                        tempfield[ii]+=mgamma*p1[imodel];
                    }
                    for(ii=0;ii<CACHE_SZ;ii++){
                        p1[imodel-(CACHE_SZ-1) + ii]=tempfield[ii];
                    }
                } // barrier waits for IO thread from single nowait before reuse of p2
                #pragma omp single 
                {
                    p1[srcindex]+= -pow(dx,2.0)*ricker[it]*velextend[srcindex];
                    swap(&p1,&p2);
                }
                #pragma omp single nowait
                    if(it%ndt==0){
                        for(ii=0;ii<nx;ii++){
                            fwrite(p2+i0_model + ii*nzz,sizeof(float),nz,wavemovie);
                        }
                    }
                }
            }
            #pragma omp single
            {
                printf("ishot:%d\n",ishot);
                printf("nshots:%d\n",nshots);
            } // barrier will synchronize p2 writeout across ishot iterations
      }
}

也许还值得研究异步 IO 以在后台执行写入。

© www.soinside.com 2019 - 2024. All rights reserved.