我的声波方程传播的 OpenMP 实现在两个线程之外根本无法扩展,这可能是什么问题?

问题描述 投票:0回答:1

在下面的 OpenMP C 代码中,我期望使用 tempfield 可以帮助在一个缓存行中写入 p1,在我的例子中是 64 字节。与串行代码相比,使用两个线程时,我获得了几乎 2 倍的加速,但是当增加内核数量时,加速总是接近 2 倍。我尝试在不使用“#pragma omp for”的情况下编写 mm 循环,因为我认为这会被调用很多次。

关于变量的一些解释:模型速度大小 -> (nz=501,nx=1001)

每边边界点数:800(夸张是为了测试代码效率)

CACHE_SZ:16

NDERIV:3

        #pragma omp parallel  default(none) shared(p1,p2,wavemovie,isrc,srcindex,ishot,nbz,idxs,deriv,ricker,dx,velextend,nbzcache,nbx,i0_extended_model,i0_model,gamma_x,gamma_z,nx,nxx,nzz,nz,nt,ndt,nshots,nthreads)
        {
            int iconv,ii,ix,it=0;
            int mm;
            int imodel;
            int iz,niz;
            float gama,mgamma,invpgamma;
            float laplacian;
            float tempfield[CACHE_SZ];
            nthreads=omp_get_num_threads();
            #pragma omp single
            {   
                printf("Number of threads used:%d\n",nthreads);
            }
            while(ishot<nshots){
                #pragma omp single
                {
                    it=0;
                    srcindex=isrc + idxs*ishot;
                }
                #pragma omp for        
                for(int iswap=0;iswap<nxx*nzz;iswap++){
                    p1[iswap]=0.0;
                    p2[iswap]=0.0;
                }
            
                //  Forward field time update
                for(it=0;it<nt;it++){
                    #pragma omp for schedule(static)
                    for(mm=0;mm<nbzcache*nbx;mm++){
                            ix=(int) mm/nbzcache;
                            niz=mm%nbzcache;
                            iz=niz*CACHE_SZ; 
                        // Loop to write in cachelines
                        for(ii=0;ii<CACHE_SZ;ii++){
                            imodel=i0_extended_model + ix*nzz +iz + ii;
                            gama=gamma_x[ix + NDERIV-1] + gamma_z[iz + NDERIV-1 +ii];
                            mgamma=-(1-gama)/(1+gama);
                            invpgamma=0.5*(1-mgamma);
                            laplacian=2*deriv[0]*p2[imodel];
                            for(iconv=1;iconv< NDERIV;iconv++){
                                laplacian+=deriv[iconv]*p2[imodel - iconv];
                                laplacian+=deriv[iconv]*p2[imodel + iconv];
                                laplacian+=deriv[iconv]*p2[imodel - nzz*iconv];             
                                laplacian+=deriv[iconv]*p2[imodel + nzz*iconv];
                            }
                            tempfield[ii]=invpgamma*velextend[imodel]*laplacian;
                            tempfield[ii]+=invpgamma*2.0*p2[imodel];
                            tempfield[ii]+=mgamma*p1[imodel];
                        }
                        for(ii=0;ii<CACHE_SZ;ii++){
                            p1[imodel-(CACHE_SZ-1) + ii]=tempfield[ii];
                        }
                    }
                    #pragma omp single 
                    {
                        p1[srcindex]+= -pow(dx,2.0)*ricker[it]*velextend[srcindex];
                        if(it%ndt==0){
                            for(ii=0;ii<nx;ii++){
                                fwrite(p1+i0_model + ii*nzz,sizeof(float),nz,wavemovie);
                            }
                        }
                        swap(&p1,&p2);
                    }
                }
                #pragma omp single
                {
                    ishot++;
                    printf("ishot:%d\n",ishot);
                    printf("nshots:%d\n",nshots);
                }
          }
    }
c openmp finite-difference
1个回答
0
投票

没有MRE,我无法测试它。我认为,以下内容应该允许 fwrite 在下一次迭代的计算期间执行 - 您将需要一个动态调度,因为 IO 线程将没有时间工作:

#pragma omp parallel  default(none) shared(p1,p2,wavemovie,isrc,srcindex,ishot,nbz,idxs,deriv,ricker,dx,velextend,nbzcache,nbx,i0_extended_model,i0_model,gamma_x,gamma_z,nx,nxx,nzz,nz,nt,ndt,nshots,nthreads)
    {
        int iconv,ii,ix,it=0;
        int mm;
        int imodel;
        int iz,niz;
        float gama,mgamma,invpgamma;
        float laplacian;
        float tempfield[CACHE_SZ];
        nthreads=omp_get_num_threads();
        #pragma omp single
        {   
            printf("Number of threads used:%d\n",nthreads);
        }
        while(ishot<nshots){
            #pragma omp single
            {
                it=0;
                srcindex=isrc + idxs*ishot;
            }
            #pragma omp for        
            for(int iswap=0;iswap<nxx*nzz;iswap++){
                p1[iswap]=0.0;
                p2[iswap]=0.0;
            }
        
            //  Forward field time update
            for(it=0;it<nt;it++){
                #pragma omp for schedule(dynamic,CACHE_SZ)
                for(mm=0;mm<nbzcache*nbx;mm++){
                        ix=(int) mm/nbzcache;
                        niz=mm%nbzcache;
                        iz=niz*CACHE_SZ; 
                    // Loop to write in cachelines
                    for(ii=0;ii<CACHE_SZ;ii++){
                        imodel=i0_extended_model + ix*nzz +iz + ii;
                        gama=gamma_x[ix + NDERIV-1] + gamma_z[iz + NDERIV-1 +ii];
                        mgamma=-(1-gama)/(1+gama);
                        invpgamma=0.5*(1-mgamma);
                        laplacian=2*deriv[0]*p2[imodel];
                        for(iconv=1;iconv< NDERIV;iconv++){
                            laplacian+=deriv[iconv]*p2[imodel - iconv];
                            laplacian+=deriv[iconv]*p2[imodel + iconv];
                            laplacian+=deriv[iconv]*p2[imodel - nzz*iconv];             
                            laplacian+=deriv[iconv]*p2[imodel + nzz*iconv];
                        }
                        tempfield[ii]=invpgamma*velextend[imodel]*laplacian;
                        tempfield[ii]+=invpgamma*2.0*p2[imodel];
                        tempfield[ii]+=mgamma*p1[imodel];
                    }
                    for(ii=0;ii<CACHE_SZ;ii++){
                        p1[imodel-(CACHE_SZ-1) + ii]=tempfield[ii];
                    }
                }
                #pragma omp single 
                {
                    p1[srcindex]+= -pow(dx,2.0)*ricker[it]*velextend[srcindex];
                    swap(&p1,&p2);
                }
                #pragma omp single nowait
                    if(it%ndt==0){
                        for(ii=0;ii<nx;ii++){
                            fwrite(p2+i0_model + ii*nzz,sizeof(float),nz,wavemovie);
                        }
                    }
                }
            }
            #pragma omp single
            {
                ishot++;
                printf("ishot:%d\n",ishot);
                printf("nshots:%d\n",nshots);
            }
      }
}

也许还值得研究异步 IO 以在后台执行写入。

© www.soinside.com 2019 - 2024. All rights reserved.