OpenMP 减慢运行时间

问题描述 投票:0回答:1

我需要帮助并行化这段代码,我知道在循环内使用 !$omp 并行会减慢速度,所以我必须将其放在外部,但这样做会创建错误值

    !$omp parallel  
        do i=1, Nt

            !$omp do private(i1)
            do i1=2, n-1
               df1(i1)=(f0(i1)-f0(i1-1))/dx
               df2(i1)=(f0(i1+1)-2*f0(i1)+f0(i1-1))/(dx**2)
               F(i1)=-V*df1(i1)+D*df2(i1)
            end do
            !$omp end do

        ! periodic boundary conditions
            df1(1)=df1(n-1)
            df1(n)=df1(2)
            df2(1)=df2(n-1)
            df2(n)=df2(2)
            F(1)=-V*df1(1)+D*df2(1)
            F(n)=-V*df1(n)+D*df2(n)
        ! time stepping loop, not parallelized
            do j=1, n
                f0(j)=f0(j)+dt*F(j)
            end do

        end do
    !$omp end parallel

更新:使用此代码,结果是正确的,但非并行化版本比 openmp 的版本更快


    !$omp parallel private(i, i1, j) 
        do i=1, Nt

            !$omp do schedule(runtime)
            do i1=2, n-1
                         df1(i1)=(f0(i1)-f0(i1-1))/dx
             df2(i1)=(f0(i1+1)-2*f0(i1)+f0(i1-1))/(dx**2)
             F(i1)=-V*df1(i1)+D*df2(i1)
                    end do
            !$omp end do

!$omp single
        ! periodic boundary conditions
            df1(1)=df1(n-1)
            df1(n)=df1(2)
            df2(1)=df2(n-1)
            df2(n)=df2(2)
            F(1)=-V*df1(1)+D*df2(1)
            F(n)=-V*df1(n)+D*df2(n)
        ! time stepping loop, not parallelized
!$omp end single            
            !$omp do 
            do j=1, n
                f0(j)=f0(j)+dt*F(j)
            end do
            !$omp end do

        end do
    !$omp end parallel

我已经尝试在循环内保持并行化,添加共享和私有,我要么得到 f0 的错误结果,要么运行时间不断增加

fortran openmp
1个回答
0
投票

好的,因为你不会提供一个,所以我已经为上述内容拼凑了一个简单的驱动程序,并或多或少随机地为该方法选择了一些参数,只是避免那些导致数值爆炸的参数 - 只要 NaN 不存在时代的创造应该是合理的。这根本不是我的领域,所以我不知道 V、D 和 dt 的值应该是多少。

我还注意到您在主循环上使用了计划子句,设置为

runtime
。这对我来说似乎是不必要的复杂化,这是简单静态调度的明显情况,因此我已将环境变量
OMP_SCHEDULE
设置为值
static

最后,我使用

default( none )
确定了并行区域中所有变量的范围,我强烈建议你遵循这种做法 - 如果你是我的学生,我会因为你不这样做而给你打分。

无论如何,代码看起来像这样

Program omp

  Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64

  !$ Use omp_lib, Only : omp_get_max_threads 
  
  Implicit None( Type, External )

  Real( wp ), Parameter :: pi = 4.0_wp * Atan( 1.0_wp )
  
  Integer, Parameter :: Nt = 100000

  Real( wp ), Dimension( : ), Allocatable :: f0
  Real( wp ), Dimension( : ), Allocatable :: F
  Real( wp ), Dimension( : ), Allocatable :: df1, df2

  Real( wp ) :: dx
  Real( wp ) :: dt = 0.00000001_wp
  Real( wp ) :: V = 0.1_wp
  Real( wp ) :: D = 0.001_wp

  Integer( li ) :: start, finish, rate
  
  Integer :: n
  !$ Integer :: nth_tmp
  Integer :: i, i1, j

  Character( Len = 6 ) :: nth
  
!!$  Write( *, * ) 'n ?'
  Read ( *, * ) n

  dx = 1.0_wp / n

  Allocate( f0( 1:n ) )
  Allocate( F ( 1:n ) )

  Allocate( df1( 1:n ) )
  Allocate( df2( 1:n ) )
  
  f0 = [ ( Sin( 2.0_wp * pi * i * dx ),  i = 1, n ) ] 

  nth = "Serial"
  !$ nth_tmp = omp_get_max_threads()
  !$ Write( nth, '( i0 )' ) nth_tmp
  
  Call System_clock( start, rate )
  !$omp parallel default( none ) &
  !$omp          shared ( n, dx, dt, V, D, f0, F, df1, df2 ) &
  !$omp          private( i, i1, j )  
  Do i=1, Nt

     !$omp do schedule(runtime)
     Do i1=2, n-1
        df1(i1)=(f0(i1)-f0(i1-1))/dx
        df2(i1)=(f0(i1+1)-2.0_wp*f0(i1)+f0(i1-1))/(dx**2)
        F(i1)=-V*df1(i1)+D*df2(i1)
     End Do
     !$omp end do

     !$omp single
     ! periodic boundary conditions
     df1(1)=df1(n-1)
     df1(n)=df1(2)
     df2(1)=df2(n-1)
     df2(n)=df2(2)
     F(1)=-V*df1(1)+D*df2(1)
     F(n)=-V*df1(n)+D*df2(n)
     ! time stepping loop, not parallelized
     !$omp end single            
     !$omp do 
     Do j=1, n
        f0(j)=f0(j)+dt*F(j)
     End Do
     !$omp end do

  End Do
  !$omp end parallel
  Call system_clock( finish, rate )

  Write( *, * ) 'threads, time, Checksum: ', &
       nth, Real( finish - start, wp ) / rate, Sum( Abs( f0 ) )
  
End Program omp

并且使用 gfortran 我将其编译如下:

ijb@ijb-Latitude-5410:~/work/stack$ gfortran --version

GNU Fortran (GCC) 14.1.0
Copyright © 2024 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

ijb@ijb-Latitude-5410:~/work/stack$ gfortran  -Wall -Wextra -Ofast -g -std=f2018 speed.f90 -o serial
ijb@ijb-Latitude-5410:~/work/stack$ gfortran  -fopenmp -Wall -Wextra -Ofast -g -std=f2018 speed.f90 -o parallel

我使用以下脚本来帮助我在我的(不是很安静的)4 核笔记本电脑上在不同数量的线程和问题大小上运行程序:

ijb@ijb-Latitude-5410:~/work/stack$ cat runit
#!/usr/bin/bash

n=$1

./serial <<EOF
$n
EOF

for nth in 1 2 3 4
do
    export OMP_NUM_THREADS=$nth
    ./parallel <<EOF
$n
EOF
done

以及一些结果

ijb@ijb-Latitude-5410:~/work/stack$ ./runit 100
 threads, time, Checksum: Serial   3.7813894000000001E-002   63.638517731294677     
 threads, time, Checksum: 1        6.6722949000000004E-002   63.638517731294677     
 threads, time, Checksum: 2        9.4060747000000000E-002   63.638517731294677     
 threads, time, Checksum: 3       0.11382194399999999        63.638517731294677     
 threads, time, Checksum: 4       0.13759535100000000        63.638517731294677     
ijb@ijb-Latitude-5410:~/work/stack$ ./runit 1000
 threads, time, Checksum: Serial  0.13271654599999999        636.59242131285487     
 threads, time, Checksum: 1       0.16490009400000000        636.59242131285487     
 threads, time, Checksum: 2       0.15034453500000000        636.59242131285487     
 threads, time, Checksum: 3       0.15836609099999999        636.59242131285487     
 threads, time, Checksum: 4       0.15964677499999999        636.59242131285487     
ijb@ijb-Latitude-5410:~/work/stack$ ./runit 10000
 threads, time, Checksum: Serial   1.5402481520000000        6365.9449356557152     
 threads, time, Checksum: 1        1.4769430800000001        6365.9449356557152     
 threads, time, Checksum: 2       0.83346241799999998        6365.9449356557152     
 threads, time, Checksum: 3       0.64971254300000003        6365.9449356557152     
 threads, time, Checksum: 4       0.53164672300000004        6365.9449356557152     
ijb@ijb-Latitude-5410:~/work/stack$ ./runit 100000
 threads, time, Checksum: Serial   20.154403815999999        63659.462735156230     
 threads, time, Checksum: 1        24.292843486999999        63659.462735156252     
 threads, time, Checksum: 2        13.762875521000000        63659.462735156252     
 threads, time, Checksum: 3        9.9385426639999999        63659.462735156252     
 threads, time, Checksum: 4        8.0076960939999999        63659.462735156252     

可以看出,当 n=100 时,代码速度会变慢,当 n=1000 时,两个线程的速度会略有加快,但在 3 和 4 线程时再次变慢,当 n=10000 时,速度会一直加速到 4 个线程,并且当 n=100000 时效果更好。其原因是,正如评论中所指出的,随着系统规模的增加,由于并行性而引入的开销的重要性与随着向量长度的增加而减少的工作相比 - 在固定数量的线程下,开销为~时间循环的每次迭代恒定为 3 个障碍,但工作时间随着系统大小线性增加,因此随着 n 的增加,开销消失。

我还运行了英特尔编译器版本 - 它显示了几乎相同的行为:

ijb@ijb-Latitude-5410:~/work/stack$ export OMP_SCHEDULE=static
ijb@ijb-Latitude-5410:~/work/stack$ ifx -O3 speed.f90 -o iserial
ijb@ijb-Latitude-5410:~/work/stack$ ifx -qopenmp -O3 speed.f90 -o iparallel
ijb@ijb-Latitude-5410:~/work/stack$ cat irunit
#!/usr/bin/bash

n=$1

./iserial <<EOF
$n
EOF

for nth in 1 2 3 4
do
    export OMP_NUM_THREADS=$nth
    ./iparallel <<EOF
$n
EOF
done

    
ijb@ijb-Latitude-5410:~/work/stack$ ./irunit 100
 threads, time, Checksum: Serial  4.064300000000000E-002   63.6385177312947     
 threads, time, Checksum: 1       2.181600000000000E-002   63.6385177312947     
 threads, time, Checksum: 2       9.437800000000000E-002   63.6385177312947     
 threads, time, Checksum: 3       0.126435000000000        63.6385177312947     
 threads, time, Checksum: 4       0.153296000000000        63.6385177312947     
ijb@ijb-Latitude-5410:~/work/stack$ ./irunit 1000
 threads, time, Checksum: Serial  0.125659000000000        636.592421312855     
 threads, time, Checksum: 1       0.178205000000000        636.592421312855     
 threads, time, Checksum: 2       0.166180000000000        636.592421312855     
 threads, time, Checksum: 3       0.182477000000000        636.592421312855     
 threads, time, Checksum: 4       0.184732000000000        636.592421312855     
ijb@ijb-Latitude-5410:~/work/stack$ ./irunit 10000
 threads, time, Checksum: Serial   1.47372400000000        6365.94493565571     
 threads, time, Checksum: 1        1.70136900000000        6365.94493565571     
 threads, time, Checksum: 2       0.920539000000000        6365.94493565571     
 threads, time, Checksum: 3       0.713782000000000        6365.94493565571     
 threads, time, Checksum: 4       0.608429000000000        6365.94493565571     
ijb@ijb-Latitude-5410:~/work/stack$ ./irunit 100000
 threads, time, Checksum: Serial   35.9102110000000        63659.4627351538     
 threads, time, Checksum: 1        17.9405890000000        63659.4627351538     
 threads, time, Checksum: 2        9.67623400000000        63659.4627351538     
 threads, time, Checksum: 3        7.31490800000000        63659.4627351538     
 threads, time, Checksum: 4        5.74371900000000        63659.4627351538     
ijb@ijb-Latitude-5410:~/work/stack$ 

总长:

并行计算是为了解决大问题!

© www.soinside.com 2019 - 2024. All rights reserved.