我正在致力于优化 Fortran 中的数组访问模式。
考虑到 Fortran 按列优先顺序分配多维数组,我分配并访问了该数组。
但是,我观察到以列优先的方式访问数组实际上比以行优先的方式访问数组要慢。这是违反直觉的,我试图理解为什么会发生这种情况。
这是我用来模仿我的情况的代码:
program test
implicit none
integer :: data_len, sub_len
double precision, allocatable :: array_1d(:), array_2d(:,:)
double precision, allocatable :: data1(:), data2(:), data3(:), data4(:), data5(:), temp(:)
integer :: i, j, idx
integer(kind=8) :: tic1, toc1, tic2, toc2, rate
call system_clock(count_rate=rate)
data_len = 1000000
sub_len = 1000
allocate(data1(data_len))
allocate(data2(data_len))
allocate(data3(data_len))
allocate(data4(data_len))
allocate(data5(data_len))
allocate(temp(data_len))
call random_number(data1)
call random_number(data2)
call random_number(data3)
call random_number(data4)
call random_number(data5)
! Case 1: 2D Array accessed in row-major sense
allocate(array_2d(data_len,sub_len))
call system_clock(tic1)
do i=1,data_len
array_2d(i,1) = data1(i)
array_2d(i,2) = data2(i)
array_2d(i,3) = data3(i)
array_2d(i,4) = data4(i)
array_2d(i,5) = data5(i)
enddo
call system_clock(toc1)
call system_clock(tic2)
do i=1,data_len
temp(i) = 0.d0
do j=1,5
temp(i) = temp(i) + array_2d(i,j)
enddo
enddo
call system_clock(toc2)
write(*,'(A)') '--------------------------------------------------------------------------------'
write(*,'(A)') '# 2D Array (row-major sense)'
write(*,'(A,ES23.15)') 'Compiler optimization detour - ', sum(temp)
write(*,'(A,ES23.15,A)') 'Data fetching : ', dble(toc1-tic1)/dble(rate), ' sec'
write(*,'(A,ES23.15,A)') 'Summation : ', dble(toc2-tic2)/dble(rate), ' sec'
write(*,'(A,ES23.15,A)') 'Total : ', dble(toc2-tic1)/dble(rate), ' sec'
deallocate(array_2d)
! Case 2: 2D Array accessed in column-major sense
allocate(array_2d(sub_len,data_len))
call system_clock(tic1)
do i=1,data_len
array_2d(1,i) = data1(i)
array_2d(2,i) = data2(i)
array_2d(3,i) = data3(i)
array_2d(4,i) = data4(i)
array_2d(5,i) = data5(i)
enddo
call system_clock(toc1)
call system_clock(tic2)
do i=1,data_len
temp(i) = 0.d0
do j=1,5
temp(i) = temp(i) + array_2d(j,i)
enddo
enddo
call system_clock(toc2)
write(*,'(A)') '--------------------------------------------------------------------------------'
write(*,'(A)') '# 2D Array (column-major sense)'
write(*,'(A,ES23.15)') 'Compiler optimization detour - ', sum(temp)
write(*,'(A,ES23.15,A)') 'Data fetching : ', dble(toc1-tic1)/dble(rate), ' sec'
write(*,'(A,ES23.15,A)') 'Summation : ', dble(toc2-tic2)/dble(rate), ' sec'
write(*,'(A,ES23.15,A)') 'Total : ', dble(toc2-tic1)/dble(rate), ' sec'
deallocate(array_2d)
! Case 3: 1D Array simulating row-major access
allocate(array_1d(data_len*sub_len))
call system_clock(tic1)
do i=1,data_len
idx = i+data_len*(1-1)
array_1d(idx) = data1(i)
idx = i+data_len*(2-1)
array_1d(idx) = data2(i)
idx = i+data_len*(3-1)
array_1d(idx) = data3(i)
idx = i+data_len*(4-1)
array_1d(idx) = data4(i)
idx = i+data_len*(5-1)
array_1d(idx) = data5(i)
enddo
call system_clock(toc1)
call system_clock(tic2)
do i=1,data_len
temp(i) = 0.d0
do j=1,5
idx = i+data_len*(j-1)
temp(i) = temp(i) + array_1d(idx)
enddo
enddo
call system_clock(toc2)
write(*,'(A)') '--------------------------------------------------------------------------------'
write(*,'(A)') '# 1D Array (row-major sense)'
write(*,'(A,ES23.15)') 'Compiler optimization detour - ', sum(temp)
write(*,'(A,ES23.15,A)') 'Data fetching : ', dble(toc1-tic1)/dble(rate), ' sec'
write(*,'(A,ES23.15,A)') 'Summation : ', dble(toc2-tic2)/dble(rate), ' sec'
write(*,'(A,ES23.15,A)') 'Total : ', dble(toc2-tic1)/dble(rate), ' sec'
deallocate(array_1d)
! Case 4: 1D Array simulating column-major access
allocate(array_1d(data_len*sub_len))
call system_clock(tic1)
do i=1,data_len
idx = sub_len*(i-1)+1
array_1d(idx) = data1(i)
idx = sub_len*(i-1)+2
array_1d(idx) = data2(i)
idx = sub_len*(i-1)+3
array_1d(idx) = data3(i)
idx = sub_len*(i-1)+4
array_1d(idx) = data4(i)
idx = sub_len*(i-1)+5
array_1d(idx) = data5(i)
enddo
call system_clock(toc1)
call system_clock(tic2)
do i=1,data_len
temp(i) = 0.d0
do j=1,5
idx = sub_len*(i-1)+j
temp(i) = temp(i) + array_1d(idx)
enddo
enddo
call system_clock(toc2)
write(*,'(A)') '--------------------------------------------------------------------------------'
write(*,'(A)') '# 1D Array (column-major sense)'
write(*,'(A,ES23.15)') 'Compiler optimization detour - ', sum(temp)
write(*,'(A,ES23.15,A)') 'Data fetching : ', dble(toc1-tic1)/dble(rate), ' sec'
write(*,'(A,ES23.15,A)') 'Summation : ', dble(toc2-tic2)/dble(rate), ' sec'
write(*,'(A,ES23.15,A)') 'Total : ', dble(toc2-tic1)/dble(rate), ' sec'
deallocate(array_1d)
end program test
简单解释一下,有五个固定“data_len”大小的一维数组,其中包含计算所需的数据(从 data1(:) 到 data5(:))。
将五个数据收集到array_2d(:,:)后进行操作。
array_2d 的大小为 array_2d(sub_len,data_len),其中 sub_len 大于或等于 5,以考虑随着数据集数量增加的可扩展性。
在我的情况下,sub_len << data_len, and both data collection and operation are performed based on the data set loop (1:5) first, so I thought it would be ideal to put sub_len in row and data_len in column.
然而,随着 sub_len 的增加(但数据集的数量固定为 5),结果表明相反,行主义(array_2d(data_len,sub_len))表现出更高的性能。
我认为这可能是 2D 数组的限制,因此我尝试通过将其展平为 1D 数组 (array_1d(sub_len*data_len)) 并使用每种访问模式访问它来实现它,但它产生了相同的结果。
这是当 data_len 固定为 1,000,000 并且 sub_len 从 5 到 10 到 100 到 1,000 变化时的结果:
# sub_len = 5
--------------------------------------------------------------------------------
# 2D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 2.184600000000000E-02 sec
Summation : 6.276000000000000E-03 sec
Total : 2.812200000000000E-02 sec
--------------------------------------------------------------------------------
# 2D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 1.804600000000000E-02 sec
Summation : 2.429000000000000E-03 sec
Total : 2.047500000000000E-02 sec
--------------------------------------------------------------------------------
# 1D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 2.110700000000000E-02 sec
Summation : 3.313000000000000E-03 sec
Total : 2.442000000000000E-02 sec
--------------------------------------------------------------------------------
# 1D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 1.812300000000000E-02 sec
Summation : 2.420000000000000E-03 sec
Total : 2.054300000000000E-02 sec
# sub_len = 10
--------------------------------------------------------------------------------
# 2D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 2.166600000000000E-02 sec
Summation : 6.287000000000000E-03 sec
Total : 2.795400000000000E-02 sec
--------------------------------------------------------------------------------
# 2D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 3.487700000000000E-02 sec
Summation : 3.780000000000000E-03 sec
Total : 3.865700000000000E-02 sec
--------------------------------------------------------------------------------
# 1D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 2.131900000000000E-02 sec
Summation : 3.284000000000000E-03 sec
Total : 2.460300000000000E-02 sec
--------------------------------------------------------------------------------
# 1D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 3.415100000000000E-02 sec
Summation : 3.813000000000000E-03 sec
Total : 3.796400000000000E-02 sec
# sub_len = 100
--------------------------------------------------------------------------------
# 2D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 2.176100000000000E-02 sec
Summation : 6.263000000000000E-03 sec
Total : 2.802400000000000E-02 sec
--------------------------------------------------------------------------------
# 2D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 3.237360000000000E-01 sec
Summation : 8.404999999999999E-03 sec
Total : 3.321410000000000E-01 sec
--------------------------------------------------------------------------------
# 1D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 2.089700000000000E-02 sec
Summation : 3.283000000000000E-03 sec
Total : 2.418000000000000E-02 sec
--------------------------------------------------------------------------------
# 1D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 3.255980000000000E-01 sec
Summation : 8.126000000000000E-03 sec
Total : 3.337240000000000E-01 sec
# sub_len = 1000
--------------------------------------------------------------------------------
# 2D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 2.155700000000000E-02 sec
Summation : 6.220000000000000E-03 sec
Total : 2.777700000000000E-02 sec
--------------------------------------------------------------------------------
# 2D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 1.634920000000000E+00 sec
Summation : 1.298100000000000E-02 sec
Total : 1.647901000000000E+00 sec
--------------------------------------------------------------------------------
# 1D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 2.075800000000000E-02 sec
Summation : 3.214000000000000E-03 sec
Total : 2.397200000000000E-02 sec
--------------------------------------------------------------------------------
# 1D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 1.645899000000000E+00 sec
Summation : 1.286800000000000E-02 sec
Total : 1.658767000000000E+00 sec
当实际数据集的数量与sub_len的大小相同(sub_len=5)时,列专业如预期更快。然而,随着 sub_len 大小的增加并且数组变得“稀疏”,列专业的时间显着增加,而行专业的时间保持不变。
无论编译器类型(Intel Fortran、gfortran)和编译器优化级别(O0、O2、O3)如何,这种趋势都是相同的。
这可能是什么原因?
编辑1) 当数据集的数量设置为“sub_len”而不是固定为5时,也会观察到这种趋势。在这种情况下,无论sub_len如何,求和部分在列主中都更快,但数据获取与之前的测试一样,行优先部分的速度更快。数据获取部分的速度减慢如此之大,以至于当 sub_len 就总时间而言较大时,行优先总是表现得更好。
编辑2)我发现调用“分配”实际上并没有立即将整个大小分配给内存。为了解决这个问题,我在分配调用之后(时间测量之前)初始化了 array_2d = 1.d0。此时,当sub_len=1000时,结果为:
# sub_len = 1000
--------------------------------------------------------------------------------
# 2D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 3.348000000000000E-03 sec
Summation : 5.835000000000000E-03 sec
Total : 9.183000000000000E-03 sec
--------------------------------------------------------------------------------
# 2D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 1.675400000000000E-02 sec
Summation : 1.279400000000000E-02 sec
Total : 2.954800000000000E-02 sec
--------------------------------------------------------------------------------
# 1D Array (row-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 3.185000000000000E-03 sec
Summation : 3.084000000000000E-03 sec
Total : 6.269000000000000E-03 sec
--------------------------------------------------------------------------------
# 1D Array (column-major sense)
Compiler optimization detour - 2.500099033103292E+06
Data fetching : 1.671800000000000E-02 sec
Summation : 1.249400000000000E-02 sec
Total : 2.921200000000000E-02 sec
与之前的测量相比,column-major的速度肯定有所提高,但仍然比row-major慢。
编辑3)编译器版本和选项是,
英特尔
版本:ifort (IFORT) 2021.12.0 20240222
编译选项:仅-O0、-O2或-O3(如ifort -O3 test.f90)
GNU
版本:GNU Fortran (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
编译选项:仅-O0、-O2或-O3(例如gfortran -O3 test.f90)
我的系统信息是,
操作系统:Ubuntu 22.04.4 LTS
CPU:AMD Ryzen Threadripper 3970X 32 核处理器
内存:128GB
在情况 1 中,L1 高速缓存内存中有 5 个“活动”高速缓存线(最快的一个,最靠近核心)并且它们已被充分使用。
在情况 2 中,L1 高速缓存内存中只有 1 个活动高速缓存行,并且未完全使用。
一个缓存行一般为64字节,因此可以包含8个双精度实数。
让我们看看情况 1 会发生什么:
array(1:8,1)
、array(1:8,2)
、array(1:8,3)
、array(1:8,4)
、array(1:8,5)
array(9:16,1)
、array(9:16,2)
、array(9:16,3)
、array(9:16,4)
、array(9:16,5)
现在来说案例2:
array(1:8,1)
;仅使用array(1:5,1)
array(1:8,2)
;仅使用array(1:5,2)
因此,案例 1 总体上更好地利用了缓存。此外,当涉及多个缓存行时,我猜硬件能够在加载其他缓存行的同时使用已加载的缓存行执行计算。