我试图通过
dgemm
将fortran
包裹在f2py
中并比较时间。看起来 dgemm
在小维度矩阵中比 numpy-einsum
慢得多(10 倍)。 dgemm
的定时器在fortran
代码里面。我认为纯fortran
dgemm
会快很多。为什么?我认为 numpy
包装了 C,所以 numpy
和 f2py
都有加载时间来编译语言。
蟒蛇部分
import numpy as np
import test_fortran
import time
dim_n = 10
A = np.random.random((dim_n, dim_n))
B = np.random.random((dim_n, dim_n))
C = np.zeros((dim_n, dim_n))
A = np.asfortranarray(A)
B = np.asfortranarray(B)
C = np.asfortranarray(C)
start = time.time()
test_fortran.wrap(A, B, C)
end = time.time()
print("Fortran time:", end - start)
#print(C)
# Cross-check with numpy.einsum
start = time.time()
C_check = np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy einsum time:", end - start)
#print(C_check)
# Compare the two results
print("Are the results equal?")
print(np.allclose(C, C_check))
Fortran
部分
module types
implicit none
integer, parameter :: sp = selected_real_kind(6,37) ! single precision
integer, parameter :: dp = selected_real_kind(15,307) ! double precision
!real(sp) :: r_sp = 1.0
!real(dp) :: r_dp = 1.0_dp
end module
subroutine wrap(A, B, C)
use types
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
real(dp), intent(in) :: A(:, :), B(:, :)
real(dp), intent(inout) :: C(:, :)
real(dp) :: alpha, beta
integer :: m, n, p !, start, finish, rate
Integer(li) :: start, finish, rate
alpha = 1.0_dp
beta = 0.0_dp
m = size(A, 1)
n = size(A, 2)
p = size(B, 2)
Call System_clock( start, rate )
Call dgemm( 'N', 'N', m, p, n, alpha, A , m, B , n, beta, C, m )
Call System_clock( finish, rate )
Write( *, * ) 'Time for dgemm', Real( finish - start, dp ) / rate
! C = matmul(A, B)
end subroutine wrap
使用
python -m numpy.f2py -c --f90flags='-O3 -lopenblas' -m test_fortran test_fortran.f90
。我有
Time for dgemm 9.6337000000000002E-003
Fortran time: 0.010259628295898438
Numpy einsum time: 0.00018858909606933594
Are the results equal?
True
更新: (1) 对于更大的维度,结果会改变。例如。,
n=100
,
Time for dgemm 1.1398200000000001E-002
Fortran time: 0.011998414993286133
Numpy einsum time: 0.0005433559417724609
Are the results equal?
True
n=1000
Time for dgemm 6.8635000000000002E-002
Fortran time: 0.06942296028137207
Numpy einsum time: 0.6464102268218994
Are the results equal?
True
(2) 并在
matmul
部分中将 dgemm
替换为 fortran
将大大加快较小尺寸的速度
n=10
Time for dgemm 1.7999999999999999E-006
Fortran time: 0.0011439323425292969
Numpy einsum time: 0.00012946128845214844
Are the results equal?
True
n=100
Time for dgemm 4.7909999999999999E-004
Fortran time: 0.0011742115020751953
Numpy einsum time: 0.000514984130859375
Are the results equal?
True
n=1000
Time for dgemm 0.12110920000000000
Fortran time: 0.12291121482849121
Numpy einsum time: 0.6383745670318604
Are the results equal?
True
(3) 然而,问题并不完全是调用
dgemm
的开销,除非有多个调用问题(f2py->fortran->blas)
比如下面的纯Fortran代码来做dgemm
Program test_dgemm
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
integer, parameter :: dp = selected_real_kind(15, 307)
! Implicit None
Real( dp ), Dimension( :, : ), Allocatable :: a
Real( dp ), Dimension( :, : ), Allocatable :: b
Real( dp ), Dimension( :, : ), Allocatable :: c
Integer :: na, nb, nc, m_iter
Integer( li ) :: start, finish, rate
real(dp) :: sum_time1, alpha, beta
Write( *, * ) 'na, nb, nc?'
Read( *, * ) na, nb, nc
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc ) )
Allocate( c( 1:na, 1:nc ) )
Call Random_number( a )
Call Random_number( b )
c = 0.0_dp
alpha = 1.0_dp
beta = 0.0_dp
m_iter = 100
write (*,*) 'm_iter average', m_iter
sum_time1 = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, nc, nb, alpha, a , na, b , nb, beta, c, na )
Call System_clock( finish, rate )
sum_time1 = sum_time1 + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for dgemm', sum_time1 / m_iter
End
提供
na, nb, nc?
10 10 10
m_iter average 100
Time for dgemm 1.5866999999999999E-005
更新 2:正如评论中所建议的,如果我记得/只在纯 Fortran 中 iter 一次,我可以看到速度差异并且
scipy.linalg.blas.dgemm
更快(带有 f2py 的 fortran 部分添加了显式维度,不包括更新的维度,因为帖子可能太长了)
import numpy as np
import test_fortran
import time
import scipy.linalg #from . import linalg
dim_n = 10
A = np.random.random((dim_n, dim_n))
B = np.random.random((dim_n, dim_n))
C = np.zeros((dim_n, dim_n))
A = np.asfortranarray(A)
B = np.asfortranarray(B)
C = np.asfortranarray(C)
start = time.time()
test_fortran.wrap(A, B, C, dim_n, dim_n, dim_n)
end = time.time()
print("Fortran time:", end - start)
start = time.time()
test_fortran.wrap(A, B, C, dim_n, dim_n, dim_n)
end = time.time()
print("Fortran time:", end - start)
#print(C)
# Cross-check with numpy.einsum
start = time.time()
C_check = np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy einsum time:", end - start)
#print(C_check)
# Cross-check with numpy.einsum
start = time.time()
C_check = np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy einsum time:", end - start)
#print(C_check)
# Cross-check with numpy.einsum
start = time.time()
C2_check = scipy.linalg.blas.dgemm(1.0, A, B) #np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy scipy-einsum time:", end - start)
#print(C_check)
# Cross-check with numpy.einsum
start = time.time()
C2_check = scipy.linalg.blas.dgemm(1.0, A, B) #np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy scipy-dgemm time:", end - start)
#print(C_check)
# Compare the two results
print("Are the results equal?")
print(np.allclose(C, C_check))
print(np.allclose(C, C2_check))
得到
Time for dgemm 1.1372600000000000E-002
Fortran time: 0.012313604354858398
Time for dgemm 1.4800000000000001E-005
Fortran time: 0.0016214847564697266
Numpy einsum time: 0.00011920928955078125
Numpy einsum time: 2.7894973754882812e-05
Numpy scipy-einsum time: 7.319450378417969e-05
Numpy scipy-dgemm time: 1.5974044799804688e-05
Are the results equal?
True
True