DGEMM 与 f2py 的性能

问题描述 投票:0回答:0

我试图通过

dgemm
fortran
包裹在
f2py
中并比较时间。看起来
dgemm
在小维度矩阵中比
numpy-einsum
慢得多(10 倍)。
dgemm
的定时器在
fortran
代码里面。我认为纯
fortran
dgemm
会快很多。为什么?我认为
numpy
包装了 C,所以
numpy
f2py
都有加载时间来编译语言。

蟒蛇部分


import numpy as np
import test_fortran
import time

dim_n = 10
A = np.random.random((dim_n, dim_n))
B = np.random.random((dim_n, dim_n))
C = np.zeros((dim_n, dim_n))

A = np.asfortranarray(A)
B = np.asfortranarray(B)
C = np.asfortranarray(C)

start = time.time()
test_fortran.wrap(A, B, C)
end = time.time()
print("Fortran time:", end - start)
#print(C)

# Cross-check with numpy.einsum
start = time.time()
C_check = np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy einsum time:", end - start)
#print(C_check)

# Compare the two results
print("Are the results equal?")
print(np.allclose(C, C_check))

Fortran
部分

module types

    implicit none
    integer, parameter :: sp = selected_real_kind(6,37) ! single precision
    integer, parameter :: dp = selected_real_kind(15,307) ! double precision
    
    !real(sp) :: r_sp = 1.0
    !real(dp) :: r_dp = 1.0_dp
    
end module


subroutine wrap(A, B, C)

    use types
    Use, Intrinsic :: iso_fortran_env, Only :  wp => real64, li => int64

    real(dp), intent(in) :: A(:, :), B(:, :)
    real(dp), intent(inout) :: C(:, :)
    real(dp) :: alpha, beta
    integer :: m, n, p !, start, finish, rate 
    Integer(li) :: start, finish, rate

    alpha = 1.0_dp
    beta = 0.0_dp

    m = size(A, 1)
    n = size(A, 2)
    p = size(B, 2)

    Call System_clock( start, rate )
    Call dgemm( 'N', 'N', m, p, n, alpha, A , m, B , n, beta, C, m )

    Call System_clock( finish, rate )
    Write( *, * ) 'Time for dgemm', Real( finish - start, dp ) / rate  
   ! C = matmul(A, B)
end subroutine wrap

使用

python -m numpy.f2py -c  --f90flags='-O3 -lopenblas'  -m test_fortran test_fortran.f90
。我有

  Time for dgemm   9.6337000000000002E-003
Fortran time: 0.010259628295898438
Numpy einsum time: 0.00018858909606933594
Are the results equal?
True

更新: (1) 对于更大的维度,结果会改变。例如。,

n=100
,

 Time for dgemm   1.1398200000000001E-002
Fortran time: 0.011998414993286133
Numpy einsum time: 0.0005433559417724609
Are the results equal?
True

n=1000

 Time for dgemm   6.8635000000000002E-002
Fortran time: 0.06942296028137207
Numpy einsum time: 0.6464102268218994
Are the results equal?
True

(2) 并在

matmul
部分中将
dgemm
替换为
fortran
将大大加快较小尺寸的速度
n=10

 Time for dgemm   1.7999999999999999E-006
Fortran time: 0.0011439323425292969
Numpy einsum time: 0.00012946128845214844
Are the results equal?
True

n=100

 Time for dgemm   4.7909999999999999E-004
Fortran time: 0.0011742115020751953
Numpy einsum time: 0.000514984130859375
Are the results equal?
True

n=1000

 Time for dgemm  0.12110920000000000
Fortran time: 0.12291121482849121
Numpy einsum time: 0.6383745670318604
Are the results equal?
True

(3) 然而,问题并不完全是调用

dgemm
的开销,除非有多个调用问题(f2py->fortran->blas) 比如下面的纯Fortran代码来做
dgemm

Program test_dgemm

    Use, Intrinsic :: iso_fortran_env, Only :  wp => real64, li => int64
    integer, parameter :: dp = selected_real_kind(15, 307)
  
   ! Implicit None
  
    Real( dp ), Dimension( :, : ), Allocatable :: a
    Real( dp ), Dimension( :, : ), Allocatable :: b
    Real( dp ), Dimension( :, : ), Allocatable :: c
  
    Integer :: na, nb, nc, m_iter
    Integer( li ) :: start, finish, rate
    real(dp) :: sum_time1, alpha, beta


    
    Write( *, * ) 'na, nb, nc?'
    Read( *, * ) na, nb, nc
    Allocate( a ( 1:na, 1:nb ) ) 
    Allocate( b ( 1:nb, 1:nc ) )  
    Allocate( c( 1:na, 1:nc ) )
  
  
    Call Random_number( a )
    Call Random_number( b )
    c  = 0.0_dp
 
    alpha = 1.0_dp
    beta = 0.0_dp
    m_iter = 100
  
    write (*,*) 'm_iter average', m_iter 

    sum_time1 = 0.0  
    do m = 1, m_iter     
      Call System_clock( start, rate )
      Call dgemm( 'N', 'N', na, nc, nb, alpha, a , na,  b , nb, beta, c, na )
      Call System_clock( finish, rate )
      sum_time1 = sum_time1 + Real( finish - start, dp ) / rate  
    end do  
    Write( *, * ) 'Time for dgemm', sum_time1 / m_iter
  End

提供

 na, nb, nc?
10 10 10
 m_iter average         100
 Time for dgemm   1.5866999999999999E-005

更新 2:正如评论中所建议的,如果我记得/只在纯 Fortran 中 iter 一次,我可以看到速度差异并且

scipy.linalg.blas.dgemm
更快(带有 f2py 的 fortran 部分添加了显式维度,不包括更新的维度,因为帖子可能太长了)


import numpy as np
import test_fortran
import time
import scipy.linalg #from . import linalg

dim_n = 10
A = np.random.random((dim_n, dim_n))
B = np.random.random((dim_n, dim_n))
C = np.zeros((dim_n, dim_n))

A = np.asfortranarray(A)
B = np.asfortranarray(B)
C = np.asfortranarray(C)

start = time.time()
test_fortran.wrap(A, B, C, dim_n, dim_n, dim_n)
end = time.time()
print("Fortran time:", end - start)

start = time.time()
test_fortran.wrap(A, B, C, dim_n, dim_n, dim_n)
end = time.time()
print("Fortran time:", end - start)

#print(C)

# Cross-check with numpy.einsum
start = time.time()
C_check = np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy einsum time:", end - start)
#print(C_check)

# Cross-check with numpy.einsum
start = time.time()
C_check = np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy einsum time:", end - start)
#print(C_check)


# Cross-check with numpy.einsum
start = time.time()
C2_check = scipy.linalg.blas.dgemm(1.0, A, B) #np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy scipy-einsum time:", end - start)
#print(C_check)

# Cross-check with numpy.einsum
start = time.time()
C2_check = scipy.linalg.blas.dgemm(1.0, A, B) #np.einsum('ij,jk->ik', A, B)
end = time.time()
print("Numpy scipy-dgemm time:", end - start)
#print(C_check)

# Compare the two results
print("Are the results equal?")
print(np.allclose(C, C_check))
print(np.allclose(C, C2_check))

得到

 Time for dgemm   1.1372600000000000E-002
Fortran time: 0.012313604354858398
 Time for dgemm   1.4800000000000001E-005
Fortran time: 0.0016214847564697266
Numpy einsum time: 0.00011920928955078125
Numpy einsum time: 2.7894973754882812e-05
Numpy scipy-einsum time: 7.319450378417969e-05
Numpy scipy-dgemm time: 1.5974044799804688e-05
Are the results equal?
True
True
python numpy fortran blas f2py
© www.soinside.com 2019 - 2024. All rights reserved.