为什么这个日志伽玛numba函数比大数组的scipy慢，但单个值更快？

Question

我有一个函数来计算我用log gamma function装饰的numba.njit。

import numpy as np
from numpy import log
from scipy.special import gammaln
from numba import njit

coefs = np.array([
    57.1562356658629235, -59.5979603554754912,
    14.1360979747417471, -0.491913816097620199,
    .339946499848118887e-4, .465236289270485756e-4,
    -.983744753048795646e-4, .158088703224912494e-3,
    -.210264441724104883e-3, .217439618115212643e-3,
    -.164318106536763890e-3, .844182239838527433e-4,
    -.261908384015814087e-4, .368991826595316234e-5
])

@njit(fastmath=True)
def gammaln_nr(z):
    """Numerical Recipes 6.1"""
    y = z
    tmp = z + 5.24218750000000000
    tmp = (z + 0.5) * log(tmp) - tmp
    ser = np.ones_like(y) * 0.999999999999997092

    n = coefs.shape[0]
    for j in range(n):
        y = y + 1
        ser = ser + coefs[j] / y

    out = tmp + log(2.5066282746310005 * ser / z)
    return out

当我使用gammaln_nr作为一个大型数组时，比如说np.linspace(0.001, 100, 10**7)，我的运行时间比scipy慢约7倍（参见下面附录中的代码）。但是，如果我运行任何单个值，我的numba函数总是快2倍。这是怎么回事？

z = 11.67
%timeit gammaln_nr(z)
%timeit gammaln(z)
>>> 470 ns ± 29.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
>>> 1.22 µs ± 28.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

我的直觉是，如果我的函数对于一个值更快，那么对于一个值数组它应该更快。当然，情况可能并非如此，因为我不知道numba是使用SIMD指令还是其他类型的矢量化，而scipy可能是。

Appendix

import matplotlib.pyplot as plt
import seaborn as sns

n_trials = 8
scipy_times = np.zeros(n_trials)
fastats_times = np.zeros(n_trials)

for i in range(n_trials):
    zs = np.linspace(0.001, 100, 10**i) # evaluate gammaln over this range

    # dont take first timing - this is just compilation
    start = time.time()
    gammaln_nr(zs)
    end = time.time()

    start = time.time()
    gammaln_nr(zs)
    end = time.time()
    fastats_times[i] = end - start

    start = time.time()
    gammaln(zs)
    end = time.time()
    scipy_times[i] = end - start

fig, ax = plt.subplots(figsize=(12,8))
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times, label="numba");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), scipy_times, label="scipy");
ax.set(xscale="log");
ax.set_xlabel("Array Size", fontsize=15);
ax.set_ylabel("Execution Time (s)", fontsize=15);
ax.set_title("Execution Time of Log Gamma");

Answer 1

在Numba实施gammaln

重新实现一些常用功能可能相当有用，不仅要达到性能，还要获得明确的精确度。因此，直接的方式是简单地wrap a working implementation。

在gammaln scipy的情况下 - 称这个函数的C-implemntation。因此，scipy实现的速度还取决于编译scipy依赖项时使用的编译器和编译器标志。

一个值的性能结果与较大数组的结果相差很大也不足为奇。在第一种情况下，调用开销（包括类型转换，输入检查......）占主导地位，在第二种情况下，实现的性能变得越来越重要。

改善您的实施

编写显式循环。在Numba中，矢量化操作被扩展为循环，然后Numba尝试加入循环。通常最好手动编写并加入此循环。
想想基本算术实现的差异。 Python总是检查除以0并在这种情况下引发异常，这是非常昂贵的。 Numba默认也使用此行为，但您也可以切换到Numpy-error检查。在这种情况下，除以0导致NaN。在进一步计算中处理NaN和Inf -0 / + 0的方式也受快速数学标志的影响。

码

import numpy as np
from numpy import log
from scipy.special import gammaln
from numba import njit
import numba as nb

@njit(fastmath=True,error_model='numpy')
def gammaln_nr(z):
    """Numerical Recipes 6.1"""
    #Don't use global variables.. (They only can be changed if you recompile the function)
    coefs = np.array([
    57.1562356658629235, -59.5979603554754912,
    14.1360979747417471, -0.491913816097620199,
    .339946499848118887e-4, .465236289270485756e-4,
    -.983744753048795646e-4, .158088703224912494e-3,
    -.210264441724104883e-3, .217439618115212643e-3,
    -.164318106536763890e-3, .844182239838527433e-4,
    -.261908384015814087e-4, .368991826595316234e-5])

    out=np.empty(z.shape[0])


    for i in range(z.shape[0]):
      y = z[i]
      tmp = z[i] + 5.24218750000000000
      tmp = (z[i] + 0.5) * np.log(tmp) - tmp
      ser = 0.999999999999997092

      n = coefs.shape[0]
      for j in range(n):
          y = y + 1.
          ser = ser + coefs[j] / y

      out[i] = tmp + log(2.5066282746310005 * ser / z[i])
    return out

@njit(fastmath=True,error_model='numpy',parallel=True)
def gammaln_nr_p(z):
    """Numerical Recipes 6.1"""
    #Don't use global variables.. (They only can be changed if you recompile the function)
    coefs = np.array([
    57.1562356658629235, -59.5979603554754912,
    14.1360979747417471, -0.491913816097620199,
    .339946499848118887e-4, .465236289270485756e-4,
    -.983744753048795646e-4, .158088703224912494e-3,
    -.210264441724104883e-3, .217439618115212643e-3,
    -.164318106536763890e-3, .844182239838527433e-4,
    -.261908384015814087e-4, .368991826595316234e-5])

    out=np.empty(z.shape[0])


    for i in nb.prange(z.shape[0]):
      y = z[i]
      tmp = z[i] + 5.24218750000000000
      tmp = (z[i] + 0.5) * np.log(tmp) - tmp
      ser = 0.999999999999997092

      n = coefs.shape[0]
      for j in range(n):
          y = y + 1.
          ser = ser + coefs[j] / y

      out[i] = tmp + log(2.5066282746310005 * ser / z[i])
    return out


import matplotlib.pyplot as plt
import seaborn as sns
import time

n_trials = 8
scipy_times = np.zeros(n_trials)
fastats_times = np.zeros(n_trials)
fastats_times_p = np.zeros(n_trials)

for i in range(n_trials):
    zs = np.linspace(0.001, 100, 10**i) # evaluate gammaln over this range

    # dont take first timing - this is just compilation
    start = time.time()
    arr_1=gammaln_nr(zs)
    end = time.time()

    start = time.time()
    arr_1=gammaln_nr(zs)
    end = time.time()
    fastats_times[i] = end - start

    start = time.time()
    arr_3=gammaln_nr_p(zs)
    end = time.time()
    fastats_times_p[i] = end - start
    start = time.time()

    start = time.time()
    arr_3=gammaln_nr_p(zs)
    end = time.time()
    fastats_times_p[i] = end - start
    start = time.time()

    arr_2=gammaln(zs)
    end = time.time()
    scipy_times[i] = end - start
    print(np.allclose(arr_1,arr_2))
    print(np.allclose(arr_1,arr_3))

fig, ax = plt.subplots(figsize=(12,8))
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times, label="numba");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times_p, label="numba_parallel");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), scipy_times, label="scipy");
ax.set(xscale="log");
ax.set_xlabel("Array Size", fontsize=15);
ax.set_ylabel("Execution Time (s)", fontsize=15);
ax.set_title("Execution Time of Log Gamma");
fig.show()

为什么这个日志伽玛numba函数比大数组的scipy慢，但单个值更快？

问题描述投票：2回答：1

Appendix

1个回答

在Numba实施gammaln

最新问题

为什么这个日志伽玛numba函数比大数组的scipy慢，但单个值更快？

问题描述 投票：2回答：1

Appendix

1个回答

在Numba实施gammaln

最新问题

问题描述投票：2回答：1