为什么我的 Cython 代码没有 NumPy 快，我该如何优化它？

Question

我正在尝试使用 Cython 来提高一些代码的性能。但是，我获得的性能与 NumPy 版本基本相同。当使用 Numba 测试相同的代码时，结果明显快于 Cython，所以可能我没有正确使用 Cython。

以下是我迄今为止所尝试的简要概述：

遵循 Cython 文档：我遵循了 Cython 文档中的“使用 NumPy”部分，但性能并未按预期提高。
尝试了 stackoverflow 中的一些技巧，但没有什么足够具体值得一提的。
分析：我尝试按照 Cython 文档运行分析，但输出对识别瓶颈没有帮助。总时间基本上是
```
{built-in method builtins.exec}
```
（也许这很重要，但我不确定如何正确解释它）。

这是重现我的结果的代码：

Numpy 版本（与 Numba 中显示的代码相同，但没有 @njit 装饰器）：

import numpy as np

def numpy_optimize(psi: np.ndarray, y: np.ndarray) -> np.ndarray:
    delta: float = 0.001
    lam: float = 0.01
    n_theta: int = psi.shape[1]
    n_samples: int = psi.shape[0]
    p: np.ndarray = np.eye(n_theta) / delta
    xi: list = []
    theta: list = 3*[[-0.0029283973561854606, 0.0032794468930772845, -0.003672579440695663]]
    for i in range(2, n_samples):
        psi_i = psi[i, :, None]# .reshape(-1, 1)
        p_psi_i = p.dot(psi_i)
        psi_i_T_p = psi_i.T.dot(p)
        k_numerator = p_psi_i / lam
        k_denominator = 1 + psi_i_T_p.dot(psi_i) / lam
        k = k_numerator / k_denominator
        xi.append(y[i, 0] - psi_i.T.dot(np.array(theta[-1]))[0])
        theta.append(list(np.array(theta[-1]) + k.flatten() * np.array(xi[-1])))
        p_update_factor = psi_i_T_p.dot(psi_i)
        p -= (p_psi_i.dot(psi_i_T_p)) / (p_update_factor + lam)
        p /= lam

    return np.array(theta[-1])

Cython 版本：

%%cython

# Use the cell magic above to run in the notebook.
# The results are the same in the notebook as the one building cython code.
cimport cython
import numpy as np
cimport numpy as cnp
cnp.import_array()


def cython_optimize(cnp.ndarray[cnp.float64_t, ndim=2] psi, cnp.ndarray[cnp.float64_t, ndim=2] y):
    cdef int i
    cdef cnp.ndarray[cnp.float64_t, ndim=2] psi_i
    cdef cnp.ndarray[cnp.float64_t, ndim=2] p_psi_i
    cdef cnp.ndarray[cnp.float64_t, ndim=2] psi_i_T_p
    cdef cnp.ndarray[cnp.float64_t, ndim=2] k
    cdef cnp.ndarray[cnp.float64_t, ndim=2] p_update_factor
    cdef cnp.ndarray[cnp.float64_t, ndim=2] p1
    cdef cnp.ndarray[cnp.float64_t, ndim=2] p2
    cdef cnp.ndarray[cnp.float64_t, ndim=2] p_numerator
    cdef cnp.ndarray[cnp.float64_t, ndim=2] k_denominator
    cdef int n_theta = psi.shape[1]
    cdef int n = psi.shape[0]
    cdef list theta = 3*[[-0.0029283973561854606, 0.0032794468930772845, -0.003672579440695663]]
    cdef double delta = 0.001
    cdef double lam = 0.01
    cdef list xi = []

    cdef cnp.ndarray[cnp.float64_t, ndim=2] p = np.eye(n_theta) / delta

    for i in range(2, n):
        psi_i = psi[i, :, None]# .reshape(-1, 1)
        p_psi_i = p.dot(psi_i)
        psi_i_T_p = psi_i.T.dot(p)
        k_numerator = p_psi_i / lam
        k_denominator = 1 + psi_i_T_p.dot(psi_i) / lam
        k = k_numerator / k_denominator
        xi.append(y[i, 0] - psi_i.T.dot(np.array(theta[-1]))[0])
        theta.append(list(np.array(theta[-1]) + k.flatten() * np.array(xi[-1])))
        p_update_factor = psi_i_T_p.dot(psi_i)
        p -= (p_psi_i.dot(psi_i_T_p)) / (p_update_factor + lam)
        p /= lam

    return np.array(theta[-1]).reshape(-1, 1)

Numba 版本:

import numpy as np
from numba import njit

@njit
def numba_optimize(psi: np.ndarray, y: np.ndarray) -> np.ndarray:
    delta: float = 0.001
    lam: float = 0.01
    n_theta: int = psi.shape[1]
    n_samples: int = psi.shape[0]
    p: np.ndarray = np.eye(n_theta) / delta
    xi: list = []
    theta: list = 3*[[-0.0029283973561854606, 0.0032794468930772845, -0.003672579440695663]]
    for i in range(2, n_samples):
        psi_i = psi[i, :, None]# .reshape(-1, 1)
        p_psi_i = p.dot(psi_i)
        psi_i_T_p = psi_i.T.dot(p)
        k_numerator = p_psi_i / lam
        k_denominator = 1 + psi_i_T_p.dot(psi_i) / lam
        k = k_numerator / k_denominator
        xi.append(y[i, 0] - psi_i.T.dot(np.array(theta[-1]))[0])
        theta.append(list(np.array(theta[-1]) + k.flatten() * np.array(xi[-1])))
        p_update_factor = psi_i_T_p.dot(psi_i)
        p -= (p_psi_i.dot(psi_i_T_p)) / (p_update_factor + lam)
        p /= lam

    return np.array(theta[-1])

结果是：

# creating arrays to test the functions
import numpy as np

y = np.random.rand(1000000, 1)
psi = np.random.rand(1000000, 3)

%timeit cython_optimize(psi, y_train)

每个循环 16.3 秒 ± 164 毫秒（7 次运行的平均值 ± 标准差，每次 1 次循环）

%timeit numba_optimize(psi, y_train)

每个循环 1.7 秒 ± 18.8 毫秒（7 次运行的平均值 ± 标准差，每次 1 次循环）

%timeit numpy_optimize(psi, y_train)

每个循环 17.7 秒 ± 75.3 毫秒（7 次运行的平均值 ± 标准差，每次 1 次循环）

如果有人能在以下方面帮助我，我将不胜感激：

我可以采取哪些步骤来确保我的 Cython 代码得到正确优化？
如何有效地分析我的 Cython 代码以识别和解决性能瓶颈？
将 Cython 与 NumPy 结合使用时，有哪些常见错误或被忽视的细节可能导致性能不佳？

如有任何建议，我们将不胜感激。谢谢！

注：

numpy==1.26.0

、

numba==0.59.1

和

Cython==3.0.10

Answer 1

只需一行代码即可使代码速度提高 10.000 倍！！！

cimport cython
import cython
cimport numpy as np
import numpy as np


np.import_array()
from time import perf_counter
import cv2 
# 4525 x 6623 x 3 picture https://www.pexels.com/pt-br/foto/foto-da-raposa-sentada-no-chao-2295744/
picpath = r"C:\Users\hansc\Downloads\pexels-alex-andrews-2295744.jpg"

pic=np.ascontiguousarray(cv2.imread(picpath))

cpdef tuple average_rgb_bad(np.ndarray[np.uint8_t, ndim=3] pic_original): 
    # NEVER LOOP THROUGH A NUMPY ARRAY LIKE THIS! NEVER EVER!
    # Typing is actually not important in the function declaration
    # The problem is here that you don't use a memory view! It is very slooooooooow 
    # Cython is such an awesome language, but the documentation is horrible. 
    # I struggled like you for a while, because  the Cython documentation emphasizes 
    # the typing of NumPy arrays. It makes you think that using this,
    # is the key to success: np.ndarray[np.uint8_t, ndim=3]   But it isn't!
    cdef:
        Py_ssize_t shape0_pic = pic_original.shape[0]
        Py_ssize_t shape1_pic = pic_original.shape[1]
        Py_ssize_t i,j # Typing won't improve ANYTHING here! 
        # The whole function is crap, because we are not using MEMORY VIEWS!
        Py_ssize_t resultcounter=shape0_pic*shape1_pic
        Py_ssize_t r=0
        Py_ssize_t g=0
        Py_ssize_t b=0
    #with nogil: IS NOT POSSIBLE HERE, because you are calling Python functions/methods __getitem__ etc.
    start=perf_counter()
    for i in range(shape0_pic):
        for j in range(shape1_pic):
            b+=pic_original[i][j][0]
            g+=pic_original[i][j][1]
            r+=pic_original[i][j][2]
    print(perf_counter()-start)
    return b//resultcounter, g//resultcounter, r//resultcounter

cpdef tuple average_rgb_good(np.ndarray pic_original):
    cdef:
        # this changes everything! Just this little, tiny line. 
        # This creates a typed memoryview, and it
        # is blazingly fast! The function signature is very sloppily typed. 
        # But this is 10,000 TIMES FASTER THAN THE PREVIOUS FUNCTION!!!
        # Think about it as it were a C pointer to an array 
        # Now you can use this memoryview and operate on the original array. 
        # If you make changes to the view, you will make changes to pic_original as well  
        cython.uchar[:,:,:] pic = pic_original  # -> pure gold!
        Py_ssize_t shape0_pic = pic.shape[0]
        Py_ssize_t shape1_pic = pic.shape[1]
        Py_ssize_t i,j
        Py_ssize_t resultcounter=shape0_pic*shape1_pic
        Py_ssize_t r=0
        Py_ssize_t g=0
        Py_ssize_t b=0
    start=perf_counter()
    with nogil: # there we go
        for i in range(shape0_pic):
            for j in range(shape1_pic):
                b+=pic[i][j][0]
                g+=pic[i][j][1]
                r+=pic[i][j][2]
    print(perf_counter()-start)
    return b//resultcounter, g//resultcounter, r//resultcounter

cpdef tuple average_rgb_very_good(np.ndarray pic_original):
    cdef:
        # This is a little faster, but use it only if you can guarantee 
        # that you are passing a continuous array!!
        # OpenCV, for example, does not guarantee continuous arrays when loading an image!! 
        #They are most of the time continuous, but sometimes not!
        cython.uchar[:,:,::1] pic = pic_original 
        # Always remember: when working with arrays / np.ndarrays,
        # you must somehow and somewhere use  square brackets with colons!
        # If you don't do that, your code will always be slow!!!
        Py_ssize_t shape0_pic = pic.shape[0]
        Py_ssize_t shape1_pic = pic.shape[1]
        Py_ssize_t i,j
        Py_ssize_t resultcounter=shape0_pic*shape1_pic
        Py_ssize_t r=0
        Py_ssize_t g=0
        Py_ssize_t b=0
    start=perf_counter()
    with nogil:
        for i in range(shape0_pic):
            for j in range(shape1_pic):
                b+=pic[i][j][0]
                g+=pic[i][j][1]
                r+=pic[i][j][2]
    print(perf_counter()-start)
    return b//resultcounter, g//resultcounter, r//resultcounter



print(average_rgb_very_good(pic))
print(average_rgb_good(pic))
print(average_rgb_bad(pic))

# compiled with C division

# 0.015835800004424527
# (76, 83, 85)
# 0.01636139999027364
# (76, 83, 85)

# Now it becomes very clear that we haven't even left Python land 
# 123.76531680001062 # -> 10.000 times slower!
# cyimage2.pyx:37: RuntimeWarning: division with oppositely signed operands, C and Python semantics differ
  # r+=pic_original[i][j][2]
# cyimage2.pyx:37: RuntimeWarning: division with oppositely signed operands, C and Python semantics differ
  # r+=pic_original[i][j][2]
# cyimage2.pyx:37: RuntimeWarning: division with oppositely signed operands, C and Python semantics differ
  # r+=pic_original[i][j][2]
# (-66, -59, -58)

# it is also important to use the right compiler directives 
# Usually I use these:

comiler='''import numpy as np
from Cython.Compiler import Options
from setuptools import Extension, setup
from Cython.Build import cythonize
import sys
import platform

iswindows = "win" in platform.platform().lower()
numpyincludefolder = np.get_include()
name = "cyimage2"

Options.docstrings = False
Options.embed_pos_in_docstring = False
Options.generate_cleanup_code = False
Options.clear_to_none = True
Options.annotate = True
Options.fast_fail = False
Options.warning_errors = False
Options.error_on_unknown_names = True
Options.error_on_uninitialized = True
Options.convert_range = True
Options.cache_builtins = True
Options.gcc_branch_hints = True
Options.lookup_module_cpdef = False
Options.embed = False
Options.cimport_from_pyx = False
Options.buffer_max_dims = 8


Options.closure_freelist_size = 8

configdict = {
    "py_limited_api": False,
    "name": name,
    "sources": [name + ".pyx"],
    "include_dirs": [
        numpyincludefolder,
    ],
    "define_macros": [
        ("NPY_NO_DEPRECATED_API", 1),
        ("NPY_1_7_API_VERSION", 1),
        ("CYTHON_USE_DICT_VERSIONS", 1),
        ("CYTHON_FAST_GIL", 1),
        ("CYTHON_USE_PYLIST_INTERNALS", 1),
        ("CYTHON_USE_UNICODE_INTERNALS", 1),
        ("CYTHON_ASSUME_SAFE_MACROS", 1),
        ("CYTHON_USE_TYPE_SLOTS", 1),
        ("CYTHON_USE_PYTYPE_LOOKUP", 1),
        ("CYTHON_USE_ASYNC_SLOTS", 1),
        ("CYTHON_USE_PYLONG_INTERNALS", 1),
        ("CYTHON_USE_UNICODE_WRITER", 1),
        ("CYTHON_UNPACK_METHODS", 1),
        ("CYTHON_USE_EXC_INFO_STACK", 1),
        ("CYTHON_ATOMICS", 1),
    ],
    "undef_macros": [],
    "library_dirs": [],
    "libraries": [],
    "runtime_library_dirs": [],
    "extra_objects": [],
    "extra_compile_args": [],
    "extra_link_args": [],
    "export_symbols": [],
    "swig_opts": [],
    "depends": [],
    "language": "c",
    "optional": None,
}
compiler_directives = {
    "binding": True,
    "boundscheck": False,
    "wraparound": False,
    "initializedcheck": False,
    "nonecheck": False,
    "overflowcheck": False,
    "overflowcheck.fold": False,
    "embedsignature": False,
    "embedsignature.format": "c",  # (c / python / clinic)
    "cdivision": True,
    "cdivision_warnings": True,
    "cpow": True,
    "always_allow_keywords": False,
    "c_api_binop_methods": False,
    "profile": False,
    "linetrace": False,
    "infer_types": True,
    "language_level": 3,  # (2/3/3str)
    "c_string_type": "bytes",  # (bytes / str / unicode)
    "c_string_encoding": "ascii",  # (ascii, default, utf-8, etc.)
    "type_version_tag": False,
    "unraisable_tracebacks": True,
    "iterable_coroutine": True,
    "annotation_typing": True,
    "emit_code_comments": True,
    "cpp_locals": False,
    "legacy_implicit_noexcept": False,
    "optimize.use_switch": True,
    "optimize.unpack_method_calls": True,
    "warn.undeclared": True,  # (default False)
    "warn.unreachable": True,  # (default True)
    "warn.maybe_uninitialized": True,  # (default False)
    "warn.unused": True,  # (default False)
    "warn.unused_arg": True,  # (default False)
    "warn.unused_result": True,  # (default False)
    "warn.multiple_declarators": True,  # (default True)
    "show_performance_hints": True,  # (default True)
}
compdi = configdict
clidict = compiler_directives

ext_modules = Extension(**configdict)

setup(
    name=name,
    ext_modules=cythonize(ext_modules, compiler_directives=compiler_directives),
)

'''

为什么我的 Cython 代码没有 NumPy 快，我该如何优化它？

问题描述投票：0回答：1

1个回答

只需一行代码即可使代码速度提高 10.000 倍！！！

最新问题

为什么我的 Cython 代码没有 NumPy 快，我该如何优化它？

问题描述 投票：0回答：1

1个回答

只需一行代码即可使代码速度提高 10.000 倍！！！

最新问题

问题描述投票：0回答：1