Numba cuda.jit 和 njit 给出不同的结果

问题描述 投票:0回答:1

在下面的例子中,我有一个简单的CPU函数:

import numpy as np
from numba import njit, cuda

@njit
def cpu_func(a, b, c, d):
    for i in range(len(a)):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]

    return a[:, 0]

以及上述相同功能的等效 GPU 实现:

@cuda.jit
def _gpu_func(a, b, c, d, out):
    i = cuda.grid(1)
    if i < len(a):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]
            out[i] = a[i, 0]

def gpu_func(a, b, c, d):
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.to_device(c)
    d_d = cuda.to_device(d)
    d_out = cuda.device_array(len(a))

    threads_per_block = 64
    blocks_per_grid = 128
    _gpu_func[blocks_per_grid, threads_per_block](d_a, d_b, d_c, d_d, d_out)
    out = d_out.copy_to_host()

    return out

但是,当我使用相同的输入调用两个函数时,我得到非常不同的结果:

a = np.array([[
    1.150962188573305234e+00, 1.135924188549360281e+00, 1.121074496043255930e+00, 1.106410753047196494e+00, 1.091930631080626046e+00,
    1.077631830820479752e+00, 1.063512081736074144e+00, 1.049569141728566635e+00, 1.035800796774924315e+00, 1.022204860576360286e+00,
    1.008779174211164253e+00, 9.955216057918859773e-01, 9.824300501268078412e-01, 9.695024283856580327e-01, 9.567366877695103744e-01,
    9.441308011848159598e-01, 9.316827669215179686e-01, 9.193906083351972569e-01, 9.072523735331967654e-01, 8.952661350646772265e-01,
    8.834299896145549891e-01, 8.717420577012704452e-01, 8.602004833783417626e-01, 8.488034339396569594e-01, 8.375490996284553624e-01,
    8.264356933499517055e-01, 8.154614503875622367e-01, 8.046246281226814290e-01, 7.939235057579688837e-01, 7.833563840441006842e-01,
    7.729215850099430130e-01, 7.626174516961046201e-01, 7.524423478918242925e-01, 7.423946578751554615e-01, 7.324727861564024334e-01,
    7.226751572247696043e-01, 7.130002152981841368e-01, 7.034464240762497989e-01, 6.940122664962961041e-01, 6.846962444924807878e-01,
    6.754968787579095357e-01, 6.664127085097342196e-01, 6.574422912571935562e-01, 6.485842025725561122e-01, 6.398370358649341227e-01,
    6.311994021569281577e-01, 6.226699298640693270e-01, 6.142472645770222783e-01, 6.059300688465173446e-01, 5.977170219709739829e-01,
    0.000000000000000000e+00
]])
b = np.array([1.533940813369776279e+00])
c = np.array([1.018336794718317062e+00])
d = np.array([50], dtype=np.int64)

cpu_func(a.copy(), b, c, d)  # Produces array([0.74204214])
gpu_func(a.copy(), b, c, d)  # Produces array([0.67937252])

我知道数值精度可能是一个问题(例如,上溢/下溢),并且每个编译器可能会以不同的方式优化代码(例如,融合乘法和加法),但是,暂时将其放在一边,是否有某种方法可以确保/强制CPU代码和GPU代码都产生相同的输出(最多4-5位小数)?或者也许有一些聪明的方法可以提高精度?

额外验证

为了完整起见,我尝试使用 mpmath 包执行相同的计算,该包声称提供固定精度功能:

import mpmath
mpmath.mp.dps = 100  # Precision

def mpmath_cpu_func(a, b, c, d):
    for i in range(a.rows):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]
    return a[:, 0]

# Convert inputs to mpmath types with high precision
mp_a = mpmath.matrix(a.copy())
for i in range(a.shape[0]):
    for j in range(a.shape[1]):
        mp_a[i, j] = mpmath.mpf(a[i, j].astype(str))
mp_b = [mpmath.mpf(b[0].astype(str))]
mp_c = [mpmath.mpf(c[0].astype(str))]

mpmath_cpu_func(mp_a, mp_b, mp_c, d)  # Produces 0.6449015342958763156413719663014237700252472529553791866336058829452386808983049961922292904843391669

请注意,高精度结果也与上面的CPU和GPU结果不同。是否有一些巧妙的方法可以提高上述 GPU/CPU 实现的精度?我很乐意在所有三个版本之间取得 4-5 位小数的一致。

更新

我知道数值精度可能是一个问题(例如,上溢/下溢),并且每个编译器可能会以不同的方式优化代码(例如,融合乘法和加法),但是,暂时将其放在一边,是否有某种方法可以确保/强制CPU代码和GPU代码都产生相同的输出(最多4-5位小数)?或者也许有一些聪明的方法可以提高精度?

我能够利用 CPU 代码中的 pyfma 包(没有

numba
njit
!)来生成与 GPU 代码匹配的结果:

import numpy as np
import pyfma
import _pyfma
from numpy.typing import ArrayLike

def monkey_patch_fma(a: ArrayLike, b: ArrayLike, c: ArrayLike) -> np.ndarray:
    """
    This is a simple monkey patch to make `pyfma` compatible with NumPy v2.0

    Based on this `pyfma` PR - https://github.com/nschloe/pyfma/pull/17/files
    """
    a = np.asarray(a)
    b = np.asarray(b)
    c = np.asarray(c)
    # dtype = np.find_common_type([], [a.dtype, b.dtype, c.dtype])
    dtype = np.promote_types(np.promote_types(a.dtype, b.dtype), c.dtype)
    a = a.astype(dtype)
    b = b.astype(dtype)
    c = c.astype(dtype)

    if dtype == np.single:
        return _pyfma.fmaf(a, b, c)
    elif dtype == np.double:
        return _pyfma.fma(a, b, c)

    assert dtype == np.longdouble
    return _pyfma.fmal(a, b, c)

pyfma.fma = monkey_patch_fma

def fma_cpu_func(a, b, c, d):
    for i in range(len(a)):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = pyfma.fma(b[i], a[i, j], (1.0 - b[i]) * a[i, j + 1]) / c[i]
    return a[:, 0]

a = np.array([[
    1.150962188573305234e+00, 1.135924188549360281e+00, 1.121074496043255930e+00, 1.106410753047196494e+00, 1.091930631080626046e+00,
    1.077631830820479752e+00, 1.063512081736074144e+00, 1.049569141728566635e+00, 1.035800796774924315e+00, 1.022204860576360286e+00,
    1.008779174211164253e+00, 9.955216057918859773e-01, 9.824300501268078412e-01, 9.695024283856580327e-01, 9.567366877695103744e-01,
    9.441308011848159598e-01, 9.316827669215179686e-01, 9.193906083351972569e-01, 9.072523735331967654e-01, 8.952661350646772265e-01,
    8.834299896145549891e-01, 8.717420577012704452e-01, 8.602004833783417626e-01, 8.488034339396569594e-01, 8.375490996284553624e-01,
    8.264356933499517055e-01, 8.154614503875622367e-01, 8.046246281226814290e-01, 7.939235057579688837e-01, 7.833563840441006842e-01,
    7.729215850099430130e-01, 7.626174516961046201e-01, 7.524423478918242925e-01, 7.423946578751554615e-01, 7.324727861564024334e-01,
    7.226751572247696043e-01, 7.130002152981841368e-01, 7.034464240762497989e-01, 6.940122664962961041e-01, 6.846962444924807878e-01,
    6.754968787579095357e-01, 6.664127085097342196e-01, 6.574422912571935562e-01, 6.485842025725561122e-01, 6.398370358649341227e-01,
    6.311994021569281577e-01, 6.226699298640693270e-01, 6.142472645770222783e-01, 6.059300688465173446e-01, 5.977170219709739829e-01,
    0.000000000000000000e+00
]])
b = np.array([1.533940813369776279e+00])
c = np.array([1.018336794718317062e+00])
d = np.array([50], dtype=np.int64)

fma_cpu_func(a.copy(), b, c, d)  # Produces array([0.67937252])

因此,CPU 和 GPU 的输出是相同的,并且“更接近”更高精度的输出 (

0.644901534295876315641
),但仍然不准确。

python numpy cuda precision numba
1个回答
0
投票

根据您的更新,差异似乎是由于缺乏 fma 优化造成的。 (因此我最近的评论是完全错误的。如果这引起了混乱,我很抱歉)。

您也可以使用 fastmath 选项在 jit 函数中启用此功能。

@njit(fastmath={"contract"})
def cpu_func_fm(a, b, c, d):
    for i in range(len(a)):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]

    return a[:, 0]

结果:

cpu_func    : 0.7420421437852579
gpu_func    : 0.6793725228584361
fma_cpu_func: 0.6793725228584361
cpu_func_fm : 0.6793725228584361
© www.soinside.com 2019 - 2024. All rights reserved.