为什么我的 C++ 函数运行速度比 C# P/Invoke 调用快 10 倍？

Question

我有一个 C++ 函数，它使用 AVX2 内在函数来使图像变亮。当我直接在 C++ 中测量性能时，处理分辨率为 3840 x 240 的图像大约需要 500 微秒。但是，当我使用 P/Invoke 从 C# 调用相同的函数时，大约需要 4 毫秒，这很长慢一点。

对于 3840 x 2160 图像，本机 C++ 大约需要 1.5 毫秒，而 P/Invoke 大约需要 4.5 毫秒。

对于 10000 x 4000 图像，本机 C++ 大约需要 3 毫秒，而 P/Invoke 大约需要 8 毫秒。

这是我的设置：

C++：使用 AVX2 处理图像数据的本机函数。 C#：使用P/Invoke调用该函数，直接通过引用传递图像数据。

C#主程序：

static void Main(string[] args) 
{
    int width = 3840;
    int height = 240;
    byte[] image = new byte[width * height];
    Random random = new Random();
    
    // Fill the image array with random brightness values between 0 and 255
    for (int i = 0; i < image.Length; i++)
    {
        image[i] = (byte)random.Next(0, 256);
    }

    byte brightness = 30;
    
    // Measure C# processing time
    Stopwatch sw = Stopwatch.StartNew();
    ProcessorCSharp.BrightenImage(image, brightness);
    sw.Stop();

    Console.WriteLine("C# Time: {0} microseconds", sw.Elapsed.TotalMilliseconds * 1000);
}

C# 包装类：

public class ProcessorCSharp
{
    [DllImport("ImageProcessingLib.dll", CallingConvention = CallingConvention.Cdecl)]
    private static extern void brightenImageSIMD(IntPtr image, int size, byte brightness);

    public static unsafe void BrightenImage(byte[] image, byte brightness)
    {
        int size = image.Length;

        fixed (byte* p = image)
        {
            brightenImageSIMD((IntPtr)p, size, brightness);
        }
    }
}

C++ 函数：

#include <immintrin.h>  // AVX2 intrinsics
#include <vector>
#include <algorithm>    // For std::min

extern "C" __declspec(dllexport) // in my main c++ code this line doesn't exist
void brightenImageSIMD(uint8_t* image, size_t size, uint8_t brightness) {
    size_t i = 0;
    __m256i brightnessVector = _mm256_set1_epi8(brightness);
    __m256i maxVector = _mm256_set1_epi8(255);

    for (; i + 31 < size; i += 32) {
        __m256i pixels = _mm256_loadu_si256((__m256i*) &image[i]);
        __m256i brightened = _mm256_adds_epu8(pixels, brightnessVector);
        __m256i clamped = _mm256_min_epu8(brightened, maxVector);
        _mm256_storeu_si256((__m256i*) &image[i], clamped);
    }

    for (; i < size; ++i) {
        image[i] = std::min(image[i] + brightness, 255);
    }
}

// Helper function to measure execution time
template <typename Func, typename... Args>
long long measureExecutionTime(Func func, Args&&... args) {
    auto start = std::chrono::high_resolution_clock::now();
    func(std::forward<Args>(args)...);
    auto end = std::chrono::high_resolution_clock::now();
    return std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
}

int main() {
    const int width = 3840;
    const int height = 2160;
    const uint8_t brightnessIncrease = 30;

    std::vector<uint8_t> image(width * height);

    // Set up random number generation
    std::random_device rd;  // Seed for the random number engine
    std::mt19937 gen(rd()); // Standard mersenne_twister_engine
    std::uniform_int_distribution<> dis(0, 255); // Range from 0 to 255 for 8-bit brightness levels

    // Fill the image with random values
    for (auto& pixel : image) {
        pixel = static_cast<uint8_t>(dis(gen));
    }

    // Measure performance of SIMD method
    auto imageCopy3 = image; // Make another copy for fair comparison
    long long timeSIMD = measureExecutionTime(brightenImageSIMD, imageCopy3, brightnessIncrease);

    std::cout << "SIMD (AVX2) method time: " << timeSIMD << " microseconds" << std::endl;
    

    return 0;
}

问题：

我尝试在 C# 中使用 C++ 封装函数的主要原因是我需要比在 C# 中实现的实时图像处理应用程序更好的性能。
例如，仅在 C++ 中，brightenImageSIMD 大约需要 500 微秒。但当我从 C# 调用它时，始终需要大约 4 毫秒。我尝试过使用带有固定指针的不安全代码来防止数组复制，但性能差异仍然存在。

问题：

为什么原生 C++ 执行和 C# P/Invoke 调用之间存在如此大的性能差距？我该如何使 C# 调用的版本更接近原生 C++ 的性能？像 OpenCvSharp 这样的库如何通过 P/Invoke 实现出色的性能？ OpenCvSharp 通过 P/Invoke 调用本机 OpenCV 函数，并且仍然保持非常高的性能，所以我很好奇该库中是否有一些我缺少的技术可以应用到这里。

Answer 1

为什么原生 C++ 执行和 C# P/Invoke 调用之间存在如此大的性能差距？

一般而言，对于 SIMD，C++ 的性能优于 C#，这自然可以解释这里的时间增量。事实上，在处理两种语言的相同代码片段时，编译器的行为并不相同：

C# JIT ASM AVX：

xor         edx,edx                          ; initialise edx (loop counter i) to zero

; LOOP_START
mov         ecx,dword ptr [rsi+8]            ; load vx.Length into ecx
cmp         edx,ecx                          ; if i >= vx.Length
jae         000007FE95B958E7                 ; throw IndexOutOfRangeException
lea         r8d,[rdx+3]                      ; load i+3 into r8d
cmp         r8d,ecx                          ; if i+3 >= vx.Length
jae         000007FE95B958E7                 ; throw IndexOutOfRangeException
movups      xmm0,xmmword ptr [rsi+rdx*4+10h] ; load vx[i..i+3] into xmm0

mov         ecx,dword ptr [rdi+8]            ; load vy.Length into ecx
cmp         edx,ecx                          ; if i >= vy.Length
jae         000007FE95B958E7                 ; throw IndexOutOfRangeException
cmp         r8d,ecx                          ; if i+3 >= vy.Length
jae         000007FE95B958E7                 ; throw IndexOutOfRangeException
movups      xmm1,xmmword ptr [rdi+rdx*4+10h] ; load vy[i..i+3] into xmm1

paddd       xmm0,xmm1                        ; perform SIMD addition of xmm0 and xmm1
mov         ecx,dword ptr [rax+8]            ; load result.Length into ecx
cmp         edx,ecx                          ; if i >= result.Length
jae         000007FE95B958EC                 ; throw ArgumentException
cmp         r8d,ecx                          ; if i+3 >= result.Length
jae         000007FE95B958F1                 ; throw ArgumentException

movups      xmmword ptr [rax+rdx*4+10h],xmm0 ; more result out of xmm0 into the result array

add         edx,4                            ; increment loop counter, i, by 4
cmp         edx,3E8h                         ; if i < 1000 (0x3E8)
jl          000007FE95B9589A                 ; go back to LOOP_START

C++ MSVC2015 AVX2：

; array initialisation and loop setup omitted...

; SIMD_LOOP_START
vmovdqu     ymm1,ymmword ptr [rax-20h]           ; load 8 ints (256 bits) from x into 256-bit register ymm1
vpaddd      ymm1,ymm1,ymmword ptr [rcx+rax-20h]  ; add 8 ints from y to those in ymm1 and store result back in ymm1
vmovdqu     ymmword ptr [r8+rax-20h],ymm1        ; move result out of ymm1 into the result array
vmovdqu     ymm2,ymmword ptr [rax]               ; load the next 8 ints from x into ymm2
vpaddd      ymm1,ymm2,ymmword ptr [rcx+rax]      ; add the next 8 ints from y to those in ymm2 and store the result in ymm1
vmovdqu     ymmword ptr [r8+rax],ymm1            ; move the result out of ymm1 into the result array
lea         rax,[rax+40h]                        ; increment the array indexer by 16 ints (64 bytes)
sub         r9,1                                 ; decrement the loop counter
jne         main+120h                            ; if loop counter != 0 go back to SIMD_LOOP_START

; SIMPLE_LOOP_START
mov         ecx,dword ptr [rbx+rax]              ; load one int from x into ecx
add         ecx,dword ptr [rax]                  ; add one int from y to the value in ecx and store the result in ecx
mov         dword ptr [rdx+rax],ecx              ; move the result out of ecx into the result array
lea         rax,[rax+4]                          ; increment the array indexer by one int (4 bytes)
sub         rdi,1                                ; decrement the loop counter
jne         main+160h                            ; if loop counter != 0 go back to SIMPLE_LOOP_START

由此得出的结论是，编译器在运行 C++ 时能够在必要时进行“自动向量化”，从而获得大量执行时间。

如何才能让C#调用的版本更接近原生C++的性能？

主要要注意的一点是

向量化总是比标量

更快。使用矢量化字节结构，您将获得 1.9 到 3.5 的处理时间。您在 C++ (std::vector<uint8_t> image(width * height)) 中使用它，而不是在 C# (

byte[] image = new byte[width * height];

) 中使用它，这可能会产生效果。矢量化更能节省时间，因为 AVX2 指令可以在一个时钟周期内操作 8 或 16 字节，因此可以并行化。对于标量容器，处理器按顺序对每个数据元素执行一条指令。

像 OpenCvSharp 这样的库如何通过 P/Invoke 实现出色的性能？

OpenCV 通常通过使用直接访问内存指针的

byte[]

对象来避免

Mat

，从而最大限度地减少编组需求。

结论

我强烈建议使用向量而不是标量容器来节省时间。但请注意，您可以使用替代的“原始”内存存储，包括内存池和原始指针，但为了保持简单和愚蠢（KISS），您可以使用向量。请注意，C++ 始终比 C# 更快，但您可以接近。

为什么我的 C++ 函数运行速度比 C# P/Invoke 调用快 10 倍？

问题描述投票：0回答：1

C#主程序：

C# 包装类：

C++ 函数：

问题：

问题：

1个回答

最新问题

为什么我的 C++ 函数运行速度比 C# P/Invoke 调用快 10 倍？

问题描述 投票：0回答：1

C#主程序：

C# 包装类：

C++ 函数：

问题：

问题：

1个回答

最新问题

问题描述投票：0回答：1