C++ 到 C# 内存对齐问题

Question

在 C# 中，我将 RGB 图像数据存储在 byte[] 数组 ([r, g, b, r, g, b, ...]) 中，并尝试将其转换为灰度。我在 C#（使用指针）和 C++（使用 SIMD 指令和 P/Invoke）中实现此灰度转换，以比较在 C# 中使用 C++ 时的性能增益。

C# 代码工作正常，保存图像没有问题，但当我使用 C++ 版本时，保存的灰度图像显示为随机噪声。这是我的主要 C# 代码：

static void Main(string[] args)
{
    DllLoader.LoadLibrary("ImageProcessingLib.dll");
    double totalElapsedMicrosecondsCpp = 0;
    double totalElapsedMicrosecondsCS = 0;

    // Load your image
    Bitmap bitmap = new Bitmap("nature.jpeg");

    // Convert the image to byte array
    byte[] rgbBytes = ConvertBitmapToByteArray(bitmap);
    byte[] rgbBytesCpp = ConvertBitmapToByteArray(bitmap);

    int runs = 2;
    for (int i = 0; i < runs; i++)
    {
        Stopwatch sw = Stopwatch.StartNew();

        // Call the P/Invoke function for C++ implementation
        fixed (byte* ptr = rgbBytesCpp)
        {
            DllLoader.ConvertRgbToGrayscale(ptr, rgbBytesCpp.Length);
        }

        sw.Stop();
        totalElapsedMicrosecondsCpp += sw.Elapsed.TotalMilliseconds * 1000;
    }

    for (int i = 0; i < runs; i++)
    {
        Stopwatch sw = Stopwatch.StartNew();

        // C# grayscale function
        ConvertRgbToGrayscale(rgbBytes);

        sw.Stop();
        totalElapsedMicrosecondsCS += sw.Elapsed.TotalMilliseconds * 1000;
    }

    double averageElapsedMicrosecondsPInvoke = totalElapsedMicrosecondsCpp / runs;
    double averageElapsedMicrosecondsCSharp = totalElapsedMicrosecondsCS / runs;

    Console.WriteLine("Average P/Invoke Grayscale Time: {0} microseconds", averageElapsedMicrosecondsPInvoke);
    Console.WriteLine("Average Native C# Grayscale Time: {0} microseconds", averageElapsedMicrosecondsCSharp);

    SaveGrayscaleImage(rgbBytesCpp, bitmap.Width, bitmap.Height, "Cpp.jpg");
    SaveGrayscaleImage(rgbBytes, bitmap.Width, bitmap.Height, "C#.jpg");

    Console.ReadLine();
}

public unsafe class DllLoader
{
    // Static constructor to load the DLL without invoking any functions from it
    static DllLoader()
    {
        LoadLibrary("ImageProcessingLib.dll");
    }

    [DllImport("kernel32.dll", CharSet = CharSet.Auto)]
    public static extern IntPtr LoadLibrary(string lpFileName);

    // P/Invoke to call the C++ ConvertRgbToGrayscale function
    [DllImport("ImageProcessingLib.dll", CallingConvention = CallingConvention.Cdecl)]
    public static extern byte* ConvertRgbToGrayscale(byte* pImage, int length);

    
}

我在 C++ 函数中同时使用了 SIMD 和非 SIMD 方法，但 SIMD 方法会导致内存问题。这是 SIMD 代码：

#include <immintrin.h>
#include <cstdint>

extern "C" __declspec(dllexport) void ConvertRgbToGrayscaleSIMD(uint8_t* rgbArray, size_t length) {
    
    // Ensure the array is aligned to 32-byte boundary (for AVX)
    //__m256i* alignedArray = reinterpret_cast<__m256i*>(_aligned_malloc(length, 32));

    // Copy data to aligned memory
    //memcpy(alignedArray, rgbArray, length);

    
    // Grayscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
    const uint8_t coeffR = 77;  // 0.3 * 256 ≈ 77
    const uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
    const uint8_t coeffB = 29;  // 0.11 * 256 ≈ 29

    // Load the grayscale coefficients into AVX registers (broadcast to 8 elements)
    __m256i coeff_r = _mm256_set1_epi8(coeffR);
    __m256i coeff_g = _mm256_set1_epi8(coeffG);
    __m256i coeff_b = _mm256_set1_epi8(coeffB);

    size_t i = 0;
    // Process 8 pixels (24 bytes) at once
    for (; i + 23 < length; i += 24) {  // 8 pixels (24 bytes) per loop
        // Load 24 bytes (8 pixels, RGBRGBRGB...)
        __m256i rgb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(rgbArray + i));

        // Extract the R, G, B channels
        __m256i r = _mm256_and_si256(rgb1, _mm256_set1_epi8(0xFF));               // R channel (bytes 0, 3, 6, 9, 12, 15, 18, 21)
        __m256i g = _mm256_and_si256(_mm256_srli_epi32(rgb1, 8), _mm256_set1_epi8(0xFF)); // G channel (bytes 1, 4, 7, 10, 13, 16, 19, 22)
        __m256i b = _mm256_and_si256(_mm256_srli_epi32(rgb1, 16), _mm256_set1_epi8(0xFF)); // B channel (bytes 2, 5, 8, 11, 14, 17, 20, 23)

        // Calculate grayscale
        __m256i gray_r = _mm256_mullo_epi16(r, coeff_r); // R * coeffR
        __m256i gray_g = _mm256_mullo_epi16(g, coeff_g); // G * coeffG
        __m256i gray_b = _mm256_mullo_epi16(b, coeff_b); // B * coeffB

        // Add the values (R * coeffR + G * coeffG + B * coeffB)
        __m256i gray = _mm256_add_epi8(
            _mm256_add_epi8(gray_r, gray_g),
            gray_b
        );

        // Right shift by 8 to normalize the grayscale values
        gray = _mm256_srli_epi16(gray, 8);

        // Duplicate grayscale values to R, G, B channels
        __m256i gray_rgb = _mm256_packus_epi16(gray, gray);

        // Store the resulting grayscale values back into the rgbArray
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(rgbArray + i), gray_rgb);
    }

    // Handle any leftover pixels that don't fit into full 8-pixel chunks
    for (; i + 2 < length; i += 3) {
        uint8_t r = rgbArray[i];
        uint8_t g = rgbArray[i + 1];
        uint8_t b = rgbArray[i + 2];

        uint8_t gray = static_cast<uint8_t>((coeffR * r + coeffG * g + coeffB * b) >> 8);

        rgbArray[i] = gray;
        rgbArray[i + 1] = gray;
        rgbArray[i + 2] = gray;
    }

    // Handle any leftover pixels that don't fit into full RGB triplets (i.e., length % 3 != 0)
    size_t remainder = length % 3;
    if (remainder > 0) {
        for (size_t j = length - remainder; j < length; ++j) {
            rgbArray[j] = rgbArray[j];  // No change
        }
    }

    //memcpy(rgbArray, alignedArray, length);

    //_aligned_free(alignedArray);



}

当我取消注释对齐的内存行（_aligned_malloc 和 memcpy）时，输出图像是正确的，但它会显着降低性能。我希望避免这种内存对齐，同时仍使用 SIMD 以获得更好的性能。

我使用的是.net Framework 4.8，我当前的性能结果：

4k 图像 RGB 到灰度转换

C#：18 毫秒（工作）

C++ P/调用非 SIMD：13 毫秒（工作）

C++ P/调用 SIMD：7 毫秒（随机噪声问题）

问题：有没有办法可以对这个byte[]进行SIMD灰度转换，而不需要对齐内存？或者，是否有另一种有效的方法来处理这个问题，在保持性能的同时避免噪音问题？

Answer 1

您的 C++ SIMD 实现完全错误。

高效处理 RGB24 像素相对困难，因为所有 CPU 寄存器都具有 2 字节大小的幂，即从内存加载和存储数据时，寄存器包含不完整的像素计数。
出于同样的原因，现代图形库和硬件 API 都不支持 3 字节/像素格式，而是将每个 RGB 像素用零填充为 4 字节。

无论如何，请尝试以下版本，它应该可以满足您的需求。它假设您使用 VC++ 构建 C++ 代码，其他编译器不提供

rep movsb

和

rep stosb

指令的内在函数。

#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>

namespace
{
    static const __m128i s_unpackTriplets = _mm_setr_epi8(
        0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 );

    // Load 24 bytes from memory, zero extending triplets from RGB into RGBA
    // The alpha bytes will be zeros
    inline __m256i loadRgb8( const uint8_t* rsi )
    {
        // Load 24 bytes into 2 SSE vectors, 16 and 8 bytes respectively
        const __m128i low = _mm_loadu_si128( ( const __m128i* )rsi );
        __m128i high = _mm_loadu_si64( rsi + 16 );
        // Make the high vector contain exactly 4 triplets = 12 bytes
        high = _mm_alignr_epi8( high, low, 12 );
        // Combine into AVX2 vector
        __m256i res = _mm256_setr_m128i( low, high );
        // Hope the compiler inlines this function, and moves the vbroadcasti128 outside of the loop
        const __m256i perm = _mm256_broadcastsi128_si256( s_unpackTriplets );
        // Unpack RGB24 into RGB32
        return _mm256_shuffle_epi8( res, perm );
    }

    // Greyscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
    constexpr uint8_t coeffR = 77;  // 0.3 * 256 ≈ 77
    constexpr uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
    constexpr uint8_t coeffB = 29;  // 0.11 * 256 ≈ 29

    // Compute vector of int32 lanes with r*coeffR + g*coeffG + b*coeffB
    inline __m256i makeGreyscale( __m256i rgba )
    {
        const __m256i lowBytesMask = _mm256_set1_epi32( 0x00FF00FF );
        __m256i rb = _mm256_and_si256( rgba, lowBytesMask );
        __m256i g = _mm256_and_si256( _mm256_srli_epi16( rgba, 8 ), lowBytesMask );

        // Scale red and blue channels, then add pairwise into int32 lanes
        constexpr int mulRbScalar = ( ( (int)coeffB ) << 16 ) | coeffR;
        const __m256i mulRb = _mm256_set1_epi32( mulRbScalar );
        rb = _mm256_madd_epi16( rb, mulRb );

        // Scale green channel
        const __m256i mulGreen = _mm256_set1_epi32( coeffG );
        g = _mm256_mullo_epi16( g, mulGreen );

        // COmpute the result in 32-bit lanes;
        // We assume alpha bytes in the input vector were zeros
        return _mm256_add_epi32( rb, g );
    }

    static const __m256i s_packTriplets = _mm256_setr_epi8(
        // Low half of the vector: e0 e0 e0 e1 e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 
        1, 1, 1, 5, 5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1,
        // High half of the vector: e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 e0 e0 e0 e1 
        5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1, 1, 1, 1, 5 );

    // Extract second byte from each int32 lane, triplicate these bytes, and store 24 bytes to memory
    inline void storeRgb8( uint8_t* rdi, __m256i gs )
    {
        // Move bytes within 16 byte lanes
        gs = _mm256_shuffle_epi8( gs, s_packTriplets );

        // Split vector into halves
        __m128i low = _mm256_castsi256_si128( gs );
        const __m128i high = _mm256_extracti128_si256( gs, 1 );
        // Insert high 4 bytes from high into low
        low = _mm_blend_epi32( low, high, 0b1000 );

        // Store 24 RGB bytes
        _mm_storeu_si128( ( __m128i* )rdi, low );
        _mm_storeu_si64( rdi + 16, high );
    }

    inline void computeGreyscale8( uint8_t* ptr )
    {
        __m256i v = loadRgb8( ptr );
        v = makeGreyscale( v );
        storeRgb8( ptr, v );
    }
}

void ConvertRgbToGrayscaleSIMD( uint8_t* ptr, size_t length )
{
    const size_t rem = length % 24;

    uint8_t* const endAligned = ptr + ( length - rem );
    for( ; ptr < endAligned; ptr += 24 )
        computeGreyscale8( ptr );

    if( rem != 0 )
    {
        // An easy way to handle remainder is using a local buffer of 24 bytes, reusing the implementation
        // Unlike memcpy / memset which are function calls and are subject to ABI conventions,
        // __movsb / __stosb don't destroy data in vector registers
        uint8_t remSpan[ 24 ];
        __movsb( remSpan, ptr, rem );
        __stosb( &remSpan[ rem ], 0, 24 - rem );

        computeGreyscale8( remSpan );

        __movsb( ptr, remSpan, rem );
    }
}

C++ 到 C# 内存对齐问题

问题描述投票：0回答：1

1个回答

最新问题

C++ 到 C# 内存对齐问题

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1