在 C# 中,我将 RGB 图像数据存储在 byte[] 数组 ([r, g, b, r, g, b, ...]) 中,并尝试将其转换为灰度。我在 C#(使用指针)和 C++(使用 SIMD 指令和 P/Invoke)中实现此灰度转换,以比较在 C# 中使用 C++ 时的性能增益。
C# 代码工作正常,保存图像没有问题,但当我使用 C++ 版本时,保存的灰度图像显示为随机噪声。这是我的主要 C# 代码:
static void Main(string[] args)
{
DllLoader.LoadLibrary("ImageProcessingLib.dll");
double totalElapsedMicrosecondsCpp = 0;
double totalElapsedMicrosecondsCS = 0;
// Load your image
Bitmap bitmap = new Bitmap("nature.jpeg");
// Convert the image to byte array
byte[] rgbBytes = ConvertBitmapToByteArray(bitmap);
byte[] rgbBytesCpp = ConvertBitmapToByteArray(bitmap);
int runs = 2;
for (int i = 0; i < runs; i++)
{
Stopwatch sw = Stopwatch.StartNew();
// Call the P/Invoke function for C++ implementation
fixed (byte* ptr = rgbBytesCpp)
{
DllLoader.ConvertRgbToGrayscale(ptr, rgbBytesCpp.Length);
}
sw.Stop();
totalElapsedMicrosecondsCpp += sw.Elapsed.TotalMilliseconds * 1000;
}
for (int i = 0; i < runs; i++)
{
Stopwatch sw = Stopwatch.StartNew();
// C# grayscale function
ConvertRgbToGrayscale(rgbBytes);
sw.Stop();
totalElapsedMicrosecondsCS += sw.Elapsed.TotalMilliseconds * 1000;
}
double averageElapsedMicrosecondsPInvoke = totalElapsedMicrosecondsCpp / runs;
double averageElapsedMicrosecondsCSharp = totalElapsedMicrosecondsCS / runs;
Console.WriteLine("Average P/Invoke Grayscale Time: {0} microseconds", averageElapsedMicrosecondsPInvoke);
Console.WriteLine("Average Native C# Grayscale Time: {0} microseconds", averageElapsedMicrosecondsCSharp);
SaveGrayscaleImage(rgbBytesCpp, bitmap.Width, bitmap.Height, "Cpp.jpg");
SaveGrayscaleImage(rgbBytes, bitmap.Width, bitmap.Height, "C#.jpg");
Console.ReadLine();
}
public unsafe class DllLoader
{
// Static constructor to load the DLL without invoking any functions from it
static DllLoader()
{
LoadLibrary("ImageProcessingLib.dll");
}
[DllImport("kernel32.dll", CharSet = CharSet.Auto)]
public static extern IntPtr LoadLibrary(string lpFileName);
// P/Invoke to call the C++ ConvertRgbToGrayscale function
[DllImport("ImageProcessingLib.dll", CallingConvention = CallingConvention.Cdecl)]
public static extern byte* ConvertRgbToGrayscale(byte* pImage, int length);
}
我在 C++ 函数中同时使用了 SIMD 和非 SIMD 方法,但 SIMD 方法会导致内存问题。这是 SIMD 代码:
#include <immintrin.h>
#include <cstdint>
extern "C" __declspec(dllexport) void ConvertRgbToGrayscaleSIMD(uint8_t* rgbArray, size_t length) {
// Ensure the array is aligned to 32-byte boundary (for AVX)
//__m256i* alignedArray = reinterpret_cast<__m256i*>(_aligned_malloc(length, 32));
// Copy data to aligned memory
//memcpy(alignedArray, rgbArray, length);
// Grayscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
const uint8_t coeffR = 77; // 0.3 * 256 ≈ 77
const uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
const uint8_t coeffB = 29; // 0.11 * 256 ≈ 29
// Load the grayscale coefficients into AVX registers (broadcast to 8 elements)
__m256i coeff_r = _mm256_set1_epi8(coeffR);
__m256i coeff_g = _mm256_set1_epi8(coeffG);
__m256i coeff_b = _mm256_set1_epi8(coeffB);
size_t i = 0;
// Process 8 pixels (24 bytes) at once
for (; i + 23 < length; i += 24) { // 8 pixels (24 bytes) per loop
// Load 24 bytes (8 pixels, RGBRGBRGB...)
__m256i rgb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(rgbArray + i));
// Extract the R, G, B channels
__m256i r = _mm256_and_si256(rgb1, _mm256_set1_epi8(0xFF)); // R channel (bytes 0, 3, 6, 9, 12, 15, 18, 21)
__m256i g = _mm256_and_si256(_mm256_srli_epi32(rgb1, 8), _mm256_set1_epi8(0xFF)); // G channel (bytes 1, 4, 7, 10, 13, 16, 19, 22)
__m256i b = _mm256_and_si256(_mm256_srli_epi32(rgb1, 16), _mm256_set1_epi8(0xFF)); // B channel (bytes 2, 5, 8, 11, 14, 17, 20, 23)
// Calculate grayscale
__m256i gray_r = _mm256_mullo_epi16(r, coeff_r); // R * coeffR
__m256i gray_g = _mm256_mullo_epi16(g, coeff_g); // G * coeffG
__m256i gray_b = _mm256_mullo_epi16(b, coeff_b); // B * coeffB
// Add the values (R * coeffR + G * coeffG + B * coeffB)
__m256i gray = _mm256_add_epi8(
_mm256_add_epi8(gray_r, gray_g),
gray_b
);
// Right shift by 8 to normalize the grayscale values
gray = _mm256_srli_epi16(gray, 8);
// Duplicate grayscale values to R, G, B channels
__m256i gray_rgb = _mm256_packus_epi16(gray, gray);
// Store the resulting grayscale values back into the rgbArray
_mm256_storeu_si256(reinterpret_cast<__m256i*>(rgbArray + i), gray_rgb);
}
// Handle any leftover pixels that don't fit into full 8-pixel chunks
for (; i + 2 < length; i += 3) {
uint8_t r = rgbArray[i];
uint8_t g = rgbArray[i + 1];
uint8_t b = rgbArray[i + 2];
uint8_t gray = static_cast<uint8_t>((coeffR * r + coeffG * g + coeffB * b) >> 8);
rgbArray[i] = gray;
rgbArray[i + 1] = gray;
rgbArray[i + 2] = gray;
}
// Handle any leftover pixels that don't fit into full RGB triplets (i.e., length % 3 != 0)
size_t remainder = length % 3;
if (remainder > 0) {
for (size_t j = length - remainder; j < length; ++j) {
rgbArray[j] = rgbArray[j]; // No change
}
}
//memcpy(rgbArray, alignedArray, length);
//_aligned_free(alignedArray);
}
当我取消注释对齐的内存行(_aligned_malloc 和 memcpy)时,输出图像是正确的,但它会显着降低性能。我希望避免这种内存对齐,同时仍使用 SIMD 以获得更好的性能。
我使用的是.net Framework 4.8,我当前的性能结果:
4k 图像 RGB 到灰度转换
C#:18 毫秒(工作)
C++ P/调用非 SIMD:13 毫秒(工作)
C++ P/调用 SIMD:7 毫秒(随机噪声问题)
问题:有没有办法可以对这个byte[]进行SIMD灰度转换,而不需要对齐内存?或者,是否有另一种有效的方法来处理这个问题,在保持性能的同时避免噪音问题?
您的 C++ SIMD 实现完全错误。
高效处理 RGB24 像素相对困难,因为所有 CPU 寄存器都具有 2 字节大小的幂,即从内存加载和存储数据时,寄存器包含不完整的像素计数。
出于同样的原因,现代图形库和硬件 API 都不支持 3 字节/像素格式,而是将每个 RGB 像素用零填充为 4 字节。
无论如何,请尝试以下版本,它应该可以满足您的需求。它假设您使用 VC++ 构建 C++ 代码,其他编译器不提供
rep movsb
和 rep stosb
指令的内在函数。
#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>
namespace
{
static const __m128i s_unpackTriplets = _mm_setr_epi8(
0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 );
// Load 24 bytes from memory, zero extending triplets from RGB into RGBA
// The alpha bytes will be zeros
inline __m256i loadRgb8( const uint8_t* rsi )
{
// Load 24 bytes into 2 SSE vectors, 16 and 8 bytes respectively
const __m128i low = _mm_loadu_si128( ( const __m128i* )rsi );
__m128i high = _mm_loadu_si64( rsi + 16 );
// Make the high vector contain exactly 4 triplets = 12 bytes
high = _mm_alignr_epi8( high, low, 12 );
// Combine into AVX2 vector
__m256i res = _mm256_setr_m128i( low, high );
// Hope the compiler inlines this function, and moves the vbroadcasti128 outside of the loop
const __m256i perm = _mm256_broadcastsi128_si256( s_unpackTriplets );
// Unpack RGB24 into RGB32
return _mm256_shuffle_epi8( res, perm );
}
// Greyscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
constexpr uint8_t coeffR = 77; // 0.3 * 256 ≈ 77
constexpr uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
constexpr uint8_t coeffB = 29; // 0.11 * 256 ≈ 29
// Compute vector of int32 lanes with r*coeffR + g*coeffG + b*coeffB
inline __m256i makeGreyscale( __m256i rgba )
{
const __m256i lowBytesMask = _mm256_set1_epi32( 0x00FF00FF );
__m256i rb = _mm256_and_si256( rgba, lowBytesMask );
__m256i g = _mm256_and_si256( _mm256_srli_epi16( rgba, 8 ), lowBytesMask );
// Scale red and blue channels, then add pairwise into int32 lanes
constexpr int mulRbScalar = ( ( (int)coeffB ) << 16 ) | coeffR;
const __m256i mulRb = _mm256_set1_epi32( mulRbScalar );
rb = _mm256_madd_epi16( rb, mulRb );
// Scale green channel
const __m256i mulGreen = _mm256_set1_epi32( coeffG );
g = _mm256_mullo_epi16( g, mulGreen );
// COmpute the result in 32-bit lanes;
// We assume alpha bytes in the input vector were zeros
return _mm256_add_epi32( rb, g );
}
static const __m256i s_packTriplets = _mm256_setr_epi8(
// Low half of the vector: e0 e0 e0 e1 e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0
1, 1, 1, 5, 5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1,
// High half of the vector: e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 e0 e0 e0 e1
5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1, 1, 1, 1, 5 );
// Extract second byte from each int32 lane, triplicate these bytes, and store 24 bytes to memory
inline void storeRgb8( uint8_t* rdi, __m256i gs )
{
// Move bytes within 16 byte lanes
gs = _mm256_shuffle_epi8( gs, s_packTriplets );
// Split vector into halves
__m128i low = _mm256_castsi256_si128( gs );
const __m128i high = _mm256_extracti128_si256( gs, 1 );
// Insert high 4 bytes from high into low
low = _mm_blend_epi32( low, high, 0b1000 );
// Store 24 RGB bytes
_mm_storeu_si128( ( __m128i* )rdi, low );
_mm_storeu_si64( rdi + 16, high );
}
inline void computeGreyscale8( uint8_t* ptr )
{
__m256i v = loadRgb8( ptr );
v = makeGreyscale( v );
storeRgb8( ptr, v );
}
}
void ConvertRgbToGrayscaleSIMD( uint8_t* ptr, size_t length )
{
const size_t rem = length % 24;
uint8_t* const endAligned = ptr + ( length - rem );
for( ; ptr < endAligned; ptr += 24 )
computeGreyscale8( ptr );
if( rem != 0 )
{
// An easy way to handle remainder is using a local buffer of 24 bytes, reusing the implementation
// Unlike memcpy / memset which are function calls and are subject to ABI conventions,
// __movsb / __stosb don't destroy data in vector registers
uint8_t remSpan[ 24 ];
__movsb( remSpan, ptr, rem );
__stosb( &remSpan[ rem ], 0, 24 - rem );
computeGreyscale8( remSpan );
__movsb( ptr, remSpan, rem );
}
}