SEH 是否使存储/写入更昂贵？

Question

我正在比较两种不同的缓冲区检查方法。

第一种方法是检查每次迭代是否已到达缓冲区的末尾，第二种方法是使用保护页来检测末尾。

虽然理论上保护页方法应该更快，但事实并非如此。

对于商店来说，两者之间的差异更为严重，其中保护页面方法比缓冲区检查方法花费的时间长 5 倍。

发生这种情况的原因是什么？

我机器上的基准测试（平均超过 10 次试验）：

branch + load:
58947659.3
branch + store:
15234306.6
seh + load:
84706608.6
seh + store:
84822314.3

我的代码：

#include <Windows.h>
#include <stdio.h>

#define BUFFER_SIZE 16ull * 1024ull * 1024ull * 1024ull

//remove this to do stores
#define LOAD

//remove this to use seh
#define USE_BRANCH

int main()
{
    HANDLE consoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);

    char* memory = VirtualAlloc(NULL, BUFFER_SIZE, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
    if (memory == NULL)
        return 0;

    unsigned long long total = 0;
    char* memoryStart = memory;
#ifdef USE_BRANCH
    
    LARGE_INTEGER perfcountBefore;
    QueryPerformanceCounter(&perfcountBefore);

    while (memory < memoryStart + BUFFER_SIZE)
    {
#ifdef LOAD
        total += *memory;
#else
        (*memory)++;
#endif
        memory++;
    }
    
    LARGE_INTEGER perfcountAfter;
    QueryPerformanceCounter(&perfcountAfter);

    char buffer[30];
    int stringlength = _snprintf_s(buffer, 30, _TRUNCATE, "operation took %i\n", perfcountAfter.QuadPart - perfcountBefore.QuadPart);
    WriteConsoleA(consoleHandle, buffer, stringlength, NULL, NULL);
#else
    SYSTEM_INFO si;
    GetSystemInfo(&si);
    DWORD garbage;
    VirtualProtect(memory + BUFFER_SIZE - si.dwPageSize, si.dwPageSize, PAGE_READWRITE | PAGE_GUARD, &garbage);

    LARGE_INTEGER perfcountBefore;
    QueryPerformanceCounter(&perfcountBefore);
    __try
    {
        while (1)
        {
#ifdef LOAD
            total += *memory;
#else
            (*memory)++;
#endif
            memory++;
        }
    }
    __except (EXCEPTION_EXECUTE_HANDLER)
    {
        while (memory < memoryStart + BUFFER_SIZE)
        {
#ifdef LOAD
            total += *memory;
#else
            (*memory)++;
#endif
            memory++;
        }
        LARGE_INTEGER perfcountAfter;
        QueryPerformanceCounter(&perfcountAfter);

        char buffer[30];
        int stringlength = _snprintf_s(buffer, 30, _TRUNCATE, "operation took %i\n", perfcountAfter.QuadPart - perfcountBefore.QuadPart);
        WriteConsoleA(consoleHandle, buffer, stringlength, NULL, NULL);
    }
#endif

    return total;
}

Answer 1

与微优化一样，您需要查看生成的代码。对于“正常”循环，你会得到这个：

$LL2@loop:
    movsx   rdx, BYTE PTR [rcx]
    lea     rcx, QWORD PTR [rcx+1]
    add     r9, rdx
    inc     r8
    cmp     r8, r10
    jb      SHORT $LL2@loop

对于您的 SEH 循环：

$LL13@loop:
    movsx   rax, BYTE PTR [rcx]
    add     rdx, rax
    mov     QWORD PTR total$1[rsp], rdx
    inc     rcx
    mov     QWORD PTR memory$[rsp], rcx
    jmp     SHORT $LL13@loop

使用

__try

块有副作用，编译器会认为所有内存访问都有副作用，并且您的局部变量

total

和

memory

没有优化，会产生两次以上的内存访问。这其实有些道理；如果它不承担副作用，编译器只会看到无限循环并删除所有内容。

SEH 是否使存储/写入更昂贵？

问题描述投票：0回答：1

1个回答

最新问题

SEH 是否使存储/写入更昂贵？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1