我正在尝试测量计算机执行不同操作所需的估计周期长度,因此我执行相同的操作 100K 次并计算平均值。 我使用 loop unwinding 更准确一些:我在每次迭代中执行 10 次基本操作,并将索引增加 10,从而减少循环操作。
这些对于我的问题来说都不重要:编译器有什么办法可以理解我多次执行相同的操作并且只执行一次? 这是我的循环:
for (i=0; i<iterations; i+=LOOP_FACTOR)
{
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
}
另外,我不知道这是否重要 - 我正在使用 Eclipse。我认为这可能很重要,因为有不同的编译器。
在没有优化的 GCC 中,它是按原样编译的:
(gdb) disas main
Dump of assembler code for function main:
0x00000000004004e4 <+0>: push rbp
0x00000000004004e5 <+1>: mov rbp,rsp
0x00000000004004e8 <+4>: mov DWORD PTR [rip+0x200482],0x0 # 0x600974 <i>
0x00000000004004f2 <+14>: jmp 0x400567 <main+131>
0x00000000004004f4 <+16>: mov DWORD PTR [rip+0x200472],0xffffffff # 0x600970 <result>
0x00000000004004fe <+26>: mov DWORD PTR [rip+0x200468],0xffffffff # 0x600970 <result>
0x0000000000400508 <+36>: mov DWORD PTR [rip+0x20045e],0xffffffff # 0x600970 <result>
0x0000000000400512 <+46>: mov DWORD PTR [rip+0x200454],0xffffffff # 0x600970 <result>
0x000000000040051c <+56>: mov DWORD PTR [rip+0x20044a],0xffffffff # 0x600970 <result>
0x0000000000400526 <+66>: mov DWORD PTR [rip+0x200440],0xffffffff # 0x600970 <result>
0x0000000000400530 <+76>: mov DWORD PTR [rip+0x200436],0xffffffff # 0x600970 <result>
0x000000000040053a <+86>: mov DWORD PTR [rip+0x20042c],0xffffffff # 0x600970 <result>
0x0000000000400544 <+96>: mov DWORD PTR [rip+0x200422],0xffffffff # 0x600970 <result>
0x000000000040054e <+106>: mov DWORD PTR [rip+0x200418],0xffffffff # 0x600970 <result>
0x0000000000400558 <+116>: mov eax,DWORD PTR [rip+0x200416] # 0x600974 <i>
0x000000000040055e <+122>: add eax,0x1
0x0000000000400561 <+125>: mov DWORD PTR [rip+0x20040d],eax # 0x600974 <i>
0x0000000000400567 <+131>: mov eax,DWORD PTR [rip+0x200407] # 0x600974 <i>
0x000000000040056d <+137>: cmp eax,0x3e7
0x0000000000400572 <+142>: jle 0x4004f4 <main+16>
0x0000000000400574 <+144>: mov eax,DWORD PTR [rip+0x2003f6] # 0x600970 <result>
0x000000000040057a <+150>: mov esi,eax
0x000000000040057c <+152>: mov edi,0x40067c
0x0000000000400581 <+157>: mov eax,0x0
0x0000000000400586 <+162>: call 0x4003e0 <printf@plt>
0x000000000040058b <+167>: pop rbp
0x000000000040058c <+168>: ret
但是如果您使用基本优化(
gcc -O
)运行,那么它会缩短为一次写入:
Dump of assembler code for function main:
0x00000000004004e4 <+0>: sub rsp,0x8
0x00000000004004e8 <+4>: mov eax,0x3e8
0x00000000004004ed <+9>: sub eax,0x1
0x00000000004004f0 <+12>: jne 0x4004ed <main+9>
0x00000000004004f2 <+14>: mov DWORD PTR [rip+0x2003fc],0xffffffff # 0x6008f8 <result>
0x00000000004004fc <+24>: mov DWORD PTR [rip+0x2003f6],0x3e8 # 0x6008fc <i>
0x0000000000400506 <+34>: mov esi,0xffffffff
0x000000000040050b <+39>: mov edi,0x40060c
0x0000000000400510 <+44>: mov eax,0x0
0x0000000000400515 <+49>: call 0x4003e0 <printf@plt>
0x000000000040051a <+54>: add rsp,0x8
0x000000000040051e <+58>: ret
我的测试代码是:
#define TIMES 1000
int result, i;
int main() {
for (i=0; i<TIMES; i++)
{
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
}
printf("%d", result);
}
它可能会优化该代码。如果您想分析 -1,那么您应该使用 -O0 运行。您可能还应该生成一些没有循环的代码来分析单个指令。
在没有优化的情况下分析代码没有多大意义。
相反,我建议将
result
声明为 volatile
。
现在,您的代码可能会优化为:
result = -1;
结果
这两个代码都经过充分优化编译:
00401000 mov ecx,3E8h
00401005 or eax,0FFFFFFFFh
00401008 jmp wmain+10h (401010h)
0040100A lea ebx,[ebx]
00401010 sub ecx,1
{
result = -1;
00401013 mov dword ptr [result (40301Ch)],eax
result = -1;
00401018 mov dword ptr [result (40301Ch)],eax
result = -1;
0040101D mov dword ptr [result (40301Ch)],eax
result = -1;
00401022 mov dword ptr [result (40301Ch)],eax
result = -1;
00401027 mov dword ptr [result (40301Ch)],eax
result = -1;
0040102C mov dword ptr [result (40301Ch)],eax
result = -1;
00401031 mov dword ptr [result (40301Ch)],eax
result = -1;
00401036 mov dword ptr [result (40301Ch)],eax
result = -1;
0040103B mov dword ptr [result (40301Ch)],eax
result = -1;
00401040 mov dword ptr [result (40301Ch)],eax
00401045 jne wmain+10h (401010h)
}
cout << result;
00401047 mov eax,dword ptr [result (40301Ch)]
0040104C mov ecx,dword ptr [__imp_std::cout (402038h)]
00401052 push eax
00401053 call dword ptr [__imp_std::basic_ostream<char,std::char_traits<char> >::operator<< (40203Ch)]
for (int i=0; i< 1000 ; i += 1)
{
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
result = -1;
}
cout << result;
00401000 mov ecx,dword ptr [__imp_std::cout (402038h)]
00401006 push 0FFFFFFFFh
00401008 mov dword ptr [result (40301Ch)],0FFFFFFFFh
00401012 call dword ptr [__imp_std::basic_ostream<char,std::char_traits<char> >::operator<< (40203Ch)]
这取决于您的编译器的优化级别。因此,可以通过几种方式对其进行优化:
如果将
result
设为 volatile
,这会变得有点棘手,这会“阻止”编译器假设它的值不会在循环或表达式序列之外发生变化,甚至可能足以防止10 条内心陈述变成 1 条。
测试这一点的最佳方法是使用
objdump
或调试器之类的工具检查编译器的输出。