我编写这些代码来读取字符串中的字符
首次使用计数器:
const char str[] = "Hello World!";
for (unsigned short i = 0; str[i]; i++) {
char c = str[i];
}
第二次使用指针:
const char str[] = "Hello World!";
for (const char* i = str; *i; i++) {
char c = *i;
}
那么,谁的表现更好呢?为什么?
没有通用的答案,因为它取决于多种因素,例如优化级别和使用的编译器...但是指针增量循环似乎更优化。
用gcc编译的Counter版本的反汇编代码:
Dump of assembler code for function main:
0x0000000000001149 <+0>: endbr64
0x000000000000114d <+4>: push rbp
0x000000000000114e <+5>: mov rbp,rsp
0x0000000000001151 <+8>: sub rsp,0x20
0x0000000000001155 <+12>: mov rax,QWORD PTR fs:0x28
0x000000000000115e <+21>: mov QWORD PTR [rbp-0x8],rax
0x0000000000001162 <+25>: xor eax,eax
0x0000000000001164 <+27>: movabs rax,0x6f57206f6c6c6548
0x000000000000116e <+37>: mov QWORD PTR [rbp-0x15],rax
0x0000000000001172 <+41>: mov DWORD PTR [rbp-0xd],0x21646c72
0x0000000000001179 <+48>: mov BYTE PTR [rbp-0x9],0x0
0x000000000000117d <+52>: mov WORD PTR [rbp-0x18],0x0
0x0000000000001183 <+58>: jmp 0x119e <main+85>
0x0000000000001185 <+60>: movzx eax,WORD PTR [rbp-0x18]
0x0000000000001189 <+64>: cdqe
0x000000000000118b <+66>: movzx eax,BYTE PTR [rbp+rax*1-0x15]
0x0000000000001190 <+71>: mov BYTE PTR [rbp-0x19],al
0x0000000000001193 <+74>: movzx eax,WORD PTR [rbp-0x18]
0x0000000000001197 <+78>: add eax,0x1
0x000000000000119a <+81>: mov WORD PTR [rbp-0x18],ax
0x000000000000119e <+85>: movzx eax,WORD PTR [rbp-0x18]
0x00000000000011a2 <+89>: cdqe
0x00000000000011a4 <+91>: movzx eax,BYTE PTR [rbp+rax*1-0x15]
0x00000000000011a9 <+96>: test al,al
0x00000000000011ab <+98>: jne 0x1185 <main+60>
0x00000000000011ad <+100>: mov eax,0x0
0x00000000000011b2 <+105>: mov rdx,QWORD PTR [rbp-0x8]
0x00000000000011b6 <+109>: sub rdx,QWORD PTR fs:0x28
0x00000000000011bf <+118>: je 0x11c6 <main+125>
0x00000000000011c1 <+120>: call 0x1050 <__stack_chk_fail@plt>
0x00000000000011c6 <+125>: leave
0x00000000000011c7 <+126>: ret
End of assembler dump.
与 Pointer 版本(仍然是 gcc)相比:
Dump of assembler code for function main:
0x0000000000001149 <+0>: endbr64
0x000000000000114d <+4>: push rbp
0x000000000000114e <+5>: mov rbp,rsp
0x0000000000001151 <+8>: sub rsp,0x30
0x0000000000001155 <+12>: mov rax,QWORD PTR fs:0x28
0x000000000000115e <+21>: mov QWORD PTR [rbp-0x8],rax
0x0000000000001162 <+25>: xor eax,eax
0x0000000000001164 <+27>: movabs rax,0x6f57206f6c6c6548
0x000000000000116e <+37>: mov QWORD PTR [rbp-0x15],rax
0x0000000000001172 <+41>: mov DWORD PTR [rbp-0xd],0x21646c72
0x0000000000001179 <+48>: mov BYTE PTR [rbp-0x9],0x0
0x000000000000117d <+52>: lea rax,[rbp-0x15]
0x0000000000001181 <+56>: mov QWORD PTR [rbp-0x20],rax
0x0000000000001185 <+60>: jmp 0x1196 <main+77>
0x0000000000001187 <+62>: mov rax,QWORD PTR [rbp-0x20]
0x000000000000118b <+66>: movzx eax,BYTE PTR [rax]
0x000000000000118e <+69>: mov BYTE PTR [rbp-0x21],al
0x0000000000001191 <+72>: add QWORD PTR [rbp-0x20],0x1
0x0000000000001196 <+77>: mov rax,QWORD PTR [rbp-0x20]
0x000000000000119a <+81>: movzx eax,BYTE PTR [rax]
0x000000000000119d <+84>: test al,al
0x000000000000119f <+86>: jne 0x1187 <main+62>
0x00000000000011a1 <+88>: mov eax,0x0
0x00000000000011a6 <+93>: mov rdx,QWORD PTR [rbp-0x8]
0x00000000000011aa <+97>: sub rdx,QWORD PTR fs:0x28
0x00000000000011b3 <+106>: je 0x11ba <main+113>
0x00000000000011b5 <+108>: call 0x1050 <__stack_chk_fail@plt>
0x00000000000011ba <+113>: leave
0x00000000000011bb <+114>: ret
End of assembler dump.
你可以看到第二个代码有更少的 movzx 并且没有 cdqe 指令,这意味着更少的内存访问,因为它直接递增指针而不是进行中间读取和计算。
现在使用 clang ,两个代码看起来更相似,但是在计数器版本中重新计算内存偏移量(rbp+rax*1-0x11)仍然存在很多低效率:
用铿锵声反击:
Dump of assembler code for function main:
0x0000000000401110 <+0>: push rbp
0x0000000000401111 <+1>: mov rbp,rsp
0x0000000000401114 <+4>: mov DWORD PTR [rbp-0x4],0x0
0x000000000040111b <+11>: mov rax,QWORD PTR ds:0x402004
0x0000000000401123 <+19>: mov QWORD PTR [rbp-0x11],rax
0x0000000000401127 <+23>: mov eax,DWORD PTR ds:0x40200c
0x000000000040112e <+30>: mov DWORD PTR [rbp-0x9],eax
0x0000000000401131 <+33>: mov al,BYTE PTR ds:0x402010
0x0000000000401138 <+40>: mov BYTE PTR [rbp-0x5],al
0x000000000040113b <+43>: mov WORD PTR [rbp-0x14],0x0
0x0000000000401141 <+49>: movzx eax,WORD PTR [rbp-0x14]
0x0000000000401145 <+53>: cmp BYTE PTR [rbp+rax*1-0x11],0x0
0x000000000040114a <+58>: je 0x40116c <main+92>
0x0000000000401150 <+64>: movzx eax,WORD PTR [rbp-0x14]
0x0000000000401154 <+68>: mov al,BYTE PTR [rbp+rax*1-0x11]
0x0000000000401158 <+72>: mov BYTE PTR [rbp-0x15],al
0x000000000040115b <+75>: mov ax,WORD PTR [rbp-0x14]
0x000000000040115f <+79>: add ax,0x1
0x0000000000401163 <+83>: mov WORD PTR [rbp-0x14],ax
0x0000000000401167 <+87>: jmp 0x401141 <main+49>
0x000000000040116c <+92>: mov eax,DWORD PTR [rbp-0x4]
0x000000000040116f <+95>: pop rbp
0x0000000000401170 <+96>: ret
End of assembler dump.
指针发出铿锵声:
Dump of assembler code for function main:
0x0000000000401110 <+0>: push rbp
0x0000000000401111 <+1>: mov rbp,rsp
0x0000000000401114 <+4>: mov DWORD PTR [rbp-0x4],0x0
0x000000000040111b <+11>: mov rax,QWORD PTR ds:0x402004
0x0000000000401123 <+19>: mov QWORD PTR [rbp-0x11],rax
0x0000000000401127 <+23>: mov eax,DWORD PTR ds:0x40200c
0x000000000040112e <+30>: mov DWORD PTR [rbp-0x9],eax
0x0000000000401131 <+33>: mov al,BYTE PTR ds:0x402010
0x0000000000401138 <+40>: mov BYTE PTR [rbp-0x5],al
0x000000000040113b <+43>: lea rax,[rbp-0x11]
0x000000000040113f <+47>: mov QWORD PTR [rbp-0x20],rax
0x0000000000401143 <+51>: mov rax,QWORD PTR [rbp-0x20]
0x0000000000401147 <+55>: cmp BYTE PTR [rax],0x0
0x000000000040114a <+58>: je 0x40116a <main+90>
0x0000000000401150 <+64>: mov rax,QWORD PTR [rbp-0x20]
0x0000000000401154 <+68>: mov al,BYTE PTR [rax]
0x0000000000401156 <+70>: mov BYTE PTR [rbp-0x21],al
0x0000000000401159 <+73>: mov rax,QWORD PTR [rbp-0x20]
0x000000000040115d <+77>: add rax,0x1
0x0000000000401161 <+81>: mov QWORD PTR [rbp-0x20],rax
0x0000000000401165 <+85>: jmp 0x401143 <main+51>
0x000000000040116a <+90>: mov eax,DWORD PTR [rbp-0x4]
0x000000000040116d <+93>: pop rbp
0x000000000040116e <+94>: ret
End of assembler dump.
基于指针的循环更加优化(没有 O2 或 O3 优化)在这种情况下以及大多数类似情况下。但当循环很短、编译器成功优化或处理多维数据结构时,它可能可以忽略不计。