我正在尝试创建数组对象的视图,以更好地利用 x86_64 平台上的 SIMD 向量。
主要思想如下:
type Char_Set_Index is range 0 .. 7;
type Char_Set_Element is mod 2 ** 32;
type Character_Set_Vector is array (Char_Set_Index) of Char_Set_Element
with Alignment => 32,Component_Size => 32, Object_Size => 256, Size => 256;
type Character_Set is array (Character) of Boolean
with Alignment => 32, Component_Size => 1, Object_Size => 256, Size => 256;
本质上,
Ada.Character.Maps
中的一些操作可以使用SIMD算法更好地处理。例如 "="
操作,可能编码为,
function "="
(Left, Right : in Character_Set)
return Boolean
is
(for all k in Character_Set'Range =>
(Left(k) = Right(k)));
.. 给我们以下输出
.LFB4:
.cfi_startproc
movq %rdi, %r8
movq %rsi, %rdi
xorl %esi, %esi
jmp .L6
.p2align 4,,10
.p2align 3
.L10:
addl $1, %esi
cmpl $256, %esi
je .L9
.L6:
movl %esi, %edx
movl %esi, %ecx
sarl $3, %edx
andl $7, %ecx
movslq %edx, %rdx
movzbl (%rdi,%rdx), %eax
xorb (%r8,%rdx), %al
shrb %cl, %al
testb $1, %al
je .L10
xorl %eax, %eax
ret
.L9:
movl $1, %eax
ret
.cfi_endproc
重要的是,它是在比较每一位,GCC 不会对其进行矢量化。但是,如果我们写,
function "="
(Left, Right : in Character_Set)
return Boolean
is
u : aliased constant Character_Set_Vector
with Import, Address => Left'Address;
v : aliased constant Character_Set_Vector
with Import, Address => Right'Address;
Temp : array (Char_Set_Index) of Integer;
Sum : Integer;
begin
for j in Temp'Range loop
pragma Loop_Optimize (Vector);
Temp(j) := (if u(j) = v(j) then 0 else 1);
end loop;
Sum := 0;
for j in Temp'Range loop
Sum := Sum + Temp(j);
end loop;
return Sum = 0;
end "=";
我们得到了我们所期望的无分支 SIMD 指令,
.cfi_startproc
vmovdqa (%rdi), %ymm1
vpcmpeqd (%rsi), %ymm1, %ymm1
vpandn .LC0(%rip), %ymm1, %ymm1
vextracti128 $0x1, %ymm1, %xmm0
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $8, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $4, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vmovd %xmm0, %eax
testl %eax, %eax
sete %al
vzeroupper
ret
.cfi_endproc
这一切都运行得很好。现在,手头的问题。如果您通过 SPARK Ada 推送此代码,则会出现许多关于对齐、别名和常量的抱怨,因此您必须最终编写,
function "="
(Left, Right : in Character_Set)
return Boolean
is
Left_Aligned : constant Character_Set := Left
with Alignment => 32;
Right_Aligned : constant Character_Set := Right
with Alignment => 32;
u : aliased constant Character_Set_Vector
with Import, Alignment => 32, Address => Left_Aligned'Address;
v : aliased constant Character_Set_Vector
with Import, Alignment => 32, Address => Right_Aligned'Address;
Temp : array (Char_Set_Index) of Integer;
Sum : Integer;
begin
for j in Temp'Range loop
pragma Loop_Optimize (Vector);
Temp(j) := (if u(j) = v(j) then 0 else 1);
end loop;
Sum := 0;
for j in Temp'Range loop
Sum := Sum + Temp(j);
end loop;
return Sum = 0;
end "=";
这给了我们大量的预复制,大概是为了确保一切都对齐 - 即使声明已经正确对齐,
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-32, %rsp
vmovdqa (%rdi), %xmm2
vmovdqa 16(%rdi), %xmm3
vmovdqa (%rsi), %xmm4
vmovdqa 16(%rsi), %xmm5
vmovdqa %xmm2, -64(%rsp)
vmovdqa %xmm3, -48(%rsp)
vmovdqa -64(%rsp), %ymm6
vmovdqa %xmm4, -32(%rsp)
vmovdqa %xmm5, -16(%rsp)
vpcmpeqd -32(%rsp), %ymm6, %ymm1
vpandn .LC0(%rip), %ymm1, %ymm1
vextracti128 $0x1, %ymm1, %xmm0
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $8, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $4, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vmovd %xmm0, %eax
testl %eax, %eax
sete %al
vzeroupper
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
显然,人们会为此烦恼的唯一原因是为了更好的性能,但是,SPARK Ada 规则在这种情况下似乎限制太多,损害了性能。所以,我的问题是,是否有更好的方法来做到这一点,并且不会导致过多的数据移动,据我所知,这是不需要的。
顺便说一句,
Ada.Unchecked_Conversion
同样在开始时也会移动大量数据。
另外,我意识到我可以证明 SPARK Ada 检查(误报)是合理的,这样我就可以使用 Ada 版本,但我希望我在这里遗漏了一些东西,并且有一种更简单的方法可以做到这一点。
也许有一种对布尔数组进行矢量化的方法?
编辑:我正在使用
编译它gnatmake -O3 -mavx2 -gnatn -gnatp -S name-of-package.adb
为什么
Left
和 Right
的对齐在函数体内是未知的,这个问题很有趣。您确实既不能断言对齐属性,也不能向函数添加前提条件,说明参数对齐的要求(至少对于GNATprove FSF 11.2.0)。不过,SPARK 源代码中对此问题有一些评论(请参阅 spark_definition.adb
中的第 3276 行)。
另一方面,似乎您可以通过在循环中应用转换来解决未经检查的转换的额外复制问题。以下是我使用 GNAT FSF 11.3.1 能够实现的目标:
character_sets.ads
package Character_Sets with SPARK_Mode is
type Character_Set is array (Character) of Boolean
with
Alignment => 32,
Component_Size => 1,
Object_Size => 256,
Size => 256;
function "=" (Left, Right : in Character_Set) return Boolean;
end Character_Sets;
character_sets.adb
with Ada.Unchecked_Conversion;
package body Character_Sets with SPARK_Mode is
type Char_Set_Index is range 0 .. 7;
type Char_Set_Element is mod 2 ** 32;
type Character_Set_Vector is array (Char_Set_Index) of aliased Char_Set_Element
with
Alignment => 32,
Component_Size => 32,
Object_Size => 256,
Size => 256;
function To_Vector is new Ada.Unchecked_Conversion
(Source => Character_Set,
Target => Character_Set_Vector);
---------
-- "=" --
---------
function "=" (Left, Right : in Character_Set) return Boolean is
Temp : array (Char_Set_Index) of Integer;
Sum : Integer;
begin
for J in Temp'Range loop
pragma Loop_Optimize (Vector);
Temp (J) := (if To_Vector (Left) (J) = To_Vector (Right) (J) then 0 else 1); -- !!!
end loop;
Sum := 0;
for J in Temp'Range loop
Sum := Sum + Temp (J);
end loop;
return Sum = 0;
end "=";
end Character_Sets;
默认.gpr
project Default is
for Source_Dirs use ("src");
for Object_Dir use "obj";
for Main use ();
package Compiler is
for Switches ("ada") use ("-O3", "-mavx2", "-gnatn", "-gnatp");
end Compiler;
end Default;
输出(objdump)
$ objdump -d -M intel ./obj/character_sets.o
./obj/character_sets.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <character_sets__Tcharacter_setBIP>:
0: c3 ret
1: 90 nop
2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
9: 00 00 00 00
d: 0f 1f 00 nop DWORD PTR [rax]
0000000000000010 <character_sets__Tcharacter_set_vectorBIP>:
10: c3 ret
11: 90 nop
12: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
19: 00 00 00 00
1d: 0f 1f 00 nop DWORD PTR [rax]
0000000000000020 <character_sets__Oeq>:
20: c5 fd 6f 0f vmovdqa ymm1,YMMWORD PTR [rdi]
24: c5 f5 76 0e vpcmpeqd ymm1,ymm1,YMMWORD PTR [rsi]
28: c5 f5 df 0d 00 00 00 vpandn ymm1,ymm1,YMMWORD PTR [rip+0x0] # 30 <character_sets__Oeq+0x10>
2f: 00
30: c4 e3 7d 39 c8 01 vextracti128 xmm0,ymm1,0x1
36: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
3a: c5 f1 73 d8 08 vpsrldq xmm1,xmm0,0x8
3f: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
43: c5 f1 73 d8 04 vpsrldq xmm1,xmm0,0x4
48: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
4c: c5 f9 7e c0 vmovd eax,xmm0
50: 85 c0 test eax,eax
52: 0f 94 c0 sete al
55: c5 f8 77 vzeroupper
58: c3 ret
输出(蚊虫证明)
$ gnatprove -P ./default.gpr -f
Phase 1 of 2: generation of Global contracts ...
Phase 2 of 2: flow analysis and proof ...
character_sets.adb:31:10: warning: pragma "Loop_Optimize" ignored (not yet supported)
31 | pragma Loop_Optimize (Vector);
| ^ here
Summary logged in /home/deedee/72423385-spark-ada-overlays-without-copying/obj/gnatprove/gnatprove.out
这是 DeeDee 解决方案后得到的(过度优化的)函数,
function "="
(Left, Right : in Character_Set)
return Boolean
is
Temp : array (Char_Set_Index) of Integer;
Sum : Integer;
begin
for j in Temp'Range loop
Temp(j) := (if To_Vector(Left)(j) = To_Vector(Right)(j) then -1 else 0);
end loop;
Sum := 0;
for j in Temp'Range loop
Sum := Sum + Temp(j);
end loop;
return Sum = -Temp'Length;
end "=";
注意 Temp 值的变化,以与 Intel 的文档相匹配,以正确匹配
vpcmpeqd
的结果。对于所有这些努力(和复杂性),您可以放弃一个 vpand
此外,将向量数组移动到正文中而不是在规范中私有之后,似乎可以允许您删除
pragma Loop_Optimize
确实,如果您没有可用的 SIMD,
.cfi_startproc
movl (%rsi), %eax
cmpl %eax, (%rdi)
sete %dl
movl 4(%rsi), %ecx
xorl %r9d, %r9d
movl 8(%rsi), %r10d
movzbl %dl, %r8d
movl 12(%rsi), %eax
negl %r8d
cmpl %ecx, 4(%rdi)
movl 16(%rsi), %ecx
sete %r9b
xorl %r11d, %r11d
subl %r9d, %r8d
cmpl %r10d, 8(%rdi)
movl 20(%rsi), %r10d
sete %r11b
xorl %edx, %edx
subl %r11d, %r8d
cmpl %eax, 12(%rdi)
movl 24(%rsi), %eax
sete %dl
xorl %r9d, %r9d
movl 28(%rsi), %esi
subl %edx, %r8d
cmpl %ecx, 16(%rdi)
sete %r9b
xorl %r11d, %r11d
subl %r9d, %r8d
cmpl %r10d, 20(%rdi)
sete %r11b
xorl %edx, %edx
subl %r11d, %r8d
cmpl %eax, 24(%rdi)
sete %dl
xorl %ecx, %ecx
subl %edx, %r8d
cmpl %esi, 28(%rdi)
sete %cl
subl %ecx, %r8d
cmpl $-8, %r8d
sete %al
ret
.cfi_endproc
与,
gnatmake -O2 -funroll-loops -gnatn -gnatp -S name-of-package.adb
如果你想避免分支,这似乎比天真的版本更好
看到这个我的第一个想法是,“你为什么要定义
"="
为 Character_Set
?”它带有预定义的 "="
。
让我们看看它做了什么:
package Packed_Vectorization is
type CS is array (Character) of Boolean with
Component_Size => 1, Size => 256;
type Character_Set is new CS with
Component_Size => 1, Size => 256;
function "=" (Left : in Character_Set; Right : in Character_Set) return Boolean is
(CS (Left) = CS (Right) );
end Packed_Vectorization;
类型派生就在那里,因此我们可以看到为预定义的
"="
生成了哪些代码。
编译
gnatmake -gnatnp -O3 -S packed_vectorization.ads
重要部分为
packed_vectorization__Oeq:
.LFB2:
.cfi_startproc
movq %rsi, %rdx
movl $256, %ecx
movl $256, %esi
jmp system__bit_ops__bit_eq@PLT
.cfi_endproc
编译器有一个专门用于比较位封装数组的特殊函数,大概是为了优化这个常见操作。你可以看看
System.Bit_Ops.Bit_Eq
的实现;重要的部分似乎是
if LeftB (1 .. BLen) /= RightB (1 .. BLen) then
其中
Leftb
和 Rightb
是两个数组作为字节打包数组的视图。这是字节数组类型的预定义 "/="
。我无法找到 System.Bit_Ops
的目标文件,但我猜 "/="
也已优化。
您的使用可以接受吗? (我想你需要优化你的
"="
才能满足你量化的时序要求,否则就没有理由担心这个。)如果是这样,那么很多努力都白费了。
“Ada 优于汇编:案例研究”,TRI-Ada '92 的论文集,报告了 Ada (83) 编译器生成的代码比专家团队手工优化的汇编器更快、更小。那是30年前的事了。从那时起,优化器技术无疑得到了改进。通常,编译器比我们任何人都更了解优化。
“过早优化是万恶之源……”——Donald Knuth