SPARK Ada:无需复制即可叠加

问题描述 投票:0回答:3

我正在尝试创建数组对象的视图,以更好地利用 x86_64 平台上的 SIMD 向量。

主要思想如下:

   type Char_Set_Index is range 0 .. 7;
   type Char_Set_Element is mod 2 ** 32;
   
   type Character_Set_Vector is array (Char_Set_Index) of Char_Set_Element
     with Alignment => 32,Component_Size => 32, Object_Size => 256, Size => 256;
   
   type Character_Set is array (Character) of Boolean
     with Alignment => 32, Component_Size => 1, Object_Size => 256, Size => 256;

本质上,

Ada.Character.Maps
中的一些操作可以使用SIMD算法更好地处理。例如
"="
操作,可能编码为,

   function "="
     (Left, Right : in Character_Set)
      return Boolean
   is
     (for all k in Character_Set'Range =>
         (Left(k) = Right(k)));

.. 给我们以下输出

.LFB4:
    .cfi_startproc
    movq    %rdi, %r8
    movq    %rsi, %rdi
    xorl    %esi, %esi
    jmp .L6
    .p2align 4,,10
    .p2align 3
.L10:
    addl    $1, %esi
    cmpl    $256, %esi
    je  .L9
.L6:
    movl    %esi, %edx
    movl    %esi, %ecx
    sarl    $3, %edx
    andl    $7, %ecx
    movslq  %edx, %rdx
    movzbl  (%rdi,%rdx), %eax
    xorb    (%r8,%rdx), %al
    shrb    %cl, %al
    testb   $1, %al
    je  .L10
    xorl    %eax, %eax
    ret
.L9:
    movl    $1, %eax
    ret
    .cfi_endproc

重要的是,它是在比较每一位,GCC 不会对其进行矢量化。但是,如果我们写,

   function "="
     (Left, Right : in Character_Set)
      return Boolean
   is
      
      u : aliased constant Character_Set_Vector
        with Import, Address => Left'Address;
      
      v : aliased constant Character_Set_Vector
        with Import, Address => Right'Address;
   
      Temp : array (Char_Set_Index) of Integer;
      Sum  : Integer;
   
   begin
   
      for j in Temp'Range loop
         pragma Loop_Optimize (Vector);
         Temp(j) := (if u(j) = v(j) then 0 else 1);
      end loop;
   
      Sum := 0;
      for j in Temp'Range loop
         Sum := Sum + Temp(j);
      end loop;
   
      return Sum = 0;
   
   end "=";

我们得到了我们所期望的无分支 SIMD 指令,

    .cfi_startproc
    vmovdqa (%rdi), %ymm1
    vpcmpeqd    (%rsi), %ymm1, %ymm1
    vpandn  .LC0(%rip), %ymm1, %ymm1
    vextracti128    $0x1, %ymm1, %xmm0
    vpaddd  %xmm1, %xmm0, %xmm0
    vpsrldq $8, %xmm0, %xmm1
    vpaddd  %xmm1, %xmm0, %xmm0
    vpsrldq $4, %xmm0, %xmm1
    vpaddd  %xmm1, %xmm0, %xmm0
    vmovd   %xmm0, %eax
    testl   %eax, %eax
    sete    %al
    vzeroupper
    ret
    .cfi_endproc

这一切都运行得很好。现在,手头的问题。如果您通过 SPARK Ada 推送此代码,则会出现许多关于对齐、别名和常量的抱怨,因此您必须最终编写,

   function "="
     (Left, Right : in Character_Set)
      return Boolean
   is
      
      Left_Aligned : constant Character_Set := Left
        with Alignment => 32;
      
      Right_Aligned : constant Character_Set := Right
        with Alignment => 32;
      
      u : aliased constant Character_Set_Vector
        with Import, Alignment => 32, Address => Left_Aligned'Address;
      
      v : aliased constant Character_Set_Vector
        with Import, Alignment => 32, Address => Right_Aligned'Address;
   
      Temp : array (Char_Set_Index) of Integer;
      Sum  : Integer;
   
   begin
   
      for j in Temp'Range loop
         pragma Loop_Optimize (Vector);
         Temp(j) := (if u(j) = v(j) then 0 else 1);
      end loop;
   
      Sum := 0;
      for j in Temp'Range loop
         Sum := Sum + Temp(j);
      end loop;
   
      return Sum = 0;
   
   end "=";

这给了我们大量的预复制,大概是为了确保一切都对齐 - 即使声明已经正确对齐,

    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    andq    $-32, %rsp
    vmovdqa (%rdi), %xmm2
    vmovdqa 16(%rdi), %xmm3
    vmovdqa (%rsi), %xmm4
    vmovdqa 16(%rsi), %xmm5
    vmovdqa %xmm2, -64(%rsp)
    vmovdqa %xmm3, -48(%rsp)
    vmovdqa -64(%rsp), %ymm6
    vmovdqa %xmm4, -32(%rsp)
    vmovdqa %xmm5, -16(%rsp)
    vpcmpeqd    -32(%rsp), %ymm6, %ymm1
    vpandn  .LC0(%rip), %ymm1, %ymm1
    vextracti128    $0x1, %ymm1, %xmm0
    vpaddd  %xmm1, %xmm0, %xmm0
    vpsrldq $8, %xmm0, %xmm1
    vpaddd  %xmm1, %xmm0, %xmm0
    vpsrldq $4, %xmm0, %xmm1
    vpaddd  %xmm1, %xmm0, %xmm0
    vmovd   %xmm0, %eax
    testl   %eax, %eax
    sete    %al
    vzeroupper
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc

显然,人们会为此烦恼的唯一原因是为了更好的性能,但是,SPARK Ada 规则在这种情况下似乎限制太多,损害了性能。所以,我的问题是,是否有更好的方法来做到这一点,并且不会导致过多的数据移动,据我所知,这是不需要的。

顺便说一句,

 Ada.Unchecked_Conversion
同样在开始时也会移动大量数据。

另外,我意识到我可以证明 SPARK Ada 检查(误报)是合理的,这样我就可以使用 Ada 版本,但我希望我在这里遗漏了一些东西,并且有一种更简单的方法可以做到这一点。

也许有一种对布尔数组进行矢量化的方法?

编辑:我正在使用

编译它
gnatmake -O3 -mavx2 -gnatn -gnatp -S name-of-package.adb
ada spark-ada
3个回答
3
投票

为什么

Left
Right
的对齐在函数体内是未知的,这个问题很有趣。您确实既不能断言对齐属性,也不能向函数添加前提条件,说明参数对齐的要求(至少对于GNATprove FSF 11.2.0)。不过,SPARK 源代码中对此问题有一些评论(请参阅
spark_definition.adb
中的第 3276 行)。

另一方面,似乎您可以通过在循环中应用转换来解决未经检查的转换的额外复制问题。以下是我使用 GNAT FSF 11.3.1 能够实现的目标:

character_sets.ads

package Character_Sets with SPARK_Mode is
   
   type Character_Set is array (Character) of Boolean
     with 
       Alignment      => 32,
       Component_Size => 1, 
       Object_Size    => 256, 
       Size           => 256;
   
   function "=" (Left, Right : in Character_Set) return Boolean;

end Character_Sets;

character_sets.adb

with Ada.Unchecked_Conversion;

package body Character_Sets with SPARK_Mode is

   type Char_Set_Index is range 0 .. 7;
   type Char_Set_Element is mod 2 ** 32;
   
   type Character_Set_Vector is array (Char_Set_Index) of aliased Char_Set_Element
     with 
       Alignment      => 32,
       Component_Size => 32, 
       Object_Size    => 256, 
       Size           => 256;
   
   function To_Vector is new Ada.Unchecked_Conversion
     (Source => Character_Set,
      Target => Character_Set_Vector);
   
   ---------
   -- "=" --
   ---------

   function "=" (Left, Right : in Character_Set) return Boolean is
      
      Temp : array (Char_Set_Index) of Integer;
      Sum  : Integer;
   
   begin
     
      for J in Temp'Range loop
         pragma Loop_Optimize (Vector);
         Temp (J) := (if To_Vector (Left) (J) = To_Vector (Right) (J) then 0 else 1);    --  !!!
      end loop;
   
      Sum := 0;
      for J in Temp'Range loop
         Sum := Sum + Temp (J);
      end loop;
   
      return Sum = 0;
   
   end "=";
   
end Character_Sets;

默认.gpr

project Default is

   for Source_Dirs use ("src");
   for Object_Dir use "obj";
   for Main use ();

   package Compiler is
      for Switches ("ada") use ("-O3", "-mavx2", "-gnatn", "-gnatp");
   end Compiler;

end Default;

输出(objdump)

$ objdump -d -M intel ./obj/character_sets.o 

./obj/character_sets.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <character_sets__Tcharacter_setBIP>:
   0:   c3                      ret    
   1:   90                      nop
   2:   66 66 2e 0f 1f 84 00    data16 cs nop WORD PTR [rax+rax*1+0x0]
   9:   00 00 00 00 
   d:   0f 1f 00                nop    DWORD PTR [rax]

0000000000000010 <character_sets__Tcharacter_set_vectorBIP>:
  10:   c3                      ret    
  11:   90                      nop
  12:   66 66 2e 0f 1f 84 00    data16 cs nop WORD PTR [rax+rax*1+0x0]
  19:   00 00 00 00 
  1d:   0f 1f 00                nop    DWORD PTR [rax]

0000000000000020 <character_sets__Oeq>:
  20:   c5 fd 6f 0f             vmovdqa ymm1,YMMWORD PTR [rdi]
  24:   c5 f5 76 0e             vpcmpeqd ymm1,ymm1,YMMWORD PTR [rsi]
  28:   c5 f5 df 0d 00 00 00    vpandn ymm1,ymm1,YMMWORD PTR [rip+0x0]        # 30 <character_sets__Oeq+0x10>
  2f:   00 
  30:   c4 e3 7d 39 c8 01       vextracti128 xmm0,ymm1,0x1
  36:   c5 f9 fe c1             vpaddd xmm0,xmm0,xmm1
  3a:   c5 f1 73 d8 08          vpsrldq xmm1,xmm0,0x8
  3f:   c5 f9 fe c1             vpaddd xmm0,xmm0,xmm1
  43:   c5 f1 73 d8 04          vpsrldq xmm1,xmm0,0x4
  48:   c5 f9 fe c1             vpaddd xmm0,xmm0,xmm1
  4c:   c5 f9 7e c0             vmovd  eax,xmm0
  50:   85 c0                   test   eax,eax
  52:   0f 94 c0                sete   al
  55:   c5 f8 77                vzeroupper 
  58:   c3                      ret

输出(蚊虫证明)

$ gnatprove -P ./default.gpr -f
Phase 1 of 2: generation of Global contracts ...
Phase 2 of 2: flow analysis and proof ...

character_sets.adb:31:10: warning: pragma "Loop_Optimize" ignored (not yet supported)
   31 |         pragma Loop_Optimize (Vector);
      |         ^ here
Summary logged in /home/deedee/72423385-spark-ada-overlays-without-copying/obj/gnatprove/gnatprove.out

2
投票

这是 DeeDee 解决方案后得到的(过度优化的)函数,

 function "="
 (Left, Right : in Character_Set)
  return Boolean
is
   Temp : array (Char_Set_Index) of Integer;
   Sum  : Integer;
begin

  for j in Temp'Range loop
     Temp(j) := (if To_Vector(Left)(j) = To_Vector(Right)(j) then -1 else 0);
  end loop;

  Sum := 0;
  for j in Temp'Range loop
     Sum := Sum + Temp(j);
  end loop;
  
  return Sum = -Temp'Length;

 end "=";

注意 Temp 值的变化,以与 Intel 的文档相匹配,以正确匹配

vpcmpeqd
的结果。对于所有这些努力(和复杂性),您可以放弃一个
vpand

此外,将向量数组移动到正文中而不是在规范中私有之后,似乎可以允许您删除

pragma Loop_Optimize

确实,如果您没有可用的 SIMD,

    .cfi_startproc
    movl    (%rsi), %eax
    cmpl    %eax, (%rdi)
    sete    %dl
    movl    4(%rsi), %ecx
    xorl    %r9d, %r9d
    movl    8(%rsi), %r10d
    movzbl  %dl, %r8d
    movl    12(%rsi), %eax
    negl    %r8d
    cmpl    %ecx, 4(%rdi)
    movl    16(%rsi), %ecx
    sete    %r9b
    xorl    %r11d, %r11d
    subl    %r9d, %r8d
    cmpl    %r10d, 8(%rdi)
    movl    20(%rsi), %r10d
    sete    %r11b
    xorl    %edx, %edx
    subl    %r11d, %r8d
    cmpl    %eax, 12(%rdi)
    movl    24(%rsi), %eax
    sete    %dl
    xorl    %r9d, %r9d
    movl    28(%rsi), %esi
    subl    %edx, %r8d
    cmpl    %ecx, 16(%rdi)
    sete    %r9b
    xorl    %r11d, %r11d
    subl    %r9d, %r8d
    cmpl    %r10d, 20(%rdi)
    sete    %r11b
    xorl    %edx, %edx
    subl    %r11d, %r8d
    cmpl    %eax, 24(%rdi)
    sete    %dl
    xorl    %ecx, %ecx
    subl    %edx, %r8d
    cmpl    %esi, 28(%rdi)
    sete    %cl
    subl    %ecx, %r8d
    cmpl    $-8, %r8d
    sete    %al
    ret
    .cfi_endproc

与,

gnatmake -O2 -funroll-loops -gnatn -gnatp -S name-of-package.adb

如果你想避免分支,这似乎比天真的版本更好


2
投票

看到这个我的第一个想法是,“你为什么要定义

"="
Character_Set
?”它带有预定义的
"="

让我们看看它做了什么:

package Packed_Vectorization is
   type CS is array (Character) of Boolean with
      Component_Size => 1, Size => 256;
  
   type Character_Set is new CS with
      Component_Size => 1, Size => 256;
  
   function "=" (Left : in Character_Set; Right : in Character_Set) return Boolean is
      (CS (Left) = CS (Right) );
end Packed_Vectorization;

类型派生就在那里,因此我们可以看到为预定义的

"="
生成了哪些代码。

编译

gnatmake -gnatnp -O3 -S packed_vectorization.ads

重要部分为

packed_vectorization__Oeq:
.LFB2:
    .cfi_startproc
    movq    %rsi, %rdx
    movl    $256, %ecx
    movl    $256, %esi
    jmp system__bit_ops__bit_eq@PLT
    .cfi_endproc

编译器有一个专门用于比较位封装数组的特殊函数,大概是为了优化这个常见操作。你可以看看

System.Bit_Ops.Bit_Eq
的实现;重要的部分似乎是

if LeftB (1 .. BLen) /= RightB (1 .. BLen) then

其中

Leftb
Rightb
是两个数组作为字节打包数组的视图。这是字节数组类型的预定义
"/="
。我无法找到
System.Bit_Ops
的目标文件,但我猜
"/="
也已优化。

您的使用可以接受吗? (我想你需要优化你的

"="
才能满足你量化的时序要求,否则就没有理由担心这个。)如果是这样,那么很多努力都白费了。

“Ada 优于汇编:案例研究”,TRI-Ada '92 的论文集,报告了 Ada (83) 编译器生成的代码比专家团队手工优化的汇编器更快、更小。那是30年前的事了。从那时起,优化器技术无疑得到了改进。通常,编译器比我们任何人都更了解优化。

“过早优化是万恶之源……”——Donald Knuth

© www.soinside.com 2019 - 2024. All rights reserved.