在 CUDA 中，如何进行合并排序将一个共享变量传递到另一个共享变量

Question

我写了一些 cuda 代码，从全局内存加载到寄存器中，对 16 个值进行排序。然后为了合并它们，我将它们写入共享数组：

__global__ void sort16(u32* arr, u32 n) {
__shared__ u32 tmp[16*32]; // 512 elements, 2kB shared memory, hopefully fits?
__shared__ u32 tmp2[16*32]; // 512 elements, 2kB shared memory, hopefully fits?

u32 tid = threadIdx.x;
u32 loc = blockIdx.x * 16;
u32 tmp_loc = tid * 16;
u32 a0 = arr[loc+0], a1 = arr[loc+1], a2 = arr[loc+2], a3 = arr[loc+3],
   a4 = arr[loc+4], a5 = arr[loc+5], a6 = arr[loc+6], a7 = arr[loc+7];
u32 a8 = arr[loc+8], a9 = arr[loc+9], a10 = arr[loc+10], a11 = arr[loc+11],
   a12 = arr[loc+12], a13 = arr[loc+13], a14 = arr[loc+14], a15 = arr[loc+15];
...
tmp[tmp_loc] = a0; tmp[tmp_loc+1] = a1; tmp[tmp_loc+2] = a2; tmp[tmp_loc+3] = a3;
tmp[tmp_loc+4] = a4; tmp[tmp_loc+5] = a5; tmp[tmp_loc+6] = a6; tmp[tmp_loc+7] = a7;
tmp[tmp_loc+8] = a8; tmp[tmp_loc+9] = a9; tmp[tmp_loc+10] = a10; tmp[tmp_loc+11] = a11;
tmp[tmp_loc+12] = a12; tmp[tmp_loc+13] = a13; tmp[tmp_loc+14] = a14; tmp[tmp_loc+15] = a15;
__syncthreads();

但是现在，我想将每个 16 块合并为 32 块，然后将 32 块合并为 64 块，依此类推，直到我的共享存储空间太小。目前，我分配了两个 512 个元素的数组。我想写入全局内存的最后一次传递。

我不明白如何将线程映射到此操作。我知道如何用循环顺序编写它。在 C 中我会写：

__global__ merge2(u32* tmp, u32*tmp2, u32 size) {
  for (u32 a = 0; a < 512; a += 2*size) {
    u32 b = a + size;
    u32 enda = a + size, endb = b + size;
    while (a < enda && b < endb) {
      if (tmp[a] < tmp[b])
        tmp2[c++] = tmp[a++];
      else
        tmp2[c++] = tmp[b++];
    }
    while (a < enda)
      tmp2[c++] = tmp[a++];
    while (b < endb)
      tmp2[c++] = tmp[b++];
  }
}

如何用cuda用线程来表达这个？这就是我设想的通话方式，虽然我不知道线程和块的参数应该是什么？

u32* t = tmp;
u32* t2 = tmp2;
for (u32 i = 16; i < 512; i *= 2) {
    merge2<<<??, ??>>>(t, t2, i);
    u32* ptrtmp = t;
    t = t2;
    t2 = ptrtmp;
}

一个问题是，随着块的增长，可用于合并每个块的线程数量会增加一倍。我也可以不使用它们。

Answer 1

GPU 上的合并排序通常是通过合并双调序列来完成的，而双调序列通常由双调排序器排序。
双调排序并不是最快的渐近排序，但它具有非常低的恒定开销，并且实际上是当前 GPU 硬件上最快的排序方法。

下面是示例代码：您可以在 Godbolt 上使用它：https://cuda.godbolt.org/z/3nM4Ma6ss

请注意，您需要 C++17 或更高版本才能运行代码。 nvcc 编译器将内联 GPU 上的所有调用并消除所有 constexpr 表达式。生成的程序集非常快速且紧凑。

NVidia 在以下位置展示了一个很好的双调排序演示：https://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks

双调排序的工作方式是，首先获取任意输入数组并将其转换为双调序列（下面的步骤 1）。然后将该双调序列转换为排序序列。

双音序列是这样的

  /\
 /  \
/        5 elements, first increasing, then decreasing (or the other way 'round)

因为 32 = 2^5，您需要 2x 5 = 10 个步骤从随机序列创建双调序列，还需要 5 个步骤将双调序列转换为排序序列。
一旦你有 2 个排序序列（a，B），你就反转 B 的顺序并并排比较元素，所有大的值进入 B'，所有小的值进入 A'。
您现在有两个新的双调序列 A' 和 B'，其良好的属性是 A' 中的每个元素 <= to every element in B', also, any duplicate elements between A' and B', must be equal in max(A) and min(B), so these are easy to detect.

图形化：

     /      /       \                       \/                      \  /
    /      /         \                      /\ ->  /                 \
a: /   b: /   rev.b:  \  a' = min(a,rev.b) /  \   / \ b' = max(a,b):

经过 minmax 分区后，您只需 5 个步骤（每 32 个元素）即可轻松对 A' 和 B' 进行排序（即：log_2(n) 步骤），这是最佳的。

下面的代码演示了单个扭曲的双调合并排序（为了保持简单）。
如果你想对更大的数组进行排序，你首先让每个 warp 对其自己的 32 部分进行排序，写入结果（在共享内存中），然后让每个 warp 组合两个排序序列，如下所示。

如果你有大数组，小键，那么基数排序可能比双调合并排序更有效。

#include <stdio.h>
#include <cuda.h>

__device__ int lanemask_eq() {
    return 1 << (threadIdx.x & 31);
}

//Bittonic sort on a warp
//https://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf
template <typename T>
__device__ float swap(const T x, const int mask, const bool dir) {
    const T y = __shfl_xor_sync(-1u, x, mask);
    const T result = (x < y) == dir ? y : x;
    return result;
}

//Bittonic sort on a warp
//on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf
template <typename T, bool reverse = false, int MaxLength = 32>
__device__ T WarpBitonicSortRegisters(T x) {
    constexpr auto rev = uint32_t(-reverse);
    const auto lanebit = lanemask_eq();
    static_assert(sizeof(T) == sizeof(int), "must be int sized");
    x = swap(x, 1, (0x66666666 & lanebit));


    //Part1: create a bitonic sequence
    //see: https://en.wikipedia.org/wiki/Bitonic_sorter
    if constexpr (MaxLength > 4) {
        x = swap(x, 2, (0x3C3C3C3C & lanebit));
        x = swap(x, 1, (0x5A5A5A5A & lanebit));

        if constexpr (MaxLength > 8) {
            x = swap(x, 4, (0x0FF00FF0 & lanebit));
            x = swap(x, 2, (0x33CC33CC & lanebit));
            x = swap(x, 1, (0x55AA55AA & lanebit));

            if constexpr (MaxLength > 16) {
                x = swap(x, 8, (0x00FFFF00 & lanebit));
                x = swap(x, 4, (0x0F0FF0F0 & lanebit));
                x = swap(x, 2, (0x3333CCCC & lanebit));
                x = swap(x, 1, (0x5555AAAA & lanebit));

             //Step 2: turn a bitonic sequence into a sorted one.   

                x = swap(x, 16, ((0xFFFF0000 ^ rev) & lanebit));
            }
            x = swap(x, 8, ((0xFF00FF00 ^ rev) & lanebit));
        }
        x = swap(x, 4, ((0xF0F0F0F0 ^ rev) & lanebit));
    }
    x = swap(x, 2, ((0xCCCCCCCC ^ rev) & lanebit));
    x = swap(x, 1, ((0xAAAAAAAA ^ rev) & lanebit));
    return x;
}

template <typename T, bool reverse = false, int MaxLength = 32>
__device__ T WarpBitonicSortRegisters2(T x) {
    constexpr auto rev = uint32_t(-reverse);
    const auto lanebit = lanemask_eq();
    static_assert(sizeof(T) == sizeof(int), "must be int sized");
    
    //Part1: create a bitonic sequence
    //see: https://en.wikipedia.org/wiki/Bitonic_sorter
    if constexpr (MaxLength > 4) {
        if constexpr (MaxLength > 8) {
            if constexpr (MaxLength > 16) {
                //Step 2: turn a bitonic sequence into a sorted one.   
                x = swap(x, 16, ((0xFFFF0000 ^ rev) & lanebit));
            }
            x = swap(x, 8, ((0xFF00FF00 ^ rev) & lanebit));
        }
        x = swap(x, 4, ((0xF0F0F0F0 ^ rev) & lanebit));
    }
    x = swap(x, 2, ((0xCCCCCCCC ^ rev) & lanebit));
    x = swap(x, 1, ((0xAAAAAAAA ^ rev) & lanebit));
    return x;
}


template <typename T>
__device__ void mergeSort(T& a, T& b) {
    static_assert(sizeof(int) == sizeof(T));
    assert(__shfl_down_sync(-1u, a, 1) >= a);
    assert(__shfl_down_sync(-1u, b, 1) >= b);
    b = __shfl_xor_sync(-1u, b, 31); //reverse the order of B: asc -> desc
    const auto dir = a < b;
    a = dir? a : b;
    b = dir? b : a;
    //every element in A will now be <= to every element in B
    //and both A and B will be bitonic sequences, 
    //meaning we can sort them in only 5 steps
    a = WarpBitonicSortRegisters2(a);
    b = WarpBitonicSortRegisters2(b);
}

__global__ void sortOneWarp(int* ddata) {
    const auto laneid = threadIdx.x & 31;
    const auto Old = ddata[laneid];
    auto A = WarpBitonicSortRegisters(Old);
    const auto Old2 = ddata[laneid + 32];
    auto B = WarpBitonicSortRegisters(Old2);
    mergeSort(A, B);
    printf("tid: %2i, new1: %i, old1: %i\n", threadIdx.x, A, Old);
    printf("tid: %2i, new2: %i, old2: %i\n", threadIdx.x+32, B, Old2);
}



int main() {
  int data[64];
  //for (auto start = 32; auto& i: data) { i = start--; }
  for (auto& i: data) { i = rand(); }
  int* ddata;
  cudaMalloc(&ddata, sizeof(data));
  cudaMemcpy(ddata, &data[0], sizeof(data), cudaMemcpyHostToDevice);
  sortOneWarp<<<1,32>>>(ddata);
  return cudaDeviceSynchronize();
}

在 CUDA 中，如何进行合并排序将一个共享变量传递到另一个共享变量

问题描述投票：0回答：1

1个回答

最新问题

在 CUDA 中，如何进行合并排序将一个共享变量传递到另一个共享变量

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1