我写了一些 cuda 代码,从全局内存加载到寄存器中,对 16 个值进行排序。 然后为了合并它们,我将它们写入共享数组:
__global__ void sort16(u32* arr, u32 n) {
__shared__ u32 tmp[16*32]; // 512 elements, 2kB shared memory, hopefully fits?
__shared__ u32 tmp2[16*32]; // 512 elements, 2kB shared memory, hopefully fits?
u32 tid = threadIdx.x;
u32 loc = blockIdx.x * 16;
u32 tmp_loc = tid * 16;
u32 a0 = arr[loc+0], a1 = arr[loc+1], a2 = arr[loc+2], a3 = arr[loc+3],
a4 = arr[loc+4], a5 = arr[loc+5], a6 = arr[loc+6], a7 = arr[loc+7];
u32 a8 = arr[loc+8], a9 = arr[loc+9], a10 = arr[loc+10], a11 = arr[loc+11],
a12 = arr[loc+12], a13 = arr[loc+13], a14 = arr[loc+14], a15 = arr[loc+15];
...
tmp[tmp_loc] = a0; tmp[tmp_loc+1] = a1; tmp[tmp_loc+2] = a2; tmp[tmp_loc+3] = a3;
tmp[tmp_loc+4] = a4; tmp[tmp_loc+5] = a5; tmp[tmp_loc+6] = a6; tmp[tmp_loc+7] = a7;
tmp[tmp_loc+8] = a8; tmp[tmp_loc+9] = a9; tmp[tmp_loc+10] = a10; tmp[tmp_loc+11] = a11;
tmp[tmp_loc+12] = a12; tmp[tmp_loc+13] = a13; tmp[tmp_loc+14] = a14; tmp[tmp_loc+15] = a15;
__syncthreads();
但是现在,我想将每个 16 块合并为 32 块,然后将 32 块合并为 64 块,依此类推,直到我的共享存储空间太小。目前,我分配了两个 512 个元素的数组。 我想写入全局内存的最后一次传递。
我不明白如何将线程映射到此操作。我知道如何用循环顺序编写它。在 C 中我会写:
__global__ merge2(u32* tmp, u32*tmp2, u32 size) {
for (u32 a = 0; a < 512; a += 2*size) {
u32 b = a + size;
u32 enda = a + size, endb = b + size;
while (a < enda && b < endb) {
if (tmp[a] < tmp[b])
tmp2[c++] = tmp[a++];
else
tmp2[c++] = tmp[b++];
}
while (a < enda)
tmp2[c++] = tmp[a++];
while (b < endb)
tmp2[c++] = tmp[b++];
}
}
如何用cuda用线程来表达这个? 这就是我设想的通话方式, 虽然我不知道线程和块的参数应该是什么?
u32* t = tmp;
u32* t2 = tmp2;
for (u32 i = 16; i < 512; i *= 2) {
merge2<<<??, ??>>>(t, t2, i);
u32* ptrtmp = t;
t = t2;
t2 = ptrtmp;
}
一个问题是,随着块的增长,可用于合并每个块的线程数量会增加一倍。我也可以不使用它们。
GPU 上的合并排序通常是通过合并双调序列来完成的,而双调序列通常由双调排序器排序。
双调排序并不是最快的渐近排序,但它具有非常低的恒定开销,并且实际上是当前 GPU 硬件上最快的排序方法。
下面是示例代码:您可以在 Godbolt 上使用它:https://cuda.godbolt.org/z/3nM4Ma6ss
请注意,您需要 C++17 或更高版本才能运行代码。 nvcc 编译器将内联 GPU 上的所有调用并消除所有 constexpr 表达式。生成的程序集非常快速且紧凑。
NVidia 在以下位置展示了一个很好的双调排序演示:https://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks
双调排序的工作方式是,首先获取任意输入数组并将其转换为双调序列(下面的步骤 1)。然后将该双调序列转换为排序序列。
双音序列是这样的
/\
/ \
/ 5 elements, first increasing, then decreasing (or the other way 'round)
因为 32 = 2^5,您需要 2x 5 = 10 个步骤从随机序列创建双调序列,还需要 5 个步骤将双调序列转换为排序序列。
一旦你有 2 个排序序列(a,B),你就反转 B 的顺序并并排比较元素,所有大的值进入 B',所有小的值进入 A'。
您现在有两个新的双调序列 A' 和 B',其良好的属性是 A' 中的每个元素 <= to every element in B', also, any duplicate elements between A' and B', must be equal in max(A) and min(B), so these are easy to detect.
图形化:
/ / \ \/ \ /
/ / \ /\ -> / \
a: / b: / rev.b: \ a' = min(a,rev.b) / \ / \ b' = max(a,b):
经过 minmax 分区后,您只需 5 个步骤(每 32 个元素)即可轻松对 A' 和 B' 进行排序(即:log_2(n) 步骤),这是最佳的。
下面的代码演示了单个扭曲的双调合并排序(为了保持简单)。
如果你想对更大的数组进行排序,你首先让每个 warp 对其自己的 32 部分进行排序,写入结果(在共享内存中),然后让每个 warp 组合两个排序序列,如下所示。
如果你有大数组,小键,那么基数排序可能比双调合并排序更有效。
#include <stdio.h>
#include <cuda.h>
__device__ int lanemask_eq() {
return 1 << (threadIdx.x & 31);
}
//Bittonic sort on a warp
//https://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf
template <typename T>
__device__ float swap(const T x, const int mask, const bool dir) {
const T y = __shfl_xor_sync(-1u, x, mask);
const T result = (x < y) == dir ? y : x;
return result;
}
//Bittonic sort on a warp
//on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf
template <typename T, bool reverse = false, int MaxLength = 32>
__device__ T WarpBitonicSortRegisters(T x) {
constexpr auto rev = uint32_t(-reverse);
const auto lanebit = lanemask_eq();
static_assert(sizeof(T) == sizeof(int), "must be int sized");
x = swap(x, 1, (0x66666666 & lanebit));
//Part1: create a bitonic sequence
//see: https://en.wikipedia.org/wiki/Bitonic_sorter
if constexpr (MaxLength > 4) {
x = swap(x, 2, (0x3C3C3C3C & lanebit));
x = swap(x, 1, (0x5A5A5A5A & lanebit));
if constexpr (MaxLength > 8) {
x = swap(x, 4, (0x0FF00FF0 & lanebit));
x = swap(x, 2, (0x33CC33CC & lanebit));
x = swap(x, 1, (0x55AA55AA & lanebit));
if constexpr (MaxLength > 16) {
x = swap(x, 8, (0x00FFFF00 & lanebit));
x = swap(x, 4, (0x0F0FF0F0 & lanebit));
x = swap(x, 2, (0x3333CCCC & lanebit));
x = swap(x, 1, (0x5555AAAA & lanebit));
//Step 2: turn a bitonic sequence into a sorted one.
x = swap(x, 16, ((0xFFFF0000 ^ rev) & lanebit));
}
x = swap(x, 8, ((0xFF00FF00 ^ rev) & lanebit));
}
x = swap(x, 4, ((0xF0F0F0F0 ^ rev) & lanebit));
}
x = swap(x, 2, ((0xCCCCCCCC ^ rev) & lanebit));
x = swap(x, 1, ((0xAAAAAAAA ^ rev) & lanebit));
return x;
}
template <typename T, bool reverse = false, int MaxLength = 32>
__device__ T WarpBitonicSortRegisters2(T x) {
constexpr auto rev = uint32_t(-reverse);
const auto lanebit = lanemask_eq();
static_assert(sizeof(T) == sizeof(int), "must be int sized");
//Part1: create a bitonic sequence
//see: https://en.wikipedia.org/wiki/Bitonic_sorter
if constexpr (MaxLength > 4) {
if constexpr (MaxLength > 8) {
if constexpr (MaxLength > 16) {
//Step 2: turn a bitonic sequence into a sorted one.
x = swap(x, 16, ((0xFFFF0000 ^ rev) & lanebit));
}
x = swap(x, 8, ((0xFF00FF00 ^ rev) & lanebit));
}
x = swap(x, 4, ((0xF0F0F0F0 ^ rev) & lanebit));
}
x = swap(x, 2, ((0xCCCCCCCC ^ rev) & lanebit));
x = swap(x, 1, ((0xAAAAAAAA ^ rev) & lanebit));
return x;
}
template <typename T>
__device__ void mergeSort(T& a, T& b) {
static_assert(sizeof(int) == sizeof(T));
assert(__shfl_down_sync(-1u, a, 1) >= a);
assert(__shfl_down_sync(-1u, b, 1) >= b);
b = __shfl_xor_sync(-1u, b, 31); //reverse the order of B: asc -> desc
const auto dir = a < b;
a = dir? a : b;
b = dir? b : a;
//every element in A will now be <= to every element in B
//and both A and B will be bitonic sequences,
//meaning we can sort them in only 5 steps
a = WarpBitonicSortRegisters2(a);
b = WarpBitonicSortRegisters2(b);
}
__global__ void sortOneWarp(int* ddata) {
const auto laneid = threadIdx.x & 31;
const auto Old = ddata[laneid];
auto A = WarpBitonicSortRegisters(Old);
const auto Old2 = ddata[laneid + 32];
auto B = WarpBitonicSortRegisters(Old2);
mergeSort(A, B);
printf("tid: %2i, new1: %i, old1: %i\n", threadIdx.x, A, Old);
printf("tid: %2i, new2: %i, old2: %i\n", threadIdx.x+32, B, Old2);
}
int main() {
int data[64];
//for (auto start = 32; auto& i: data) { i = start--; }
for (auto& i: data) { i = rand(); }
int* ddata;
cudaMalloc(&ddata, sizeof(data));
cudaMemcpy(ddata, &data[0], sizeof(data), cudaMemcpyHostToDevice);
sortOneWarp<<<1,32>>>(ddata);
return cudaDeviceSynchronize();
}