Shuffle instruction based warp reduction is expected to perform faster reduction than reduction using shared memory or global memory,如Faster Parallel Reductions on Kepler and CUDA Pro Tip: Do The Kepler Shuffle
在下面的代码中,我试图验证这一点:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32];
int lane = threadIdx.x%32;
int wid = threadIdx.x / 32;
val = warpReduceSum(val);
//write reduced value to shared memory
if (lane == 0) shared[wid] = val;
__syncthreads();
//ensure we only grab a value from shared memory if that warp existed
val = (threadIdx.x<blockDim.x / 32) ? shared[lane] : int(0);
if (wid == 0) val = warpReduceSum(val);
return val;
}
__global__ void device_reduce_stable_kernel(float *in, float* out, int N) {
float sum = int(0);
//printf("value = %d ", blockDim.x*gridDim.x);
for (int i = blockIdx.x*blockDim.x + threadIdx.x; i<N; i += blockDim.x*gridDim.x) {
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x == 0)
out[blockIdx.x] = sum;
}
void device_reduce_stable(float *in, float* out, int N) {
//int threads = 512;
//int blocks = min((N + threads - 1) / threads, 1024);
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = N / maxThreadsPerBlock;
device_reduce_stable_kernel << <blocks, threads >> >(in, out, N);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
device_reduce_stable_kernel << <1, 1024 >> >(out, out, blocks);
//cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// do reduction in global mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
d_in[myId] += d_in[myId + s];
}
__syncthreads(); // make sure all adds at one stage are done!
}
// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
d_out[blockIdx.x] = d_in[myId];
}
}
__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
// sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ float sdata[];
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// load shared mem from global mem
sdata[tid] = d_in[myId];
__syncthreads(); // make sure entire block is loaded!
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads(); // make sure all adds at one stage are done!
}
// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
d_out[blockIdx.x] = sdata[0];
}
}
void reduce(float * d_out, float * d_intermediate, float * d_in,
int size, bool usesSharedMemory)
{
// assumes that size is not greater than maxThreadsPerBlock^2
// and that size is a multiple of maxThreadsPerBlock
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = size / maxThreadsPerBlock;
if (usesSharedMemory)
{
shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
(d_intermediate, d_in);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
else
{
global_reduce_kernel << <blocks, threads >> >
(d_intermediate, d_in);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
// now we're down to one block left, so reduce it
threads = blocks; // launch one thread for each block in prev step
blocks = 1;
if (usesSharedMemory)
{
shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
(d_out, d_intermediate);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
else
{
global_reduce_kernel << <blocks, threads >> >
(d_out, d_intermediate);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
}
int main()
{
/*int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (deviceCount == 0) {
fprintf(stderr, "error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, dev) == 0)
{
printf("Using device %d:\n", dev);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
devProps.name, (int)devProps.totalGlobalMem,
(int)devProps.major, (int)devProps.minor,
(int)devProps.clockRate);
}
*/
const int ARRAY_SIZE = 2048;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
float sum = 0.0f;
for (int i = 0; i < ARRAY_SIZE; i++) {
// generate random float in [-1.0f, 1.0f]
h_in[i] = i;
sum += h_in[i];
}
// declare GPU memory pointers
float * d_in, *d_intermediate, *d_out;
// allocate GPU memory
cudaMalloc((void **)&d_in, ARRAY_BYTES);
cudaMalloc((void **)&d_intermediate, ARRAY_BYTES); // overallocated
cudaMalloc((void **)&d_out, sizeof(float));
// transfer the input array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
int whichKernel = 2;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// launch the kernel
cudaProfilerStart();
switch (whichKernel) {
case 0:
printf("Running global reduce\n");
cudaEventRecord(start, 0);
//for (int i = 0; i < 100; i++)
//{
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
//}
cudaEventRecord(stop, 0);
break;
case 1:
printf("Running reduce with shared mem\n");
cudaEventRecord(start, 0);
//for (int i = 0; i < 100; i++)
//{
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
//}
cudaEventRecord(stop, 0);
break;
case 2:
printf("Running reduce with shuffle instruction\n");
cudaEventRecord(start, 0);
/*for (int i = 0; i < 100; i++)
{*/
device_reduce_stable(d_in, d_out, ARRAY_SIZE);
//}
cudaEventRecord(stop, 0);
break;
default:
fprintf(stderr, "error: ran no kernel\n");
exit(EXIT_FAILURE);
}
cudaProfilerStop();
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
elapsedTime /= 100.0f; // 100 trials
// copy back the sum from GPU
float h_out;
cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
printf("average time elapsed: %f\n", elapsedTime);
// free GPU memory allocation
cudaFree(d_in);
cudaFree(d_intermediate);
cudaFree(d_out);
return 0;
}
结果表明,基于 warp 的归约花费的时间几乎是基于共享内存的归约的两倍。这些结果与预期的行为相矛盾。
实验在计算能力高于 3.0 的 Tesla K40c 上进行。
我正在比较以下两个缩减内核,一个使用仅共享内存,而在最后一个 warp 缩减阶段不使用 warp shuffling(
version4
),一个使用共享内存和 warp shuffling 最后一个 warp reduction stage( version5
).
version4
template <class T>
__global__ void version4(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // --- Local thread index
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; // --- Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
{
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
if (tid < 32) {
sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads();
}
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = mySum;
}
version5
template <class T>
__global__ void version5(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // --- Local thread index
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; // --- Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
{
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Single warp reduction by shuffle operations
if (tid < 32)
{
// --- Last iteration removed from the for loop, but needed for shuffle reduction
mySum += sdata[tid + 32];
// --- Reduce final warp using shuffle
//for (int offset = warpSize / 2; offset > 0; offset /= 2) mySum += __shfl_down_sync(0xffffffff, mySum, offset);
for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor_sync(0xffffffff, mySum, i);
}
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = mySum;
}
我确认两者之间没有敏感差异。在我的 GTX920M 卡上,时间如下:
N = 33554432
version4 = 27.5ms
version5 = 27.095ms
所以,我确认罗伯特上面的评论。