CUDA shuffle 指令减少比共享内存减少慢？

Question

Shuffle instruction based warp reduction is expected to perform faster reduction than reduction using shared memory or global memory,如Faster Parallel Reductions on Kepler and CUDA Pro Tip: Do The Kepler Shuffle

在下面的代码中，我试图验证这一点：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>


__inline__ __device__
float warpReduceSum(float val) {
    for (int offset = 16; offset > 0; offset /= 2)
        val += __shfl_down(val, offset);
    return val;
}

__inline__ __device__
float blockReduceSum(float val) {
    static __shared__ int shared[32];
    int lane = threadIdx.x%32;
    int wid = threadIdx.x / 32;
    val = warpReduceSum(val);

    //write reduced value to shared memory
    if (lane == 0) shared[wid] = val;
    __syncthreads();

    //ensure we only grab a value from shared memory if that warp existed
    val = (threadIdx.x<blockDim.x / 32) ? shared[lane] : int(0);
    if (wid == 0) val = warpReduceSum(val);

    return val;
}

__global__ void device_reduce_stable_kernel(float *in, float* out, int N) {
    float sum = int(0);
    //printf("value = %d ", blockDim.x*gridDim.x);
    for (int i = blockIdx.x*blockDim.x + threadIdx.x; i<N; i += blockDim.x*gridDim.x) {
        sum += in[i];
    }
    sum = blockReduceSum(sum);
    if (threadIdx.x == 0)
        out[blockIdx.x] = sum;
}

void device_reduce_stable(float *in, float* out, int N) {
    //int threads = 512;
    //int blocks = min((N + threads - 1) / threads, 1024);
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = N / maxThreadsPerBlock;
    device_reduce_stable_kernel << <blocks, threads >> >(in, out, N);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        printf("Error: %s\n", cudaGetErrorString(err));

    device_reduce_stable_kernel << <1, 1024 >> >(out, out, blocks);
    //cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        printf("Error: %s\n", cudaGetErrorString(err));
}

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid = threadIdx.x;

    // do reduction in global mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            d_in[myId] += d_in[myId + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = d_in[myId];
    }
}

__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
    // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
    extern __shared__ float sdata[];

    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid = threadIdx.x;

    // load shared mem from global mem
    sdata[tid] = d_in[myId];
    __syncthreads();            // make sure entire block is loaded!

    // do reduction in shared mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = sdata[0];
    }
}

void reduce(float * d_out, float * d_intermediate, float * d_in,
    int size, bool usesSharedMemory)
{
    // assumes that size is not greater than maxThreadsPerBlock^2
    // and that size is a multiple of maxThreadsPerBlock
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = size / maxThreadsPerBlock;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
            (d_intermediate, d_in);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    else
    {
        global_reduce_kernel << <blocks, threads >> >
            (d_intermediate, d_in);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    // now we're down to one block left, so reduce it
    threads = blocks; // launch one thread for each block in prev step
    blocks = 1;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
            (d_out, d_intermediate);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    else
    {
        global_reduce_kernel << <blocks, threads >> >
            (d_out, d_intermediate);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
}

int main()
{
    /*int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        fprintf(stderr, "error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }
    int dev = 0;
    cudaSetDevice(dev);

    cudaDeviceProp devProps;
    if (cudaGetDeviceProperties(&devProps, dev) == 0)
    {
        printf("Using device %d:\n", dev);
        printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
            devProps.name, (int)devProps.totalGlobalMem,
            (int)devProps.major, (int)devProps.minor,
            (int)devProps.clockRate);
    }
*/
    const int ARRAY_SIZE = 2048;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // generate the input array on the host
    float h_in[ARRAY_SIZE];
    float sum = 0.0f;
    for (int i = 0; i < ARRAY_SIZE; i++) {
        // generate random float in [-1.0f, 1.0f]
        h_in[i] = i;
        sum += h_in[i];
    }

    // declare GPU memory pointers
    float * d_in, *d_intermediate, *d_out;

    // allocate GPU memory
    cudaMalloc((void **)&d_in, ARRAY_BYTES);
    cudaMalloc((void **)&d_intermediate, ARRAY_BYTES); // overallocated
    cudaMalloc((void **)&d_out, sizeof(float));

    // transfer the input array to the GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

    int whichKernel = 2;
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    // launch the kernel
    cudaProfilerStart();
    switch (whichKernel) {
    case 0:
        printf("Running global reduce\n");
        cudaEventRecord(start, 0);
        //for (int i = 0; i < 100; i++)
        //{
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
        //}
        cudaEventRecord(stop, 0);
        break;
    case 1:
        printf("Running reduce with shared mem\n");
        cudaEventRecord(start, 0);
        //for (int i = 0; i < 100; i++)
        //{
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
        //}
        cudaEventRecord(stop, 0);
        break;
    case 2:
        printf("Running reduce with shuffle instruction\n");
        cudaEventRecord(start, 0);
        /*for (int i = 0; i < 100; i++)
        {*/
            device_reduce_stable(d_in, d_out, ARRAY_SIZE);
        //}
        cudaEventRecord(stop, 0);
        break;
    default:
        fprintf(stderr, "error: ran no kernel\n");
        exit(EXIT_FAILURE);
    }
    cudaProfilerStop();
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    elapsedTime /= 100.0f;      // 100 trials

    // copy back the sum from GPU
    float h_out;
    cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    printf("average time elapsed: %f\n", elapsedTime);

    // free GPU memory allocation
    cudaFree(d_in);
    cudaFree(d_intermediate);
    cudaFree(d_out);

    return 0;
}

结果表明，基于 warp 的归约花费的时间几乎是基于共享内存的归约的两倍。这些结果与预期的行为相矛盾。
实验在计算能力高于 3.0 的 Tesla K40c 上进行。

Answer 1

我正在比较以下两个缩减内核，一个使用仅共享内存，而在最后一个 warp 缩减阶段不使用 warp shuffling（

version4

），一个使用共享内存和 warp shuffling 最后一个 warp reduction stage（

version5

).

version4

template <class T>
__global__ void version4(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid = threadIdx.x;                                 // --- Local thread index
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;   // --- Global thread index - Fictitiously double the block dimension

    // --- Performs the first level of reduction in registers when reading from global memory. 
    T mySum = (i < N) ? g_idata[i] : 0;
    if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
    sdata[tid] = mySum;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
    {
        if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
    if (tid < 32) {
        sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads();
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

version5

template <class T>
__global__ void version5(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid = threadIdx.x;                                 // --- Local thread index
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;   // --- Global thread index - Fictitiously double the block dimension

    // --- Performs the first level of reduction in registers when reading from global memory. 
    T mySum = (i < N) ? g_idata[i] : 0;
    if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
    sdata[tid] = mySum;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
    {
        if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Single warp reduction by shuffle operations
    if (tid < 32)
    {
        // --- Last iteration removed from the for loop, but needed for shuffle reduction
        mySum += sdata[tid + 32];
        // --- Reduce final warp using shuffle
        //for (int offset = warpSize / 2; offset > 0; offset /= 2) mySum += __shfl_down_sync(0xffffffff, mySum, offset);
        for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor_sync(0xffffffff, mySum, i);
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

我确认两者之间没有敏感差异。在我的 GTX920M 卡上，时间如下：

N = 33554432
version4 = 27.5ms
version5 = 27.095ms

所以，我确认罗伯特上面的评论。

CUDA shuffle 指令减少比共享内存减少慢？

问题描述投票：0回答：1

1个回答

最新问题

CUDA shuffle 指令减少比共享内存减少慢？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1