基于共享存储器的1d模板CUDA实现中的负数组索引

Question

我目前正在使用CUDA编程，我正在尝试从我在网上找到的工作室学习幻灯片，可以找到here。我遇到的问题是幻灯片48。可以在那里找到以下代码：

__global__ void stencil_1d(int *in, int *out) {

    __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];

    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    // Read input elements into shared memory
    temp[lindex] = in[gindex];
    if (threadIdx.x < RADIUS) {
        temp[lindex - RADIUS] = in[gindex - RADIUS];
        temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
    }

....

添加一些上下文。我们有一个名为in的数组，长度如N。然后我们有另一个数组out，其长度为N+(2*RADIUS)，其中RADIUS的值为3，用于此特定示例。我们的想法是将数组in复制到数组out中，但是将数组in放置在数组3（即out）开头的out = [RADIUS][in][RADIUS]位置，请参阅幻灯片以进行图形表示。

混乱来自以下几行：

 temp[lindex - RADIUS] = in[gindex - RADIUS];

如果gindex是0那么我们有in[-3]。我们如何读取数组中的负数索引？真的很感激任何帮助。

Answer 1

pQB的答案是正确的。您应该通过RADIUS来偏移输入数组指针。

为了表明这一点，我在下面提供了一个完整的例子。希望它对其他用户有益。

（我会说在共享内存加载后你需要一个__syncthreads()。我在下面的例子中添加了它）。

#include <thrust/device_vector.h>

#define RADIUS      3
#define BLOCKSIZE   32

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {

    __shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];

    unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;

    unsigned int lindexx = threadIdx.x + RADIUS;

    // --- Read input elements into shared memory
    temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
    if (threadIdx.x < RADIUS) {
        temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
        temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
    }

    __syncthreads();

    // --- Apply the stencil
    unsigned int result = 0;
    for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
        result += temp[lindexx + offset];
    }

    // --- Store the result
    out[gindexx] = result;
}

/********/
/* MAIN */
/********/
int main() {

    const unsigned int N        = 55 + 2 * RADIUS;

    const unsigned int constant = 4;

    thrust::device_vector<unsigned int> d_in(N, constant);
    thrust::device_vector<unsigned int> d_out(N);

    moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    thrust::host_vector<unsigned int> h_out = d_out;

    for (int i=0; i<N; i++)
        printf("Element i = %i; h_out = %i\n", i, h_out[i]);

    return 0;

}

Answer 2

您假设in数组指向已为此数组分配的内存的第一个位置。但是，如果您看到幻灯片47，则in数组在数据之前和之后具有三个元素的晕圈（橙色框）（表示为绿色立方体）。

我的假设是（我还没有完成研讨会）输入数组首先用晕圈初始化，然后指针在内核调用中移动。就像是：

stencil_1d<<<dimGrid, dimBlock>>>(in + RADIUS, out);

因此，在内核中，执行in[-3]是安全的，因为指针不在数组的开头。

Answer 3

已经有了很好的答案，但要关注引起混淆的实际观点：

在C中（不仅在CUDA中，而且在C中），当您使用[括号]访问“数组”时，您实际上在进行指针算术。

例如，考虑这样的指针：

int* data= ... // Points to some memory

然后你写一个像这样的语句

data[3] = 42;

您只是访问一个“原始data指针后面的三个条目”的内存位置。所以你也可以写

int* data= ... // Points to some memory
int* dataWithOffset = data+3;
dataWithOffset[0] = 42; // This will write into data[3]

因此，

dataWithOffset[-3] = 123; // This will write into data[0]

事实上，你可以说data[i]与*(data+i)相同，*(i+data)与i[data]相同，后者又与相同，但你不应该在真正的程序中使用它...）

Answer 4

我可以编译@ JackOLantern的代码，但是有一个警告：“无符号整数与零的无意义比较”：

在运行时，它会像：#include <thrust/device_vector.h> #define RADIUS 3 #define BLOCKSIZE 32 /*******************/ /* iDivUp FUNCTION */ /*******************/ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); } /********************/ /* CUDA ERROR CHECK */ /********************/ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } /**********/ /* KERNEL */ /**********/ __global__ void moving_average(unsigned int *in, unsigned int *out, int N) { __shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS]; int gindexx = threadIdx.x + blockIdx.x * blockDim.x; int lindexx = threadIdx.x + RADIUS; // --- Read input elements into shared memory temp[lindexx] = (gindexx < N)? in[gindexx] : 0; if (threadIdx.x < RADIUS) { temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0; temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0; } __syncthreads(); // --- Apply the stencil unsigned int result = 0; for (int offset = -RADIUS ; offset <= RADIUS ; offset++) { result += temp[lindexx + offset]; } // --- Store the result out[gindexx] = result; } /********/ /* MAIN */ /********/ int main() { const int N = 55 + 2 * RADIUS; const unsigned int constant = 4; thrust::device_vector<unsigned int> d_in(N, constant); thrust::device_vector<unsigned int> d_out(N); moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); thrust::host_vector<unsigned int> h_out = d_out; for (int i=0; i<N; i++) printf("Element i = %i; h_out = %i\n", i, h_out[i]); return 0; }一样中止

我已将代码修改为以下内容，警告消失，并且可以得到正确的结果：

结果是这样的：

qazxswpoi

基于共享存储器的1d模板CUDA实现中的负数组索引

问题描述投票：6回答：4

4个回答

最新问题

基于共享存储器的1d模板CUDA实现中的负数组索引

问题描述 投票：6回答：4

4个回答

最新问题

问题描述投票：6回答：4