cuda c ++错误:__device__函数呼叫无法配置

问题描述 投票:0回答:1
我想实现基本远期函数的CDP(我将远期功能同时称为多次(也来自CUDA功能),因此我想使用CDP)

这里是我要运行的代码;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int outputidx = idx / outputsize; int inputidx = idx % outputsize; if (outputidx >= outputsize || inputidx >= inputsize) { return; } atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]); } __device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= size) { return; } result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx]))); } __global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) { int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK; NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize); cudaDeviceSynchronize(); NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize); }
我还尝试从这样的设备函数运行该函数,但仍然给了我同样的错误;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int outputidx = idx / outputsize; int inputidx = idx % outputsize; if (outputidx >= outputsize || inputidx >= inputsize) { return; } atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]); } __device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= size) { return; } result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx]))); } __device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) { int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize); NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize); } __global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) { NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize); }
还尝试了

cudaLaunchKernel

函数,并使用全局而不是设备,但它们也没有起作用。我也使用-rdc = true标志,我的拱门是SM_75,它应该支持CDP
thanks!

c++ parallel-processing cuda gpu gpgpu
1个回答
0
投票
驱动器函数调用。调试它们感觉就像在捕获动态内容时对

HTML2CANVAS进行故障排除。您是否检查了函数是否正确标记并从正确的上下文中调用?

最新问题
© www.soinside.com 2019 - 2025. All rights reserved.