这里是我要运行的代码;
__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
cudaDeviceSynchronize();
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
我还尝试从这样的设备函数运行该函数,但仍然给了我同样的错误;
__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}
还尝试了cudaLaunchKernel
函数,并使用全局而不是设备,但它们也没有起作用。我也使用-rdc = true标志,我的拱门是SM_75,它应该支持CDPthanks!