nvcc 无法识别带有 cuda 12.5 的 Windows 11 上的内核 (我可以使用 powershell 编译其他 *.cu 文件)
nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
__global__ void kernel(uint *A, uint *B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char **argv) {
uint *Xs, *Ys;
uint *Xs_d, *Ys_d;
uint SIZE = 4;
Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
结果
PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
从您的错误消息中我可以看到您正在 Windows 上运行。
仅在 Linux 上定义了
uint
,在 Windows 上则没有定义。
您可以使用
unsigned
轻松解决此问题,或者根据需要定义 uint
。以下修复将使您的代码编译并正确运行。
添加因此根据需要定义 uint (请注意,在现代平台上
unsigned
表示 unsigned int
,又名 uint32_t
):
#ifndef uint
typedef unsigned uint;
#endif
完整代码(见下文)现在可以正确编译并运行,输出正确的结果。
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#ifndef uint
typedef unsigned uint;
#endif
__global__ void kernel(uint* A, uint* B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char** argv) {
uint* Xs, * Ys;
uint* Xs_d, * Ys_d;
uint SIZE = 4;
Xs = (uint*)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint*)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void**)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void**)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel << <grid_size, block_size >> > (Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}