cuda 相关问题

CUDA是Nvidia GPU（图形处理单元）的并行计算平台和编程模型。 CUDA通过各种编程语言，库和API为Nvidia GPU提供了一个接口。

升级cuda后，cmake无法检测到以下cuda项目的GPU（升级前工作正常）： # CMakeLists.txt: cmake_minimum_required（版本3.24）如果（不是 CMAKE_CUDA_COM...

cmake cuda nvcc

回答 1 投票 0

我正在尝试在我的 conda 环境中安装 pytorch+cuda12.1。我已按照 https://pytorch.org/get-started/locally/ 上的说明操作并运行以下命令： conda 安装 pytorch torchvision

python pytorch cuda conda

回答 1 投票 0

我无法使用cudaMalloc和cudaMemcpy

我不熟悉 C++ 和 CUDA，但正在尝试学习它们。我尝试使用 cudaMalloc 和 cudaMemcpy 但失败了。源代码是： #包括 #包括 #定义...

c++ cuda gpu

回答 1 投票 0

朴素矩阵乘法的 CUDA 内存访问分析混乱

考虑以下矩阵乘法内核： __global__ void mmNaive(int numArows, int numBCols, int Width, float** A, float** B, float* C) { int tx = threadIdx.x; int ty = threadIdx.y; ...

cuda matrix-multiplication coalescing warp-scheduler

回答 1 投票 0

是否可以从cuda-gdb进行cudaDeviceSynchronize

我正在尝试单步执行在其过程中调用多个内核的主机代码。我想知道是否可以在主机代码中设置任意断点并从 cud 手动进行 cudaDeviceSynchronize...

cuda cuda-gdb

回答 1 投票 0

多次调用一个函数但只运行一次

在下面的代码中，我试图让 runSimulation() 函数为 rinoVals 中的每个值运行 1 次（因此总共运行 6 次）。但是，当我运行它时，runSimulation() 只执行一次并且不再执行...

c++ function cuda

回答 1 投票 0

如何查看矩阵乘法的进度？

我需要展示矩阵乘法的中间进度。 for(无符号整数 col=0;col 我需要展示矩阵乘法的中间进度。 for(unsigned int col=0; col<mtxSize; col++) { unsigned tmp = 0; for(unsigned int row=0; row<mtxSize; row++) { for(unsigned int idx=0; idx<mtxSize; idx++) { tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row]; } h_Rs[col*mtxSize+row] = tmp; tmp = 0; int rate_tmp = (col*mtxSize + (row+1))*100; // Maybe like this... fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize); fflush(stdout); } } 对于主机代码（使用CPU）来说，这非常容易，因为它是按顺序处理的，因此我们可以轻松检查它。但是对于并行处理的GPU，我该怎么办？内核一旦运行，直到内核执行完成才会返回。因此，我无法在内核执行期间检查中间数据。我想我需要使用异步内核调用，但我不知道如何。即使使用异步内核调用，要通过处理器查看所有数据到多个块中，我是否必须使用atomicAdd()（换句话说，全局内存访问）函数，其中包含一些开销？我也想知道 CUDA 的情况如何。这里的代码演示了如何检查矩阵乘法内核的进度： #include <stdio.h> #include <stdlib.h> #include <time.h> #define TIME_INC 100000000 #define INCS 10 #define USE_PROGRESS 1 #define MAT_DIMX 4000 #define MAT_DIMY MAT_DIMX #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) __global__ void mykernel(volatile int *data){ unsigned long time; for (int i = 0; i < INCS; i++){ atomicAdd((int *)data,1); __threadfence_system(); time = clock64(); while((clock64() - time)<TIME_INC) {}; } printf("progress check finished\n"); } __global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){ unsigned int row = threadIdx.x+blockDim.x*blockIdx.x; unsigned int col = threadIdx.y+blockDim.y*blockIdx.y; if ((row < rowA) && (col < colB)){ float temp = 0.0f; for (unsigned int k = 0; k < colA; k++) temp += a[(row*colA)+k] * b[(k*colB) + col]; c[(row*colB)+col] = temp; #if USE_PROGRESS if (!(threadIdx.x || threadIdx.y)){ atomicAdd((int *)progress, 1); __threadfence_system(); } #endif } } int main(){ // simple test to demonstrate reading progress data from kernel volatile int *d_data, *h_data; cudaSetDeviceFlags(cudaDeviceMapHost); cudaCheckErrors("cudaSetDeviceFlags error"); cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped); cudaCheckErrors("cudaHostAlloc error"); cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0); cudaCheckErrors("cudaHostGetDevicePointer error"); *h_data = 0; printf("kernel starting\n"); mykernel<<<1,1>>>(d_data); cudaCheckErrors("kernel fail"); int value = 0; do{ int value1 = *h_data; if (value1 > value){ printf("h_data = %d\n", value1); value = value1;}} while (value < (INCS-1)); cudaDeviceSynchronize(); cudaCheckErrors("kernel fail 2"); // now try matrix multiply with progress float *h_c, *d_a, *d_b, *d_c; h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float)); if (h_c == NULL) {printf("malloc fail\n"); return 1;} cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float)); cudaCheckErrors("cudaMalloc a fail"); cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float)); cudaCheckErrors("cudaMalloc b fail"); cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float)); cudaCheckErrors("cudaMalloc c fail"); for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX; cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice); cudaCheckErrors("cudaMemcpy a fail"); cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice); cudaCheckErrors("cudaMemcpy b fail"); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); *h_data=0; dim3 block(16,16); dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y)); printf("matrix multiply kernel starting\n"); cudaEventRecord(start); matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data); cudaEventRecord(stop); #if USE_PROGRESS unsigned int num_blocks = grid.x*grid.y; float my_progress = 0.0f; value = 0; printf("Progress:\n"); do{ cudaEventQuery(stop); // may help WDDM scenario int value1 = *h_data; float kern_progress = (float)value1/(float)num_blocks; if ((kern_progress - my_progress)> 0.1f) { printf("percent complete = %2.1f\n", (kern_progress*100)); my_progress = kern_progress;}} while (my_progress < 0.9f); printf("\n"); #endif cudaEventSynchronize(stop); cudaCheckErrors("event sync fail"); float et; cudaEventElapsedTime(&et, start, stop); cudaCheckErrors("event elapsed time fail"); cudaDeviceSynchronize(); cudaCheckErrors("mat mult kernel fail"); printf("matrix multiply finished. elapsed time = %f milliseconds\n", et); return 0; } 与第一个内核调用相关的代码只是为了演示让内核报告其进度的基本思想。代码的第二部分显示了 GPU 上的简单矩阵乘法示例，GPU 报告其进度。我已经包含了通过预处理器宏删除进度检查代码的功能，以及对矩阵乘法内核进行计时的功能。对于我这里的情况，有或没有进度代码的时间没有明显的差异。因此，虽然进度报告代码可能确实增加了一些开销，但与合理大小的矩阵乘法内核的范围相比，它并没有增加我所看到的大量时间。这里讨论了信令的其他一些用途

cuda

回答 1 投票 0

比较自定义 cuda 内核、cublas 和 cutensor 之间的性能

我进行了以下 CUDA 测试来比较（方）矩阵乘法的性能数据，在 Ubuntu 24.04 上运行，GPU 卡 Quadro T1000 Mobile 的计算兼容性为 7.5（...

cuda tensor cublas

回答 1 投票 0

如何使用 GPU 并行化在 3D 网格上高效计算函数？

我正在开发一个项目，需要计算 3D 网格中每个点的函数。最初，我使用嵌套循环来迭代所有点并执行计算。这个大概...

c++ multidimensional-array cuda

回答 1 投票 0

pytorch 将 nn.module 移动到 cuda，包括子模块

我是 pytorch 的新手。我想将我的模块移至 cuda。子模型类（nn.Module）： def __init__(自身): 超级（子模型，自我）.__init__() self.conv1 = nn.Conv1d(in_ch...

python pytorch cuda

回答 1 投票 0

Cupy 将 numpy 数组复制到现有设备数组

我想在现有的预分配的 GPU 数组上复制 numpy 数组。我已经看到cupy提供了copy和copyto功能，但是前者不允许指定目的地...

python cuda gpu cupy

回答 1 投票 0

如何在OpenCL/CUDA中使用128位浮点数和复数？

我需要在使用OpenCL或CUDA的并行GPU计算中使用128位浮点数和复数。有没有什么方法可以实现这一点而不需要自己实现？我看了

parallel-processing cuda opencl

回答 1 投票 0

链接器错误：/usr/bin/ld：尝试使用 clang 编译 CUDA 代码时找不到 -lcudart_static

我尝试按照官方文档中的指定编译 axpy.cu 文件： clang++ axpy.cu -o exec --cuda-gpu-arch=sm_60 -L/usr/local/cuda -lcudart_static -ldl -lrt -pthread 但这给了一个链接器......

c++ cuda clang gpu llvm

回答 2 投票 0

链接 CUDA 设备功能

我有这个a.cu文件： extern "C" __device__ int test42(); __global__ void get42(int* toStore){ *toStore = test42(); } int main() { int 主机{0}； int* 设备； cudaMalloc(&...

cuda nvcc fat-binaries fatbin

回答 1 投票 0

并行化 FFT（使用 CUDA）

在我的应用程序中，我需要转换图像的每一行，应用过滤器并将其转换回来。我希望能够使用 GPU 同时进行多个 FFT。更准确地说，我正在使用...

cuda fft

回答 1 投票 0

无法在Ubuntu 16.04上安装CUDA

多年来我一直以这种方式安装各种版本的 CUDA： sudo apt-get 更新 sudo apt-get purge cuda --是 # 例如 9.0: wget http://developer.download.nvidia.com/compute/cuda/repos/

cuda nvidia

回答 2 投票 0

CUDA 内核参数导致“cudaMemcpy 遇到非法内存访问”

我想在代码执行时通过argv参数改变变量M，N。我的代码 tvm_test.cu 如下： #包括 #包括 #包括我想在代码执行时通过M参数更改变量N、argv。我的代码tvm_test.cu如下： #include <cuda_fp16.h> #include <iostream> #include <cuda_runtime.h> #include <chrono> #include <assert.h> #define CUDA_CHECK(status) \ { \ cudaError_t error = status; \ if (error != cudaSuccess) \ { \ std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \ << " at line: " << __LINE__ << std::endl; \ exit(EXIT_FAILURE); \ } \ } __device__ int M=8192,N=8192; const int tx = 1024, ty = 1, tz = 1; const int element_per_thread = 256; // const int M = 2048, N = 2048; __global__ void int2half_I2H(int8_t* __restrict__ input, half* __restrict__ output){ #pragma unroll for(int i = 0; i < 256; ++ i){ output[(blockIdx.x * 1024 + threadIdx.x) * 256 + i] = __int2half_rn(input[(blockIdx.x * 1024 + threadIdx.x) * 256 + i]); } } __global__ void int2half_I2H_tvm(int8_t* input, half* output, int M, int N){ #pragma unroll for(int i = 0; i < 256; ++ i){ output[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256] = \ __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256]); } } __global__ void int2half_I2H_tvm(int8_t* input, half* output){ int rM = 8192; int rN = 8192; // #pragma unroll for(int i = 0; i < 256; ++ i){ output[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256] = \ __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256]); } } int main(int argc, char *argv[]){ M=atoi(argv[1]); N=atoi(argv[2]); srand(time(NULL)); // for int2half int8_t *input = (int8_t *)malloc(M * N); half *output_int2half = (half *)malloc(M * N * 2); // for prmt, + 128 int8_t *input_preprocess = (int8_t *)malloc(M * N); half *output_prmt = (half *)malloc(M * N * 2); // result float *golden = (float *)malloc(M * N * 4); // rand input [-10, 10] for(int i = 0; i < M * N; ++ i){ input[i] = (int8_t)((rand() % 21) - 10); golden[i] = (float)input[i]; } // preprocess + 128 for(int i = 0; i < M * N; ++ i){ input_preprocess[i] = input[i] + 128; } int8_t *d_input; int8_t *d_input_preprocess; half *d_output_int2half; half *d_output_prmt; CUDA_CHECK(cudaMalloc(&d_input, M * N)); CUDA_CHECK(cudaMalloc(&d_input_preprocess, M * N)); CUDA_CHECK(cudaMalloc(&d_output_int2half, M * N * 2)); CUDA_CHECK(cudaMalloc(&d_output_prmt, M * N * 2)); CUDA_CHECK(cudaMemcpy(d_input, input, M * N, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_input_preprocess, input_preprocess, M * N, cudaMemcpyHostToDevice)); dim3 dimBlock(1024); dim3 dimGrid(256); for(int i = 0; i < 1000; ++ i){ // printf("%d %d", M, N); //int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half); int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half,M,N); cudaDeviceSynchronize(); } CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(output_int2half, d_output_int2half, M * N * 2, cudaMemcpyDeviceToHost)); } 但是在编译指令nvcc -o tvm_test tvm_test.cu之后，我们运行./tvm_test 8192 8192。错误：Got bad cuda status: an illegal memory access was encountered at line: 104。该行代码是CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost));。我们使用 int2half_I2H 函数而不是 int2half_I2H_tvm 函数。没有错误！这在 CUDA 中是不合法的： __device__ int M=8192,N=8192; ... M=atoi(argv[1]); N=atoi(argv[2]); 在 CUDA 中，主机代码无法直接访问设备变量（或设备内存）。有多种方法可以通过重构来解决这个问题；例如，您可以完全摆脱 __device__ 变量的使用，并通过内核参数传递这些参数。也许最简单的更改可能是将它们从 __device__ 转换为 __managed__。此计算可能会导致整数溢出： i * (M * N) 您的 M 和 N 为 8192。i 最多可取 255 的值。如果将这 3 个相乘，则最终会得到一个数字 (17,112,760,320)，该数字大于可计算的最大正值存储数量为 int（约 2,000,000,000）或 unsigned（约 4,000,000,000）。这是导致非法访问的最直接问题。我们可以通过将其中一个量提升为 long long 来强制计算以 64 位完成来解决此问题。当然，还有其他可能的修复方法。（您指出的其他内核变体正在运行，不受此特定溢出问题的影响。）如果是我，我不会使用与内核参数M同名的全局变量M（对于N同样如此）。我觉得很混乱。是的，这并不违法，但它立即让我挠头试图记住哪一个适用。当我解决这 3 个问题时，你的代码似乎运行时没有错误： # cat t216.cu #include <cuda_fp16.h> #include <iostream> #include <cuda_runtime.h> #include <chrono> #include <assert.h> #define CUDA_CHECK(status) \ { \ cudaError_t error = status; \ if (error != cudaSuccess) \ { \ std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \ << " at line: " << __LINE__ << std::endl; \ exit(EXIT_FAILURE); \ } \ } __managed__ int M=8192,N=8192; __global__ void int2half_I2H(int8_t* __restrict__ input, half* __restrict__ output){ #pragma unroll for(int i = 0; i < 256; ++ i){ output[(blockIdx.x * 1024 + threadIdx.x) * 256 + i] = __int2half_rn(input[(blockIdx.x * 1024 + threadIdx.x) * 256 + i]); } } __global__ void int2half_I2H_tvm(int8_t* input, half* output, long long kM, long long kN){ #pragma unroll for(int i = 0; i < 256; ++ i){ output[blockIdx.x * 1024 + threadIdx.x + i * (kM * kN) / 256] = \ __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (kM * kN) / 256]); } } __global__ void int2half_I2H_tvm(int8_t* input, half* output){ int rM = 8192; int rN = 8192; // #pragma unroll for(int i = 0; i < 256; ++ i){ output[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256] = \ __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256]); } } int main(int argc, char *argv[]){ M=atoi(argv[1]); N=atoi(argv[2]); srand(time(NULL)); // for int2half int8_t *input = (int8_t *)malloc(M * N); half *output_int2half = (half *)malloc(M * N * 2); // for prmt, + 128 int8_t *input_preprocess = (int8_t *)malloc(M * N); half *output_prmt = (half *)malloc(M * N * 2); // result float *golden = (float *)malloc(M * N * 4); // rand input [-10, 10] for(int i = 0; i < M * N; ++ i){ input[i] = (int8_t)((rand() % 21) - 10); golden[i] = (float)input[i]; } // preprocess + 128 for(int i = 0; i < M * N; ++ i){ input_preprocess[i] = input[i] + 128; } int8_t *d_input; int8_t *d_input_preprocess; half *d_output_int2half; half *d_output_prmt; CUDA_CHECK(cudaMalloc(&d_input, M * N)); CUDA_CHECK(cudaMalloc(&d_input_preprocess, M * N)); CUDA_CHECK(cudaMalloc(&d_output_int2half, M * N * 2)); CUDA_CHECK(cudaMalloc(&d_output_prmt, M * N * 2)); CUDA_CHECK(cudaMemcpy(d_input, input, M * N, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_input_preprocess, input_preprocess, M * N, cudaMemcpyHostToDevice)); dim3 dimBlock(1024); dim3 dimGrid(256); for(int i = 0; i < 1000; ++ i){ // printf("%d %d", M, N); //int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half); int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half,M,N); cudaDeviceSynchronize(); } CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(output_int2half, d_output_int2half, M * N * 2, cudaMemcpyDeviceToHost)); } # nvcc -o t216 t216.cu -lineinfo # compute-sanitizer ./t216 8192 8192 ========= COMPUTE-SANITIZER ========= ERROR SUMMARY: 0 errors # 下面的代码有 CUDA容易出现缓冲区溢出问题。您应该始终牢记这一点。在您的代码中，这是有问题的行：输出[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256] =... 问问自己索引表达式的最大值是多少？好像是2551024 + 1023 + MN，大于分配的M*N个元素

cuda cuda-streams

回答 2 投票 0

将结果写入CUDA/OptiX中的文本文件

我想知道是否可以编写一个在我的程序 CUDA/OptiX 中计算的变量的文本文件。那是变量在我的 .cu 文件中，因此 CPU 无法写入。

cuda optix

回答 6 投票 0

带有 CUDA 的 Arrayfire CPP“无法打开 libnvrtc-builtins.so.12.2”

由于某种原因，arrayfire 无法与我的 nvidia GPU 配合使用。我正在尝试用 cmake 来做到这一点。我已正确安装 cuda 工具包，并按照此处找到的 linux 安装说明进行操作

c++ cmake cuda runtime-error arrayfire

回答 1 投票 0

Arrayfire 构建在 OpenCL 关闭时出现问题

我在构建 arrayfire 时遇到错误。我希望将 Arrayfire 纯粹与 CUDA 结合使用，但 arrayfire 构建似乎需要 OpenCL，尽管我设置了不构建 OpenCL 的标志。构建