这是我的程序代码。而且这个错误甚至不限于此代码,无论我编写什么程序,它都不会输入大数据。并不是说我现在必须对这么大的数据做任何事情。但我只是想知道,为什么它会出现这样的错误。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>
#include <stdio.h>
#include <math.h>
cudaError_t multiplyWithCuda(int* d_P, int* d_M, int* d_N, int size);
__global__ void multiplyKernel(int* d_P, int* d_M, int* d_N, int size)
{
int row = blockIdx.x*blockDim.x+ threadIdx.x ;
int column = blockIdx.y * blockDim.y + threadIdx.y;
if ((row < size) && (column <size) )
{
float Pval = 0;
for (int k = 0; k < size; k++)
{
Pval += d_M[row * size + k] * d_N[k*size+column];
}
d_P[row * size + column] = Pval;
}
}
int main()
{
const int max_n = 291;
int n ;
printf("Enter order of matrix: ");
scanf("%d", &n);
int a[max_n * max_n] = {0};
int b[max_n * max_n] = {0};
for (int ele1 = 0; ele1 < n*n; ele1++)
{
printf("Enter element %d for matrix1: ",(ele1+1));
scanf("%d", &a[ele1]);
}
for (int ele2 = 0; ele2 < n * n; ele2++)
{
printf("Enter element %d for matrix 2: ", (ele2 + 1));
scanf("%d", &b[ele2]);
}
int c[max_n* max_n] ;
// Add vectors in parallel.
clock_t t;
t = clock();
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, n);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
for(int m=0; m < n; m++)
{
for (int l = 0; l < n; l++)
{
if (l == 0 && m > 0) { printf("\n%d ", c[l + n * m]); }
else { printf("%d ", c[l + n * m]); }
}
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
t = clock() - t;
double time_taken = ((double)t) / CLOCKS_PER_SEC;
printf("\n %f", time_taken);
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t multiplyWithCuda(int* c, int* a, int* b, int size)
{
int* dev_a = 0;
int* dev_b = 0;
int* dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
dim3 gridSize(ceil(size/32.0),ceil(size/32.0));
dim3 blockSize(32,32);
multiplyKernel <<<gridSize,blockSize >>> (dev_c, dev_a, dev_b,size);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
一旦将我的 max_n >292,它就开始显示错误。但它对于 291 和低阶矩阵非常有效。
我曾经遇到过这些问题,但对于一个非常不同且更大的值,我认为它大约是 1000ich 或其他东西。
我使用Heap的时候就解决了。尝试在这里使用 calloc() 看到您将数组初始化为 0。如果这没有帮助,请告诉我。
其余代码看起来非常好。