CUDA 矩阵乘法中的分段错误（核心转储）

Question

我正在开发用于矩阵乘法的 CUDA 程序，并且遇到“分段错误（核心转储）”错误。我已经包含了下面代码的相关部分。当我使用特定的输入大小和块大小运行程序时，会发生错误。

什么可能导致我的 CUDA 矩阵乘法代码中的分段错误，以及如何排除故障并修复它？

任何帮助将不胜感激。谢谢！

// matrix.cu

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define BLOCK_SIZE 16

__global__ void gpu_square_matrix_mult(int *d_a, int *d_b, int *d_result, int n) {
    __shared__ int tile_a[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ int tile_b[BLOCK_SIZE][BLOCK_SIZE];

    int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int tmp = 0;
    int idx;

    for (int sub = 0; sub < gridDim.x; ++sub) {
        idx = row * n + sub * BLOCK_SIZE + threadIdx.x;
        if (idx >= n * n) {
            // n may not divisible by BLOCK_SIZE
            tile_a[threadIdx.y][threadIdx.x] = 0;
        } else {
            tile_a[threadIdx.y][threadIdx.x] = d_a[idx];
        }

        idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col;
        if (idx >= n * n) {
            tile_b[threadIdx.y][threadIdx.x] = 0;
        } else {
            tile_b[threadIdx.y][threadIdx.x] = d_b[idx];
        }
        __syncthreads();

        for (int k = 0; k < BLOCK_SIZE; ++k) {
            tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x];
        }
        __syncthreads();
    }

    if (row < n && col < n) {
        d_result[row * n + col] = tmp;
    }
}

void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < k; ++j) {
            int tmp = 0.0;
            for (int h = 0; h < n; ++h) {
                tmp += h_a[i * n + h] * h_b[h * k + j];
            }
            h_result[i * k + j] = tmp;
        }
    }
}

int main(int argc, char const *argv[]) {
    int m, n, k;

    if (argc < 2) {
        printf("Usage: %s matrix_size block_size\n", argv[0]);
        exit(0);
    }

    m = atoi(argv[1]);
    n = m;
    k = m;
    int block_size = atoi(argv[2]);

    // allocate memory in host RAM, h_cc is used to store CPU result
    int *h_a, *h_b, *h_c, *h_cc;
    cudaMallocHost((void **)&h_a, sizeof(int) * m * n);
    cudaMallocHost((void **)&h_b, sizeof(int) * n * k);
    cudaMallocHost((void **)&h_c, sizeof(int) * m * k);
    cudaMallocHost((void **)&h_cc, sizeof(int) * m * k);

    // random initialize matrix A
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            h_a[i * n + j] = rand() % 1024;
        }
    }

    // random initialize matrix B
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < k; ++j) {
            h_b[i * k + j] = rand() % 1024;
        }
    }

    // Allocate memory space on the device
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, sizeof(int) * m * n);
    cudaMalloc((void **)&d_b, sizeof(int) * n * k);
    cudaMalloc((void **)&d_c, sizeof(int) * m * k);

    // Check CUDA memory allocations
    if (d_a == NULL || d_b == NULL || d_c == NULL) {
        fprintf(stderr, "CUDA malloc failed!\n");
        return 1;
    }

    // copy matrix A and B from host to device memory
    cudaMemcpy(d_a, h_a, sizeof(int) * m * n, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(int) * n * k, cudaMemcpyHostToDevice);

    // some events to count the execution time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // start to count execution time of GPU version
    cudaEventRecord(start, 0);

    unsigned int grid_rows = (m + block_size - 1) / block_size;
    unsigned int grid_cols = (k + block_size - 1) / block_size;
    dim3 dimGrid(grid_cols, grid_rows);
    dim3 dimBlock(block_size, block_size);

    // Launch kernel
    gpu_square_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n);

    // Check for kernel launch errors
    cudaError_t kernelError = cudaGetLastError();
    if (kernelError != cudaSuccess) {
        fprintf(stderr, "CUDA kernel launch error: %s\n", cudaGetErrorString(kernelError));
        return 1;
    }

    // Transfer results from device to host
    cudaMemcpy(h_c, d_c, sizeof(int) * m * k, cudaMemcpyDeviceToHost);

    // time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float gpu_elapsed_time_ms;
    // compute time elapsed on GPU computing
    cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);
    printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on GPU: %f ms.\n\n",
           m, n, n, k, gpu_elapsed_time_ms);

    // start the CPU version
    cudaEventRecord(start, 0);

    cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float cpu_elapsed_time_ms;
    cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
    printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on CPU: %f ms.\n\n",
           m, n, n, k, cpu_elapsed_time_ms);

    // validate results computed by GPU
    int all_ok = 1;
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < k; ++j) {
            if (h_cc[i * k + j] != h_c[i * k + j]) {
                all_ok = 0;
            }
        }
    }

    if (all_ok) {
        printf("All results are correct!!!, speedup = %f\n", cpu_elapsed_time_ms / gpu_elapsed_time_ms);
    } else {
        printf("Incorrect results\n");
    }

    // free memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
    cudaFreeHost(h_cc);

    return 0;
}

```.........................................................

(https://i.stack.imgur.com/j4P20.png)

Answer 1

尝试使用它来查找特定的有问题的行！

#define CUDA_CHECK(status)                                                    \
    {                                                                         \
        cudaError_t error = status;                                           \
        if (error != cudaSuccess)                                             \
        {                                                                     \
            std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
                      << " at line: " << __LINE__ << std::endl;               \
            exit(EXIT_FAILURE);                                               \
        }                                                                     \
    }

CUDA_CHECK(cudaMalloc(&dA, total_size_dA_dB));

另一个想法是，注释一些行，直到不再提示错误！然后你就知道哪一行是错误的！

CUDA 矩阵乘法中的分段错误（核心转储）

问题描述投票：0回答：1

1个回答

最新问题

CUDA 矩阵乘法中的分段错误（核心转储）

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1