我正在开发用于矩阵乘法的 CUDA 程序,并且遇到“分段错误(核心转储)”错误。我已经包含了下面代码的相关部分。当我使用特定的输入大小和块大小运行程序时,会发生错误。
什么可能导致我的 CUDA 矩阵乘法代码中的分段错误,以及如何排除故障并修复它?
// matrix.cu
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define BLOCK_SIZE 16
__global__ void gpu_square_matrix_mult(int *d_a, int *d_b, int *d_result, int n) {
__shared__ int tile_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int tile_b[BLOCK_SIZE][BLOCK_SIZE];
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int tmp = 0;
int idx;
for (int sub = 0; sub < gridDim.x; ++sub) {
idx = row * n + sub * BLOCK_SIZE + threadIdx.x;
if (idx >= n * n) {
// n may not divisible by BLOCK_SIZE
tile_a[threadIdx.y][threadIdx.x] = 0;
} else {
tile_a[threadIdx.y][threadIdx.x] = d_a[idx];
idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col;
if (idx >= n * n) {
tile_b[threadIdx.y][threadIdx.x] = 0;
} else {
tile_b[threadIdx.y][threadIdx.x] = d_b[idx];
for (int k = 0; k < BLOCK_SIZE; ++k) {
tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x];
if (row < n && col < n) {
d_result[row * n + col] = tmp;
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i) {
for (int j = 0; j < k; ++j) {
int tmp = 0.0;
for (int h = 0; h < n; ++h) {
tmp += h_a[i * n + h] * h_b[h * k + j];
h_result[i * k + j] = tmp;
int main(int argc, char const *argv[]) {
int m, n, k;
if (argc < 2) {
printf("Usage: %s matrix_size block_size\n", argv[0]);
m = atoi(argv[1]);
n = m;
k = m;
int block_size = atoi(argv[2]);
// allocate memory in host RAM, h_cc is used to store CPU result
int *h_a, *h_b, *h_c, *h_cc;
cudaMallocHost((void **)&h_a, sizeof(int) * m * n);
cudaMallocHost((void **)&h_b, sizeof(int) * n * k);
cudaMallocHost((void **)&h_c, sizeof(int) * m * k);
cudaMallocHost((void **)&h_cc, sizeof(int) * m * k);
// random initialize matrix A
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = rand() % 1024;
// random initialize matrix B
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = rand() % 1024;
// Allocate memory space on the device
int *d_a, *d_b, *d_c;
cudaMalloc((void **)&d_a, sizeof(int) * m * n);
cudaMalloc((void **)&d_b, sizeof(int) * n * k);
cudaMalloc((void **)&d_c, sizeof(int) * m * k);
// Check CUDA memory allocations
if (d_a == NULL || d_b == NULL || d_c == NULL) {
fprintf(stderr, "CUDA malloc failed!\n");
return 1;
// copy matrix A and B from host to device memory
cudaMemcpy(d_a, h_a, sizeof(int) * m * n, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeof(int) * n * k, cudaMemcpyHostToDevice);
// some events to count the execution time
cudaEvent_t start, stop;
// start to count execution time of GPU version
cudaEventRecord(start, 0);
unsigned int grid_rows = (m + block_size - 1) / block_size;
unsigned int grid_cols = (k + block_size - 1) / block_size;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(block_size, block_size);
// Launch kernel
gpu_square_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n);
// Check for kernel launch errors
cudaError_t kernelError = cudaGetLastError();
if (kernelError != cudaSuccess) {
fprintf(stderr, "CUDA kernel launch error: %s\n", cudaGetErrorString(kernelError));
return 1;
// Transfer results from device to host
cudaMemcpy(h_c, d_c, sizeof(int) * m * k, cudaMemcpyDeviceToHost);
// time counting terminate
cudaEventRecord(stop, 0);
float gpu_elapsed_time_ms;
// compute time elapsed on GPU computing
cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on GPU: %f ms.\n\n",
m, n, n, k, gpu_elapsed_time_ms);
// start the CPU version
cudaEventRecord(start, 0);
cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);
cudaEventRecord(stop, 0);
float cpu_elapsed_time_ms;
cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
printf("Time elapsed on matrix multiplication of %dx%d . %dx%d on CPU: %f ms.\n\n",
m, n, n, k, cpu_elapsed_time_ms);
// validate results computed by GPU
int all_ok = 1;
for (int i = 0; i < m; ++i) {
for (int j = 0; j < k; ++j) {
if (h_cc[i * k + j] != h_c[i * k + j]) {
all_ok = 0;
if (all_ok) {
printf("All results are correct!!!, speedup = %f\n", cpu_elapsed_time_ms / gpu_elapsed_time_ms);
} else {
printf("Incorrect results\n");
// free memory
return 0;
#define CUDA_CHECK(status) \
{ \
cudaError_t error = status; \
if (error != cudaSuccess) \
{ \
std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
<< " at line: " << __LINE__ << std::endl; \
} \
CUDA_CHECK(cudaMalloc(&dA, total_size_dA_dB));