我一直在处理一个庞大的设备阵列。经过一定的限制后,我发现设备(内核:dmemd)内部的动态数组存在设备缓冲区空间不足的问题(在g_size = 8附近使用nvprof)。
我试图从主机端(内核:dmemh)定义它们,并且我已经能够创建更大的设备阵列,但是同样,在一定的限制之后(在 g_size = 55k 左右使用 nvprof),设备缓冲区空间不足的问题又回来了.但是,我需要增加到 g_size = 120k。
关于算法:
关于尝试的代码架构:
其他高效的并行化方案也非常受欢迎。
这是要使用的示例代码:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <complex.h>
#include <cuComplex.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <ctime>
#include <stdint.h>
#include <cstring>
#include"cuda_runtime.h"
#include"device_launch_parameters.h"
// size of different arrays
const uint32_t g_size = 55000U;
const uint32_t g_det_size = 40000U;
const uint32_t g_sum_size = 30000U;
// kernel: dynamical array inside device
__global__ void dmemd(cuDoubleComplex *d_sum) {
int gid = blockIdx.x;
// dynamical array
cuDoubleComplex *d_det = new cuDoubleComplex[g_det_size];
// compute det
for (int i = 0; i < g_det_size; i++) {
d_det[i] = make_cuDoubleComplex((i * 1.0 / (gid + 1)) * (0.01 / g_det_size),
(i * 1.0 / (gid + 1)) * (0.01 / g_det_size) );
}
cuDoubleComplex dt_sum = make_cuDoubleComplex(0.0, 0.0);
// compute sum
for (int i = 0; i < g_sum_size; i++) {
dt_sum = cuCadd(dt_sum, d_det[i]);
}
// sum linked with each block id
d_sum[gid] = dt_sum;
delete[] d_det;
}
// kernel: dynamical device array from host
__global__ void dmemh(cuDoubleComplex *d_sum, cuDoubleComplex *d_det) {
int gid = blockIdx.x;
// compute det
for (int i = 0; i < g_det_size; i++) {
d_det[gid * g_det_size + i] = make_cuDoubleComplex((i * 1.0 / (gid + 1)) * (0.01 / g_det_size),
(i * 1.0 / (gid + 1)) * (0.01 / g_det_size) );
}
cuDoubleComplex dt_sum = make_cuDoubleComplex(0.0, 0.0);
// compute sum
for (int i = 0; i < g_sum_size; i++) {
dt_sum = cuCadd(dt_sum, d_det[gid * g_det_size + i]);
}
// sum linked with each block id
d_sum[gid] = dt_sum;
}
int main() {
double complex tsum;
tsum = 0.0 + 0.0 * I;
double complex *sum;
sum = (double complex*)malloc((sizeof(double complex)) * g_size);
cuDoubleComplex *d_sum;
cudaMalloc((void **)&d_sum, sizeof(cuDoubleComplex) * g_size);
cudaMemset(&d_sum, 0, sizeof(cuDoubleComplex) * g_size);
dim3 block(1);
dim3 grid(g_size);
// kernel: dynamical array inside device
//dmemd << < grid, block >> > (d_sum);
// kernel: dynamical device array from host
cuDoubleComplex *d_det;
cudaMalloc((void **)&d_det, sizeof(cuDoubleComplex) * g_det_size * g_size);
cudaMemset(&d_det, 0, sizeof(cuDoubleComplex) * g_det_size * g_size);
dmemh << < grid, block >> > (d_sum, d_det);
cudaDeviceSynchronize();
// memory copy host to device
cudaMemcpy(sum, d_sum, sizeof(double complex) * g_size,
cudaMemcpyDeviceToHost );
for (int i = 0; i < g_size; i++) {
tsum += sum[i];
}
printf("Total sum: %.16E,%.16E \n", creal(tsum), cimag(tsum));
cudaDeviceReset();
return 0;
}