我正在使用 2D 卷积并将滤波器 (3 x 3) 应用于图像 (2048 x 2048)。我编写了两个版本:一个使用全局内存访问,另一个使用过滤器的常量内存。当我对代码进行基准测试(在我的 RTX 3090 上)时,我发现使用恒定内存没有任何改进。
内核使用全局内存
__global__ void gpu_conv2d_kernel(float *d_N_ptr, float *d_F_ptr, float *d_P_ptr, int n_rows, int n_cols)
{
// Which output element this thread works on
int out_col = blockIdx.x*blockDim.x + threadIdx.x;
int out_row = blockIdx.y*blockDim.y + threadIdx.y;
// Check if output element is valid
if (out_row < n_rows && out_col < n_cols)
{
// Result (in thread register)
float p_val = 0.0f;
// Loop over elements of the filter array
for (int f_row = 0; f_row < 2*FILTER_RADIUS+1; f_row++)
{
for (int f_col = 0; f_col < 2*FILTER_RADIUS+1; f_col++)
{
// Input element to filter element mapping
int in_row = out_row + (f_row - FILTER_RADIUS);
int in_col = out_col + (f_col - FILTER_RADIUS);
// Boundary check
if (in_row >= 0 && in_row < n_rows && in_col >= 0 && in_col < n_cols)
p_val += d_F_ptr[f_row*(2*FILTER_RADIUS+1) + f_col] * d_N_ptr[in_row*n_cols + in_col];
}
}
d_P_ptr[out_row*n_cols + out_col] = p_val;
}
}
使用常量内存的内核
#define FILTER_RADIUS 1
extern __constant__ float d_F[(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)];
__global__ void gpu_conv2d_constMem_kernel(float *d_N_ptr, float *d_P_ptr, int n_rows, int n_cols)
{
// Which output element this thread works on
int out_col = blockIdx.x*blockDim.x + threadIdx.x;
int out_row = blockIdx.y*blockDim.y + threadIdx.y;
// Check if output element is valid
if (out_row < n_rows && out_col < n_cols)
{
// Result (in thread register)
float p_val = 0.0f;
// Loop over elements of the filter array
for (int f_row = 0; f_row < 2*FILTER_RADIUS+1; f_row++)
{
for (int f_col = 0; f_col < 2*FILTER_RADIUS+1; f_col++)
{
// Input element to filter element mapping
int in_row = out_row + (f_row - FILTER_RADIUS);
int in_col = out_col + (f_col - FILTER_RADIUS);
// Boundary check
if (in_row >= 0 && in_row < n_rows && in_col >= 0 && in_col < n_cols)
p_val += d_F[f_row*(2*FILTER_RADIUS+1)+f_col] * d_N_ptr[in_row*n_cols + in_col];
}
}
d_P_ptr[out_row*n_cols + out_col] = p_val;
}
}
我编写了一个主函数,它分别对代码的各个部分(内存分配、数据传输、内核执行等)执行基准测试。
// All essential includes
// .
// .
// .
#define FILTER_RADIUS 1
__constant__ float d_F[(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)];
// CUDA Error Checking
#define cuda_check(err) { \
if (err != cudaSuccess) { \
std::cout << cudaGetErrorString(err) << " in " << __FILE__ << " at line " << __LINE__ << "\n"; \
exit(EXIT_FAILURE); \
} \
}
int main(int argc, char const *argv[])
{
// Benchmarking variables
float elapsed_time_mem_alloc,
elapsed_time_mem_t_in, elapsed_time_mem_t_f, elapsed_time_mem_t_out,
elapsed_time_kernel;
cudaEvent_t beg, end;
cudaEventCreate(&beg);
cudaEventCreate(&end);
// ---------------------------------------------------------- //
// ------------------ Load image in memory ------------------ //
// ---------------------------------------------------------- //
//.
//.
//.
// ---------------------------------------------------------- //
// ----------------- GPU memory allocation ------------------ //
// ---------------------------------------------------------- //
cudaError_t err;
std::cout << "Allocating GPU memory... \n";
cudaEventRecord(beg);
float* d_N;
err = cudaMalloc((void**) &d_N, new_size*new_size*sizeof(float));
cuda_check(err);
float *d_P;
err = cudaMalloc((void**) &d_P, new_size*new_size*sizeof(float));
cuda_check(err);
cudaEventRecord(end);
cudaEventSynchronize(beg);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed_time_mem_alloc, beg, end);
elapsed_time_mem_alloc /= 1000.;
std::cout << "Time for GPU memory allocation (seconds): " << elapsed_time_mem_alloc << "\n";
std::cout << "\n";
// ---------------------------------------------------------- //
// ------------------- Move input to GPU -------------------- //
// ---------------------------------------------------------- //
std::cout << "Moving input to GPU memory... \n";
cudaEventRecord(beg);
err = cudaMemcpy(d_N, N, new_size*new_size*sizeof(float), cudaMemcpyHostToDevice);
cuda_check(err);
cudaEventRecord(end);
cudaEventSynchronize(beg);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed_time_mem_t_in, beg, end);
elapsed_time_mem_t_in /= 1000.;
std::cout << "Time for input data transfer (seconds): " << elapsed_time_mem_t_in << "\n";
std::cout << "\n";
// ------------------------------------------------------------------------- //
// ----------------------- Initialize filter ------------------------------- //
// ------------------------------------------------------------------------- //
std::string filter_type;
float *F = new float[(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)];
int iter = 0;
while (true)
{
// ------------------------------------------------------------------------- //
// Which filter; Options: Sharpen, High-pass, Low-pass, Gaussian, d_Gaussian //
// ------------------------------------------------------------------------- //
std::cout << "Filter options: Sharpen, High-pass, Low-pass, Gaussian, d_Gaussian \n";
std::cout << "Enter filter (press 'q' to exit): ";
std::cin >> filter_type;
// ---------------------------------------------------------- //
// ---------------- Defining filter matrix ------------------ //
// ---------------------------------------------------------- //
if (filter_type == "Sharpen")
{
float alpha = 0.8f;
std::cout << "Enter alpha between 0 and 1 (default: 0.8): ";
std::cin >> alpha;
std::cout << "\n";
F[0] = -alpha/(9-9*alpha);
F[1] = -alpha/(9-9*alpha);
F[2] = -alpha/(9-9*alpha);
F[3] = -alpha/(9-9*alpha);
F[4] = (9-alpha)/(9-9*alpha);
F[5] = -alpha/(9-9*alpha);
F[6] = -alpha/(9-9*alpha);
F[7] = -alpha/(9-9*alpha);
F[8] = -alpha/(9-9*alpha);
}
else if (filter_type == "High-pass")
{
std::cout << "\n";
F[0] = -1;
F[1] = -1;
F[2] = -1;
F[3] = -1;
F[4] = 8;
F[5] = -1;
F[6] = -1;
F[7] = -1;
F[8] = -1;
}
else if (filter_type == "Low-pass")
{
float alpha = 9.0f;
std::cout << "Enter alpha (default: 9.0): ";
std::cin >> alpha;
std::cout << "\n";
F[0] = 1/alpha;
F[1] = 1/alpha;
F[2] = 1/alpha;
F[3] = 1/alpha;
F[4] = 1/alpha;
F[5] = 1/alpha;
F[6] = 1/alpha;
F[7] = 1/alpha;
F[8] = 1/alpha;
}
else if (filter_type == "Gaussian")
{
float alpha = 16.0f;
std::cout << "Enter alpha (default: 16.0): ";
std::cin >> alpha;
std::cout << "\n";
F[0] = 1/alpha;
F[1] = 2/alpha;
F[2] = 1/alpha;
F[3] = 2/alpha;
F[4] = 3/alpha;
F[5] = 4/alpha;
F[6] = 1/alpha;
F[7] = 2/alpha;
F[8] = 1/alpha;
}
else if (filter_type == "d_Gaussian")
{
std::cout << "\n";
F[0] = -2;
F[1] = 1;
F[2] = -2;
F[3] = 1;
F[4] = 4;
F[5] = 1;
F[6] = -2;
F[7] = 1;
F[8] = -2;
}
else if (filter_type == "q")
{
break;
}
else
{
std::cout << "Filter not supported!" << "\n";
std::terminate();
}
// ---------------------------------------------------------- //
// ------------------ Move filter to GPU -------------------- //
// ---------------------------------------------------------- //
std::cout << "Moving filter to GPU constant memory... \n";
cudaEventRecord(beg);
err = cudaMemcpyToSymbol(d_F, F, (2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)*sizeof(float));
cuda_check(err);
cudaDeviceSynchronize();
cudaEventRecord(end);
cudaEventSynchronize(beg);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed_time_mem_t_f, beg, end);
elapsed_time_mem_t_f /= 1000.;
std::cout << "Time for filter data transfer (seconds): " << elapsed_time_mem_t_f << "\n";
std::cout << "\n";
// ---------------------------------------------------------- //
// --------------------- 2D Convolution --------------------- //
// ---------------------------------------------------------- //
// Applying filters frame by frame
std::cout << "Applying filter... \n";
// Kernel execution
cudaEventRecord(beg);
dim3 dim_block(32, 32, 1);
dim3 dim_grid(ceil(new_size/(float)(32)), ceil(new_size/(float)(32)), 1);
gpu_conv2d_constMem_kernel<<<dim_grid, dim_block>>>(d_N, d_P, new_size, new_size);
cudaDeviceSynchronize();
cudaEventRecord(end);
cudaEventSynchronize(beg);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed_time_kernel, beg, end);
elapsed_time_kernel /= 1000.;
std::cout << "Time for kernel execution (seconds): " << elapsed_time_kernel << "\n";
std::cout << "\n";
// ---------------------------------------------------------- //
// ---------- Copying result back to host memory -------------//
// ---------------------------------------------------------- //
std::cout << "Moving result to CPU memory... \n";
cudaEventRecord(beg);
err = cudaMemcpy(P, d_P, new_size*new_size*sizeof(float), cudaMemcpyDeviceToHost);
cuda_check(err);
cudaEventRecord(end);
cudaEventSynchronize(beg);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed_time_mem_t_out, beg, end);
elapsed_time_mem_t_out /= 1000.;
std::cout << "Time for output data transfer (seconds): " << elapsed_time_mem_t_out << "\n";
std::cout << "\n";
// ---------------------------------------------------------- //
// --------------------- Benchmarking ------------------------//
// ---------------------------------------------------------- //
std::cout << "--------------------- \n";
std::cout << "Benchmarking details: \n";
std::cout << "--------------------- \n";
if (iter == 0)
{
std::cout << "Time (total): " << elapsed_time_kernel + elapsed_time_mem_alloc +
elapsed_time_mem_t_in + elapsed_time_mem_t_f + elapsed_time_mem_t_out << "\n";
std::cout << "FPS (total): " << 1 / (elapsed_time_kernel + elapsed_time_mem_alloc +
elapsed_time_mem_t_in + elapsed_time_mem_t_f + elapsed_time_mem_t_out) << "\n";
std::cout << "\n";
}
else
{
std::cout << "Time (total): " << elapsed_time_kernel + elapsed_time_mem_t_f + elapsed_time_mem_t_out << "\n";
std::cout << "FPS (total): " << 1 / (elapsed_time_kernel + elapsed_time_mem_t_f+ elapsed_time_mem_t_out) << "\n";
std::cout << "\n";
}
std::cout << "Time (kernel): " << elapsed_time_kernel << "\n";
std::cout << "FPS (kernel): " << 1 / (elapsed_time_kernel) << "\n";
std::cout << "GFLOPS (kernel): " << 2*new_size*new_size*(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1) * 1e-9 / elapsed_time_kernel << "\n";
std::cout << "------------------------------------ \n";
std::cout << "\n";
// ----------------------------------------------------------------- //
// -------------------- Saving output as jpg ----------------------- //
// ----------------------------------------------------------------- //
//.
//.
//.
iter += 1;
}
delete[] N;
delete[] F;
delete[] P;
cudaFree(d_N);
cudaFree(d_P);
return 0;
}
您可以在这里查看代码存储库。我不确定实施/基准测试是否存在问题,或者问题可能不够复杂(尽管我尝试了大输入图像尺寸)。
每次访问过滤器元素有 17 种运算,包括按位、整数乘法和加法、浮点乘法和加法。所有这些的总延迟可能大于缓存的全局内存缓冲区的访问延迟。
您可以减少每次过滤器访问的操作数量:
也许可以测量恒定记忆的差异,而无需隐藏在其他人后面。