我有一个 16 倍粗化的内核(1x16 平铺)。为了减少 STG(存储全局)指令,在我的例子中,我通过 uchar4 实现了向量化内存访问。当我查看内存图表时,我看到了这一点:
矢量化内存访问
__global__ void k_1D_gf_3x3_vectorized16_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 16;
int tx = blockIdx.y * blockDim.y + threadIdx.y;
int vals[16] = { 0 };
unsigned char frame[3][3];
if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty < cols - 1)) {
frame[0][0] = input[(tx - 1) * cols + ty - 1];
frame[0][1] = input[(tx - 1) * cols + ty];
frame[0][2] = input[(tx - 1) * cols + ty + 1];
frame[1][0] = input[tx * cols + ty - 1];
frame[1][1] = input[tx * cols + ty];
frame[1][2] = input[tx * cols + ty + 1];
frame[2][0] = input[(tx + 1) * cols + ty - 1];
frame[2][1] = input[(tx + 1) * cols + ty];
frame[2][2] = input[(tx + 1) * cols + ty + 1];
vals[0] = (global_conv_kernel3x3[0][0] * frame[0][0]
+ global_conv_kernel3x3[0][1] * frame[0][1]
+ global_conv_kernel3x3[0][2] * frame[0][2]
+ global_conv_kernel3x3[1][0] * frame[1][0]
+ global_conv_kernel3x3[1][1] * frame[1][1]
+ global_conv_kernel3x3[1][2] * frame[1][2]
+ global_conv_kernel3x3[2][0] * frame[2][0]
+ global_conv_kernel3x3[2][1] * frame[2][1]
+ global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
for (int i = 1; i < 16; i++) {
int _ty = ty + i;
shift_left(frame);
if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
frame[0][2] = input[(tx - 1) * cols + _ty + 1];
frame[1][2] = input[tx * cols + _ty + 1];
frame[2][2] = input[(tx + 1) * cols + _ty + 1];
vals[i] = (global_conv_kernel3x3[0][0] * frame[0][0]
+ global_conv_kernel3x3[0][1] * frame[0][1]
+ global_conv_kernel3x3[0][2] * frame[0][2]
+ global_conv_kernel3x3[1][0] * frame[1][0]
+ global_conv_kernel3x3[1][1] * frame[1][1]
+ global_conv_kernel3x3[1][2] * frame[1][2]
+ global_conv_kernel3x3[2][0] * frame[2][0]
+ global_conv_kernel3x3[2][1] * frame[2][1]
+ global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
}
}
reinterpret_cast<uchar4*>(&output[(tx * cols + ty)])[0] = make_uchar4(vals[0], vals[1], vals[2], vals[3]);
reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 4)])[0] = make_uchar4(vals[4], vals[5], vals[6], vals[7]);
reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 8)])[0] = make_uchar4(vals[8], vals[9], vals[10], vals[11]);
reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 12)])[0] = make_uchar4(vals[12], vals[13], vals[14], vals[15]);
}
}
__global__ void k_1D_gf_3x3_load_balance16_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 16;
int tx = blockIdx.y * blockDim.y + threadIdx.y;
unsigned char frame[3][3];
if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty < cols - 1)) {
frame[0][0] = input[(tx - 1) * cols + ty - 1];
frame[0][1] = input[(tx - 1) * cols + ty];
frame[0][2] = input[(tx - 1) * cols + ty + 1];
frame[1][0] = input[tx * cols + ty - 1];
frame[1][1] = input[tx * cols + ty];
frame[1][2] = input[tx * cols + ty + 1];
frame[2][0] = input[(tx + 1) * cols + ty - 1];
frame[2][1] = input[(tx + 1) * cols + ty];
frame[2][2] = input[(tx + 1) * cols + ty + 1];
output[(tx * cols + ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
+ global_conv_kernel3x3[0][1] * frame[0][1]
+ global_conv_kernel3x3[0][2] * frame[0][2]
+ global_conv_kernel3x3[1][0] * frame[1][0]
+ global_conv_kernel3x3[1][1] * frame[1][1]
+ global_conv_kernel3x3[1][2] * frame[1][2]
+ global_conv_kernel3x3[2][0] * frame[2][0]
+ global_conv_kernel3x3[2][1] * frame[2][1]
+ global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
for (int i = 1; i < 16; i++) {
int _ty = ty + i;
shift_left(frame);
if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
frame[0][2] = input[(tx - 1) * cols + _ty + 1];
frame[1][2] = input[tx * cols + _ty + 1];
frame[2][2] = input[(tx + 1) * cols + _ty + 1];
output[(tx * cols + _ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
+ global_conv_kernel3x3[0][1] * frame[0][1]
+ global_conv_kernel3x3[0][2] * frame[0][2]
+ global_conv_kernel3x3[1][0] * frame[1][0]
+ global_conv_kernel3x3[1][1] * frame[1][1]
+ global_conv_kernel3x3[1][2] * frame[1][2]
+ global_conv_kernel3x3[2][0] * frame[2][0]
+ global_conv_kernel3x3[2][1] * frame[2][1]
+ global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
}
}
}
}
如您所见,唯一的区别是存储输出元素。正如我预期的那样,用于存储元素 (STG) 的全局内存访问量减少了 75%。
全局加载次数减少,因为在第一个版本中,编译器无法确认
global_conv_kernel3x3
不与output
重叠,因此必须再次加载该值以确保它没有改变。
在常规内核的 PTX 源代码中,您可以看到 2x 重复的全局加载:
ld.global.u64 %rd7, [global_conv_kernel3x3];
...
ld.global.u64 %rd15, [global_conv_kernel3x3];
为了解决这个问题,您可以将
global_conv_kernel3x3
标记为 __constant__
以告诉编译器值永远不会改变。我已在本地确认它确实解决了您的问题:
$ head -n3 test.cu
__constant__ char** global_conv_kernel3x3;
__global__ void k_1D_gf_3x3_load_balance16_global(unsigned char* input, unsigned char* output, int rows, int cols)
$ nvcc-preview -ptx test.cu
$ cat test2.ptx | grep global_conv
.const .align 8 .u64 global_conv_kernel3x3;
ld.const.u64 %rd8, [global_conv_kernel3x3];