矢量化内存存储减少负载指令

问题描述 投票:0回答:1

我有一个 16 倍粗化的内核(1x16 平铺)。为了减少 STG(存储全局)指令,在我的例子中,我通过 uchar4 实现了向量化内存访问。当我查看内存图表时,我看到了这一点:

标量内存访问 enter image description here

矢量化内存访问

enter image description here

enter image description here 这怎么可能?全局加载指令减少了。这些是内核:

    __global__ void k_1D_gf_3x3_vectorized16_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
    int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 16;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;

    int vals[16] = { 0 };
    unsigned char frame[3][3];

    if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty  < cols - 1)) {
        frame[0][0] = input[(tx - 1) * cols + ty - 1];
        frame[0][1] = input[(tx - 1) * cols + ty];
        frame[0][2] = input[(tx - 1) * cols + ty + 1];
        frame[1][0] = input[tx * cols + ty - 1];
        frame[1][1] = input[tx * cols + ty];
        frame[1][2] = input[tx * cols + ty + 1];
        frame[2][0] = input[(tx + 1) * cols + ty - 1];
        frame[2][1] = input[(tx + 1) * cols + ty];
        frame[2][2] = input[(tx + 1) * cols + ty + 1];

        vals[0] = (global_conv_kernel3x3[0][0] * frame[0][0]
            + global_conv_kernel3x3[0][1] * frame[0][1]
            + global_conv_kernel3x3[0][2] * frame[0][2]
            + global_conv_kernel3x3[1][0] * frame[1][0]
            + global_conv_kernel3x3[1][1] * frame[1][1]
            + global_conv_kernel3x3[1][2] * frame[1][2]
            + global_conv_kernel3x3[2][0] * frame[2][0]
            + global_conv_kernel3x3[2][1] * frame[2][1]
            + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;

        for (int i = 1; i < 16; i++) {
            int _ty = ty + i;
            shift_left(frame);
            if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
                frame[0][2] = input[(tx - 1) * cols + _ty + 1];
                frame[1][2] = input[tx * cols + _ty + 1];
                frame[2][2] = input[(tx + 1) * cols + _ty + 1];

                vals[i] = (global_conv_kernel3x3[0][0] * frame[0][0]
                    + global_conv_kernel3x3[0][1] * frame[0][1]
                    + global_conv_kernel3x3[0][2] * frame[0][2]
                    + global_conv_kernel3x3[1][0] * frame[1][0]
                    + global_conv_kernel3x3[1][1] * frame[1][1]
                    + global_conv_kernel3x3[1][2] * frame[1][2]
                    + global_conv_kernel3x3[2][0] * frame[2][0]
                    + global_conv_kernel3x3[2][1] * frame[2][1]
                    + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
            }
        }
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty)])[0] = make_uchar4(vals[0], vals[1], vals[2], vals[3]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 4)])[0] = make_uchar4(vals[4], vals[5], vals[6], vals[7]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 8)])[0] = make_uchar4(vals[8], vals[9], vals[10], vals[11]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 12)])[0] = make_uchar4(vals[12], vals[13], vals[14], vals[15]);
    }
}
__global__ void k_1D_gf_3x3_load_balance16_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
    int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 16;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;

    unsigned char frame[3][3];

    if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty < cols - 1)) {
        frame[0][0] = input[(tx - 1) * cols + ty - 1];
        frame[0][1] = input[(tx - 1) * cols + ty];
        frame[0][2] = input[(tx - 1) * cols + ty + 1];
        frame[1][0] = input[tx * cols + ty - 1];
        frame[1][1] = input[tx * cols + ty];
        frame[1][2] = input[tx * cols + ty + 1];
        frame[2][0] = input[(tx + 1) * cols + ty - 1];
        frame[2][1] = input[(tx + 1) * cols + ty];
        frame[2][2] = input[(tx + 1) * cols + ty + 1];

        output[(tx * cols + ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
        + global_conv_kernel3x3[0][1] * frame[0][1]
        + global_conv_kernel3x3[0][2] * frame[0][2]
        + global_conv_kernel3x3[1][0] * frame[1][0]
        + global_conv_kernel3x3[1][1] * frame[1][1]
        + global_conv_kernel3x3[1][2] * frame[1][2]
        + global_conv_kernel3x3[2][0] * frame[2][0]
        + global_conv_kernel3x3[2][1] * frame[2][1]
        + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;

        for (int i = 1; i < 16; i++) {
            int _ty = ty + i;
            shift_left(frame);
            if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
                frame[0][2] = input[(tx - 1) * cols + _ty + 1];
                frame[1][2] = input[tx * cols + _ty + 1];
                frame[2][2] = input[(tx + 1) * cols + _ty + 1];

                output[(tx * cols + _ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
                + global_conv_kernel3x3[0][1] * frame[0][1]
                + global_conv_kernel3x3[0][2] * frame[0][2]
                + global_conv_kernel3x3[1][0] * frame[1][0]
                + global_conv_kernel3x3[1][1] * frame[1][1]
                + global_conv_kernel3x3[1][2] * frame[1][2]
                + global_conv_kernel3x3[2][0] * frame[2][0]
                + global_conv_kernel3x3[2][1] * frame[2][1]
                + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
            }
        }
    }
}

如您所见,唯一的区别是存储输出元素。正如我预期的那样,用于存储元素 (STG) 的全局内存访问量减少了 75%。

memory cuda nsight-compute
1个回答
0
投票

全局加载次数减少,因为在第一个版本中,编译器无法确认

global_conv_kernel3x3
不与
output
重叠,因此必须再次加载该值以确保它没有改变。

在常规内核的 PTX 源代码中,您可以看到 2x 重复的全局加载:

 ld.global.u64  %rd7, [global_conv_kernel3x3];
...
 ld.global.u64  %rd15, [global_conv_kernel3x3];

为了解决这个问题,您可以将

global_conv_kernel3x3
标记为
__constant__
以告诉编译器值永远不会改变。我已在本地确认它确实解决了您的问题:

$ head -n3 test.cu
__constant__ char** global_conv_kernel3x3;

__global__ void k_1D_gf_3x3_load_balance16_global(unsigned char* input, unsigned char* output, int rows, int cols)
$ nvcc-preview -ptx test.cu
$ cat test2.ptx | grep global_conv
.const .align 8 .u64 global_conv_kernel3x3;
 ld.const.u64  %rd8, [global_conv_kernel3x3];
© www.soinside.com 2019 - 2024. All rights reserved.