矢量化内存存储减少负载指令

Question

我有一个 16 倍粗化的内核（1x16 平铺）。为了减少 STG（存储全局）指令，在我的例子中，我通过 uchar4 实现了向量化内存访问。当我查看内存图表时，我看到了这一点：

标量内存访问

矢量化内存访问

这怎么可能？全局加载指令减少了。这些是内核：

    __global__ void k_1D_gf_3x3_vectorized16_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
    int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 16;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;

    int vals[16] = { 0 };
    unsigned char frame[3][3];

    if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty  < cols - 1)) {
        frame[0][0] = input[(tx - 1) * cols + ty - 1];
        frame[0][1] = input[(tx - 1) * cols + ty];
        frame[0][2] = input[(tx - 1) * cols + ty + 1];
        frame[1][0] = input[tx * cols + ty - 1];
        frame[1][1] = input[tx * cols + ty];
        frame[1][2] = input[tx * cols + ty + 1];
        frame[2][0] = input[(tx + 1) * cols + ty - 1];
        frame[2][1] = input[(tx + 1) * cols + ty];
        frame[2][2] = input[(tx + 1) * cols + ty + 1];

        vals[0] = (global_conv_kernel3x3[0][0] * frame[0][0]
            + global_conv_kernel3x3[0][1] * frame[0][1]
            + global_conv_kernel3x3[0][2] * frame[0][2]
            + global_conv_kernel3x3[1][0] * frame[1][0]
            + global_conv_kernel3x3[1][1] * frame[1][1]
            + global_conv_kernel3x3[1][2] * frame[1][2]
            + global_conv_kernel3x3[2][0] * frame[2][0]
            + global_conv_kernel3x3[2][1] * frame[2][1]
            + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;

        for (int i = 1; i < 16; i++) {
            int _ty = ty + i;
            shift_left(frame);
            if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
                frame[0][2] = input[(tx - 1) * cols + _ty + 1];
                frame[1][2] = input[tx * cols + _ty + 1];
                frame[2][2] = input[(tx + 1) * cols + _ty + 1];

                vals[i] = (global_conv_kernel3x3[0][0] * frame[0][0]
                    + global_conv_kernel3x3[0][1] * frame[0][1]
                    + global_conv_kernel3x3[0][2] * frame[0][2]
                    + global_conv_kernel3x3[1][0] * frame[1][0]
                    + global_conv_kernel3x3[1][1] * frame[1][1]
                    + global_conv_kernel3x3[1][2] * frame[1][2]
                    + global_conv_kernel3x3[2][0] * frame[2][0]
                    + global_conv_kernel3x3[2][1] * frame[2][1]
                    + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
            }
        }
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty)])[0] = make_uchar4(vals[0], vals[1], vals[2], vals[3]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 4)])[0] = make_uchar4(vals[4], vals[5], vals[6], vals[7]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 8)])[0] = make_uchar4(vals[8], vals[9], vals[10], vals[11]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 12)])[0] = make_uchar4(vals[12], vals[13], vals[14], vals[15]);
    }
}
__global__ void k_1D_gf_3x3_load_balance16_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
    int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 16;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;

    unsigned char frame[3][3];

    if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty < cols - 1)) {
        frame[0][0] = input[(tx - 1) * cols + ty - 1];
        frame[0][1] = input[(tx - 1) * cols + ty];
        frame[0][2] = input[(tx - 1) * cols + ty + 1];
        frame[1][0] = input[tx * cols + ty - 1];
        frame[1][1] = input[tx * cols + ty];
        frame[1][2] = input[tx * cols + ty + 1];
        frame[2][0] = input[(tx + 1) * cols + ty - 1];
        frame[2][1] = input[(tx + 1) * cols + ty];
        frame[2][2] = input[(tx + 1) * cols + ty + 1];

        output[(tx * cols + ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
        + global_conv_kernel3x3[0][1] * frame[0][1]
        + global_conv_kernel3x3[0][2] * frame[0][2]
        + global_conv_kernel3x3[1][0] * frame[1][0]
        + global_conv_kernel3x3[1][1] * frame[1][1]
        + global_conv_kernel3x3[1][2] * frame[1][2]
        + global_conv_kernel3x3[2][0] * frame[2][0]
        + global_conv_kernel3x3[2][1] * frame[2][1]
        + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;

        for (int i = 1; i < 16; i++) {
            int _ty = ty + i;
            shift_left(frame);
            if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
                frame[0][2] = input[(tx - 1) * cols + _ty + 1];
                frame[1][2] = input[tx * cols + _ty + 1];
                frame[2][2] = input[(tx + 1) * cols + _ty + 1];

                output[(tx * cols + _ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
                + global_conv_kernel3x3[0][1] * frame[0][1]
                + global_conv_kernel3x3[0][2] * frame[0][2]
                + global_conv_kernel3x3[1][0] * frame[1][0]
                + global_conv_kernel3x3[1][1] * frame[1][1]
                + global_conv_kernel3x3[1][2] * frame[1][2]
                + global_conv_kernel3x3[2][0] * frame[2][0]
                + global_conv_kernel3x3[2][1] * frame[2][1]
                + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
            }
        }
    }
}

如您所见，唯一的区别是存储输出元素。正如我预期的那样，用于存储元素 (STG) 的全局内存访问量减少了 75%。

Answer 1

全局加载次数减少，因为在第一个版本中，编译器无法确认

global_conv_kernel3x3

不与

output

重叠，因此必须再次加载该值以确保它没有改变。

在常规内核的 PTX 源代码中，您可以看到 2x 重复的全局加载：

 ld.global.u64  %rd7, [global_conv_kernel3x3];
...
 ld.global.u64  %rd15, [global_conv_kernel3x3];

为了解决这个问题，您可以将

global_conv_kernel3x3

标记为

__constant__

以告诉编译器值永远不会改变。我已在本地确认它确实解决了您的问题：

$ head -n3 test.cu
__constant__ char** global_conv_kernel3x3;

__global__ void k_1D_gf_3x3_load_balance16_global(unsigned char* input, unsigned char* output, int rows, int cols)
$ nvcc-preview -ptx test.cu
$ cat test2.ptx | grep global_conv
.const .align 8 .u64 global_conv_kernel3x3;
 ld.const.u64  %rd8, [global_conv_kernel3x3];

矢量化内存存储减少负载指令

问题描述投票：0回答：1

1个回答

最新问题

矢量化内存存储减少负载指令

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1