我在尝试为该函数获得最佳加速时遇到了一个大问题,但我无法编写击败自动矢量化器的有效 SIMD 代码。我需要写一些 SIMD 来击败它,但我现在完全陷入困境:
static int c_coef_4tap[4][4] = {
0, 64, 0, 0,
-4, 54, 16, -2,
-4, 36, 36, -4,
-2, 16, 54, -4,
};
static void s_read_qpel_ref(int list, int ref_pos_x, int ref_pos_y, int blk_size, int ref_buf[])
{
int int_buf[35 * 35]; // 1+32+2 = 35
int int_x, int_y, frac_x, frac_y;
int pixel[4][4], hor[4], ver;
int x, y, h, v;
int_x = ref_pos_x >> 2;
int_y = ref_pos_y >> 2;
frac_x = ref_pos_x & 3;
frac_y = ref_pos_y & 3;
//int_buf fill-up function is here. it's too long but is not the bottleneck
for (y=0; y<blk_size; y++) {
for (x=0; x<blk_size; x++) {
for (v=0; v<4; v++) {
for (h=0; h<4; h++) {
pixel[v][h] = int_buf[(y + v) * (blk_size+3) + (x + h)];
}
}
// 4x4 filtering
for (v=0; v<4; v++) {
hor[v] = (pixel[v][0] * c_coef_4tap[frac_x][0] +
pixel[v][1] * c_coef_4tap[frac_x][1] +
pixel[v][2] * c_coef_4tap[frac_x][2] +
pixel[v][3] * c_coef_4tap[frac_x][3] + 32) >> 6;
hor[v] = I_CLIP3(0, 255, hor[v]);
}
ver = (hor[0] * c_coef_4tap[frac_y][0] +
hor[1] * c_coef_4tap[frac_y][1] +
hor[2] * c_coef_4tap[frac_y][2] +
hor[3] * c_coef_4tap[frac_y][3] + 32) >> 6;
ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
}
}
}
我当前使用的优化标志是:
-O3 -march=native -funroll-loops -flto
这些给了我最好的加速,没问题。但我必须让它更快。
目前,我已经分析了代码并发现“hor[v]”计算花费的时间最多。我尝试过水平添加,但它让它变得更慢。 我还查看了自动矢量化器转储,它显示 x 循环正在被矢量化。我的 SIMD 内部代码根本没有对 x 循环进行矢量化。事实上,当我引入内在函数时,它会停止对 x 循环进行矢量化。有人可以告诉我在这里做什么吗?
我会尝试转置和使用点积,但是 x 循环再次没有被矢量化。我认为,对 int_buf[] 的跨步访问来填充像素数组也会导致巨大的问题。
目前,我已经分析了代码并发现“hor[v]”计算花费的时间最多。我尝试过水平添加,但它让它变得更慢。 我还查看了自动矢量化器转储,它显示 x 循环正在被矢量化。我的 SIMD 内部代码根本没有对 x 循环进行矢量化。事实上,当我引入内在函数时,它会停止对 x 循环进行矢量化。有人可以告诉我在这里做什么吗?
我会尝试转置和使用点积,但是 x 循环再次没有被矢量化。我认为,对 int_buf[] 的跨步访问来填充像素数组也导致了巨大的问题。
即使这些是整数。像素数组不超过 10 位整数。思考使用 16 位乘法是否会有帮助。
以热门评论为序...
好的,我综合了一个基准程序。
将
pixel
和 hor
更改为标量的一个原因是它使转换为 openmp
更容易。
第一次天真的尝试使用
-fopenmp
导致原始函数结果不正确。我的“清理”功能更容易转换为 OMP。
详细基准如下,但是...
OMP 带来 2 倍的加速。
使用
-O2
,我的版本速度更快(17%)。
对于-O3
,版本几乎相同。
这是重构后的代码:
// bench.c -- benchmark program
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define inline_always static inline __attribute__((always_inline))
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
double t_beg;
double t_end;
#define BEG \
t_beg = tscgetf()
#define END \
t_end = tscgetf()
#if 0
static int c_coef_4tap[4][4] = {
0, 64, 0, 0,
-4, 54, 16, -2,
-4, 36, 36, -4,
-2, 16, 54, -4,
};
#else
const int c_coef_4tap[4][4] = {
{ 0, 64, 0, 0 },
{ -4, 54, 16, -2 },
{ -4, 36, 36, -4 },
{ -2, 16, 54, -4 },
};
#endif
typedef void
(*fncptr)(int list, int ref_pos_x, int ref_pos_y, int blk_size, int ref_buf[]);
struct fncdef {
fncptr fnc_ptr;
const char *fnc_sym;
const char *fnc_reason;
int *fnc_ref;
double fnc_elap;
};
inline_always int
I_CLIP3(int lo,int hi,int val)
{
do {
if (val < lo) {
val = lo;
break;
}
if (val > hi) {
val = hi;
break;
}
} while (0);
return val;
}
void
int_fill(int *int_buf)
{
for (int idx = 0; idx < (35 * 35); ++idx)
int_buf[idx] = idx;
}
void
s_orig(int list, int ref_pos_x, int ref_pos_y, int blk_size,
int ref_buf[])
{
int int_buf[35 * 35]; // 1+32+2 = 35
#if 0
int int_x, int_y;
#endif
int frac_x, frac_y;
int pixel[4][4];
int hor[4];
int ver;
int x, y, h, v;
#if 0
int_x = ref_pos_x >> 2;
int_y = ref_pos_y >> 2;
#endif
frac_x = ref_pos_x & 3;
frac_y = ref_pos_y & 3;
// int_buf fill-up function is here. it's too long but is not the bottleneck
int_fill(int_buf);
BEG;
for (y = 0; y < blk_size; y++) {
for (x = 0; x < blk_size; x++) {
for (v = 0; v < 4; v++) {
for (h = 0; h < 4; h++) {
pixel[v][h] = int_buf[(y + v) * (blk_size + 3) + (x + h)];
}
}
// 4x4 filtering
for (v = 0; v < 4; v++) {
hor[v] = (pixel[v][0] * c_coef_4tap[frac_x][0] +
pixel[v][1] * c_coef_4tap[frac_x][1] +
pixel[v][2] * c_coef_4tap[frac_x][2] +
pixel[v][3] * c_coef_4tap[frac_x][3] + 32) >> 6;
hor[v] = I_CLIP3(0, 255, hor[v]);
}
ver = (hor[0] * c_coef_4tap[frac_y][0] +
hor[1] * c_coef_4tap[frac_y][1] +
hor[2] * c_coef_4tap[frac_y][2] +
hor[3] * c_coef_4tap[frac_y][3] + 32) >> 6;
ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
}
}
END;
}
void
s_orig_omp1(int list, int ref_pos_x, int ref_pos_y, int blk_size,
int ref_buf[])
{
int int_buf[35 * 35]; // 1+32+2 = 35
#if 0
int int_x, int_y;
#endif
int frac_x, frac_y;
int pixel[4][4];
int hor[4];
int ver;
int x, y, h, v;
#if 0
int_x = ref_pos_x >> 2;
int_y = ref_pos_y >> 2;
#endif
frac_x = ref_pos_x & 3;
frac_y = ref_pos_y & 3;
// int_buf fill-up function is here. it's too long but is not the bottleneck
int_fill(int_buf);
BEG;
#pragma omp parallel for
for (y = 0; y < blk_size; y++) {
for (x = 0; x < blk_size; x++) {
for (v = 0; v < 4; v++) {
for (h = 0; h < 4; h++) {
pixel[v][h] = int_buf[(y + v) * (blk_size + 3) + (x + h)];
}
}
// 4x4 filtering
for (v = 0; v < 4; v++) {
hor[v] = (pixel[v][0] * c_coef_4tap[frac_x][0] +
pixel[v][1] * c_coef_4tap[frac_x][1] +
pixel[v][2] * c_coef_4tap[frac_x][2] +
pixel[v][3] * c_coef_4tap[frac_x][3] + 32) >> 6;
hor[v] = I_CLIP3(0, 255, hor[v]);
}
ver = (hor[0] * c_coef_4tap[frac_y][0] +
hor[1] * c_coef_4tap[frac_y][1] +
hor[2] * c_coef_4tap[frac_y][2] +
hor[3] * c_coef_4tap[frac_y][3] + 32) >> 6;
ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
}
}
END;
}
void
s_orig_omp2(int list, int ref_pos_x, int ref_pos_y, int blk_size,
int ref_buf[])
{
int int_buf[35 * 35]; // 1+32+2 = 35
#if 0
int int_x, int_y;
#endif
int frac_x, frac_y;
int pixel[4][4];
int hor[4];
#if 0
int_x = ref_pos_x >> 2;
int_y = ref_pos_y >> 2;
#endif
frac_x = ref_pos_x & 3;
frac_y = ref_pos_y & 3;
// int_buf fill-up function is here. it's too long but is not the bottleneck
int_fill(int_buf);
BEG;
#pragma omp parallel for private(pixel,hor) shared(int_buf,frac_x,frac_y)
for (int y = 0; y < blk_size; y++) {
for (int x = 0; x < blk_size; x++) {
for (int v = 0; v < 4; v++) {
for (int h = 0; h < 4; h++) {
pixel[v][h] = int_buf[(y + v) * (blk_size + 3) + (x + h)];
}
}
// 4x4 filtering
for (int v = 0; v < 4; v++) {
hor[v] = (pixel[v][0] * c_coef_4tap[frac_x][0] +
pixel[v][1] * c_coef_4tap[frac_x][1] +
pixel[v][2] * c_coef_4tap[frac_x][2] +
pixel[v][3] * c_coef_4tap[frac_x][3] + 32) >> 6;
hor[v] = I_CLIP3(0, 255, hor[v]);
}
int ver = (hor[0] * c_coef_4tap[frac_y][0] +
hor[1] * c_coef_4tap[frac_y][1] +
hor[2] * c_coef_4tap[frac_y][2] +
hor[3] * c_coef_4tap[frac_y][3] + 32) >> 6;
ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
}
}
END;
}
void
s_fix1(int list, int ref_pos_x, int ref_pos_y, int blk_size,
int ref_buf[])
{
int int_buf[35 * 35]; // 1+32+2 = 35
#if 0
int int_x, int_y;
#endif
int frac_x, frac_y;
int x, y, h, v;
#if 0
int_x = ref_pos_x >> 2;
int_y = ref_pos_y >> 2;
#endif
frac_x = ref_pos_x & 3;
frac_y = ref_pos_y & 3;
// int_buf fill-up function is here. it's too long but is not the bottleneck
int_fill(int_buf);
BEG;
for (y = 0; y < blk_size; y++) {
for (x = 0; x < blk_size; x++) {
int ver = 0;
for (v = 0; v < 4; v++) {
int hor_h = 0;
for (h = 0; h < 4; h++) {
int pixel = int_buf[(y + v) * (blk_size + 3) + (x + h)];
hor_h += pixel * c_coef_4tap[frac_x][h];
}
hor_h += 32;
hor_h >>= 6;
hor_h = I_CLIP3(0, 255, hor_h);
ver += hor_h * c_coef_4tap[frac_y][v];
}
ver += 32;
ver >>= 6;
ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
}
}
END;
}
void
s_fix1_omp(int list, int ref_pos_x, int ref_pos_y, int blk_size,
int ref_buf[])
{
int int_buf[35 * 35]; // 1+32+2 = 35
#if 0
int int_x, int_y;
#endif
int frac_x, frac_y;
#if 0
int x, y, h, v;
#endif
#if 0
int_x = ref_pos_x >> 2;
int_y = ref_pos_y >> 2;
#endif
frac_x = ref_pos_x & 3;
frac_y = ref_pos_y & 3;
// int_buf fill-up function is here. it's too long but is not the bottleneck
int_fill(int_buf);
BEG;
#pragma omp parallel for shared(int_buf,frac_x,frac_y)
for (int y = 0; y < blk_size; y++) {
for (int x = 0; x < blk_size; x++) {
int ver = 0;
for (int v = 0; v < 4; v++) {
int hor_h = 0;
for (int h = 0; h < 4; h++) {
int pixel = int_buf[(y + v) * (blk_size + 3) + (x + h)];
hor_h += pixel * c_coef_4tap[frac_x][h];
}
hor_h += 32;
hor_h >>= 6;
hor_h = I_CLIP3(0, 255, hor_h);
ver += hor_h * c_coef_4tap[frac_y][v];
}
ver += 32;
ver >>= 6;
ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
}
}
END;
}
#define FNCDEF(_sym,_reason) \
.fnc_ptr = _sym, .fnc_sym = #_sym, .fnc_reason = _reason
struct fncdef fnclist[] = {
{ FNCDEF(s_orig,"OP's original code") },
#ifdef _OPENMP
{ FNCDEF(s_orig_omp1,"OP's -fopenmp code [BROKEN]") },
{ FNCDEF(s_orig_omp2,"OP's -fopenmp code (with fixes)") },
#endif
{ FNCDEF(s_fix1,"Craig's cleanup") },
#ifdef _OPENMP
{ FNCDEF(s_fix1_omp,"Craig's -fopenmp") },
#endif
{ .fnc_ptr = NULL }
};
#define FNCALL \
struct fncdef *fnc = fnclist; fnc->fnc_ptr != NULL; ++fnc
void
dofnc(struct fncdef *fnc,
int list, int ref_pos_x, int ref_pos_y, int blk_size)
{
double t_dif;
fnc->fnc_elap = 1e9;
for (int iter = 1; iter <= 5; ++iter) {
fnc->fnc_ptr(list,ref_pos_x,ref_pos_y,blk_size,fnc->fnc_ref);
t_dif = t_end - t_beg;
if (t_dif < fnc->fnc_elap)
fnc->fnc_elap = t_dif;
}
printf("ELAPSED: %.9f %s (%s)\n",
fnc->fnc_elap,fnc->fnc_sym,fnc->fnc_reason);
}
void
dotest(int list,int ref_pos_x,int ref_pos_y,int blk_size)
{
for (FNCALL) {
fnc->fnc_ref = malloc(sizeof(int) * blk_size * blk_size);
dofnc(fnc,list,ref_pos_x,ref_pos_y,blk_size);
}
struct fncdef *ref = NULL;
for (FNCALL) {
if (ref == NULL) {
ref = fnc;
continue;
}
const int *ref_orig = ref->fnc_ref;
const int *ref_fast = fnc->fnc_ref;
printf("dotest: COMPARE %s\n",fnc->fnc_sym);
for (int y = 0; y < blk_size; ++y) {
for (int x = 0; x < blk_size; ++x) {
int orig = ref_orig[(y * blk_size) + x];
int fast = ref_fast[(y * blk_size) + x];
if (fast != orig) {
printf("dotest: [%3d][%3d] %4d %4d\n",y,x,orig,fast);
//exit(1);
}
}
}
}
for (FNCALL)
free(fnc->fnc_ref);
}
int
main(void)
{
//dotest(0,35,0,16);
dotest(0,35,0,32);
return 0;
}
在上面的代码中,我使用
cpp
条件来表示旧代码与新代码:
#if 0
// old code
#else
// new code
#endif
#if 1
// new code
#endif
注意:这可以通过运行文件来清理
unifdef -k
以下是我使用的命令:
cc -o sO0 bench.c -O0 -Wall -Werror -Wno-unknown-pragmas
./sO0
ELAPSED: 0.000141978 s_orig (OP's original code)
ELAPSED: 0.000132095 s_fix1 (Craig's cleanup)
dotest: COMPARE s_fix1
cc -o sO2 bench.c -O2 -Wall -Werror -Wno-unknown-pragmas
./sO2
ELAPSED: 0.000027120 s_orig (OP's original code)
ELAPSED: 0.000023231 s_fix1 (Craig's cleanup)
dotest: COMPARE s_fix1
cc -o std bench.c -O3 -Wall -Werror -Wno-unknown-pragmas
./std
ELAPSED: 0.000010765 s_orig (OP's original code)
ELAPSED: 0.000010280 s_fix1 (Craig's cleanup)
dotest: COMPARE s_fix1
cc -o omp bench.c -O3 -Wall -Werror -fopenmp
./omp
ELAPSED: 0.000010361 s_orig (OP's original code)
ELAPSED: 0.000022512 s_orig_omp1 (OP's -fopenmp code [BROKEN])
ELAPSED: 0.000004398 s_orig_omp2 (OP's -fopenmp code (with fixes))
ELAPSED: 0.000015385 s_fix1 (Craig's cleanup)
ELAPSED: 0.000004432 s_fix1_omp (Craig's -fopenmp)
dotest: COMPARE s_orig_omp1
dotest: [ 0][ 14] 51 255
dotest: [ 3][ 17] 159 255
dotest: [ 3][ 23] 165 255
dotest: [ 5][ 0] 212 255
dotest: [ 8][ 18] 255 190
dotest: [ 12][ 0] 255 181
dotest: [ 16][ 0] 255 73
dotest: [ 16][ 13] 255 241
dotest: [ 20][ 8] 255 38
dotest: [ 22][ 0] 255 100
dotest: [ 24][ 12] 255 44
dotest: [ 25][ 21] 255 241
dotest: [ 25][ 22] 255 247
dotest: [ 28][ 21] 255 66
dotest: [ 28][ 22] 255 219
dotest: [ 28][ 23] 255 221
dotest: [ 28][ 25] 255 228
dotest: [ 29][ 0] 255 89
dotest: [ 30][ 0] 255 125
dotest: [ 31][ 1] 255 169
dotest: COMPARE s_orig_omp2
dotest: COMPARE s_fix1
dotest: COMPARE s_fix1_omp