我在C++中有这个函数
void routine2(float alpha, float beta) {
unsigned int i, j;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
w[i] = w[i] - beta + alpha * A[i][j] * x[j];
}
这是我写的SSE(向量化)版本
void routine2_vec(float alpha, float beta) {
__m128 alpha_2 = _mm_set1_ps(alpha);
__m128 beta_2 = _mm_set1_ps(beta);
__m128 w2,num1,A2,X2;
__m128 multiple1, multiple2,sub;
unsigned int i, j;
for ( i = 0; i < N; ++i) {
w2 = _mm_setzero_ps();
for ( j = 0; j < (N/4)*4; j += 4) {
num1 = _mm_loadu_ps(&w[i]);
A2 = _mm_loadu_ps(&A[i][j]);
X2 = _mm_loadu_ps(&x[j]);
multiple1 = _mm_mul_ps(A2, X2);
multiple2 = _mm_mul_ps(alpha_2, multiple1);
sub = _mm_sub_ps(num1, beta_2);
w2 = _mm_add_ps(sub, multiple2);
_mm_store_ss(&w[i], w2);
}
}
}
两个函数的结果应该是相同的,所以我编写了另一个函数来运行并比较两个函数的结果。
测试和比较功能
int routine2_test(float alpha, float beta) {
unsigned int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
test2[i] = test2[i] - beta + alpha * A[i][j] * x[j];
}
}
// compare
for (j = 0; j < N; j++) {
if (equal(w[j], test2[j]) == 1) {
printf("\n The result of w[%d] is not equal to test2[%d] \n", j, j);
return 1;
}
}
return 0;
}
unsigned short int equal(float a, float b) {
float temp = a - b;
//printf("\n %f %f", a, b);
if ((fabs(temp) / fabs(b)) < EPSILON)
return 0; //success
else
return 1; //wrong result
}
测试函数运行主代码并将结果存储在
test2
数组中,routine2_vec
函数与 SSE 执行相同的操作并将结果存储在 w
数组中,然后我在 routine2_test
中比较两个结果
具有 equal
功能。
已编辑
这是剩下的部分,包括 main 和 init 函数。
#include <stdio.h>
#include <time.h>
#include <pmmintrin.h>
#include <process.h>
#include <chrono>
#include <iostream>
#include <immintrin.h>
#include <omp.h>
#define M 256*128
#define ARITHMETIC_OPERATIONS1 3*M
#define TIMES1 1
#define N 2048
#define ARITHMETIC_OPERATIONS2 4*N*N
#define TIMES2 1
constexpr auto EPSILON =0.0001;
//function declaration
void initialize();
void routine2(float alpha, float beta);
unsigned short int equal(float a, float b);
int routine2_test(float, float);
void routine2_vec(float alpha, float beta);
__declspec(align(64)) float test2[N];
__declspec(align(64)) float A[N][N], x[N], w[N];
int main() {
float alpha = 0.023f, beta = 0.045f;
double run_time, start_time;
unsigned int t;
initialize();
printf("\nRoutine2:");
start_time = omp_get_wtime(); //start timer
for (t = 0; t < TIMES2; t++){
// routine2(alpha, beta);
routine2_vec(alpha, beta);
routine2_test(alpha, beta);
}
run_time = omp_get_wtime() - start_time; //end timer
printf("\n Time elapsed is %f secs \n %e FLOPs achieved\n", run_time, (double)(ARITHMETIC_OPERATIONS2) / ((double)run_time / TIMES2));
return 0;
}
void initialize() {
unsigned int i, j;
//initialize routine2 arrays
for (i = 0; i < N; i++)
for (j = 0; j < N; j++) {
A[i][j] = (i % 99) + (j % 14) + 0.013f;
}
//initialize routine2 arrays
for (i = 0; i < N; i++) {
x[i] = (i % 19) - 0.01f;
w[i] = (i % 5) - 0.002f;
test2[i]= (i % 5) - 0.002f;
}
}
我们可以从优化标量版本开始。
void routine2(float alpha, float beta) {
for (unsigned i = 0; i < N; i++) {
float dot = 0.f;
for (unsigned j = 0; j < N; j++)
dot += A[i][j] * x[j];
w[i] += alpha * dot - N * beta;
}
}
beta
的使用方式看起来很可疑,但这由你决定。
现在我们进行矢量化。您没有编写正确的归约循环。它应该是这样的:
void routine2_vec(float alpha, float beta) {
assert(N % 4 == 0);
for (unsigned i = 0; i < N; i++) {
__m128 dot4 = _mm_setzero_ps();
for (unsigned j = 0; j < N; j += 4) {
__m128 aij = _mm_loadu_ps(&A[i][j]);
__m128 xj = _mm_loadu_ps(&x[j]);
__m128 mres = _mm_mul_ps(aij, xj);
dot4 = _mm_add_ps(dot4, mres);
}
// Reduce to two
__m128 high = _mm_shuffle_ps(dot4, dot4, _MM_SHUFFLE(3, 3, 1, 1));
dot4 = _mm_add_ps(dot4, high);
// reduce to one
high = _mm_movehl_ps(dot4, dot4);
dot4 = _mm_add_ss(dot4, high);
/*
* There is no point in doing the last part with intrinsics,
* just let the compiler handle it
*/
float dot = _mm_cvtss_f32(dot4);
w[i] += alpha * dot - N * beta;
}
}
有关进行归约步骤的最佳方法的讨论可以在这里找到:进行水平 SSE 向量和(或其他归约)的最快方法
由于您本质上是在进行矩阵向量乘积,因此您可以沿着外循环计算四个值。这可以避免通过累加器对循环携带的依赖链造成阻塞,如下所述:为什么 mulss 在 Haswell 上只需要 3 个周期,与 Agner 的指令表不同? (使用多个累加器展开 FP 循环)
void routine2_vec(float alpha, float beta) {
__m128 alpha2 = _mm_set1_ps(alpha);
__m128 beta_n = _mm_set1_ps(N * beta);
assert(N % 4 == 0);
for (unsigned i = 0; i < N; i += 4) {
__m128 dot4_1 = _mm_setzero_ps();
__m128 dot4_2 = dot4_1, dot4_3 = dot4_1, dot4_4 = dot4_1;
for (unsigned j = 0; j < N; j += 4) {
__m128 aij = _mm_loadu_ps(&A[i][j]);
__m128 xj = _mm_loadu_ps(&x[j]);
__m128 mres = _mm_mul_ps(aij, xj);
dot4_1 = _mm_add_ps(dot4_1, mres);
aij = _mm_loadu_ps(&A[i + 1][j]);
mres = _mm_mul_ps(aij, xj);
dot4_2 = _mm_add_ps(dot4_2, mres);
aij = _mm_loadu_ps(&A[i + 2][j]);
mres = _mm_mul_ps(aij, xj);
dot4_3 = _mm_add_ps(dot4_3, mres);
aij = _mm_loadu_ps(&A[i + 3][j]);
mres = _mm_mul_ps(aij, xj);
dot4_4 = _mm_add_ps(dot4_4, mres);
}
/* Reduce to one vector for 4 i-values */
_MM_TRANSPOSE4_PS(dot4_1, dot4_2, dot4_3, dot4_4);
dot4_1 = _mm_add_ps(dot4_1, dot4_2);
dot4_3 = _mm_add_ps(dot4_3, dot4_4);
dot4_1 = _mm_add_ps(dot4_1, dot4_3);
dot4_1 = _mm_mul_ps(dot4_1, alpha2);
dot4_1 = _mm_sub_ps(dot4_1, beta_n);
__m128 wi = _mm_loadu_ps(&w[i]);
wi = _mm_add_ps(wi, dot4_1);
_mm_storeu_ps(&w[i], wi);
}
}
可以在此处找到进一步的改进,例如微优化循环条件或对齐输入之一。它还包括在数组大小不是 4 的倍数时处理数组尾部的代码。