为什么使用 SSE 对向量矩阵乘积进行向量化结果不正确?

问题描述 投票:0回答:1

我在C++中有这个函数

void routine2(float alpha, float beta) {

    unsigned int i, j;

    for (i = 0; i < N; i++)
        for (j = 0; j < N; j++)
            w[i] = w[i] - beta + alpha * A[i][j] * x[j];

}

这是我写的SSE(向量化)版本

void routine2_vec(float alpha, float beta) {
    __m128 alpha_2 = _mm_set1_ps(alpha);
    __m128 beta_2 = _mm_set1_ps(beta);

    __m128 w2,num1,A2,X2;
    __m128 multiple1, multiple2,sub;

    unsigned int i, j;
    

    for ( i = 0; i < N; ++i) {
        w2 = _mm_setzero_ps();
        for ( j = 0; j < (N/4)*4; j += 4) {
            num1 = _mm_loadu_ps(&w[i]);

            A2 = _mm_loadu_ps(&A[i][j]);
            X2 = _mm_loadu_ps(&x[j]);

            multiple1 = _mm_mul_ps(A2, X2);
            multiple2 = _mm_mul_ps(alpha_2, multiple1);
            sub = _mm_sub_ps(num1, beta_2);

            w2 = _mm_add_ps(sub, multiple2);
            _mm_store_ss(&w[i], w2);
        }

    }
}

两个函数的结果应该是相同的,所以我编写了另一个函数来运行并比较两个函数的结果。

测试和比较功能

int routine2_test(float alpha, float beta) {
    unsigned int i, j;

    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            test2[i] = test2[i] - beta + alpha * A[i][j] * x[j];

        }
    }
  
    // compare
    for (j = 0; j < N; j++) {
        if (equal(w[j], test2[j]) == 1) {
            printf("\n The result of w[%d] is not equal to test2[%d]  \n", j, j);
            return 1;
        }
    }

    return 0;
}


unsigned short int equal(float a, float b) {
    float temp = a - b;
    //printf("\n %f  %f", a, b);
    if ((fabs(temp) / fabs(b)) < EPSILON)
        return 0; //success
    else
        return 1; //wrong result
}

测试函数运行主代码并将结果存储在

test2
数组中,
routine2_vec
函数与 SSE 执行相同的操作并将结果存储在
w
数组中,然后我在
routine2_test 中比较两个结果
具有
equal
功能。

已编辑

这是剩下的部分,包括 main 和 init 函数。


#include <stdio.h>
#include <time.h>
#include <pmmintrin.h>
#include <process.h>
#include <chrono>
#include <iostream>
#include <immintrin.h>
#include <omp.h>

#define M 256*128
#define ARITHMETIC_OPERATIONS1 3*M
#define TIMES1 1

#define N  2048
#define ARITHMETIC_OPERATIONS2 4*N*N
#define TIMES2 1


constexpr auto EPSILON =0.0001;

//function declaration
void initialize();
void routine2(float alpha, float beta);

unsigned short int equal(float a, float b); 
int routine2_test(float, float);

void routine2_vec(float alpha, float beta);

__declspec(align(64)) float  test2[N];
__declspec(align(64)) float A[N][N], x[N], w[N];


int main() {

    float alpha = 0.023f, beta = 0.045f;
    double run_time, start_time;
    unsigned int t;

    initialize();

    printf("\nRoutine2:");
    start_time = omp_get_wtime(); //start timer

    for (t = 0; t < TIMES2; t++){
        // routine2(alpha, beta);
        routine2_vec(alpha, beta);
        routine2_test(alpha, beta);
}

    run_time = omp_get_wtime() - start_time; //end timer
    printf("\n Time elapsed is %f secs \n %e FLOPs achieved\n", run_time, (double)(ARITHMETIC_OPERATIONS2) / ((double)run_time / TIMES2));

    return 0;
}


void initialize() {

    unsigned int i, j;

    //initialize routine2 arrays
    for (i = 0; i < N; i++)
        for (j = 0; j < N; j++) {
            A[i][j] = (i % 99) + (j % 14) + 0.013f;
        }

    //initialize routine2 arrays
    for (i = 0; i < N; i++) {
        x[i] = (i % 19) - 0.01f;
        w[i] = (i % 5) - 0.002f;
        test2[i]= (i % 5) - 0.002f;
    }
}

c++ matrix-multiplication sse dot-product
1个回答
1
投票

我们可以从优化标量版本开始。

  1. 避免多余的装载和存储。
  2. 将 alpha 和 beta 从最内层循环中提升出来
void routine2(float alpha, float beta) {
    for (unsigned i = 0; i < N; i++) {
        float dot = 0.f;
        for (unsigned j = 0; j < N; j++)
            dot += A[i][j] * x[j];
        w[i] += alpha * dot - N * beta;
    }
}

beta
的使用方式看起来很可疑,但这由你决定。

现在我们进行矢量化。您没有编写正确的归约循环。它应该是这样的:

void routine2_vec(float alpha, float beta) {
    assert(N % 4 == 0);
    for (unsigned i = 0; i < N; i++) {
        __m128 dot4 = _mm_setzero_ps();
        for (unsigned j = 0; j < N; j += 4) {
            __m128 aij = _mm_loadu_ps(&A[i][j]);
            __m128 xj = _mm_loadu_ps(&x[j]);
            __m128 mres = _mm_mul_ps(aij, xj);
            dot4 = _mm_add_ps(dot4, mres);
        }
        // Reduce to two
        __m128 high = _mm_shuffle_ps(dot4, dot4, _MM_SHUFFLE(3, 3, 1, 1));
        dot4 = _mm_add_ps(dot4, high);
        // reduce to one
        high = _mm_movehl_ps(dot4, dot4);
        dot4 = _mm_add_ss(dot4, high);
        /*
         * There is no point in doing the last part with intrinsics,
         * just let the compiler handle it
         */
        float dot = _mm_cvtss_f32(dot4);
        w[i] += alpha * dot - N * beta;
    }
}

有关进行归约步骤的最佳方法的讨论可以在这里找到:进行水平 SSE 向量和(或其他归约)的最快方法

由于您本质上是在进行矩阵向量乘积,因此您可以沿着外循环计算四个值。这可以避免通过累加器对循环携带的依赖链造成阻塞,如下所述:为什么 mulss 在 Haswell 上只需要 3 个周期,与 Agner 的指令表不同? (使用多个累加器展开 FP 循环)

void routine2_vec(float alpha, float beta) {
    __m128 alpha2 = _mm_set1_ps(alpha);
    __m128 beta_n = _mm_set1_ps(N * beta);
    assert(N % 4 == 0);
    for (unsigned i = 0; i < N; i += 4) {
        __m128 dot4_1 = _mm_setzero_ps();
        __m128 dot4_2 = dot4_1, dot4_3 = dot4_1, dot4_4 = dot4_1;
        for (unsigned j = 0; j < N; j += 4) {
            __m128 aij = _mm_loadu_ps(&A[i][j]);
            __m128 xj = _mm_loadu_ps(&x[j]);
            __m128 mres = _mm_mul_ps(aij, xj);
            dot4_1 = _mm_add_ps(dot4_1, mres);

            aij = _mm_loadu_ps(&A[i + 1][j]);
            mres = _mm_mul_ps(aij, xj);
            dot4_2 = _mm_add_ps(dot4_2, mres);

            aij = _mm_loadu_ps(&A[i + 2][j]);
            mres = _mm_mul_ps(aij, xj);
            dot4_3 = _mm_add_ps(dot4_3, mres);

            aij = _mm_loadu_ps(&A[i + 3][j]);
            mres = _mm_mul_ps(aij, xj);
            dot4_4 = _mm_add_ps(dot4_4, mres);
        }
        /* Reduce to one vector for 4 i-values */
        _MM_TRANSPOSE4_PS(dot4_1, dot4_2, dot4_3, dot4_4);
        dot4_1 = _mm_add_ps(dot4_1, dot4_2);
        dot4_3 = _mm_add_ps(dot4_3, dot4_4);
        dot4_1 = _mm_add_ps(dot4_1, dot4_3);

        dot4_1 = _mm_mul_ps(dot4_1, alpha2);
        dot4_1 = _mm_sub_ps(dot4_1, beta_n);
        __m128 wi = _mm_loadu_ps(&w[i]);
        wi = _mm_add_ps(wi, dot4_1);
        _mm_storeu_ps(&w[i], wi);
    }
}

可以在此处找到进一步的改进,例如微优化循环条件或对齐输入之一。它还包括在数组大小不是 4 的倍数时处理数组尾部的代码。

© www.soinside.com 2019 - 2024. All rights reserved.