使用 SIMD 并行化 4x4 行主矩阵的矩阵乘法

问题描述 投票:0回答:1

我目前在尝试并行化 4x4 矩阵乘法算法时面临着极其困难的时期。我正在尝试创建一个库以在学校的最小光线跟踪器项目中使用,因此我正在尝试使其约定尽可能对用户友好,这就是为什么我选择以行主顺序存储矩阵,因为对于我询问的大多数人来说似乎更直观。然而,这提出了一个问题,因为许多并行化策略需要列提取(用于点积简化),而我什至很难尝试并行思考......这是我当前的矩阵乘法代码(有一些缺失的部分)用于计算两个 4x4 矩阵(行主)的矩阵乘积。

static inline __m128    _extract_column_ps4(const t_mat4s *in, int c)
{
    return (_mm_set_ps(in->a[3][c], in->a[2][c], in->a[1][c], in->a[0][c]));
}

static inline __m128    compute_row(const __m128 *in1, const t_mat4s *in2,
                            int r)
{
    __m128  col[4];
    __m128  mul[4];
    __m128  res;

    col[0] = _extract_column_ps4(in2, 0);
    col[1] = _extract_column_ps4(in2, 1);
    col[2] = _extract_column_ps4(in2, 2);
    col[3] = _extract_column_ps4(in2, 3);
    mul[0] = _mm_mul_ps(in1[r], col[0]);
    mul[1] = _mm_mul_ps(in1[r], col[1]);
    mul[2] = _mm_mul_ps(in1[r], col[2]);
    mul[3] = _mm_mul_ps(in1[r], col[3]);
    // ..
    // ..
    return (res);
}

/// @brief computes the cross product of `in1` with `in2`
///        (in that order), and stores the result in the `t_mat4s`
///        pointed-to by `out`.
static inline void  lag_mat4s_cross_mat4s(const t_mat4s in1,
                            const t_mat4s in2, t_mat4s *out)
{
    out->simd[0] = compute_row(in1.simd, &in2, 0);
    out->simd[1] = compute_row(in1.simd, &in2, 1);
    out->simd[2] = compute_row(in1.simd, &in2, 2);
    out->simd[3] = compute_row(in1.simd, &in2, 3);
}

如你所见,我没有使用任何

hadd
,事实上,我试图避免它们,因为它们非常昂贵。另一种方法可以是简单地执行以下操作:

// Code for computing the result of multiplying two 4x4 row-major matrix of double-precision floats

static inline void  unrolled_cross(const t_mat4d *in, const __m256d col[4],
                        t_mat4d *out)
{
    out->r1.x = lag_dot_product_avx(in->r1.simd, col[0]);
    out->r1.y = lag_dot_product_avx(in->r1.simd, col[1]);
    out->r1.z = lag_dot_product_avx(in->r1.simd, col[2]);
    out->r1.w = lag_dot_product_avx(in->r1.simd, col[3]);
    out->r2.x = lag_dot_product_avx(in->r2.simd, col[0]);
    out->r2.y = lag_dot_product_avx(in->r2.simd, col[1]);
    out->r2.z = lag_dot_product_avx(in->r2.simd, col[2]);
    out->r2.w = lag_dot_product_avx(in->r2.simd, col[3]);
    out->r3.x = lag_dot_product_avx(in->r3.simd, col[0]);
    out->r3.y = lag_dot_product_avx(in->r3.simd, col[1]);
    out->r3.z = lag_dot_product_avx(in->r3.simd, col[2]);
    out->r3.w = lag_dot_product_avx(in->r3.simd, col[3]);
    out->r4.x = lag_dot_product_avx(in->r4.simd, col[0]);
    out->r4.y = lag_dot_product_avx(in->r4.simd, col[1]);
    out->r4.z = lag_dot_product_avx(in->r4.simd, col[2]);
    out->r4.w = lag_dot_product_avx(in->r4.simd, col[3]);
}

void    lag_mat4_cross(const t_mat4d *in, const t_mat4d *in2, t_mat4d *out)
{
    __m256d col[4];

    lag_extract_column4_avx(in2, 0, &col[0]);
    lag_extract_column4_avx(in2, 1, &col[1]);
    lag_extract_column4_avx(in2, 2, &col[2]);
    lag_extract_column4_avx(in2, 3, &col[3]);
    unrolled_cross(in, col, out);
}

公平地说,这工作得很好,但我觉得我在这里失去了很多并行化......

我第一次尝试洗牌,但很快意识到它行不通,因为我的列在内存中不连续。我还尝试使用一系列

_mm_mul_ps
后跟两个
_mm_add_ps
,这显然不起作用,因为您需要水平添加元素而不是并行(按组件)添加元素。我正在考虑使用 AVX-256 寄存器来存储两组 4 个打包单曲,但这也变得相当重,而且我的大脑在尝试概念化时都感到困惑。有什么想法/建议吗?我认为此时转换为列优先对我来说不是一个选择...另外,您能否就我存储矩阵的顺序(列优先与行优先)的性能提供建议。这会对性能产生什么影响?有什么方法比其他方法更好吗?是具体情况而定吗?为什么?

编辑:我可能应该提到我的结构/联合看起来像这样:

typedef union u_vec4s
{
    float       a[4];
    __m128      simd;
    struct
    {
        float   x;
        float   y;
        float   z;
        float   w;
    };
}__attribute((aligned(16))) t_vec4s;

typedef union u_mat4s
{
    float   a[4][4];
    __m128  simd[4];
    struct
    {
        t_vec4s r1;
        t_vec4s r2;
        t_vec4s r3;
        t_vec4s r4;
    };
}__attribute((aligned(16))) t_mat4s;

编辑#2: 这是我修改后的代码:

static inline __m128    _extract_column_ps4(const t_mat4s *in, int c)
{
    return (_mm_set_ps(in->a[3][c], in->a[2][c], in->a[1][c], in->a[0][c]));
}

/// @brief Returns the cross product of a `t_mat4s` with a `t_vec4s`
///        (in that order).
static inline t_vec4s   lag_mat4s_cross_vec4s(const t_mat4s m,
                            const t_vec4s v)
{
    t_vec4s ret;

    ret.x = lag_vec4s_dot_ret(m.r1, v);
    ret.y = lag_vec4s_dot_ret(m.r2, v);
    ret.z = lag_vec4s_dot_ret(m.r3, v);
    ret.w = lag_vec4s_dot_ret(m.r4, v);
    return (ret);
}

static inline __m128    compute_row(const __m128 *in1, const t_mat4s *in2,
                            int r)
{
    t_mat4s cols;
    t_vec4s row;
    __m128  mul[4];
    __m128  add[2];

    cols.simd[0] = _extract_column_ps4(in2, 0);
    cols.simd[1] = _extract_column_ps4(in2, 1);
    cols.simd[2] = _extract_column_ps4(in2, 2);
    cols.simd[3] = _extract_column_ps4(in2, 3);
    row.simd = in1[r];
    return (lag_mat4s_cross_vec4s(cols, row).simd);
}

/// @brief computes the cross product of `in1` with `in2`
///        (in that order), and stores the result in the `t_mat4s`
///        pointed-to by `out`.
static inline void  lag_mat4s_cross_mat4s(const t_mat4s in1,
                            const t_mat4s in2, t_mat4s *out)
{
    out->simd[0] = compute_row(in1.simd, &in2, 0);
    out->simd[1] = compute_row(in1.simd, &in2, 1);
    out->simd[2] = compute_row(in1.simd, &in2, 2);
    out->simd[3] = compute_row(in1.simd, &in2, 3);
}

现在我要采用不同的策略,其中我计算矩阵向量乘积,假设每一行都是它自己的向量,乘以相同的翻转矩阵(它的列变成它的行)。这可行,有更好的方法吗..?

c matrix-multiplication intrinsics avx
1个回答
0
投票

好吧。我得出的结论是,使用 AVX 寄存器是 4x4 行主矩阵的最佳方法(没有基准支持我的主张),仅仅是因为广播负载显然更高效。还决定避免在我的

_mm_dp_ps
功能中发送垃圾邮件
lag_mat4s_cross_vec4s
。这是修改后的代码:

static inline __m128    _extract_column_ps4(const t_mat4s *in, int c)
{
    return (_mm_set_ps(in->a[3][c], in->a[2][c], in->a[1][c], in->a[0][c]));
}

/// @brief Returns the cross product of a `t_mat4s` with a `t_vec4s`
///        (in that order).
static inline t_vec4s   lag_mat4s_cross_vec4s(const t_mat4s m,
                            const t_vec4s v)
{
    t_vec4s         ret;
    const __m128    mul0 = _mm_mul_ps(m.simd[0], v.simd);
    const __m128    mul1 = _mm_mul_ps(m.simd[1], v.simd);
    const __m128    mul2 = _mm_mul_ps(m.simd[2], v.simd);
    const __m128    mul3 = _mm_mul_ps(m.simd[3], v.simd);


    ret.simd = _mm_hadd_ps(_mm_hadd_ps(mul0, mul1), _mm_hadd_ps(mul2, mul3));
    return (ret);
}

//static inline void    lag_mat4s_cross_mat4s(const t_mat4s in1,
//                          const t_mat4s in2, t_mat4s *out)
//{
//  t_mat4s cols;

//  cols.simd[0] = _extract_column_ps4(&in2, 0);
//  cols.simd[1] = _extract_column_ps4(&in2, 1);
//  cols.simd[2] = _extract_column_ps4(&in2, 2);
//  cols.simd[3] = _extract_column_ps4(&in2, 3);
//  out->r1 = lag_mat4s_cross_vec4s(cols, in1.r1);
//  out->r2 = lag_mat4s_cross_vec4s(cols, in1.r2);
//  out->r3 = lag_mat4s_cross_vec4s(cols, in1.r3);
//  out->r4 = lag_mat4s_cross_vec4s(cols, in1.r4);
//}

/// @brief computes the cross product of `in1` with `in2`
///        (in that order), and stores the result in the `t_mat4s`
///        pointed-to by `out`.
static inline void  lag_mat4s_cross_mat4s(const t_mat4s in1,
                        const t_mat4s in2, t_mat4s *out)
{
    __m256  a[2];
    __m256  b[2];
    __m256  c[8];
    __m256  t[2];
    __m256  u[2];

    t[0] = in1._ymm[0];
    t[1] = in1._ymm[1];
    u[0] = in2._ymm[0];
    u[1] = in2._ymm[1];
    a[0] = _mm256_shuffle_ps(t[0], t[0], _MM_SHUFFLE(0, 0, 0, 0));
    a[1] = _mm256_shuffle_ps(t[1], t[1], _MM_SHUFFLE(0, 0, 0, 0));
    b[0] = _mm256_permute2f128_ps(u[0], u[0], 0x00);
    c[0] = _mm256_mul_ps(a[0], b[0]);
    c[1] = _mm256_mul_ps(a[1], b[0]);
    a[0] = _mm256_shuffle_ps(t[0], t[0], _MM_SHUFFLE(1, 1, 1, 1));
    a[1] = _mm256_shuffle_ps(t[1], t[1], _MM_SHUFFLE(1, 1, 1, 1));
    b[0] = _mm256_permute2f128_ps(u[0], u[0], 0x11);
    c[2] = _mm256_mul_ps(a[0], b[0]);
    c[3] = _mm256_mul_ps(a[1], b[0]);
    a[0] = _mm256_shuffle_ps(t[0], t[0], _MM_SHUFFLE(2, 2, 2, 2));
    a[1] = _mm256_shuffle_ps(t[1], t[1], _MM_SHUFFLE(2, 2, 2, 2));
    b[1] = _mm256_permute2f128_ps(u[1], u[1], 0x00);
    c[4] = _mm256_mul_ps(a[0], b[1]);
    c[5] = _mm256_mul_ps(a[1], b[1]);
    a[0] = _mm256_shuffle_ps(t[0], t[0], _MM_SHUFFLE(3, 3, 3, 3));
    a[1] = _mm256_shuffle_ps(t[1], t[1], _MM_SHUFFLE(3, 3, 3, 3));
    b[1] = _mm256_permute2f128_ps(u[1], u[1], 0x11);
    c[6] = _mm256_mul_ps(a[0], b[1]);
    c[7] = _mm256_mul_ps(a[1], b[1]);
    c[0] = _mm256_add_ps(c[0], c[2]);
    c[4] = _mm256_add_ps(c[4], c[6]);
    c[1] = _mm256_add_ps(c[1], c[3]);
    c[5] = _mm256_add_ps(c[5], c[7]);
    out->_ymm[0] = _mm256_add_ps(c[0], c[4]);
    out->_ymm[1] = _mm256_add_ps(c[1], c[5]);
}

我从彼得好心提供的一篇文章中窃取了一个实现(我不只是复制粘贴。我明白发生了什么)。然而,我想知道如果我将结构对齐到 32 位区域而不是 16 位区域,性能是否会得到提升。目前这不是我关心的问题,但请随意评论您的想法;欢迎任何信息和批评!

© www.soinside.com 2019 - 2024. All rights reserved.