我一直在尝试使用 AVX 指令来加速一些神经网络计算。但是,我不断遇到以下错误“[...] 处未处理的异常:读取位置 [...] 访问冲突”。
我试图隔离问题,但由于内存似乎在某处损坏,错误不会每次都出现在同一位置,我怀疑它具有误导性。有谁知道可能导致问题的原因是什么?
这是一些可重现的代码:
#include <immintrin.h>
#include <cmath>
#include <iostream>
#include <array>
#include <vector>
inline const int num_avx_registers = 16;
inline const int floats_per_reg = 4;
inline const int HKP_size = 100;
inline constexpr int acc_size = 256;
class NNLayer {
public:
alignas(32) float* weight;
alignas(32) float* bias;
NNLayer(){
weight = new float[HKP_size * acc_size]; // flattened 2D array.
bias = new float[acc_size];
// initialize the weights and bias with test values
for (uint32_t i=0; i<HKP_size * acc_size; i++){
weight[i] = 1.F;
}
for (int i=0; i<acc_size; i++){
bias[i] = static_cast<float>(i);
}
}
~NNLayer(){
delete[] weight;
delete[] bias;
}
};
class Accumulator {
public:
alignas(32) std::array<float, acc_size> accumulator_w;
alignas(32) std::array<float, acc_size> accumulator_b;
std::array<float, acc_size>& Accumulator::operator[](bool color){
return color ? accumulator_w : accumulator_b;
}
};
class NNUE {
public:
Accumulator accumulator;
NNLayer first_layer = NNLayer();
void compute_accumulator(const std::vector<int> active_features, bool color){
// we have 256 floats to process.
// there are 16 avx registers, and each can hold 4 floats.
// therefore we need to do 256/64 = 4 passes to the registers.
constexpr int c_size = num_avx_registers * floats_per_reg; //chunk size
constexpr int num_chunks = acc_size / c_size;
static_assert(acc_size % c_size == 0);
__m256 avx_regs[num_avx_registers];
// we process 1/4th of the whole data at each loop.
// we add c_idx to the indexes pick up where we left off at the last chunk.
for (int c_idx = 0; c_idx < num_chunks*c_size; c_idx += c_size){ // chunk index
// load the bias from memory
for (int i = 0; i < num_avx_registers; i++){
avx_regs[i] = _mm256_load_ps(&first_layer.bias[c_idx + i*floats_per_reg]);
}
// add the active weights
for (const int &a: active_features){
for (int i = 0; i < num_avx_registers; i++){
// a*acc_size is to get the a-th row of the flattened 2D array.
avx_regs[i] = _mm256_add_ps(
avx_regs[i],
_mm256_load_ps(&first_layer.weight[a*acc_size + c_idx + i*floats_per_reg])
);
}
}
//store the result in the accumulator
for (int i = 0; i < num_avx_registers; i++){
_mm256_store_ps(&accumulator[color][c_idx + i*floats_per_reg], avx_regs[i]);
}
}
}
};
int main(){
NNUE nnue;
std::vector<int> act_f = {2, 1, 620, 62};
nnue.compute_accumulator(act_f, true);
std::cout << "still alive\n";
return 0;
}
alignas(32) float* weight;
仅对齐指针存储的地址,而不对齐指向的内存。如果你想在 C++17 中使用 new
创建对齐内存,你可以写:
weight = new (std::align_val_t(32)) float[HKP_size * acc_size];
在
_mm256_add_ps
行出现访问冲突(在评论中提到,应添加到问题中)可能意味着:
_mm256_add_ps
和 _mm256_load_ps
融合为带有内存操作数的 vaddps
,并对其进行访问冲突。在这种情况下,对齐不能成为原因,大多数 AVX 指令并不关心它。最有可能的是索引超出了数组范围_mm256_add_ps
和 _mm256_load_ps
。在这种情况下,预计会发出 vmovaps
对齐指令,并且 AV 很可能存在。在这种情况下,未对齐可能是一个原因,但索引超出数组范围也可能是一个原因