我有一个 armv8 64 架构的设备,我想使用它的指令来加速 AES。我在 Github 中找到了这段实现 AES 加密的代码: https://github.com/noloader/AES-Intrinsics/blob/master/aes-arm.c 但是,它缺少解密。我自己编码:
#if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM)
# if defined(__GNUC__)
# include <stdint.h>
# endif
# if defined(__ARM_NEON) || defined(_MSC_VER)
# include <arm_neon.h>
# endif
/* GCC and LLVM Clang, but not Apple Clang */
# if defined(__GNUC__) && !defined(__apple_build_version__)
# if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
# include <arm_acle.h>
#include <string.h>
# endif
# endif
#endif /* ARM Headers */
void aes_process_arm(const uint8_t key[], const uint8_t subkeys[], uint32_t rounds,
const uint8_t input[], uint8_t output[], uint32_t length)
{
while (length >= 16)
{
uint8x16_t block = vld1q_u8(input);
// AES single round encryption
block = vaeseq_u8(block, vld1q_u8(key));
// AES mix columns
block = vaesmcq_u8(block);
// AES single round encryption
block = vaeseq_u8(block, vld1q_u8(subkeys));
// AES mix columns
block = vaesmcq_u8(block);
for (unsigned int i=1; i<rounds-2; ++i)
{
// AES single round encryption
block = vaeseq_u8(block, vld1q_u8(subkeys+i*16));
// AES mix columns
block = vaesmcq_u8(block);
}
// AES single round encryption
block = vaeseq_u8(block, vld1q_u8(subkeys+(rounds-2)*16));
// Final Add (bitwise Xor)
block = veorq_u8(block, vld1q_u8(subkeys+(rounds-1)*16));
vst1q_u8(output, block);
input += 16; output += 16;
length -= 16;
}
}
void aes_decrypt_arm(const uint8_t key[], const uint8_t subkeys[], uint32_t rounds,
const uint8_t input[], uint8_t output[], uint32_t length)
{
// Reverse the order of the subkeys
uint8_t reversed_subkeys[rounds*16];
for (unsigned int i=0; i<rounds; ++i)
{
memcpy(reversed_subkeys+i*16, subkeys+(rounds-i-1)*16, 16);
}
while (length >= 16)
{
uint8x16_t block = vld1q_u8(input);
// AES Final Add (bitwise Xor)
block = veorq_u8(block, vld1q_u8(reversed_subkeys));
for (unsigned int i=rounds-1; i>0; --i)
{
// AES single round decryption
block = vaesdq_u8(block, vld1q_u8(reversed_subkeys+i*16));
// AES inverse mix columns
block = vaesimcq_u8(block);
}
// AES single round decryption
block = vaesdq_u8(block, vld1q_u8(reversed_subkeys));
// AES single round decryption
block = vaesdq_u8(block, vld1q_u8(reversed_subkeys+(rounds-1)*16));
vst1q_u8(output, block);
input += 16; output += 16;
length -= 16;
}
}
#include <stdio.h>
#include <string.h>
int main(int argc, char* argv[])
{
/* FIPS 197, Appendix B input */
const uint8_t input[16] = { /* user input, unaligned buffer */
0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34
};
/* FIPS 197, Appendix B key */
const uint8_t key[16] = { /* user input, unaligned buffer */
0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x9 , 0xcf, 0x4f, 0x3c
};
/* FIPS 197, Appendix B expanded subkeys */
__attribute__((aligned(4)))
const uint8_t subkeys[10][16] = { /* library controlled, aligned buffer */
{0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1, 0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05},
{0xF2, 0xC2, 0x95, 0xF2, 0x7a, 0x96, 0xb9, 0x43, 0x59, 0x35, 0x80, 0x7a, 0x73, 0x59, 0xf6, 0x7f},
{0x3D, 0x80, 0x47, 0x7D, 0x47, 0x16, 0xFE, 0x3E, 0x1E, 0x23, 0x7E, 0x44, 0x6D, 0x7A, 0x88, 0x3B},
{0xEF, 0x44, 0xA5, 0x41, 0xA8, 0x52, 0x5B, 0x7F, 0xB6, 0x71, 0x25, 0x3B, 0xDB, 0x0B, 0xAD, 0x00},
{0xD4, 0xD1, 0xC6, 0xF8, 0x7C, 0x83, 0x9D, 0x87, 0xCA, 0xF2, 0xB8, 0xBC, 0x11, 0xF9, 0x15, 0xBC},
{0x6D, 0x88, 0xA3, 0x7A, 0x11, 0x0B, 0x3E, 0xFD, 0xDB, 0xF9, 0x86, 0x41, 0xCA, 0x00, 0x93, 0xFD},
{0x4E, 0x54, 0xF7, 0x0E, 0x5F, 0x5F, 0xC9, 0xF3, 0x84, 0xA6, 0x4F, 0xB2, 0x4E, 0xA6, 0xDC, 0x4F},
{0xEA, 0xD2, 0x73, 0x21, 0xB5, 0x8D, 0xBA, 0xD2, 0x31, 0x2B, 0xF5, 0x60, 0x7F, 0x8D, 0x29, 0x2F},
{0xAC, 0x77, 0x66, 0xF3, 0x19, 0xFA, 0xDC, 0x21, 0x28, 0xD1, 0x29, 0x41, 0x57, 0x5c, 0x00, 0x6E},
{0xD0, 0x14, 0xF9, 0xA8, 0xC9, 0xEE, 0x25, 0x89, 0xE1, 0x3F, 0x0c, 0xC8, 0xB6, 0x63, 0x0C, 0xA6}
};
/* Result */
uint8_t ciphertext[19] = { 0 };
uint8_t decryption[19]= { 0 };
aes_process_arm((const uint8_t*)key, (const uint8_t*)subkeys, 10, input, ciphertext+3, 16);
printf("Input: ");
for (unsigned int i=0; i<16; ++i)
printf("%02X ", input[i]);
printf("\n");
printf("Key: ");
for (unsigned int i=0; i<16; ++i)
printf("%02X ", key[i]);
printf("\n");
printf("Output: ");
for (unsigned int i=3; i<19; ++i)
printf("%02X ",ciphertext[i]);
printf("\n");
aes_decrypt_arm((const uint8_t*)key, (const uint8_t*)subkeys, 10, ciphertext+3, decryption+3, 16);
printf("Decipher: ");
for (unsigned int i=3; i<19; ++i)
printf("%02X ", decryption[i]);
printf("\n");
/* FIPS 197, Appendix B output */
const uint8_t exp[16] = {
0x39, 0x25, 0x84, 0x1D, 0x02, 0xDC, 0x09, 0xFB, 0xDC, 0x11, 0x85, 0x97, 0x19, 0x6A, 0x0B, 0x32
};
if (0 == memcmp(ciphertext+3, exp, 16))
printf("SUCCESS!!!\n");
else
printf("FAILURE!!!\n");
if (0 == memcmp(decryption+3, input, 16))
printf("Encryption SUCCESS!!!\n");
else
printf("Encryption FAILURE!!!\n");
return 0;
}
但是解密和输入信息不匹配。我错过了什么?
在花了几天时间找出在 ARM 处理器上实现 AES 加密和解密的正确方法后,这就是我的发现。
与您的解密实现相比,您只是简单地反转了不正确的子密钥数组。
当使用AES-NI或ARM NEON AES intrinsic时,原始加密轮密钥必须首先应用InverseMixColumns,在ARM中我们需要使用
vaesimcq_u8
intrinsic来应用InverseMixColumn操作。
size_t aes_rounds = Nr + 1;
round_key = new uint8x16_t[aes_rounds];
dec_round_key = new uint8x16_t[aes_rounds];
// NOTE: here the original key was placed in the first index
// so I did not apply inverse mix columns to it
// in your implementation you might need to apply it in the
// first index because you separated the original key with the subkeys.
round_key[0] = vld1q_u8(&subKeys[0]);
dec_round_key[0] = vld1q_u8(&subKeys[0]);
for (size_t i = 1; i < aes_rounds; ++i) {
round_key[i] = vld1q_u8(&subKeys[i * 4 * Nb]);
dec_round_key[i] = vaesimcq_u8(round_key[i]);
}
round_key[Nr] = vld1q_u8(&subKeys[Nr * 4 * Nb]);
dec_round_key[Nr] = round_key[Nr];
关于实现解密,模式和加密完全一样。
这里是 AES 块加密和解密的示例代码,您可以使用它来检查和比较您的解密实现。
// AES block encryption using 128-bit key
void neon_aes128_encrypt(const uint8_t *input, uint8_t *output, const uint8x16_t *round_key) {
uint8x16_t state = vld1q_u8(input);
// Initial round
state = vaesmcq_u8(vaeseq_u8(state, round_key[0]));
// 8 main rounds
for (int i = 1; i < 9; i++) {
state = vaesmcq_u8(vaeseq_u8(state, round_key[i]));
}
// last 2 final round
state = vaeseq_u8(state, round_key[9]);
state = veorq_u8(state, round_key[10]);
// store the result to output
vst1q_u8(output, state);
}
// AES block decryption using 128-bit key
void neon_aes128_decrypt(const uint8_t *input, uint8_t *output, const uint8x16_t *dec_round_key) {
uint8x16_t state = vld1q_u8(input);
// Initial round
state = vaesimcq_u8(vaesdq_u8(state, dec_round_key[10]));
// 8 main rounds
// note here that I did not reverse the dec_round_key
// so I'm doing it from the last index to the starting index.
for (int i = 9; i > 1; i--) {
state = vaesimcq_u8(vaesdq_u8(state, dec_round_key[i]));
}
// final 2 rounds
state = vaesdq_u8(state, dec_round_key[1]);
state = veorq_u8(state, dec_round_key[0]);
// store the result to output
vst1q_u8(output, state);
}
如果您将该模式与我上面给出的示例进行比较,请注意,在您的解密实现中,您正在立即执行 XOR,并且看起来您颠倒了加密模式,这是不正确的。
解密数据的正确方法是先应用解密,然后应用 InverseMixColumns(与通常的回合一样),然后当您接近倒数第二轮时,应用最终解密。最后,对于最后一轮,您应用异或运算。