我的机器:AMD x86_64 上的 ArchWSL2
海湾合作委员会版本:14.2.1
clang版本:18.1.8
我在gcc/clang编译选项中添加了-flto,这使得我的程序在0.000000秒内完成运行。当我删除这个参数时,我的程序似乎正常。图片显示了我的结果。我想知道为什么 -flto 会造成这样的后果。
意想不到:
**gcc with -flto:**
> Performing performance test...
BLOCK_CIPHER_THROUGHPUT: DES encryption
Execute time: 0.059215 s
Throughpt: 103.073688 Mbps
BLOCK_CIPHER_THROUGHPUT: DES decryption
Execute time: 0.000000 s
Throughpt: 22837.028650 Gbps
**clang with -flto:**
> Performing performance test...
BLOCK_CIPHER_THROUGHPUT: DES encryption
Execute time: 0.000000 s
Throughpt: 20482.695799 Gbps
BLOCK_CIPHER_THROUGHPUT: DES decryption
Execute time: 0.000000 s
Throughpt: 23746.870428 Gbps
正常:
**gcc without -flto:**
> Performing performance test...
BLOCK_CIPHER_THROUGHPUT: DES encryption
Execute time: 0.055438 s
Throughpt: 110.097214 Mbps
BLOCK_CIPHER_THROUGHPUT: DES decryption
Execute time: 0.054398 s
Throughpt: 112.200099 Mbps
**clang without -flto**
> Performing performance test...
BLOCK_CIPHER_THROUGHPUT: DES encryption
Execute time: 0.050459 s
Throughpt: 120.959877 Mbps
BLOCK_CIPHER_THROUGHPUT: DES decryption
Execute time: 0.049857 s
Throughpt: 122.419760 Mbps
我读过gcc关于-flto的选项手册,但这对我来说太难了。所以我问ChatGPT4o为什么我的des算法中添加了-flto编译选项后我的程序执行时间变成了0.00000s?它告诉我
事实上,我不相信AI的答案。这只是一个参考。我想知道为什么-flto会造成这样的效果,或者真的像AI所说的那样吗? 受到AI提到的第三点的启发,我给出了基准测试的不完整实现。
#ifndef BENCHMARK_H
#define BENCHMARK_H
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define NSPERS 1000000000
#ifdef __linux__
#elif defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#else
#endif
/**
* BITS_PER_SECOND
* Runs a new benchmark.
* bench the throughput of symmetric ciphers with (K/M/G)bps
*
* @param[in] LABEL - the label for this benchmark.
* @param[in] BENCHS - Number of times each benchmark is ran.
*/
#define BPS_BENCH_START(_LABEL, _BENCHS) \
{ \
uint64_t time_t[_BENCHS]; \
int benchs_ = _BENCHS; \
int retrys; \
printf("BLOCK_CIPHER_THROUGHPUT: " _LABEL "\n"); \
for (int _b = 0; _b < benchs_; _b++){
/**
* Measures the throughput of of FUNCTION.
*
* @param[in] FUNCTION - the function executed.
*/
#define BPS_BENCH_ITEM(_FUNCTION, _ROUNDS) \
retrys = _ROUNDS; \
_FUNCTION; \
time_bench_before(); \
for (int _r = 0; _r < retrys; _r++) { _FUNCTION; } \
time_bench_after(time_t, _b);
/**
* Prints the throughput of FUNCTION with (K/M/G)bps
* @param[in] DATASIZE -bit length of data input to hash functions or block-size of block ciphers
*/
#define BPS_BENCH_FINAL(_DATASIZE) \
} \
print_sc_bps(time_t, benchs_, retrys, (_DATASIZE)); \
}
#ifdef __cplusplus
extern "C" { /* start of __cplusplus */
#endif
typedef struct timespec time_s;
/**
* Measures the time before a benchmark is executed.
*/
void time_bench_before(void);
/**
* Measures the time after a benchmark.
*/
void time_bench_after(uint64_t *t, int i);
/**
* Prints the last benchmark with bps.
*/
void print_sc_bps(const uint64_t *t, int benches, int rounds, int block_size);
#ifdef __cplusplus
} /* end of __cplusplus */
#endif
#endif /* !BENCHMARK_H */
#include "benchmark.h"
/*============================================================================*/
/* Time Bench */
/*============================================================================*/
#ifdef __linux__
static struct
{
/** Stores the time measured before the execution of the benchmark. */
time_s before;
/** Stores the time measured after the execution of the benchmark. */
time_s after;
} g_bench;
/**
* compute the time between start and end, using the time_s struct
*
* @param end -the start of the execution
* @param start -the end of the execution
* @return
*/
static time_s time_sub(time_s *end, time_s *start)
{
time_s temp;
if ((end->tv_nsec - start->tv_nsec) < 0)
{
temp.tv_sec = end->tv_sec - start->tv_sec - 1;
temp.tv_nsec = NSPERS + end->tv_nsec - start->tv_nsec;
}
else
{
temp.tv_sec = end->tv_sec - start->tv_sec;
temp.tv_nsec = end->tv_nsec - start->tv_nsec;
}
return temp;
}
void time_bench_before() { clock_gettime(CLOCK_MONOTONIC, &g_bench.before); }
void time_bench_after(uint64_t *t, int i)
{
clock_gettime(CLOCK_MONOTONIC, &g_bench.after);
time_s temp = time_sub(&g_bench.after, &g_bench.before);
t[i] = temp.tv_sec * NSPERS + temp.tv_nsec;
}
#elif defined(_WIN32) || defined(_WIN64)
static struct
{
/** Stores the time measured before the execution of the benchmark. */
LARGE_INTEGER before;
/** Stores the time measured after the execution of the benchmark. */
LARGE_INTEGER after;
} g_bench;
void time_bench_before() { QueryPerformanceCounter(&g_bench.before); }
void time_bench_after(uint64_t *t, int i)
{
QueryPerformanceCounter(&g_bench.after);
LARGE_INTEGER Frequency;
QueryPerformanceFrequency(&Frequency);
int64_t temp = g_bench.after.QuadPart - g_bench.before.QuadPart;
t[i] = (NSPERS * temp) / Frequency.QuadPart;
}
#else
#endif
void print_sc_bps(const uint64_t *t, int benches, int rounds, int block_size)
{
if (benches < 2)
{
fprintf(stderr, "ERROR: Need a least two bench counts!\n");
return;
}
uint64_t acc = 0;
for (int i = 0; i < benches; i++) acc += t[i];
uint64_t bits = benches * rounds * block_size;
double kbits = (double)bits / (1 << 10);
double mbits = (double)bits / (1 << 20);
double gbits = (double)bits / (1 << 30);
double secend = (double)acc / NSPERS;
double throughpt_bits_s = (double)bits / secend;// bits/s
double throughpt_kbits_s = (double)kbits / secend;// kbits/s
double throughpt_mbits_s = (double)mbits / secend;// mbits/s
double throughpt_gbits_s = (double)gbits / secend;// gbits/s
printf("Execute time: %f s\n", secend);
if (throughpt_bits_s < 1000) printf("Throughpt: %f bps\n", throughpt_bits_s);
else if (throughpt_kbits_s < 1000)
printf("Throughpt: %f Kbps\n", throughpt_kbits_s);
else if (throughpt_mbits_s < 1000)
printf("Throughpt: %f Mbps\n", throughpt_mbits_s);
else
printf("Throughpt: %f Gbps\n", throughpt_gbits_s);
printf("\n");
}
// Performance test function
void test_des_performance()
{
// Fixed example plaintext 4e45565251554954
unsigned char plaintext[DES_BLOCK_SIZE] = { 0x4e,0x45,0x56,0x52,0x51,0x55,0x49,0x54 };
// Fixed example key 4b41534849534142
unsigned char key[DES_KEY_SIZE] = { 0x4b,0x41,0x53,0x48,0x49,0x53,0x41,0x42 };
unsigned char subKeys[16][6];
unsigned char ciphertext[DES_BLOCK_SIZE];
unsigned char decrypted[DES_BLOCK_SIZE];
// Generate subkeys
if (des_make_subkeys(key, subKeys) != 0)
{
printf("Failed to generate subkeys.\n");
return;
}
// Perform performance test
BPS_BENCH_START("DES encryption", BENCHS);
BPS_BENCH_ITEM(des_encrypt_block(plaintext, subKeys, ciphertext), ROUNDS);
BPS_BENCH_FINAL(DES_BLOCK_BITS);
BPS_BENCH_START("DES decryption", BENCHS);
BPS_BENCH_ITEM(des_decrypt_block(ciphertext, subKeys, decrypted), ROUNDS);
BPS_BENCH_FINAL(DES_BLOCK_BITS);
}
至少这个问题
溢出
temp.tv_sec * NSPERS
存在溢出风险 (time_t * int
)。 最好确保至少 64 位数学。
t[i] = temp.tv_sec * (int64_t) NSPERS + temp.tv_nsec;