我想测量访问缓存行的一个元素的延迟。 我有一个带有下一个索引和填充的结构,其长度为缓存行大小(我的拱门中为 64 字节)。 然后,随机排序 N 个结构体的数组。
这是代码(基于薯片和奶酪代码):
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define L1_SIZE 32 * 1024
#define L2_SIZE 256 * 1024
#define L3_SIZE 12288 * 1024
#define CACHE_LINE_SIZE_BYTES 64
#define U64_PER_CACHE_LINE CACHE_LINE_SIZE_BYTES / 8
int default_test_sizes[37] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048,
3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,
131072, 262144, 393216, 524288, 1048576 };
typedef struct cache_line {
uint64_t next;
uint64_t padding[ (CACHE_LINE_SIZE_BYTES - sizeof(cache_line*)) / sizeof(uint64_t) ];
} cache_line;
double runTest(uint64_t size_lines, uint32_t iterations) {
uint64_t size = (size_lines * CACHE_LINE_SIZE_BYTES) / sizeof(cache_line);
cache_line *cache_lines = (cache_line*) malloc(size * sizeof(cache_line));
if (!cache_lines) {
std::cout << "Error on malloc" << std::endl;
return -1;
}
for (uint64_t i = 0; i < size; i++) {
cache_lines[i].next = i;
}
uint32_t iter = size;
while(iter > 1) {
iter -= 1;
int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
uint64_t tmp = cache_lines[iter].next;
cache_lines[iter].next = cache_lines[j].next;
cache_lines[j].next = tmp;
}
struct timeval startTv, endTv;
uint64_t scaled_iterations = iterations;
gettimeofday(&startTv, NULL);
cache_line current = cache_lines[0];
uint32_t total = current.next;
for(uint32_t n = 0; n < scaled_iterations; ++n) {
current = cache_lines[current.next];
total += current.next;
}
gettimeofday(&endTv, NULL);
time_t time_diff_ms = 1e6 * (endTv.tv_sec - startTv.tv_sec) + (endTv.tv_usec - startTv.tv_usec);
double latency = 1e3 * (double) time_diff_ms / (double) scaled_iterations;
if (total == 0) printf("sum == 0 (?)\n");
free(cache_lines);
return latency;
}
#define ITERATIONS 100000000
int main() {
std::cout << "CACHE_LINE_SIZE_BYTES: " << CACHE_LINE_SIZE_BYTES << std::endl;
std::cout << "U64_PER_CACHE_LINE: " << U64_PER_CACHE_LINE << std::endl;
std::cout << "Size of pointer: " << sizeof(cache_line*) << std::endl;
std::cout << "Size of uint64_t: " << sizeof(uint64_t) << std::endl;
std::cout << "Size of cache_line struct: " << sizeof(cache_line) << std::endl;
printf("Num. Cache lines,Latency (ns)\n");
for (long unsigned int i = 0; i < sizeof(default_test_sizes) / sizeof(int); i++) {
double result = runTest(default_test_sizes[i], ITERATIONS);
printf("%d,%.5g\n", default_test_sizes[i], result);
}
}
在我的 CPU 上具有以下缓存:
L1 size: 32kB,
L2 size: 256kB,
L3 size: 12MB.
我得到以下信息:
CACHE_LINE_SIZE_BYTES: 64
U64_PER_CACHE_LINE: 8
Size of pointer: 8
Size of uint64_t: 8
Size of cache_line struct: 64
Num. Cache lines,Latency (ns)
2,3.3074
4,2.3374
8,2.3873
12,2.2797
16,2.2301
24,2.1864
32,2.2436
48,2.3232
64,2.2363
96,2.3279
128,2.0391
192,2.3194
256,2.1941
384,2.1234
512,2.1142
600,4.8127
768,4.2977
1024,4.7297
1536,4.4902
2048,4.558
3072,7.8019
4096,8.0671
5120,9.5193
6144,11.242
8192,13.565
10240,14.654
12288,14.989
16384,16.542
24567,16.561
32768,16.72
65536,19.067
98304,25.08
131072,50.754
196608,91.816
262144,111.06
393216,122.16
524288,120.96
1048576,127.99
就每个缓存可以容纳的缓存行而言,我们看到一些“分区”大小:
但是,我有几个问题:
for(uint32_t n = 0; n < scaled_iterations; ++n) {
current = cache_lines[current.next];
total += current.padding[0];
}
惩罚只是增加一个 L1 访问权限,对吧?