/* * lib_mem.c - library of routines used to analyze the memory hierarchy * * %W% %@% * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. * Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ #include "bench.h" #define FIVE(m) m m m m m #define TEN(m) FIVE(m) FIVE(m) #define FIFTY(m) TEN(m) TEN(m) TEN(m) TEN(m) TEN(m) #define HUNDRED(m) FIFTY(m) FIFTY(m) #define DEREF(N) p##N = (char**)*p##N; #define DECLARE(N) static char **sp##N; register char **p##N; #define INIT(N) p##N = (mem_benchmark_rerun && addr_save==state->addr) ? sp##N : (char**)state->p[N]; #define SAVE(N) sp##N = p##N; #define MEM_BENCHMARK_F(N) mem_benchmark_##N, benchmp_f mem_benchmarks[] = {REPEAT_15(MEM_BENCHMARK_F)}; static int mem_benchmark_rerun = 0; #define MEM_BENCHMARK_DEF(N,repeat,body) \ void \ mem_benchmark_##N(iter_t iterations, void *cookie) \ { \ struct mem_state* state = (struct mem_state*)cookie; \ static char *addr_save = NULL; \ repeat(DECLARE); \ \ repeat(INIT); \ while (iterations-- > 0) { \ HUNDRED(repeat(body)); \ } \ \ repeat(SAVE); \ addr_save = state->addr; \ mem_benchmark_rerun = 1; \ } MEM_BENCHMARK_DEF(0, REPEAT_0, DEREF) MEM_BENCHMARK_DEF(1, REPEAT_1, DEREF) MEM_BENCHMARK_DEF(2, REPEAT_2, DEREF) MEM_BENCHMARK_DEF(3, REPEAT_3, DEREF) MEM_BENCHMARK_DEF(4, REPEAT_4, DEREF) MEM_BENCHMARK_DEF(5, REPEAT_5, DEREF) MEM_BENCHMARK_DEF(6, REPEAT_6, DEREF) MEM_BENCHMARK_DEF(7, REPEAT_7, DEREF) MEM_BENCHMARK_DEF(8, REPEAT_8, DEREF) MEM_BENCHMARK_DEF(9, REPEAT_9, DEREF) MEM_BENCHMARK_DEF(10, REPEAT_10, DEREF) MEM_BENCHMARK_DEF(11, REPEAT_11, DEREF) MEM_BENCHMARK_DEF(12, REPEAT_12, DEREF) MEM_BENCHMARK_DEF(13, REPEAT_13, DEREF) MEM_BENCHMARK_DEF(14, REPEAT_14, DEREF) MEM_BENCHMARK_DEF(15, REPEAT_15, DEREF) size_t* words_initialize(size_t max, int scale); void mem_reset() { mem_benchmark_rerun = 0; } void mem_cleanup(iter_t iterations, void* cookie) { struct mem_state* state = (struct mem_state*)cookie; if (iterations) return; if (state->addr) { free(state->addr); state->addr = NULL; } if (state->lines) { free(state->lines); state->lines = NULL; } if (state->pages) { free(state->pages); state->pages = NULL; } if (state->words) { free(state->words); state->words = NULL; } } void tlb_cleanup(iter_t iterations, void* cookie) { size_t i; struct mem_state* state = (struct mem_state*)cookie; char **addr = (char**)state->addr; if (iterations) return; if (addr) { for (i = 0; i < state->npages; ++i) { if (addr[i]) free(addr[i]); } free(addr); state->addr = NULL; } if (state->pages) { free(state->pages); state->pages = NULL; } if (state->lines) { free(state->lines); state->lines = NULL; } } void base_initialize(iter_t iterations, void* cookie) { size_t nwords, nlines, nbytes, npages, nmpages; size_t *pages; size_t *lines; size_t *words; struct mem_state* state = (struct mem_state*)cookie; register char *p = 0 /* lint */; if (iterations) return; state->initialized = 0; nbytes = state->len; nwords = state->line / sizeof(char*); nlines = state->pagesize / state->line; npages = (nbytes + state->pagesize - 1) / state->pagesize; nmpages= (state->maxlen + state->pagesize - 1) / state->pagesize; srand(getpid()); words = NULL; lines = NULL; pages = permutation(nmpages, state->pagesize); p = state->addr = (char*)malloc(state->maxlen + 2 * state->pagesize); if (!p) { perror("base_initialize: malloc"); exit(1); } state->nwords = nwords; state->nlines = nlines; state->npages = npages; state->lines = lines; state->pages = pages; state->words = words; if (state->addr == NULL || pages == NULL) return; if ((unsigned long)p % state->pagesize) { p += state->pagesize - (unsigned long)p % state->pagesize; } state->base = p; state->initialized = 1; mem_reset(); } /* * Create a circular list of pointers using a simple striding * algorithm. * * This access pattern corresponds to many array/matrix * algorithms. It should be easily and correctly predicted * by any decent hardware prefetch algorithm. */ void stride_initialize(iter_t iterations, void* cookie) { struct mem_state* state = (struct mem_state*)cookie; size_t i; size_t range = state->len; size_t stride = state->line; char* addr; base_initialize(iterations, cookie); if (!state->initialized) return; addr = state->base; for (i = stride; i < range; i += stride) { *(char **)&addr[i - stride] = (char*)&addr[i]; } *(char **)&addr[i - stride] = (char*)&addr[0]; state->p[0] = addr; mem_reset(); } void thrash_initialize(iter_t iterations, void* cookie) { struct mem_state* state = (struct mem_state*)cookie; size_t i; size_t j; size_t cur; size_t next; size_t cpage; size_t npage; char* addr; base_initialize(iterations, cookie); if (!state->initialized) return; addr = state->base; /* * Create a circular list of pointers with a random access * pattern. * * This stream corresponds more closely to linked list * memory access patterns. For large data structures each * access will likely cause both a cache miss and a TLB miss. * * Access a different page each time. This will eventually * cause a tlb miss each page. It will also cause maximal * thrashing in the cache between the user data stream and * the page table entries. */ if (state->len % state->pagesize) { state->nwords = state->len / state->line; state->words = words_initialize(state->nwords, state->line); if (!state->words) { perror("thrash_initialize: malloc"); exit(1); } for (i = 0; i < state->nwords - 1; ++i) { *(char **)&addr[state->words[i]] = (char*)&addr[state->words[i+1]]; } *(char **)&addr[state->words[i]] = addr; state->p[0] = addr; } else { state->nwords = state->pagesize / state->line; state->words = words_initialize(state->nwords, state->line); if (!state->words) { perror("thrash_initialize: malloc"); exit(2); } for (i = 0; i < state->npages - 1; ++i) { cpage = state->pages[i]; npage = state->pages[i + 1]; for (j = 0; j < state->nwords; ++j) { cur = cpage + state->words[(i + j) % state->nwords]; next = npage + state->words[(i + j + 1) % state->nwords]; *(char **)&addr[cur] = (char*)&addr[next]; } } cpage = state->pages[i]; npage = state->pages[0]; for (j = 0; j < state->nwords; ++j) { cur = cpage + state->words[(i + j) % state->nwords]; next = npage + state->words[(j + 1) % state->nwords]; *(char **)&addr[cur] = (char*)&addr[next]; } state->p[0] = (char*)&addr[state->pages[0]]; } mem_reset(); } /* * mem_initialize * * Create a circular pointer chain that runs through memory. * * The chain threads through each cache line on a page before * moving to the next page. The order of cache line accesses * is randomized to defeat cache prefetching algorithms. In * addition, the order of page accesses is randomized. Finally, * to reduce the impact of incorrect line-size estimates on * machines with direct-mapped caches, we randomize which * word in the cache line is used to hold the pointer. * * It initializes state->width pointers to elements evenly * spaced through the chain. */ void mem_initialize(iter_t iterations, void* cookie) { int i, j, k, l, nw, nwords, nlines, npages, npointers; size_t *pages; size_t *lines; size_t *words; struct mem_state* state = (struct mem_state*)cookie; register char *p = 0 /* lint */; if (iterations) return; base_initialize(iterations, cookie); if (!state->initialized) return; state->initialized = 0; npointers = state->len / state->line; nwords = state->nwords; nlines = state->nlines; npages = state->npages; words = state->words = words_initialize(nwords, sizeof(char*)); lines = state->lines = words_initialize(nlines, state->line); pages = state->pages; p = state->base; if (state->addr == NULL \ || pages == NULL || lines == NULL || words == NULL) { perror("mem_initialize: malloc"); exit(1); } /* setup the run through the pages */ l = 0; j = nlines - 1; for (i = 0; i < npages; ++i) { for (j = 0; j < nlines - 1 && l < npointers - 1; ++j, ++l) { for (k = 0; k < state->line; k += sizeof(char*)) { *(char**)(p + pages[i] + lines[j] + k) = p + pages[i] + lines[j+1] + k; } if (l % (npointers/state->width) == 0 && l / (npointers/state->width) < MAX_MEM_PARALLELISM) { k = l / (npointers/state->width); state->p[k] = p + pages[i] + lines[j] + words[k % nwords]; } } if (i < npages - 1) { for (k = 0; k < nwords; ++k) *(char**)(p + pages[i] + lines[j] + words[k]) = p + pages[i+1] + lines[0] + words[k]; } } for (k = 0; k < nwords; ++k) { nw = (k == nwords - 1) ? 0 : k + 1; *(char**)(p + pages[npages-1] + lines[j] + words[k]) = p + pages[0] + lines[0] + words[nw]; } /* now, run through the chain once to clear the cache */ mem_reset(); (*mem_benchmarks[state->width-1])((nwords * npointers + 100) / 100, state); state->initialized = 1; } /* * line_initialize * * This is very similar to mem_initialize, except that we always use * the first element of the cache line to hold the pointer. * */ void line_initialize(iter_t iterations, void* cookie) { int i, j, nlines, npages; size_t *pages; size_t *lines; struct mem_state* state = (struct mem_state*)cookie; register char *p = 0 /* lint */; if (iterations) return; base_initialize(iterations, cookie); if (!state->initialized) return; state->initialized = 0; nlines = state->nlines; npages = state->npages; lines = state->lines = words_initialize(nlines, state->line); pages = state->pages; p = state->base; state->width = 1; if (state->addr == NULL || lines == NULL || pages == NULL) return; /* new setup runs through the lines */ for (i = 0; i < npages; ++i) { /* sequence through the first word of each line */ for (j = 0; j < nlines - 1; ++j) { *(char**)(p + pages[i] + lines[j]) = p + pages[i] + lines[j+1]; } /* jump to the fist word of the first line on next page */ *(char**)(p + pages[i] + lines[j]) = p + pages[(i < npages-1) ? i+1 : 0] + lines[0]; } state->p[0] = p + pages[0] + lines[0]; /* now, run through the chain once to clear the cache */ mem_reset(); mem_benchmark_0((nlines * npages + 100) / 100, state); state->initialized = 1; } /* * tlb_initialize * * Build a pointer chain which accesses one word per page, for a total * of (line * pages) bytes of data loaded into cache. * * If the number of elements in the chain (== #pages) is larger than the * number of pages addressed by the TLB, then each access should cause * a TLB miss (certainly as the number of pages becomes much larger than * the TLB-addressed space). * * In addition, if we arrange the chain properly, each word we access * will be in the cache. * * This means that the average access time for each pointer dereference * should be a cache hit plus a TLB miss. * */ void tlb_initialize(iter_t iterations, void* cookie) { int i, nlines, npages, pagesize; unsigned int r; char **pages = NULL; char **addr = NULL; size_t *lines = NULL; struct mem_state* state = (struct mem_state*)cookie; register char *p = 0 /* lint */; if (iterations) return; state->initialized = 0; pagesize = state->pagesize; nlines = pagesize / sizeof(char*); npages = state->len / pagesize; srand(getpid() ^ (getppid()<<7)); lines = words_initialize(nlines, sizeof(char*)); pages = (char**)malloc(npages * sizeof(char**)); addr = (char**)malloc(npages * sizeof(char**)); if (!lines || !pages || !addr) { perror("tlb_initialize: malloc"); exit(1); } state->nwords = 1; state->nlines = nlines; state->npages = npages; state->words = NULL; state->lines = lines; state->pages = (size_t*)pages; state->addr = (char*)addr; if (addr) bzero(addr, npages * sizeof(char**)); if (pages) bzero(pages, npages * sizeof(char**)); if (addr == NULL || pages == NULL || lines == NULL) { return; } /* first, layout the sequence of page accesses */ for (i = 0; i < npages; ++i) { p = addr[i] = (char*)valloc(pagesize); if (!p) { perror("tlb_initialize: valloc"); exit(4); } if ((unsigned long)p % pagesize) { free(p); p = addr[i] = (char*)valloc(2 * pagesize); if (!p) { perror("tlb_initialize: valloc"); exit(5); } p += pagesize - (unsigned long)p % pagesize; } pages[i] = (char*)p; } /* randomize the page sequences (except for zeroth page) */ r = (rand() << 15) ^ rand(); for (i = npages - 2; i > 0; --i) { char* l; r = (r << 1) ^ (rand() >> 4); l = pages[(r % i) + 1]; pages[(r % i) + 1] = pages[i + 1]; pages[i + 1] = l; } /* now setup run through the pages */ for (i = 0; i < npages - 1; ++i) { *(char**)(pages[i] + lines[i%nlines]) = pages[i+1] + lines[(i+1)%nlines]; } *(char**)(pages[i] + lines[i%nlines]) = pages[0] + lines[0]; state->p[0] = pages[0] + lines[0]; /* run through the chain once to clear the cache */ mem_reset(); mem_benchmark_0((npages + 100) / 100, state); state->initialized = 1; } /* * words_initialize * * This is supposed to create the order in which the words in a * "cache line" are used. Since we rarely know the cache line * size with any real reliability, we need to jump around so * as to maximize the number of potential cache misses, and to * minimize the possibility of re-using a cache line. */ size_t* words_initialize(size_t max, int scale) { size_t i, j, nbits; size_t* words = (size_t*)malloc(max * sizeof(size_t)); if (!words) return NULL; bzero(words, max * sizeof(size_t)); for (i = max>>1, nbits = 0; i != 0; i >>= 1, nbits++) ; for (i = 0; i < max; ++i) { /* now reverse the bits */ for (j = 0; j < nbits; j++) { if (i & (1<width = 1; state->line = sizeof(char*); for (state->addr = NULL; !state->addr && len; ) { state->len = state->maxlen = len; line_initialize(0, state); if (state->addr == NULL) len >>= 1; } if (state->addr == NULL) return -1; for (i = sizeof(char*); i <= maxline; i<<=1) { t = line_test(i, warmup, repetitions, state); if (t == 0.0) break; if (i > sizeof(char*)) { if (t > 1.3 * baseline) { big_jump = 1; } else if (big_jump && t < 1.15 * baseline) { line = (i>>1); break; } } baseline = t; } mem_cleanup(0, state); /* fprintf(stderr, "line_find(%lu, ...): returning %lu\n", (unsigned long)len, (unsigned long)line); /**/ return line; } double line_test(size_t line, int warmup, int repetitions, struct mem_state* state) { size_t i; size_t npages = state->npages; size_t nlines = state->pagesize / line; double t; char* p = state->base; char* first = p + state->pages[0] + state->lines[0]; result_t *r, *r_save; if (repetitions < 0) repetitions = TRIES; /* only visit a subset of the lines in each page */ if (nlines < state->nlines) { p = state->base; for (i = 0; i < npages - 1; ++i) { *(char**)(p + state->pages[i] + state->lines[nlines-1]) = p + state->pages[i+1] + state->lines[0]; } *(char**)(p + state->pages[npages-1] + state->lines[nlines-1]) = p + state->pages[0] + state->lines[0]; } r_save = get_results(); r = (result_t*)malloc(sizeof_result(repetitions)); if (!r) { perror("line_test: malloc"); exit(1); } insertinit(r); p = first; for (i = 0; i < repetitions; ++i) { BENCH1(HUNDRED(p = *(char**)p;),0); /* fprintf(stderr, "%d\t%d\t%d\n", line, (int)gettime(), (int)get_n()); /**/ insertsort(gettime(), get_n(), r); } use_pointer(p); set_results(r); t = 10. * (double)gettime() / (double)get_n(); set_results(r_save); free(r); /* fprintf(stderr, "%d\t%.5f\t%d\n", line, t, state->len); /**/ /* fixup full path again */ if (nlines < state->nlines) { p = state->base; for (i = 0; i < npages - 1; ++i) { *(char**)(p + state->pages[i] + state->lines[nlines-1]) = p + state->pages[i] + state->lines[nlines]; } *(char**)(p + state->pages[npages-1] + state->lines[nlines-1]) = p + state->pages[npages-1] + state->lines[nlines]; } return (t); } double par_mem(size_t len, int warmup, int repetitions, struct mem_state* state) { int i, j; iter_t __n = 1; double baseline, max_par, par; state->width = 1; max_par = 1.; if (repetitions < 0) repetitions = TRIES; for (state->addr = NULL; !state->addr && len; ) { state->len = state->maxlen = len; mem_initialize(0, state); if (state->addr == NULL) len >>= 1; } if (state->addr == NULL) return -1.; for (i = 0; i < MAX_MEM_PARALLELISM; ++i) { for (j = 0; j <= i; j++) { size_t nlines = len / state->line; size_t lines_per_chunk = nlines / (i + 1); size_t lines_per_page = state->pagesize / state->line; size_t line = j * lines_per_chunk; size_t word = (j * state->nwords) / (i + 1); /* if (state->len == 32768 && i == 7) { fprintf(stderr, "\tj=%d, line=%d, word=%d, page=%d, _line=%d, _word=%d\n", j, line, word, line / lines_per_page, line % lines_per_page, word % state->nwords); } /**/ state->p[j] = state->base + state->pages[line / lines_per_page] + state->lines[line % lines_per_page] + state->words[word % state->nwords]; } mem_reset(); (*mem_benchmarks[i])((len / sizeof(char*) + 100) / 100, state); BENCH((*mem_benchmarks[i])(__n, state); __n = 1;, 0); if (i == 0) { baseline = (double)gettime() / (double)get_n(); } else if (gettime() > 0) { par = baseline; par /= (double)gettime() / (double)((i + 1) * get_n()); /* fprintf(stderr, "par_mem(%d): i=%d, p=%5.2f, l=%d, lpp=%d, lpc=%d, nl=%d, wpc=%d\n", len, i, par, state->line, state->pagesize / state->line, (len / state->line) / (i + 1), len / state->line, state->nwords / (i + 1)); /**/ if (par > max_par) { max_par = par; } if (4.0 * max_par < i) break; } } mem_cleanup(0, state); return max_par; }