1 /*--------------------------------------------------------------------*/
2 /*--- Cachegrind: cache configuration. cg-arch.c ---*/
3 /*--------------------------------------------------------------------*/
4
5 /*
6 This file is part of Cachegrind, a Valgrind tool for cache
7 profiling programs.
8
9 Copyright (C) 2011-2015 Nicholas Nethercote
10 njn@valgrind.org
11
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of the
15 License, or (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25 02111-1307, USA.
26
27 The GNU General Public License is contained in the file COPYING.
28 */
29
30 #include "pub_tool_basics.h"
31 #include "pub_tool_libcassert.h"
32 #include "pub_tool_libcbase.h"
33 #include "pub_tool_libcprint.h"
34 #include "pub_tool_options.h"
35 #include "pub_tool_machine.h"
36
37 #include "cg_arch.h"
38
39 static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
40 Bool all_caches_clo_defined);
41
42 // Checks cache config is ok. Returns NULL if ok, or a pointer to an error
43 // string otherwise.
check_cache(cache_t * cache)44 static const HChar* check_cache(cache_t* cache)
45 {
46 // Simulator requires set count to be a power of two.
47 if ((cache->size % (cache->line_size * cache->assoc) != 0) ||
48 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
49 {
50 return "Cache set count is not a power of two.\n";
51 }
52
53 // Simulator requires line size to be a power of two.
54 if (-1 == VG_(log2)(cache->line_size)) {
55 return "Cache line size is not a power of two.\n";
56 }
57
58 // Then check line size >= 16 -- any smaller and a single instruction could
59 // straddle three cache lines, which breaks a simulation assertion and is
60 // stupid anyway.
61 if (cache->line_size < MIN_LINE_SIZE) {
62 return "Cache line size is too small.\n";
63 }
64
65 /* Then check cache size > line size (causes seg faults if not). */
66 if (cache->size <= cache->line_size) {
67 return "Cache size <= line size.\n";
68 }
69
70 /* Then check assoc <= (size / line size) (seg faults otherwise). */
71 if (cache->assoc > (cache->size / cache->line_size)) {
72 return "Cache associativity > (size / line size).\n";
73 }
74
75 return NULL;
76 }
77
78
parse_cache_opt(cache_t * cache,const HChar * opt,const HChar * optval)79 static void parse_cache_opt ( cache_t* cache, const HChar* opt,
80 const HChar* optval )
81 {
82 Long i1, i2, i3;
83 HChar* endptr;
84 const HChar* checkRes;
85
86 // Option argument looks like "65536,2,64". Extract them.
87 i1 = VG_(strtoll10)(optval, &endptr); if (*endptr != ',') goto bad;
88 i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',') goto bad;
89 i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
90
91 // Check for overflow.
92 cache->size = (Int)i1;
93 cache->assoc = (Int)i2;
94 cache->line_size = (Int)i3;
95 if (cache->size != i1) goto overflow;
96 if (cache->assoc != i2) goto overflow;
97 if (cache->line_size != i3) goto overflow;
98
99 checkRes = check_cache(cache);
100 if (checkRes) {
101 VG_(fmsg)("%s", checkRes);
102 goto bad;
103 }
104
105 return;
106
107 bad:
108 VG_(fmsg_bad_option)(opt, "Bad argument '%s'\n", optval);
109
110 overflow:
111 VG_(fmsg_bad_option)(opt,
112 "One of the cache parameters was too large and overflowed.\n");
113 }
114
115
VG_(str_clo_cache_opt)116 Bool VG_(str_clo_cache_opt)(const HChar *arg,
117 cache_t* clo_I1c,
118 cache_t* clo_D1c,
119 cache_t* clo_LLc)
120 {
121 const HChar* tmp_str;
122
123 if VG_STR_CLO(arg, "--I1", tmp_str) {
124 parse_cache_opt(clo_I1c, arg, tmp_str);
125 return True;
126 } else if VG_STR_CLO(arg, "--D1", tmp_str) {
127 parse_cache_opt(clo_D1c, arg, tmp_str);
128 return True;
129 } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
130 VG_STR_CLO(arg, "--LL", tmp_str)) {
131 parse_cache_opt(clo_LLc, arg, tmp_str);
132 return True;
133 } else
134 return False;
135 }
136
umsg_cache_img(const HChar * desc,cache_t * c)137 static void umsg_cache_img(const HChar* desc, cache_t* c)
138 {
139 VG_(umsg)(" %s: %'d B, %d-way, %d B lines\n", desc,
140 c->size, c->assoc, c->line_size);
141 }
142
143 // Verifies if c is a valid cache.
144 // An invalid value causes an assert, unless clo_redefined is True.
check_cache_or_override(const HChar * desc,cache_t * c,Bool clo_redefined)145 static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined)
146 {
147 const HChar* checkRes;
148
149 checkRes = check_cache(c);
150 if (checkRes) {
151 VG_(umsg)("Auto-detected %s cache configuration not supported: %s",
152 desc, checkRes);
153 umsg_cache_img(desc, c);
154 if (!clo_redefined) {
155 VG_(umsg)("As it probably should be supported, please report a bug!\n");
156 VG_(umsg)("Bypass this message by using option --%s=...\n", desc);
157 tl_assert(0);
158 }
159 }
160 }
161
162
163 /* If the LL cache config isn't something the simulation functions
164 can handle, try to adjust it so it is. Caches are characterised
165 by (total size T, line size L, associativity A), and then we
166 have
167
168 number of sets S = T / (L * A)
169
170 The required constraints are:
171
172 * L must be a power of 2, but it always is in practice, so
173 no problem there
174
175 * A can be any value >= 1
176
177 * T can be any value, but ..
178
179 * S must be a power of 2.
180
181 That sometimes gives a problem. For example, some Core iX based
182 Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
183 sets. Some AMD cpus have T = 5MB, A = 48, L = 64, which gives
184 1706.667 sets (!).
185
186 The "fix" is to force S down to the nearest power of two below its
187 original value, and increase A proportionately, so as to keep the
188 total cache size the same. In fact to be safe we recalculate the
189 cache size afterwards anyway, to guarantee that it divides exactly
190 between the new number of sets.
191
192 The "fix" is "justified" (cough, cough) by alleging that
193 increases of associativity above about 4 have very little effect
194 on the actual miss rate. It would be far more inaccurate to
195 fudge this by changing the size of the simulated cache --
196 changing the associativity is a much better option.
197 */
198
199 /* (Helper function) Returns the largest power of 2 that is <= |x|.
200 Even works when |x| == 0. */
floor_power_of_2(UInt x)201 static UInt floor_power_of_2 ( UInt x )
202 {
203 x = x | (x >> 1);
204 x = x | (x >> 2);
205 x = x | (x >> 4);
206 x = x | (x >> 8);
207 x = x | (x >> 16);
208 return x - (x >> 1);
209 }
210
211 static void
maybe_tweak_LLc(cache_t * LLc)212 maybe_tweak_LLc(cache_t *LLc)
213 {
214 if (LLc->size == 0 || LLc->assoc == 0 || LLc->line_size == 0)
215 return;
216
217 tl_assert(LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0);
218
219 UInt old_size = (UInt)LLc->size;
220 UInt old_assoc = (UInt)LLc->assoc;
221 UInt old_line_size = (UInt)LLc->line_size;
222
223 UInt new_size = old_size;
224 UInt new_assoc = old_assoc;
225 UInt new_line_size = old_line_size;
226
227 UInt old_nSets = old_size / (old_assoc * old_line_size);
228 if (old_nSets == 0) {
229 /* This surely can't happen; but would cause chaos with the maths
230 * below if it did. Just give up if it does. */
231 return;
232 }
233
234 if (-1 != VG_(log2_64)(old_nSets)) {
235 /* The number of sets is already a power of 2. Make sure that
236 the size divides exactly between the sets. Almost all of the
237 time this will have no effect. */
238 new_size = old_line_size * old_assoc * old_nSets;
239 } else {
240 /* The number of sets isn't a power of two. Calculate some
241 scale-down factor which causes the number of sets to become a
242 power of two. Then, increase the associativity by that
243 factor. Finally, re-calculate the total size so as to make
244 sure it divides exactly between the sets. */
245 tl_assert(old_nSets >= 0);
246 UInt new_nSets = floor_power_of_2 ( old_nSets );
247 tl_assert(new_nSets > 0 && new_nSets < old_nSets);
248 Double factor = (Double)old_nSets / (Double)new_nSets;
249 tl_assert(factor >= 1.0);
250
251 new_assoc = (UInt)(0.5 + factor * (Double)old_assoc);
252 tl_assert(new_assoc >= old_assoc);
253
254 new_size = old_line_size * new_assoc * new_nSets;
255 }
256
257 tl_assert(new_line_size == old_line_size); /* we never change this */
258 if (new_size == old_size && new_assoc == old_assoc)
259 return;
260
261 VG_(dmsg)("warning: "
262 "specified LL cache: line_size %u assoc %u total_size %'u\n",
263 old_line_size, old_assoc, old_size);
264 VG_(dmsg)("warning: "
265 "simulated LL cache: line_size %u assoc %u total_size %'u\n",\
266 new_line_size, new_assoc, new_size);
267
268 LLc->size = new_size;
269 LLc->assoc = new_assoc;
270 LLc->line_size = new_line_size;
271 }
272
VG_(post_clo_init_configure_caches)273 void VG_(post_clo_init_configure_caches)(cache_t* I1c,
274 cache_t* D1c,
275 cache_t* LLc,
276 cache_t* clo_I1c,
277 cache_t* clo_D1c,
278 cache_t* clo_LLc)
279 {
280 #define DEFINED(L) (-1 != L->size || -1 != L->assoc || -1 != L->line_size)
281
282 // Count how many were defined on the command line.
283 Bool all_caches_clo_defined =
284 (DEFINED(clo_I1c) &&
285 DEFINED(clo_D1c) &&
286 DEFINED(clo_LLc));
287
288 // Set the cache config (using auto-detection, if supported by the
289 // architecture).
290 configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
291
292 maybe_tweak_LLc( LLc );
293
294 // Check the default/auto-detected values.
295 // Allow the user to override invalid auto-detected caches
296 // with command line.
297 check_cache_or_override ("I1", I1c, DEFINED(clo_I1c));
298 check_cache_or_override ("D1", D1c, DEFINED(clo_D1c));
299 check_cache_or_override ("LL", LLc, DEFINED(clo_LLc));
300
301 // Then replace with any defined on the command line. (Already checked in
302 // VG(parse_clo_cache_opt)().)
303 if (DEFINED(clo_I1c)) { *I1c = *clo_I1c; }
304 if (DEFINED(clo_D1c)) { *D1c = *clo_D1c; }
305 if (DEFINED(clo_LLc)) { *LLc = *clo_LLc; }
306
307 if (VG_(clo_verbosity) >= 2) {
308 VG_(umsg)("Cache configuration used:\n");
309 umsg_cache_img ("I1", I1c);
310 umsg_cache_img ("D1", D1c);
311 umsg_cache_img ("LL", LLc);
312 }
313 #undef DEFINED
314 }
315
VG_(print_cache_clo_opts)316 void VG_(print_cache_clo_opts)()
317 {
318 VG_(printf)(
319 " --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
320 " --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
321 " --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
322 );
323 }
324
325
326 // Traverse the cache info and return a cache of the given kind and level.
327 // Return NULL if no such cache exists.
328 static const VexCache *
locate_cache(const VexCacheInfo * ci,VexCacheKind kind,UInt level)329 locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level)
330 {
331 const VexCache *c;
332
333 for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) {
334 if (c->level == level && c->kind == kind) {
335 return c;
336 }
337 }
338 return NULL; // not found
339 }
340
341
342 // Gives the auto-detected configuration of I1, D1 and LL caches. They get
343 // overridden by any cache configurations specified on the command line.
344 static void
configure_caches(cache_t * I1c,cache_t * D1c,cache_t * LLc,Bool all_caches_clo_defined)345 configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc,
346 Bool all_caches_clo_defined)
347 {
348 VexArchInfo vai;
349 const VexCacheInfo *ci;
350 const VexCache *i1, *d1, *ll;
351
352 VG_(machine_get_VexArchInfo)(NULL, &vai);
353 ci = &vai.hwcache_info;
354
355 // Extract what we need
356 i1 = locate_cache(ci, INSN_CACHE, 1);
357 d1 = locate_cache(ci, DATA_CACHE, 1);
358 ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels);
359
360 if (ci->num_caches > 0 && ll == NULL) {
361 VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
362 }
363
364 if (ll && ci->num_levels > 2) {
365 VG_(dmsg)("warning: L%u cache found, using its data for the "
366 "LL simulation.\n", ci->num_levels);
367 }
368
369 if (i1 && d1 && ll) {
370 if (i1->is_trace_cache) {
371 /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
372 * conversion to byte size is a total guess; treat the 12K and 16K
373 * cases the same since the cache byte size must be a power of two for
374 * everything to work!. Also guessing 32 bytes for the line size...
375 */
376 UInt adjusted_size, guessed_line_size = 32;
377
378 if (i1->sizeB == 12 * 1024 || i1->sizeB == 16 * 1024) {
379 adjusted_size = 16 * 1024;
380 } else {
381 adjusted_size = 32 * 1024;
382 }
383 VG_(dmsg)("warning: Pentium 4 with %u KB micro-op instruction trace cache\n",
384 i1->sizeB / 1024);
385 VG_(dmsg)(" Simulating a %u KB I-cache with %u B lines\n",
386 adjusted_size / 1024, guessed_line_size);
387
388 *I1c = (cache_t) { adjusted_size, i1->assoc, guessed_line_size };
389 } else {
390 *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB };
391 }
392 *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB };
393 *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB };
394
395 return;
396 }
397
398 // Cache information could not be queried; choose some default
399 // architecture specific default setting.
400
401 #if defined(VGA_ppc32)
402
403 // Default cache configuration
404 *I1c = (cache_t) { 65536, 2, 64 };
405 *D1c = (cache_t) { 65536, 2, 64 };
406 *LLc = (cache_t) { 262144, 8, 64 };
407
408 #elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
409
410 // Default cache configuration
411 *I1c = (cache_t) { 65536, 2, 64 };
412 *D1c = (cache_t) { 65536, 2, 64 };
413 *LLc = (cache_t) { 262144, 8, 64 };
414
415 #elif defined(VGA_arm)
416
417 // Set caches to default (for Cortex-A8 ?)
418 *I1c = (cache_t) { 16384, 4, 64 };
419 *D1c = (cache_t) { 16384, 4, 64 };
420 *LLc = (cache_t) { 262144, 8, 64 };
421
422 #elif defined(VGA_arm64)
423
424 // Copy the 32-bit ARM version until such time as we have
425 // some real hardware to run on
426 *I1c = (cache_t) { 16384, 4, 64 };
427 *D1c = (cache_t) { 16384, 4, 64 };
428 *LLc = (cache_t) { 262144, 8, 64 };
429
430 #elif defined(VGA_s390x)
431 //
432 // Here is the cache data from older machine models:
433 //
434 // I1 D1 I/D L2
435 // z900 256k/256/4 256k/256/4 16MB
436 // z800 256k/256/4 256k/256/4 8MB
437 // z990 256k/256/4 256k/256/4 32MB
438 // z890 256k/256/4 256k/256/4 32MB
439 // z9 256k/256/4 256k/256/4 40MB
440 //
441 // Sources:
442 // (1) IBM System z9 109 Technical Introduction
443 // www.redbooks.ibm.com/redbooks/pdfs/sg246669.pdf
444 // (2) The microarchitecture of the IBM eServer z900 processor
445 // IBM Journal of Research and Development
446 // Volume 46, Number 4/5, pp 381-395, July/September 2002
447 // (3) The IBM eServer z990 microprocessor
448 // IBM Journal of Research and Development
449 // Volume 48, Number 3/4, pp 295-309, May/July 2004
450 // (4) Charles Webb, IBM
451 //
452 // L2 data is unfortunately incomplete. Otherwise, we could support
453 // machines without the ECAG insn by looking at VEX_S390X_MODEL(hwcaps).
454
455 // Default cache configuration is z10-EC (Source: ECAG insn)
456 *I1c = (cache_t) { 65536, 4, 256 };
457 *D1c = (cache_t) { 131072, 8, 256 };
458 *LLc = (cache_t) { 50331648, 24, 256 };
459
460 #elif defined(VGA_mips32)
461
462 // Set caches to default (for MIPS32-r2(mips 74kc))
463 *I1c = (cache_t) { 32768, 4, 32 };
464 *D1c = (cache_t) { 32768, 4, 32 };
465 *LLc = (cache_t) { 524288, 8, 32 };
466
467 #elif defined(VGA_mips64)
468
469 // Set caches to default (for MIPS64 - 5kc)
470 *I1c = (cache_t) { 32768, 4, 32 };
471 *D1c = (cache_t) { 32768, 4, 32 };
472 *LLc = (cache_t) { 524288, 8, 32 };
473
474 #elif defined(VGA_x86) || defined(VGA_amd64)
475
476 *I1c = (cache_t) { 65536, 2, 64 };
477 *D1c = (cache_t) { 65536, 2, 64 };
478 *LLc = (cache_t) { 262144, 8, 64 };
479
480 #elif defined(VGA_tilegx)
481
482 // Set caches to default for Tilegx.
483 *I1c = (cache_t) { 0x8000, 2, 64 };
484 *D1c = (cache_t) { 0x8000, 2, 64 };
485 *LLc = (cache_t) { 0x40000, 8, 64 };
486
487 #else
488
489 #error "Unknown arch"
490
491 #endif
492
493 if (!all_caches_clo_defined) {
494 const HChar warning[] =
495 "Warning: Cannot auto-detect cache config, using defaults.\n"
496 " Run with -v to see.\n";
497 VG_(dmsg)("%s", warning);
498 }
499 }
500
501 /*--------------------------------------------------------------------*/
502 /*--- end ---*/
503 /*--------------------------------------------------------------------*/
504