• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*--------------------------------------------------------------------*/
2 /*--- Cache simulation.                                            ---*/
3 /*---                                                        sim.c ---*/
4 /*--------------------------------------------------------------------*/
5 
6 /*
7    This file is part of Callgrind, a Valgrind tool for call graph
8    profiling programs.
9 
10    Copyright (C) 2003-2013, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
11 
12    This tool is derived from and contains code from Cachegrind
13    Copyright (C) 2002-2013 Nicholas Nethercote (njn@valgrind.org)
14 
15    This program is free software; you can redistribute it and/or
16    modify it under the terms of the GNU General Public License as
17    published by the Free Software Foundation; either version 2 of the
18    License, or (at your option) any later version.
19 
20    This program is distributed in the hope that it will be useful, but
21    WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    General Public License for more details.
24 
25    You should have received a copy of the GNU General Public License
26    along with this program; if not, write to the Free Software
27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28    02111-1307, USA.
29 
30    The GNU General Public License is contained in the file COPYING.
31 */
32 
33 #include "global.h"
34 
35 
36 /* Notes:
37   - simulates a write-allocate cache
38   - (block --> set) hash function uses simple bit selection
39   - handling of references straddling two cache blocks:
40       - counts as only one cache access (not two)
41       - both blocks hit                  --> one hit
42       - one block hits, the other misses --> one miss
43       - both blocks miss                 --> one miss (not two)
44 */
45 
46 /* Cache configuration */
47 #include "cg_arch.c"
48 
49 /* additional structures for cache use info, separated
50  * according usage frequency:
51  * - line_loaded : pointer to cost center of instruction
52  *                 which loaded the line into cache.
53  *                 Needed to increment counters when line is evicted.
54  * - line_use    : updated on every access
55  */
56 typedef struct {
57   UInt count;
58   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
59 } line_use;
60 
61 typedef struct {
62   Addr memline, iaddr;
63   line_use* dep_use; /* point to higher-level cacheblock for this memline */
64   ULong* use_base;
65 } line_loaded;
66 
67 /* Cache state */
68 typedef struct {
69    const HChar* name;
70    int          size;                   /* bytes */
71    int          assoc;
72    int          line_size;              /* bytes */
73    Bool         sectored;  /* prefetch nearside cacheline on read */
74    int          sets;
75    int          sets_min_1;
76    int          line_size_bits;
77    int          tag_shift;
78    UWord        tag_mask;
79    HChar        desc_line[128];
80    UWord*       tags;
81 
82   /* for cache use */
83    int          line_size_mask;
84    int*         line_start_mask;
85    int*         line_end_mask;
86    line_loaded* loaded;
87    line_use*    use;
88 } cache_t2;
89 
90 /*
91  * States of flat caches in our model.
92  * We use a 2-level hierarchy,
93  */
94 static cache_t2 I1, D1, LL;
95 
96 /* Lower bits of cache tags are used as flags for a cache line */
97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
98 #define CACHELINE_DIRTY    1
99 
100 
101 /* Cache simulator Options */
102 static Bool clo_simulate_writeback = False;
103 static Bool clo_simulate_hwpref = False;
104 static Bool clo_simulate_sectors = False;
105 static Bool clo_collect_cacheuse = False;
106 
107 /* Following global vars are setup before by setup_bbcc():
108  *
109  * - Addr   CLG_(bb_base)     (instruction start address of original BB)
110  * - ULong* CLG_(cost_base)   (start of cost array for BB)
111  */
112 
113 Addr   CLG_(bb_base);
114 ULong* CLG_(cost_base);
115 
116 static InstrInfo* current_ii;
117 
118 /* Cache use offsets */
119 /* The offsets are only correct because all per-instruction event sets get
120  * the "Use" set added first !
121  */
122 static Int off_I1_AcCost  = 0;
123 static Int off_I1_SpLoss  = 1;
124 static Int off_D1_AcCost  = 0;
125 static Int off_D1_SpLoss  = 1;
126 static Int off_LL_AcCost  = 2;
127 static Int off_LL_SpLoss  = 3;
128 
129 /* Cache access types */
130 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
131 
132 /* Result of a reference into a flat cache */
133 typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
134 
135 /* Result of a reference into a hierarchical cache model */
136 typedef enum {
137     L1_Hit,
138     LL_Hit,
139     MemAccess,
140     WriteBackMemAccess } CacheModelResult;
141 
142 typedef CacheModelResult (*simcall_type)(Addr, UChar);
143 
144 static struct {
145     simcall_type I1_Read;
146     simcall_type D1_Read;
147     simcall_type D1_Write;
148 } simulator;
149 
150 /*------------------------------------------------------------*/
151 /*--- Cache Simulator Initialization                       ---*/
152 /*------------------------------------------------------------*/
153 
cachesim_clearcache(cache_t2 * c)154 static void cachesim_clearcache(cache_t2* c)
155 {
156   Int i;
157 
158   for (i = 0; i < c->sets * c->assoc; i++)
159     c->tags[i] = 0;
160   if (c->use) {
161     for (i = 0; i < c->sets * c->assoc; i++) {
162       c->loaded[i].memline  = 0;
163       c->loaded[i].use_base = 0;
164       c->loaded[i].dep_use = 0;
165       c->loaded[i].iaddr = 0;
166       c->use[i].mask    = 0;
167       c->use[i].count   = 0;
168       c->tags[i] = i % c->assoc; /* init lower bits as pointer */
169     }
170   }
171 }
172 
173 static void cacheuse_initcache(cache_t2* c);
174 
175 /* By this point, the size/assoc/line_size has been checked. */
cachesim_initcache(cache_t config,cache_t2 * c)176 static void cachesim_initcache(cache_t config, cache_t2* c)
177 {
178    c->size      = config.size;
179    c->assoc     = config.assoc;
180    c->line_size = config.line_size;
181    c->sectored  = False; // FIXME
182 
183    c->sets           = (c->size / c->line_size) / c->assoc;
184    c->sets_min_1     = c->sets - 1;
185    c->line_size_bits = VG_(log2)(c->line_size);
186    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
187    c->tag_mask       = ~((1<<c->tag_shift)-1);
188 
189    /* Can bits in tag entries be used for flags?
190     * Should be always true as MIN_LINE_SIZE >= 16 */
191    CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
192 
193    if (c->assoc == 1) {
194       VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
195 		   c->size, c->line_size,
196 		   c->sectored ? ", sectored":"");
197    } else {
198       VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
199 		   c->size, c->line_size, c->assoc,
200 		   c->sectored ? ", sectored":"");
201    }
202 
203    c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
204                                  sizeof(UWord) * c->sets * c->assoc);
205    if (clo_collect_cacheuse)
206        cacheuse_initcache(c);
207    else
208      c->use = 0;
209    cachesim_clearcache(c);
210 }
211 
212 
213 #if 0
214 static void print_cache(cache_t2* c)
215 {
216    UInt set, way, i;
217 
218    /* Note initialisation and update of 'i'. */
219    for (i = 0, set = 0; set < c->sets; set++) {
220       for (way = 0; way < c->assoc; way++, i++) {
221          VG_(printf)("%8x ", c->tags[i]);
222       }
223       VG_(printf)("\n");
224    }
225 }
226 #endif
227 
228 
229 /*------------------------------------------------------------*/
230 /*--- Simple Cache Simulation                              ---*/
231 /*------------------------------------------------------------*/
232 
233 /*
234  * Model: single inclusive, 2-level cache hierarchy (L1/LL)
235  *        with write-allocate
236  *
237  * For simple cache hit/miss counts, we do not have to
238  * maintain the dirty state of lines (no need to distinguish
239  * read/write references), and the resulting counts are the
240  * same for write-through and write-back caches.
241  *
242  * Simulator functions:
243  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
244  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
245  */
246 __attribute__((always_inline))
247 static __inline__
cachesim_setref(cache_t2 * c,UInt set_no,UWord tag)248 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
249 {
250     int i, j;
251     UWord *set;
252 
253     set = &(c->tags[set_no * c->assoc]);
254 
255     /* This loop is unrolled for just the first case, which is the most */
256     /* common.  We can't unroll any further because it would screw up   */
257     /* if we have a direct-mapped (1-way) cache.                        */
258     if (tag == set[0])
259         return Hit;
260 
261     /* If the tag is one other than the MRU, move it into the MRU spot  */
262     /* and shuffle the rest down.                                       */
263     for (i = 1; i < c->assoc; i++) {
264         if (tag == set[i]) {
265             for (j = i; j > 0; j--) {
266                 set[j] = set[j - 1];
267             }
268             set[0] = tag;
269             return Hit;
270         }
271     }
272 
273     /* A miss;  install this tag as MRU, shuffle rest down. */
274     for (j = c->assoc - 1; j > 0; j--) {
275         set[j] = set[j - 1];
276     }
277     set[0] = tag;
278 
279     return Miss;
280 }
281 
282 __attribute__((always_inline))
283 static __inline__
cachesim_ref(cache_t2 * c,Addr a,UChar size)284 CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
285 {
286     UWord block1 =  a         >> c->line_size_bits;
287     UWord block2 = (a+size-1) >> c->line_size_bits;
288     UInt  set1   = block1 & c->sets_min_1;
289     /* the tag does not need to include bits specifying the set,
290      * but it can, and this saves instructions */
291     UWord tag1   = block1;
292 
293     /* Access entirely within line. */
294     if (block1 == block2)
295 	return cachesim_setref(c, set1, tag1);
296 
297     /* Access straddles two lines. */
298     else if (block1 + 1 == block2) {
299         UInt  set2 = block2 & c->sets_min_1;
300         UWord tag2 = block2;
301 
302 	/* the call updates cache structures as side effect */
303 	CacheResult res1 =  cachesim_setref(c, set1, tag1);
304 	CacheResult res2 =  cachesim_setref(c, set2, tag2);
305 	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
306 
307    } else {
308        VG_(printf)("addr: %lx  size: %u  blocks: %ld %ld",
309 		   a, size, block1, block2);
310        VG_(tool_panic)("item straddles more than two cache sets");
311    }
312    return Hit;
313 }
314 
315 static
cachesim_I1_ref(Addr a,UChar size)316 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
317 {
318     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
319     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
320     return MemAccess;
321 }
322 
323 static
cachesim_D1_ref(Addr a,UChar size)324 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
325 {
326     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
327     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
328     return MemAccess;
329 }
330 
331 
332 /*------------------------------------------------------------*/
333 /*--- Write Back Cache Simulation                          ---*/
334 /*------------------------------------------------------------*/
335 
336 /*
337  * More complex model: L1 Write-through, LL Write-back
338  * This needs to distinguish among read and write references.
339  *
340  * Simulator functions:
341  *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
342  *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
343  *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
344  */
345 
346 /*
347  * With write-back, result can be a miss evicting a dirty line
348  * The dirty state of a cache line is stored in Bit0 of the tag for
349  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
350  * type (Read/Write), the line gets dirty on a write.
351  */
352 __attribute__((always_inline))
353 static __inline__
cachesim_setref_wb(cache_t2 * c,RefType ref,UInt set_no,UWord tag)354 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
355 {
356     int i, j;
357     UWord *set, tmp_tag;
358 
359     set = &(c->tags[set_no * c->assoc]);
360 
361     /* This loop is unrolled for just the first case, which is the most */
362     /* common.  We can't unroll any further because it would screw up   */
363     /* if we have a direct-mapped (1-way) cache.                        */
364     if (tag == (set[0] & ~CACHELINE_DIRTY)) {
365 	set[0] |= ref;
366         return Hit;
367     }
368     /* If the tag is one other than the MRU, move it into the MRU spot  */
369     /* and shuffle the rest down.                                       */
370     for (i = 1; i < c->assoc; i++) {
371 	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
372 	    tmp_tag = set[i] | ref; // update dirty flag
373             for (j = i; j > 0; j--) {
374                 set[j] = set[j - 1];
375             }
376             set[0] = tmp_tag;
377             return Hit;
378         }
379     }
380 
381     /* A miss;  install this tag as MRU, shuffle rest down. */
382     tmp_tag = set[c->assoc - 1];
383     for (j = c->assoc - 1; j > 0; j--) {
384         set[j] = set[j - 1];
385     }
386     set[0] = tag | ref;
387 
388     return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
389 }
390 
391 __attribute__((always_inline))
392 static __inline__
cachesim_ref_wb(cache_t2 * c,RefType ref,Addr a,UChar size)393 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
394 {
395     UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
396     UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
397     UWord tag = a & c->tag_mask;
398 
399     /* Access entirely within line. */
400     if (set1 == set2)
401 	return cachesim_setref_wb(c, ref, set1, tag);
402 
403     /* Access straddles two lines. */
404     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
405     else if (((set1 + 1) & (c->sets_min_1)) == set2) {
406 	UWord tag2  = (a+size-1) & c->tag_mask;
407 
408 	/* the call updates cache structures as side effect */
409 	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
410 	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
411 
412 	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
413 	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
414 
415    } else {
416        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
417        VG_(tool_panic)("item straddles more than two cache sets");
418    }
419    return Hit;
420 }
421 
422 
423 static
cachesim_I1_Read(Addr a,UChar size)424 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
425 {
426     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
427     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
428 	case Hit: return LL_Hit;
429 	case Miss: return MemAccess;
430 	default: break;
431     }
432     return WriteBackMemAccess;
433 }
434 
435 static
cachesim_D1_Read(Addr a,UChar size)436 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
437 {
438     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
439     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
440 	case Hit: return LL_Hit;
441 	case Miss: return MemAccess;
442 	default: break;
443     }
444     return WriteBackMemAccess;
445 }
446 
447 static
cachesim_D1_Write(Addr a,UChar size)448 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
449 {
450     if ( cachesim_ref( &D1, a, size) == Hit ) {
451 	/* Even for a L1 hit, the write-trough L1 passes
452 	 * the write to the LL to make the LL line dirty.
453 	 * But this causes no latency, so return the hit.
454 	 */
455 	cachesim_ref_wb( &LL, Write, a, size);
456 	return L1_Hit;
457     }
458     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
459 	case Hit: return LL_Hit;
460 	case Miss: return MemAccess;
461 	default: break;
462     }
463     return WriteBackMemAccess;
464 }
465 
466 
467 /*------------------------------------------------------------*/
468 /*--- Hardware Prefetch Simulation                         ---*/
469 /*------------------------------------------------------------*/
470 
471 static ULong prefetch_up = 0;
472 static ULong prefetch_down = 0;
473 
474 #define PF_STREAMS  8
475 #define PF_PAGEBITS 12
476 
477 static UInt pf_lastblock[PF_STREAMS];
478 static Int  pf_seqblocks[PF_STREAMS];
479 
480 static
prefetch_clear(void)481 void prefetch_clear(void)
482 {
483   int i;
484   for(i=0;i<PF_STREAMS;i++)
485     pf_lastblock[i] = pf_seqblocks[i] = 0;
486 }
487 
488 /*
489  * HW Prefetch emulation
490  * Start prefetching when detecting sequential access to 3 memory blocks.
491  * One stream can be detected per 4k page.
492  */
493 static __inline__
prefetch_LL_doref(Addr a)494 void prefetch_LL_doref(Addr a)
495 {
496   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
497   UInt block = ( a >> LL.line_size_bits);
498 
499   if (block != pf_lastblock[stream]) {
500     if (pf_seqblocks[stream] == 0) {
501       if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
502       else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
503     }
504     else if (pf_seqblocks[stream] >0) {
505       if (pf_lastblock[stream] +1 == block) {
506 	pf_seqblocks[stream]++;
507 	if (pf_seqblocks[stream] >= 2) {
508 	  prefetch_up++;
509 	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
510 	}
511       }
512       else pf_seqblocks[stream] = 0;
513     }
514     else if (pf_seqblocks[stream] <0) {
515       if (pf_lastblock[stream] -1 == block) {
516 	pf_seqblocks[stream]--;
517 	if (pf_seqblocks[stream] <= -2) {
518 	  prefetch_down++;
519 	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
520 	}
521       }
522       else pf_seqblocks[stream] = 0;
523     }
524     pf_lastblock[stream] = block;
525   }
526 }
527 
528 /* simple model with hardware prefetch */
529 
530 static
prefetch_I1_ref(Addr a,UChar size)531 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
532 {
533     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
534     prefetch_LL_doref(a);
535     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
536     return MemAccess;
537 }
538 
539 static
prefetch_D1_ref(Addr a,UChar size)540 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
541 {
542     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
543     prefetch_LL_doref(a);
544     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
545     return MemAccess;
546 }
547 
548 
549 /* complex model with hardware prefetch */
550 
551 static
prefetch_I1_Read(Addr a,UChar size)552 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
553 {
554     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
555     prefetch_LL_doref(a);
556     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
557 	case Hit: return LL_Hit;
558 	case Miss: return MemAccess;
559 	default: break;
560     }
561     return WriteBackMemAccess;
562 }
563 
564 static
prefetch_D1_Read(Addr a,UChar size)565 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
566 {
567     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
568     prefetch_LL_doref(a);
569     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
570 	case Hit: return LL_Hit;
571 	case Miss: return MemAccess;
572 	default: break;
573     }
574     return WriteBackMemAccess;
575 }
576 
577 static
prefetch_D1_Write(Addr a,UChar size)578 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
579 {
580     prefetch_LL_doref(a);
581     if ( cachesim_ref( &D1, a, size) == Hit ) {
582 	/* Even for a L1 hit, the write-trough L1 passes
583 	 * the write to the LL to make the LL line dirty.
584 	 * But this causes no latency, so return the hit.
585 	 */
586 	cachesim_ref_wb( &LL, Write, a, size);
587 	return L1_Hit;
588     }
589     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
590 	case Hit: return LL_Hit;
591 	case Miss: return MemAccess;
592 	default: break;
593     }
594     return WriteBackMemAccess;
595 }
596 
597 
598 /*------------------------------------------------------------*/
599 /*--- Cache Simulation with use metric collection          ---*/
600 /*------------------------------------------------------------*/
601 
602 /* can not be combined with write-back or prefetch */
603 
604 static
cacheuse_initcache(cache_t2 * c)605 void cacheuse_initcache(cache_t2* c)
606 {
607     int i;
608     unsigned int start_mask, start_val;
609     unsigned int end_mask, end_val;
610 
611     c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
612                            sizeof(line_use) * c->sets * c->assoc);
613     c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
614                            sizeof(line_loaded) * c->sets * c->assoc);
615     c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
616                                     sizeof(int) * c->line_size);
617     c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
618                                   sizeof(int) * c->line_size);
619 
620     c->line_size_mask = c->line_size-1;
621 
622     /* Meaning of line_start_mask/line_end_mask
623      * Example: for a given cache line, you get an access starting at
624      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
625      * line size of 32, you have 1 bit per byte in the mask:
626      *
627      *   bit31   bit8 bit5  bit 0
628      *       |      |  |    |
629      *       11..111111100000   line_start_mask[5]
630      *       00..000111111111   line_end_mask[(5+4)-1]
631      *
632      *  use_mask |= line_start_mask[5] && line_end_mask[8]
633      *
634      */
635     start_val = end_val = ~0;
636     if (c->line_size < 32) {
637 	int bits_per_byte = 32/c->line_size;
638 	start_mask = (1<<bits_per_byte)-1;
639 	end_mask   = start_mask << (32-bits_per_byte);
640 	for(i=0;i<c->line_size;i++) {
641 	    c->line_start_mask[i] = start_val;
642 	    start_val  = start_val & ~start_mask;
643 	    start_mask = start_mask << bits_per_byte;
644 
645 	    c->line_end_mask[c->line_size-i-1] = end_val;
646 	    end_val  = end_val & ~end_mask;
647 	    end_mask = end_mask >> bits_per_byte;
648 	}
649     }
650     else {
651 	int bytes_per_bit = c->line_size/32;
652 	start_mask = 1;
653 	end_mask   = 1 << 31;
654 	for(i=0;i<c->line_size;i++) {
655 	    c->line_start_mask[i] = start_val;
656 	    c->line_end_mask[c->line_size-i-1] = end_val;
657 	    if ( ((i+1)%bytes_per_bit) == 0) {
658 		start_val   &= ~start_mask;
659 		end_val     &= ~end_mask;
660 		start_mask <<= 1;
661 		end_mask   >>= 1;
662 	    }
663 	}
664     }
665 
666     CLG_DEBUG(6, "Config %s:\n", c->desc_line);
667     for(i=0;i<c->line_size;i++) {
668 	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
669 		  i, c->line_start_mask[i], c->line_end_mask[i]);
670     }
671 
672     /* We use lower tag bits as offset pointers to cache use info.
673      * I.e. some cache parameters don't work.
674      */
675     if ( (1<<c->tag_shift) < c->assoc) {
676 	VG_(message)(Vg_DebugMsg,
677 		     "error: Use associativity < %d for cache use statistics!\n",
678 		     (1<<c->tag_shift) );
679 	VG_(tool_panic)("Unsupported cache configuration");
680     }
681 }
682 
683 
684 /* for I1/D1 caches */
685 #define CACHEUSE(L)                                                         \
686                                                                             \
687 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
688 {                                                                           \
689    UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
690    UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
691    UWord tag  = a & L.tag_mask;                                             \
692    UWord tag2;                                                              \
693    int i, j, idx;                                                           \
694    UWord *set, tmp_tag; 						    \
695    UInt use_mask;							    \
696                                                                             \
697    CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
698 	    L.name, a, size, set1, set2);				    \
699                                                                             \
700    /* First case: word entirely within line. */                             \
701    if (set1 == set2) {                                                      \
702                                                                             \
703       set = &(L.tags[set1 * L.assoc]);                                      \
704       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
705 	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
706                                                                             \
707       /* This loop is unrolled for just the first case, which is the most */\
708       /* common.  We can't unroll any further because it would screw up   */\
709       /* if we have a direct-mapped (1-way) cache.                        */\
710       if (tag == (set[0] & L.tag_mask)) {                                   \
711         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
712         L.use[idx].count ++;                                                \
713         L.use[idx].mask |= use_mask;                                        \
714 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
715 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
716 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
717 	return L1_Hit;							    \
718       }                                                                     \
719       /* If the tag is one other than the MRU, move it into the MRU spot  */\
720       /* and shuffle the rest down.                                       */\
721       for (i = 1; i < L.assoc; i++) {                                       \
722 	 if (tag == (set[i] & L.tag_mask)) {			            \
723   	    tmp_tag = set[i];                                               \
724             for (j = i; j > 0; j--) {                                       \
725                set[j] = set[j - 1];                                         \
726             }                                                               \
727             set[0] = tmp_tag;			                            \
728             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
729             L.use[idx].count ++;                                            \
730             L.use[idx].mask |= use_mask;                                    \
731 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
732 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
733 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
734             return L1_Hit;                                                  \
735          }                                                                  \
736       }                                                                     \
737                                                                             \
738       /* A miss;  install this tag as MRU, shuffle rest down. */            \
739       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
740       for (j = L.assoc - 1; j > 0; j--) {                                   \
741          set[j] = set[j - 1];                                               \
742       }                                                                     \
743       set[0] = tag | tmp_tag;                                               \
744       idx = (set1 * L.assoc) + tmp_tag;                                     \
745       return update_##L##_use(&L, idx,         			            \
746 		       use_mask, a &~ L.line_size_mask);		    \
747                                                                             \
748    /* Second case: word straddles two lines. */                             \
749    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
750    } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
751       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
752       set = &(L.tags[set1 * L.assoc]);                                      \
753       use_mask = L.line_start_mask[a & L.line_size_mask];		    \
754       if (tag == (set[0] & L.tag_mask)) {                                   \
755          idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
756          L.use[idx].count ++;                                               \
757          L.use[idx].mask |= use_mask;                                       \
758 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
759 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
760 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
761          goto block2;                                                       \
762       }                                                                     \
763       for (i = 1; i < L.assoc; i++) {                                       \
764 	 if (tag == (set[i] & L.tag_mask)) {			            \
765   	    tmp_tag = set[i];                                               \
766             for (j = i; j > 0; j--) {                                       \
767                set[j] = set[j - 1];                                         \
768             }                                                               \
769             set[0] = tmp_tag;                                               \
770             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
771             L.use[idx].count ++;                                            \
772             L.use[idx].mask |= use_mask;                                    \
773 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
774 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
775 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
776             goto block2;                                                    \
777          }                                                                  \
778       }                                                                     \
779       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
780       for (j = L.assoc - 1; j > 0; j--) {                                   \
781          set[j] = set[j - 1];                                               \
782       }                                                                     \
783       set[0] = tag | tmp_tag;                                               \
784       idx = (set1 * L.assoc) + tmp_tag;                                     \
785       miss1 = update_##L##_use(&L, idx,        			            \
786 		       use_mask, a &~ L.line_size_mask);		    \
787 block2:                                                                     \
788       set = &(L.tags[set2 * L.assoc]);                                      \
789       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
790       tag2  = (a+size-1) & L.tag_mask;                                      \
791       if (tag2 == (set[0] & L.tag_mask)) {                                  \
792          idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
793          L.use[idx].count ++;                                               \
794          L.use[idx].mask |= use_mask;                                       \
795 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
796 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
797 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
798          return miss1;                                                      \
799       }                                                                     \
800       for (i = 1; i < L.assoc; i++) {                                       \
801 	 if (tag2 == (set[i] & L.tag_mask)) {			            \
802   	    tmp_tag = set[i];                                               \
803             for (j = i; j > 0; j--) {                                       \
804                set[j] = set[j - 1];                                         \
805             }                                                               \
806             set[0] = tmp_tag;                                               \
807             idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
808             L.use[idx].count ++;                                            \
809             L.use[idx].mask |= use_mask;                                    \
810 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
811 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
812 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
813             return miss1;                                                   \
814          }                                                                  \
815       }                                                                     \
816       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
817       for (j = L.assoc - 1; j > 0; j--) {                                   \
818          set[j] = set[j - 1];                                               \
819       }                                                                     \
820       set[0] = tag2 | tmp_tag;                                              \
821       idx = (set2 * L.assoc) + tmp_tag;                                     \
822       miss2 = update_##L##_use(&L, idx,			                    \
823 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
824       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
825                                                                             \
826    } else {                                                                 \
827        VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
828        VG_(tool_panic)("item straddles more than two cache sets");          \
829    }                                                                        \
830    return 0;                                                                \
831 }
832 
833 
834 /* logarithmic bitcounting algorithm, see
835  * http://graphics.stanford.edu/~seander/bithacks.html
836  */
countBits(unsigned int bits)837 static __inline__ unsigned int countBits(unsigned int bits)
838 {
839   unsigned int c; // store the total here
840   const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
841   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
842 
843   c = bits;
844   c = ((c >> S[0]) & B[0]) + (c & B[0]);
845   c = ((c >> S[1]) & B[1]) + (c & B[1]);
846   c = ((c >> S[2]) & B[2]) + (c & B[2]);
847   c = ((c >> S[3]) & B[3]) + (c & B[3]);
848   c = ((c >> S[4]) & B[4]) + (c & B[4]);
849   return c;
850 }
851 
update_LL_use(int idx,Addr memline)852 static void update_LL_use(int idx, Addr memline)
853 {
854   line_loaded* loaded = &(LL.loaded[idx]);
855   line_use* use = &(LL.use[idx]);
856   int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
857 
858   CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
859            idx, CLG_(bb_base) + current_ii->instr_offset, memline);
860   if (use->count>0) {
861     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
862 	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
863     CLG_DEBUG(2, "   collect: %d, use_base %p\n",
864 	     CLG_(current_state).collect, loaded->use_base);
865 
866     if (CLG_(current_state).collect && loaded->use_base) {
867       (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
868       (loaded->use_base)[off_LL_SpLoss] += i;
869     }
870    }
871 
872    use->count = 0;
873    use->mask  = 0;
874 
875   loaded->memline = memline;
876   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
877   loaded->use_base = (CLG_(current_state).nonskipped) ?
878     CLG_(current_state).nonskipped->skipped :
879     CLG_(cost_base) + current_ii->cost_offset;
880 }
881 
882 static
cacheuse_LL_access(Addr memline,line_loaded * l1_loaded)883 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
884 {
885    UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
886    UWord* set = &(LL.tags[setNo * LL.assoc]);
887    UWord tag  = memline & LL.tag_mask;
888 
889    int i, j, idx;
890    UWord tmp_tag;
891 
892    CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
893 
894    if (tag == (set[0] & LL.tag_mask)) {
895      idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
896      l1_loaded->dep_use = &(LL.use[idx]);
897 
898      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
899 		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
900 		 LL.use[idx].mask, LL.use[idx].count);
901      return LL_Hit;
902    }
903    for (i = 1; i < LL.assoc; i++) {
904      if (tag == (set[i] & LL.tag_mask)) {
905        tmp_tag = set[i];
906        for (j = i; j > 0; j--) {
907 	 set[j] = set[j - 1];
908        }
909        set[0] = tmp_tag;
910        idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
911        l1_loaded->dep_use = &(LL.use[idx]);
912 
913 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
914 		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
915 		 LL.use[idx].mask, LL.use[idx].count);
916 	return LL_Hit;
917      }
918    }
919 
920    /* A miss;  install this tag as MRU, shuffle rest down. */
921    tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
922    for (j = LL.assoc - 1; j > 0; j--) {
923      set[j] = set[j - 1];
924    }
925    set[0] = tag | tmp_tag;
926    idx = (setNo * LL.assoc) + tmp_tag;
927    l1_loaded->dep_use = &(LL.use[idx]);
928 
929    update_LL_use(idx, memline);
930 
931    return MemAccess;
932 }
933 
934 
935 
936 
937 #define UPDATE_USE(L)					             \
938                                                                      \
939 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
940 			       UInt mask, Addr memline)		     \
941 {                                                                    \
942   line_loaded* loaded = &(cache->loaded[idx]);			     \
943   line_use* use = &(cache->use[idx]);				     \
944   int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
945                                                                      \
946   CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
947            cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
948   if (use->count>0) {                                                \
949     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
950 	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
951     CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
952 	     CLG_(current_state).collect, loaded->use_base);	     \
953                                                                      \
954     if (CLG_(current_state).collect && loaded->use_base) {           \
955       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
956       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
957                                                                      \
958       /* FIXME (?): L1/LL line sizes must be equal ! */              \
959       loaded->dep_use->mask |= use->mask;                            \
960       loaded->dep_use->count += use->count;                          \
961     }                                                                \
962   }                                                                  \
963                                                                      \
964   use->count = 1;                                                    \
965   use->mask  = mask;                                                 \
966   loaded->memline = memline;                                         \
967   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
968   loaded->use_base = (CLG_(current_state).nonskipped) ?              \
969     CLG_(current_state).nonskipped->skipped :                        \
970     CLG_(cost_base) + current_ii->cost_offset;                       \
971                                                                      \
972   if (memline == 0) return LL_Hit;                                   \
973   return cacheuse_LL_access(memline, loaded);                        \
974 }
975 
976 UPDATE_USE(I1);
977 UPDATE_USE(D1);
978 
979 CACHEUSE(I1);
980 CACHEUSE(D1);
981 
982 
983 static
cacheuse_finish(void)984 void cacheuse_finish(void)
985 {
986   int i;
987   InstrInfo ii = { 0,0,0,0 };
988 
989   if (!CLG_(current_state).collect) return;
990 
991   CLG_(bb_base) = 0;
992   current_ii = &ii; /* needs to be set for update_XX_use */
993   CLG_(cost_base) = 0;
994 
995   /* update usage counters */
996   if (I1.use)
997     for (i = 0; i < I1.sets * I1.assoc; i++)
998       if (I1.loaded[i].use_base)
999 	update_I1_use( &I1, i, 0,0);
1000 
1001   if (D1.use)
1002     for (i = 0; i < D1.sets * D1.assoc; i++)
1003       if (D1.loaded[i].use_base)
1004 	update_D1_use( &D1, i, 0,0);
1005 
1006   if (LL.use)
1007     for (i = 0; i < LL.sets * LL.assoc; i++)
1008       if (LL.loaded[i].use_base)
1009 	update_LL_use(i, 0);
1010 
1011   current_ii = 0;
1012 }
1013 
1014 
1015 
1016 /*------------------------------------------------------------*/
1017 /*--- Helper functions called by instrumented code         ---*/
1018 /*------------------------------------------------------------*/
1019 
1020 
1021 static __inline__
inc_costs(CacheModelResult r,ULong * c1,ULong * c2)1022 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1023 {
1024     switch(r) {
1025 	case WriteBackMemAccess:
1026 	    if (clo_simulate_writeback) {
1027 		c1[3]++;
1028 		c2[3]++;
1029 	    }
1030 	    // fall through
1031 
1032 	case MemAccess:
1033 	    c1[2]++;
1034 	    c2[2]++;
1035 	    // fall through
1036 
1037 	case LL_Hit:
1038 	    c1[1]++;
1039 	    c2[1]++;
1040 	    // fall through
1041 
1042 	default:
1043 	    c1[0]++;
1044 	    c2[0]++;
1045     }
1046 }
1047 
1048 static
cacheRes(CacheModelResult r)1049 const HChar* cacheRes(CacheModelResult r)
1050 {
1051     switch(r) {
1052     case L1_Hit:    return "L1 Hit ";
1053     case LL_Hit:    return "LL Hit ";
1054     case MemAccess: return "LL Miss";
1055     case WriteBackMemAccess: return "LL Miss (dirty)";
1056     default:
1057 	tl_assert(0);
1058     }
1059     return "??";
1060 }
1061 
1062 VG_REGPARM(1)
log_1I0D(InstrInfo * ii)1063 static void log_1I0D(InstrInfo* ii)
1064 {
1065     CacheModelResult IrRes;
1066 
1067     current_ii = ii;
1068     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1069 
1070     CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
1071               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1072 
1073     if (CLG_(current_state).collect) {
1074 	ULong* cost_Ir;
1075 
1076 	if (CLG_(current_state).nonskipped)
1077 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1078 	else
1079             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1080 
1081 	inc_costs(IrRes, cost_Ir,
1082 		  CLG_(current_state).cost + fullOffset(EG_IR) );
1083     }
1084 }
1085 
1086 VG_REGPARM(2)
log_2I0D(InstrInfo * ii1,InstrInfo * ii2)1087 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1088 {
1089     CacheModelResult Ir1Res, Ir2Res;
1090     ULong *global_cost_Ir;
1091 
1092     current_ii = ii1;
1093     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1094     current_ii = ii2;
1095     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1096 
1097     CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1098               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1099               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1100 
1101     if (!CLG_(current_state).collect) return;
1102 
1103     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1104     if (CLG_(current_state).nonskipped) {
1105 	ULong* skipped_cost_Ir =
1106 	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1107 
1108 	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1109 	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1110 	return;
1111     }
1112 
1113     inc_costs(Ir1Res, global_cost_Ir,
1114               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1115     inc_costs(Ir2Res, global_cost_Ir,
1116               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1117 }
1118 
1119 VG_REGPARM(3)
log_3I0D(InstrInfo * ii1,InstrInfo * ii2,InstrInfo * ii3)1120 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1121 {
1122     CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1123     ULong *global_cost_Ir;
1124 
1125     current_ii = ii1;
1126     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1127     current_ii = ii2;
1128     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1129     current_ii = ii3;
1130     Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1131 
1132     CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1133               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1134               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1135               CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1136 
1137     if (!CLG_(current_state).collect) return;
1138 
1139     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1140     if (CLG_(current_state).nonskipped) {
1141 	ULong* skipped_cost_Ir =
1142 	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1143 	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1144 	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1145 	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1146 	return;
1147     }
1148 
1149     inc_costs(Ir1Res, global_cost_Ir,
1150               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1151     inc_costs(Ir2Res, global_cost_Ir,
1152               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1153     inc_costs(Ir3Res, global_cost_Ir,
1154               CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1155 }
1156 
1157 /* Instruction doing a read access */
1158 
1159 VG_REGPARM(3)
log_1I1Dr(InstrInfo * ii,Addr data_addr,Word data_size)1160 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1161 {
1162     CacheModelResult IrRes, DrRes;
1163 
1164     current_ii = ii;
1165     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1166     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1167 
1168     CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
1169               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1170 	      data_addr, data_size, cacheRes(DrRes));
1171 
1172     if (CLG_(current_state).collect) {
1173 	ULong *cost_Ir, *cost_Dr;
1174 
1175 	if (CLG_(current_state).nonskipped) {
1176 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1177 	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1178 	}
1179 	else {
1180             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1181             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1182 	}
1183 
1184 	inc_costs(IrRes, cost_Ir,
1185 		  CLG_(current_state).cost + fullOffset(EG_IR) );
1186 	inc_costs(DrRes, cost_Dr,
1187 		  CLG_(current_state).cost + fullOffset(EG_DR) );
1188     }
1189 }
1190 
1191 
1192 /* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
1193    have exactly the same prototype.  If you change them, you must
1194    change addEvent_D_guarded too. */
1195 VG_REGPARM(3)
log_0I1Dr(InstrInfo * ii,Addr data_addr,Word data_size)1196 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1197 {
1198     CacheModelResult DrRes;
1199 
1200     current_ii = ii;
1201     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1202 
1203     CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
1204 	      data_addr, data_size, cacheRes(DrRes));
1205 
1206     if (CLG_(current_state).collect) {
1207 	ULong *cost_Dr;
1208 
1209 	if (CLG_(current_state).nonskipped)
1210 	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1211 	else
1212             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1213 
1214 	inc_costs(DrRes, cost_Dr,
1215 		  CLG_(current_state).cost + fullOffset(EG_DR) );
1216     }
1217 }
1218 
1219 
1220 /* Instruction doing a write access */
1221 
1222 VG_REGPARM(3)
log_1I1Dw(InstrInfo * ii,Addr data_addr,Word data_size)1223 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1224 {
1225     CacheModelResult IrRes, DwRes;
1226 
1227     current_ii = ii;
1228     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1229     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1230 
1231     CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
1232               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1233 	      data_addr, data_size, cacheRes(DwRes));
1234 
1235     if (CLG_(current_state).collect) {
1236 	ULong *cost_Ir, *cost_Dw;
1237 
1238 	if (CLG_(current_state).nonskipped) {
1239 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1240 	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1241 	}
1242 	else {
1243             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1244             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1245 	}
1246 
1247 	inc_costs(IrRes, cost_Ir,
1248 		  CLG_(current_state).cost + fullOffset(EG_IR) );
1249 	inc_costs(DwRes, cost_Dw,
1250 		  CLG_(current_state).cost + fullOffset(EG_DW) );
1251     }
1252 }
1253 
1254 /* See comment on log_0I1Dr. */
1255 VG_REGPARM(3)
log_0I1Dw(InstrInfo * ii,Addr data_addr,Word data_size)1256 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1257 {
1258     CacheModelResult DwRes;
1259 
1260     current_ii = ii;
1261     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1262 
1263     CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
1264 	      data_addr, data_size, cacheRes(DwRes));
1265 
1266     if (CLG_(current_state).collect) {
1267 	ULong *cost_Dw;
1268 
1269 	if (CLG_(current_state).nonskipped)
1270 	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1271 	else
1272             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1273 
1274 	inc_costs(DwRes, cost_Dw,
1275 		  CLG_(current_state).cost + fullOffset(EG_DW) );
1276     }
1277 }
1278 
1279 
1280 
1281 /*------------------------------------------------------------*/
1282 /*--- Cache configuration                                  ---*/
1283 /*------------------------------------------------------------*/
1284 
1285 static cache_t clo_I1_cache = UNDEFINED_CACHE;
1286 static cache_t clo_D1_cache = UNDEFINED_CACHE;
1287 static cache_t clo_LL_cache = UNDEFINED_CACHE;
1288 
1289 /* Initialize and clear simulator state */
cachesim_post_clo_init(void)1290 static void cachesim_post_clo_init(void)
1291 {
1292   /* Cache configurations. */
1293   cache_t  I1c, D1c, LLc;
1294 
1295   /* Initialize access handlers */
1296   if (!CLG_(clo).simulate_cache) {
1297     CLG_(cachesim).log_1I0D  = 0;
1298     CLG_(cachesim).log_1I0D_name = "(no function)";
1299     CLG_(cachesim).log_2I0D  = 0;
1300     CLG_(cachesim).log_2I0D_name = "(no function)";
1301     CLG_(cachesim).log_3I0D  = 0;
1302     CLG_(cachesim).log_3I0D_name = "(no function)";
1303 
1304     CLG_(cachesim).log_1I1Dr = 0;
1305     CLG_(cachesim).log_1I1Dr_name = "(no function)";
1306     CLG_(cachesim).log_1I1Dw = 0;
1307     CLG_(cachesim).log_1I1Dw_name = "(no function)";
1308 
1309     CLG_(cachesim).log_0I1Dr = 0;
1310     CLG_(cachesim).log_0I1Dr_name = "(no function)";
1311     CLG_(cachesim).log_0I1Dw = 0;
1312     CLG_(cachesim).log_0I1Dw_name = "(no function)";
1313     return;
1314   }
1315 
1316   /* Configuration of caches only needed with real cache simulation */
1317   VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
1318                                       &clo_I1_cache,
1319                                       &clo_D1_cache,
1320                                       &clo_LL_cache);
1321 
1322   I1.name = "I1";
1323   D1.name = "D1";
1324   LL.name = "LL";
1325 
1326   // min_line_size is used to make sure that we never feed
1327   // accesses to the simulator straddling more than two
1328   // cache lines at any cache level
1329   CLG_(min_line_size) = (I1c.line_size < D1c.line_size)
1330                            ? I1c.line_size : D1c.line_size;
1331   CLG_(min_line_size) = (LLc.line_size < CLG_(min_line_size))
1332                            ? LLc.line_size : CLG_(min_line_size);
1333 
1334   Int largest_load_or_store_size
1335      = VG_(machine_get_size_of_largest_guest_register)();
1336   if (CLG_(min_line_size) < largest_load_or_store_size) {
1337      /* We can't continue, because the cache simulation might
1338         straddle more than 2 lines, and it will assert.  So let's
1339         just stop before we start. */
1340      VG_(umsg)("Callgrind: cannot continue: the minimum line size (%d)\n",
1341                (Int)CLG_(min_line_size));
1342      VG_(umsg)("  must be equal to or larger than the maximum register size (%d)\n",
1343                largest_load_or_store_size );
1344      VG_(umsg)("  but it is not.  Exiting now.\n");
1345      VG_(exit)(1);
1346   }
1347 
1348   cachesim_initcache(I1c, &I1);
1349   cachesim_initcache(D1c, &D1);
1350   cachesim_initcache(LLc, &LL);
1351 
1352   /* the other cache simulators use the standard helpers
1353    * with dispatching via simulator struct */
1354 
1355   CLG_(cachesim).log_1I0D  = log_1I0D;
1356   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1357   CLG_(cachesim).log_2I0D  = log_2I0D;
1358   CLG_(cachesim).log_2I0D_name  = "log_2I0D";
1359   CLG_(cachesim).log_3I0D  = log_3I0D;
1360   CLG_(cachesim).log_3I0D_name  = "log_3I0D";
1361 
1362   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1363   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1364   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1365   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1366 
1367   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1368   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1369   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1370   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1371 
1372   if (clo_collect_cacheuse) {
1373 
1374       /* Output warning for not supported option combinations */
1375       if (clo_simulate_hwpref) {
1376 	  VG_(message)(Vg_DebugMsg,
1377 		       "warning: prefetch simulation can not be "
1378                        "used with cache usage\n");
1379 	  clo_simulate_hwpref = False;
1380       }
1381 
1382       if (clo_simulate_writeback) {
1383 	  VG_(message)(Vg_DebugMsg,
1384 		       "warning: write-back simulation can not be "
1385                        "used with cache usage\n");
1386 	  clo_simulate_writeback = False;
1387       }
1388 
1389       simulator.I1_Read  = cacheuse_I1_doRead;
1390       simulator.D1_Read  = cacheuse_D1_doRead;
1391       simulator.D1_Write = cacheuse_D1_doRead;
1392       return;
1393   }
1394 
1395   if (clo_simulate_hwpref) {
1396     prefetch_clear();
1397 
1398     if (clo_simulate_writeback) {
1399       simulator.I1_Read  = prefetch_I1_Read;
1400       simulator.D1_Read  = prefetch_D1_Read;
1401       simulator.D1_Write = prefetch_D1_Write;
1402     }
1403     else {
1404       simulator.I1_Read  = prefetch_I1_ref;
1405       simulator.D1_Read  = prefetch_D1_ref;
1406       simulator.D1_Write = prefetch_D1_ref;
1407     }
1408 
1409     return;
1410   }
1411 
1412   if (clo_simulate_writeback) {
1413       simulator.I1_Read  = cachesim_I1_Read;
1414       simulator.D1_Read  = cachesim_D1_Read;
1415       simulator.D1_Write = cachesim_D1_Write;
1416   }
1417   else {
1418       simulator.I1_Read  = cachesim_I1_ref;
1419       simulator.D1_Read  = cachesim_D1_ref;
1420       simulator.D1_Write = cachesim_D1_ref;
1421   }
1422 }
1423 
1424 
1425 /* Clear simulator state. Has to be initialized before */
1426 static
cachesim_clear(void)1427 void cachesim_clear(void)
1428 {
1429   cachesim_clearcache(&I1);
1430   cachesim_clearcache(&D1);
1431   cachesim_clearcache(&LL);
1432 
1433   prefetch_clear();
1434 }
1435 
1436 
cachesim_getdesc(HChar * buf)1437 static void cachesim_getdesc(HChar* buf)
1438 {
1439   Int p;
1440   p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1441   p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1442   VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
1443 }
1444 
1445 static
cachesim_print_opts(void)1446 void cachesim_print_opts(void)
1447 {
1448   VG_(printf)(
1449 "\n   cache simulator options (does cache simulation if used):\n"
1450 "    --simulate-wb=no|yes      Count write-back events [no]\n"
1451 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1452 #if CLG_EXPERIMENTAL
1453 "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1454 #endif
1455 "    --cacheuse=no|yes         Collect cache block use [no]\n");
1456   VG_(print_cache_clo_opts)();
1457 }
1458 
1459 /* Check for command line option for cache configuration.
1460  * Return False if unknown and not handled.
1461  *
1462  * Called from CLG_(process_cmd_line_option)() in clo.c
1463  */
cachesim_parse_opt(const HChar * arg)1464 static Bool cachesim_parse_opt(const HChar* arg)
1465 {
1466    if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
1467    else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
1468    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
1469 
1470    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1471       if (clo_collect_cacheuse) {
1472          /* Use counters only make sense with fine dumping */
1473          CLG_(clo).dump_instr = True;
1474       }
1475    }
1476 
1477    else if (VG_(str_clo_cache_opt)(arg,
1478                                    &clo_I1_cache,
1479                                    &clo_D1_cache,
1480                                    &clo_LL_cache)) {}
1481 
1482    else
1483      return False;
1484 
1485   return True;
1486 }
1487 
1488 /* Adds commas to ULong, right justifying in a field field_width wide, returns
1489  * the string in buf. */
1490 static
commify(ULong n,int field_width,HChar * buf)1491 Int commify(ULong n, int field_width, HChar* buf)
1492 {
1493    int len, n_commas, i, j, new_len, space;
1494 
1495    VG_(sprintf)(buf, "%llu", n);
1496    len = VG_(strlen)(buf);
1497    n_commas = (len - 1) / 3;
1498    new_len = len + n_commas;
1499    space = field_width - new_len;
1500 
1501    /* Allow for printing a number in a field_width smaller than it's size */
1502    if (space < 0) space = 0;
1503 
1504    /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1505     * of three. */
1506    for (j = -1, i = len ; i >= 0; i--) {
1507       buf[i + n_commas + space] = buf[i];
1508 
1509       if ((i>0) && (3 == ++j)) {
1510          j = 0;
1511          n_commas--;
1512          buf[i + n_commas + space] = ',';
1513       }
1514    }
1515    /* Right justify in field. */
1516    for (i = 0; i < space; i++)  buf[i] = ' ';
1517    return new_len;
1518 }
1519 
1520 static
percentify(Int n,Int ex,Int field_width,HChar buf[])1521 void percentify(Int n, Int ex, Int field_width, HChar buf[])
1522 {
1523    int i, len, space;
1524 
1525    VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1526    len = VG_(strlen)(buf);
1527    space = field_width - len;
1528    if (space < 0) space = 0;     /* Allow for v. small field_width */
1529    i = len;
1530 
1531    /* Right justify in field */
1532    for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
1533    for (i = 0; i < space; i++)  buf[i] = ' ';
1534 }
1535 
1536 static
cachesim_printstat(Int l1,Int l2,Int l3)1537 void cachesim_printstat(Int l1, Int l2, Int l3)
1538 {
1539   FullCost total = CLG_(total_cost), D_total = 0;
1540   ULong LL_total_m, LL_total_mr, LL_total_mw,
1541     LL_total, LL_total_r, LL_total_w;
1542   HChar buf1[RESULTS_BUF_LEN],
1543     buf2[RESULTS_BUF_LEN],
1544     buf3[RESULTS_BUF_LEN];
1545   Int p;
1546 
1547   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1548     VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
1549 		 prefetch_up);
1550     VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
1551 		 prefetch_down);
1552     VG_(message)(Vg_DebugMsg, "\n");
1553   }
1554 
1555   commify(total[fullOffset(EG_IR) +1], l1, buf1);
1556   VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
1557 
1558   commify(total[fullOffset(EG_IR) +2], l1, buf1);
1559   VG_(message)(Vg_UserMsg, "LLi misses:    %s\n", buf1);
1560 
1561   p = 100;
1562 
1563   if (0 == total[fullOffset(EG_IR)])
1564     total[fullOffset(EG_IR)] = 1;
1565 
1566   percentify(total[fullOffset(EG_IR)+1] * 100 * p /
1567 	     total[fullOffset(EG_IR)], p, l1+1, buf1);
1568   VG_(message)(Vg_UserMsg, "I1  miss rate: %s\n", buf1);
1569 
1570   percentify(total[fullOffset(EG_IR)+2] * 100 * p /
1571 	     total[fullOffset(EG_IR)], p, l1+1, buf1);
1572   VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
1573   VG_(message)(Vg_UserMsg, "\n");
1574 
1575   /* D cache results.
1576      Use the D_refs.rd and D_refs.wr values to determine the
1577    * width of columns 2 & 3. */
1578 
1579   D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1580   CLG_(init_cost)( CLG_(sets).full, D_total);
1581   // we only use the first 3 values of D_total, adding up Dr and Dw costs
1582   CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1583   CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1584 
1585   commify( D_total[0], l1, buf1);
1586   commify(total[fullOffset(EG_DR)], l2,  buf2);
1587   commify(total[fullOffset(EG_DW)], l3,  buf3);
1588   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)\n",
1589 	       buf1,  buf2,  buf3);
1590 
1591   commify( D_total[1], l1, buf1);
1592   commify(total[fullOffset(EG_DR)+1], l2, buf2);
1593   commify(total[fullOffset(EG_DW)+1], l3, buf3);
1594   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)\n",
1595 	       buf1, buf2, buf3);
1596 
1597   commify( D_total[2], l1, buf1);
1598   commify(total[fullOffset(EG_DR)+2], l2, buf2);
1599   commify(total[fullOffset(EG_DW)+2], l3, buf3);
1600   VG_(message)(Vg_UserMsg, "LLd misses:    %s  (%s rd + %s wr)\n",
1601 	       buf1, buf2, buf3);
1602 
1603   p = 10;
1604 
1605   if (0 == D_total[0])   D_total[0] = 1;
1606   if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1607   if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1608 
1609   percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
1610   percentify(total[fullOffset(EG_DR)+1] * 100 * p /
1611 	     total[fullOffset(EG_DR)], p, l2+1, buf2);
1612   percentify(total[fullOffset(EG_DW)+1] * 100 * p /
1613 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1614   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )\n",
1615                buf1, buf2,buf3);
1616 
1617   percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
1618   percentify(total[fullOffset(EG_DR)+2] * 100 * p /
1619 	     total[fullOffset(EG_DR)], p, l2+1, buf2);
1620   percentify(total[fullOffset(EG_DW)+2] * 100 * p /
1621 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1622   VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s   + %s  )\n",
1623                buf1, buf2,buf3);
1624   VG_(message)(Vg_UserMsg, "\n");
1625 
1626 
1627 
1628   /* LL overall results */
1629 
1630   LL_total   =
1631     total[fullOffset(EG_DR) +1] +
1632     total[fullOffset(EG_DW) +1] +
1633     total[fullOffset(EG_IR) +1];
1634   LL_total_r =
1635     total[fullOffset(EG_DR) +1] +
1636     total[fullOffset(EG_IR) +1];
1637   LL_total_w = total[fullOffset(EG_DW) +1];
1638   commify(LL_total,   l1, buf1);
1639   commify(LL_total_r, l2, buf2);
1640   commify(LL_total_w, l3, buf3);
1641   VG_(message)(Vg_UserMsg, "LL refs:       %s  (%s rd + %s wr)\n",
1642 	       buf1, buf2, buf3);
1643 
1644   LL_total_m  =
1645     total[fullOffset(EG_DR) +2] +
1646     total[fullOffset(EG_DW) +2] +
1647     total[fullOffset(EG_IR) +2];
1648   LL_total_mr =
1649     total[fullOffset(EG_DR) +2] +
1650     total[fullOffset(EG_IR) +2];
1651   LL_total_mw = total[fullOffset(EG_DW) +2];
1652   commify(LL_total_m,  l1, buf1);
1653   commify(LL_total_mr, l2, buf2);
1654   commify(LL_total_mw, l3, buf3);
1655   VG_(message)(Vg_UserMsg, "LL misses:     %s  (%s rd + %s wr)\n",
1656 	       buf1, buf2, buf3);
1657 
1658   percentify(LL_total_m  * 100 * p /
1659 	     (total[fullOffset(EG_IR)] + D_total[0]),  p, l1+1, buf1);
1660   percentify(LL_total_mr * 100 * p /
1661 	     (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1662 	     p, l2+1, buf2);
1663   percentify(LL_total_mw * 100 * p /
1664 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1665   VG_(message)(Vg_UserMsg, "LL miss rate:  %s (%s   + %s  )\n",
1666 	       buf1, buf2,buf3);
1667 }
1668 
1669 
1670 /*------------------------------------------------------------*/
1671 /*--- Setup for Event set.                                 ---*/
1672 /*------------------------------------------------------------*/
1673 
1674 struct event_sets CLG_(sets);
1675 
CLG_(init_eventsets)1676 void CLG_(init_eventsets)()
1677 {
1678     // Event groups from which the event sets are composed
1679     // the "Use" group only is used with "cacheuse" simulation
1680     if (clo_collect_cacheuse)
1681 	CLG_(register_event_group4)(EG_USE,
1682 				    "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1683 
1684     if (!CLG_(clo).simulate_cache)
1685 	CLG_(register_event_group)(EG_IR, "Ir");
1686     else if (!clo_simulate_writeback) {
1687 	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
1688 	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
1689 	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
1690     }
1691     else { // clo_simulate_writeback
1692 	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
1693         CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
1694         CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
1695     }
1696 
1697     if (CLG_(clo).simulate_branch) {
1698         CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1699         CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1700     }
1701 
1702     if (CLG_(clo).collect_bus)
1703 	CLG_(register_event_group)(EG_BUS, "Ge");
1704 
1705     if (CLG_(clo).collect_alloc)
1706 	CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1707 
1708     if (CLG_(clo).collect_systime)
1709 	CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1710 
1711     // event set used as base for instruction self cost
1712     CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1713 
1714     // event set comprising all event groups, used for inclusive cost
1715     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1716     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1717     CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1718     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1719 
1720     CLG_DEBUGIF(1) {
1721 	CLG_DEBUG(1, "EventSets:\n");
1722 	CLG_(print_eventset)(-2, CLG_(sets).base);
1723 	CLG_(print_eventset)(-2, CLG_(sets).full);
1724     }
1725 
1726     /* Not-existing events are silently ignored */
1727     CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1728     CLG_(append_event)(CLG_(dumpmap), "Ir");
1729     CLG_(append_event)(CLG_(dumpmap), "Dr");
1730     CLG_(append_event)(CLG_(dumpmap), "Dw");
1731     CLG_(append_event)(CLG_(dumpmap), "I1mr");
1732     CLG_(append_event)(CLG_(dumpmap), "D1mr");
1733     CLG_(append_event)(CLG_(dumpmap), "D1mw");
1734     CLG_(append_event)(CLG_(dumpmap), "ILmr");
1735     CLG_(append_event)(CLG_(dumpmap), "DLmr");
1736     CLG_(append_event)(CLG_(dumpmap), "DLmw");
1737     CLG_(append_event)(CLG_(dumpmap), "ILdmr");
1738     CLG_(append_event)(CLG_(dumpmap), "DLdmr");
1739     CLG_(append_event)(CLG_(dumpmap), "DLdmw");
1740     CLG_(append_event)(CLG_(dumpmap), "Bc");
1741     CLG_(append_event)(CLG_(dumpmap), "Bcm");
1742     CLG_(append_event)(CLG_(dumpmap), "Bi");
1743     CLG_(append_event)(CLG_(dumpmap), "Bim");
1744     CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1745     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1746     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1747     CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1748     CLG_(append_event)(CLG_(dumpmap), "Ge");
1749     CLG_(append_event)(CLG_(dumpmap), "allocCount");
1750     CLG_(append_event)(CLG_(dumpmap), "allocSize");
1751     CLG_(append_event)(CLG_(dumpmap), "sysCount");
1752     CLG_(append_event)(CLG_(dumpmap), "sysTime");
1753 }
1754 
1755 
1756 /* this is called at dump time for every instruction executed */
cachesim_add_icost(SimCost cost,BBCC * bbcc,InstrInfo * ii,ULong exe_count)1757 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1758 			       InstrInfo* ii, ULong exe_count)
1759 {
1760     if (!CLG_(clo).simulate_cache)
1761 	cost[ fullOffset(EG_IR) ] += exe_count;
1762 
1763     if (ii->eventset)
1764 	CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1765 				  ii->eventset, bbcc->cost + ii->cost_offset);
1766 }
1767 
1768 static
cachesim_finish(void)1769 void cachesim_finish(void)
1770 {
1771   if (clo_collect_cacheuse)
1772     cacheuse_finish();
1773 }
1774 
1775 /*------------------------------------------------------------*/
1776 /*--- The simulator defined in this file                   ---*/
1777 /*------------------------------------------------------------*/
1778 
1779 struct cachesim_if CLG_(cachesim) = {
1780   .print_opts    = cachesim_print_opts,
1781   .parse_opt     = cachesim_parse_opt,
1782   .post_clo_init = cachesim_post_clo_init,
1783   .clear         = cachesim_clear,
1784   .getdesc       = cachesim_getdesc,
1785   .printstat     = cachesim_printstat,
1786   .add_icost     = cachesim_add_icost,
1787   .finish        = cachesim_finish,
1788 
1789   /* these will be set by cachesim_post_clo_init */
1790   .log_1I0D        = 0,
1791   .log_2I0D        = 0,
1792   .log_3I0D        = 0,
1793 
1794   .log_1I1Dr       = 0,
1795   .log_1I1Dw       = 0,
1796 
1797   .log_0I1Dr       = 0,
1798   .log_0I1Dw       = 0,
1799 
1800   .log_1I0D_name = "(no function)",
1801   .log_2I0D_name = "(no function)",
1802   .log_3I0D_name = "(no function)",
1803 
1804   .log_1I1Dr_name = "(no function)",
1805   .log_1I1Dw_name = "(no function)",
1806 
1807   .log_0I1Dr_name = "(no function)",
1808   .log_0I1Dw_name = "(no function)",
1809 };
1810 
1811 
1812 /*--------------------------------------------------------------------*/
1813 /*--- end                                                 ct_sim.c ---*/
1814 /*--------------------------------------------------------------------*/
1815