• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*--------------------------------------------------------------------*/
2 /*--- Cache simulation.                                            ---*/
3 /*---                                                        sim.c ---*/
4 /*--------------------------------------------------------------------*/
5 
6 /*
7    This file is part of Callgrind, a Valgrind tool for call graph
8    profiling programs.
9 
10    Copyright (C) 2003-2011, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
11 
12    This tool is derived from and contains code from Cachegrind
13    Copyright (C) 2002-2011 Nicholas Nethercote (njn@valgrind.org)
14 
15    This program is free software; you can redistribute it and/or
16    modify it under the terms of the GNU General Public License as
17    published by the Free Software Foundation; either version 2 of the
18    License, or (at your option) any later version.
19 
20    This program is distributed in the hope that it will be useful, but
21    WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    General Public License for more details.
24 
25    You should have received a copy of the GNU General Public License
26    along with this program; if not, write to the Free Software
27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28    02111-1307, USA.
29 
30    The GNU General Public License is contained in the file COPYING.
31 */
32 
33 #include "global.h"
34 
35 
36 /* Notes:
37   - simulates a write-allocate cache
38   - (block --> set) hash function uses simple bit selection
39   - handling of references straddling two cache blocks:
40       - counts as only one cache access (not two)
41       - both blocks hit                  --> one hit
42       - one block hits, the other misses --> one miss
43       - both blocks miss                 --> one miss (not two)
44 */
45 
46 /* Cache configuration */
47 #include "cg_arch.h"
48 
49 /* additional structures for cache use info, separated
50  * according usage frequency:
51  * - line_loaded : pointer to cost center of instruction
52  *                 which loaded the line into cache.
53  *                 Needed to increment counters when line is evicted.
54  * - line_use    : updated on every access
55  */
56 typedef struct {
57   UInt count;
58   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
59 } line_use;
60 
61 typedef struct {
62   Addr memline, iaddr;
63   line_use* dep_use; /* point to higher-level cacheblock for this memline */
64   ULong* use_base;
65 } line_loaded;
66 
67 /* Cache state */
68 typedef struct {
69    char*        name;
70    int          size;                   /* bytes */
71    int          assoc;
72    int          line_size;              /* bytes */
73    Bool         sectored;  /* prefetch nearside cacheline on read */
74    int          sets;
75    int          sets_min_1;
76    int          line_size_bits;
77    int          tag_shift;
78    UWord        tag_mask;
79    char         desc_line[128];
80    UWord*       tags;
81 
82   /* for cache use */
83    int          line_size_mask;
84    int*         line_start_mask;
85    int*         line_end_mask;
86    line_loaded* loaded;
87    line_use*    use;
88 } cache_t2;
89 
90 /*
91  * States of flat caches in our model.
92  * We use a 2-level hierarchy,
93  */
94 static cache_t2 I1, D1, LL;
95 
96 /* Lower bits of cache tags are used as flags for a cache line */
97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
98 #define CACHELINE_DIRTY    1
99 
100 
101 /* Cache simulator Options */
102 static Bool clo_simulate_writeback = False;
103 static Bool clo_simulate_hwpref = False;
104 static Bool clo_simulate_sectors = False;
105 static Bool clo_collect_cacheuse = False;
106 
107 /* Following global vars are setup before by setup_bbcc():
108  *
109  * - Addr   CLG_(bb_base)     (instruction start address of original BB)
110  * - ULong* CLG_(cost_base)   (start of cost array for BB)
111  */
112 
113 Addr   CLG_(bb_base);
114 ULong* CLG_(cost_base);
115 
116 static InstrInfo* current_ii;
117 
118 /* Cache use offsets */
119 /* The offsets are only correct because all per-instruction event sets get
120  * the "Use" set added first !
121  */
122 static Int off_I1_AcCost  = 0;
123 static Int off_I1_SpLoss  = 1;
124 static Int off_D1_AcCost  = 0;
125 static Int off_D1_SpLoss  = 1;
126 static Int off_LL_AcCost  = 2;
127 static Int off_LL_SpLoss  = 3;
128 
129 /* Cache access types */
130 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
131 
132 /* Result of a reference into a flat cache */
133 typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
134 
135 /* Result of a reference into a hierarchical cache model */
136 typedef enum {
137     L1_Hit,
138     LL_Hit,
139     MemAccess,
140     WriteBackMemAccess } CacheModelResult;
141 
142 typedef CacheModelResult (*simcall_type)(Addr, UChar);
143 
144 static struct {
145     simcall_type I1_Read;
146     simcall_type D1_Read;
147     simcall_type D1_Write;
148 } simulator;
149 
150 /*------------------------------------------------------------*/
151 /*--- Cache Simulator Initialization                       ---*/
152 /*------------------------------------------------------------*/
153 
cachesim_clearcache(cache_t2 * c)154 static void cachesim_clearcache(cache_t2* c)
155 {
156   Int i;
157 
158   for (i = 0; i < c->sets * c->assoc; i++)
159     c->tags[i] = 0;
160   if (c->use) {
161     for (i = 0; i < c->sets * c->assoc; i++) {
162       c->loaded[i].memline  = 0;
163       c->loaded[i].use_base = 0;
164       c->loaded[i].dep_use = 0;
165       c->loaded[i].iaddr = 0;
166       c->use[i].mask    = 0;
167       c->use[i].count   = 0;
168       c->tags[i] = i % c->assoc; /* init lower bits as pointer */
169     }
170   }
171 }
172 
173 static void cacheuse_initcache(cache_t2* c);
174 
175 /* By this point, the size/assoc/line_size has been checked. */
cachesim_initcache(cache_t config,cache_t2 * c)176 static void cachesim_initcache(cache_t config, cache_t2* c)
177 {
178    c->size      = config.size;
179    c->assoc     = config.assoc;
180    c->line_size = config.line_size;
181    c->sectored  = False; // FIXME
182 
183    c->sets           = (c->size / c->line_size) / c->assoc;
184    c->sets_min_1     = c->sets - 1;
185    c->line_size_bits = VG_(log2)(c->line_size);
186    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
187    c->tag_mask       = ~((1<<c->tag_shift)-1);
188 
189    /* Can bits in tag entries be used for flags?
190     * Should be always true as MIN_LINE_SIZE >= 16 */
191    CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
192 
193    if (c->assoc == 1) {
194       VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
195 		   c->size, c->line_size,
196 		   c->sectored ? ", sectored":"");
197    } else {
198       VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
199 		   c->size, c->line_size, c->assoc,
200 		   c->sectored ? ", sectored":"");
201    }
202 
203    c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
204                                  sizeof(UWord) * c->sets * c->assoc);
205    if (clo_collect_cacheuse)
206        cacheuse_initcache(c);
207    else
208      c->use = 0;
209    cachesim_clearcache(c);
210 }
211 
212 
213 #if 0
214 static void print_cache(cache_t2* c)
215 {
216    UInt set, way, i;
217 
218    /* Note initialisation and update of 'i'. */
219    for (i = 0, set = 0; set < c->sets; set++) {
220       for (way = 0; way < c->assoc; way++, i++) {
221          VG_(printf)("%8x ", c->tags[i]);
222       }
223       VG_(printf)("\n");
224    }
225 }
226 #endif
227 
228 
229 /*------------------------------------------------------------*/
230 /*--- Write Through Cache Simulation                       ---*/
231 /*------------------------------------------------------------*/
232 
233 /*
234  * Simple model: L1 & LL Write Through
235  * Does not distinguish among read and write references
236  *
237  * Simulator functions:
238  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
239  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
240  */
241 
242 static __inline__
cachesim_setref(cache_t2 * c,UInt set_no,UWord tag)243 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
244 {
245     int i, j;
246     UWord *set;
247 
248     set = &(c->tags[set_no * c->assoc]);
249 
250     /* This loop is unrolled for just the first case, which is the most */
251     /* common.  We can't unroll any further because it would screw up   */
252     /* if we have a direct-mapped (1-way) cache.                        */
253     if (tag == set[0])
254         return Hit;
255 
256     /* If the tag is one other than the MRU, move it into the MRU spot  */
257     /* and shuffle the rest down.                                       */
258     for (i = 1; i < c->assoc; i++) {
259         if (tag == set[i]) {
260             for (j = i; j > 0; j--) {
261                 set[j] = set[j - 1];
262             }
263             set[0] = tag;
264             return Hit;
265         }
266     }
267 
268     /* A miss;  install this tag as MRU, shuffle rest down. */
269     for (j = c->assoc - 1; j > 0; j--) {
270         set[j] = set[j - 1];
271     }
272     set[0] = tag;
273 
274     return Miss;
275 }
276 
cachesim_ref(cache_t2 * c,Addr a,UChar size)277 static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
278 {
279     UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
280     UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
281     UWord tag  = a >> c->tag_shift;
282 
283     /* Access entirely within line. */
284     if (set1 == set2)
285 	return cachesim_setref(c, set1, tag);
286 
287     /* Access straddles two lines. */
288     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
289     else if (((set1 + 1) & (c->sets_min_1)) == set2) {
290 	UWord tag2  = (a+size-1) >> c->tag_shift;
291 
292 	/* the call updates cache structures as side effect */
293 	CacheResult res1 =  cachesim_setref(c, set1, tag);
294 	CacheResult res2 =  cachesim_setref(c, set2, tag2);
295 	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
296 
297    } else {
298        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
299        VG_(tool_panic)("item straddles more than two cache sets");
300    }
301    return Hit;
302 }
303 
304 static
cachesim_I1_ref(Addr a,UChar size)305 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
306 {
307     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
308     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
309     return MemAccess;
310 }
311 
312 static
cachesim_D1_ref(Addr a,UChar size)313 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
314 {
315     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
316     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
317     return MemAccess;
318 }
319 
320 
321 /*------------------------------------------------------------*/
322 /*--- Write Back Cache Simulation                          ---*/
323 /*------------------------------------------------------------*/
324 
325 /*
326  * More complex model: L1 Write-through, LL Write-back
327  * This needs to distinguish among read and write references.
328  *
329  * Simulator functions:
330  *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
331  *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
332  *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
333  */
334 
335 /*
336  * With write-back, result can be a miss evicting a dirty line
337  * The dirty state of a cache line is stored in Bit0 of the tag for
338  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
339  * type (Read/Write), the line gets dirty on a write.
340  */
341 static __inline__
cachesim_setref_wb(cache_t2 * c,RefType ref,UInt set_no,UWord tag)342 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
343 {
344     int i, j;
345     UWord *set, tmp_tag;
346 
347     set = &(c->tags[set_no * c->assoc]);
348 
349     /* This loop is unrolled for just the first case, which is the most */
350     /* common.  We can't unroll any further because it would screw up   */
351     /* if we have a direct-mapped (1-way) cache.                        */
352     if (tag == (set[0] & ~CACHELINE_DIRTY)) {
353 	set[0] |= ref;
354         return Hit;
355     }
356     /* If the tag is one other than the MRU, move it into the MRU spot  */
357     /* and shuffle the rest down.                                       */
358     for (i = 1; i < c->assoc; i++) {
359 	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
360 	    tmp_tag = set[i] | ref; // update dirty flag
361             for (j = i; j > 0; j--) {
362                 set[j] = set[j - 1];
363             }
364             set[0] = tmp_tag;
365             return Hit;
366         }
367     }
368 
369     /* A miss;  install this tag as MRU, shuffle rest down. */
370     tmp_tag = set[c->assoc - 1];
371     for (j = c->assoc - 1; j > 0; j--) {
372         set[j] = set[j - 1];
373     }
374     set[0] = tag | ref;
375 
376     return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
377 }
378 
379 
380 static __inline__
cachesim_ref_wb(cache_t2 * c,RefType ref,Addr a,UChar size)381 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
382 {
383     UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
384     UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
385     UWord tag = a & c->tag_mask;
386 
387     /* Access entirely within line. */
388     if (set1 == set2)
389 	return cachesim_setref_wb(c, ref, set1, tag);
390 
391     /* Access straddles two lines. */
392     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
393     else if (((set1 + 1) & (c->sets_min_1)) == set2) {
394 	UWord tag2  = (a+size-1) & c->tag_mask;
395 
396 	/* the call updates cache structures as side effect */
397 	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
398 	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
399 
400 	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
401 	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
402 
403    } else {
404        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
405        VG_(tool_panic)("item straddles more than two cache sets");
406    }
407    return Hit;
408 }
409 
410 
411 static
cachesim_I1_Read(Addr a,UChar size)412 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
413 {
414     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
415     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
416 	case Hit: return LL_Hit;
417 	case Miss: return MemAccess;
418 	default: break;
419     }
420     return WriteBackMemAccess;
421 }
422 
423 static
cachesim_D1_Read(Addr a,UChar size)424 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
425 {
426     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
427     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
428 	case Hit: return LL_Hit;
429 	case Miss: return MemAccess;
430 	default: break;
431     }
432     return WriteBackMemAccess;
433 }
434 
435 static
cachesim_D1_Write(Addr a,UChar size)436 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
437 {
438     if ( cachesim_ref( &D1, a, size) == Hit ) {
439 	/* Even for a L1 hit, the write-trough L1 passes
440 	 * the write to the LL to make the LL line dirty.
441 	 * But this causes no latency, so return the hit.
442 	 */
443 	cachesim_ref_wb( &LL, Write, a, size);
444 	return L1_Hit;
445     }
446     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
447 	case Hit: return LL_Hit;
448 	case Miss: return MemAccess;
449 	default: break;
450     }
451     return WriteBackMemAccess;
452 }
453 
454 
455 /*------------------------------------------------------------*/
456 /*--- Hardware Prefetch Simulation                         ---*/
457 /*------------------------------------------------------------*/
458 
459 static ULong prefetch_up = 0;
460 static ULong prefetch_down = 0;
461 
462 #define PF_STREAMS  8
463 #define PF_PAGEBITS 12
464 
465 static UInt pf_lastblock[PF_STREAMS];
466 static Int  pf_seqblocks[PF_STREAMS];
467 
468 static
prefetch_clear(void)469 void prefetch_clear(void)
470 {
471   int i;
472   for(i=0;i<PF_STREAMS;i++)
473     pf_lastblock[i] = pf_seqblocks[i] = 0;
474 }
475 
476 /*
477  * HW Prefetch emulation
478  * Start prefetching when detecting sequential access to 3 memory blocks.
479  * One stream can be detected per 4k page.
480  */
481 static __inline__
prefetch_LL_doref(Addr a)482 void prefetch_LL_doref(Addr a)
483 {
484   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
485   UInt block = ( a >> LL.line_size_bits);
486 
487   if (block != pf_lastblock[stream]) {
488     if (pf_seqblocks[stream] == 0) {
489       if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
490       else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
491     }
492     else if (pf_seqblocks[stream] >0) {
493       if (pf_lastblock[stream] +1 == block) {
494 	pf_seqblocks[stream]++;
495 	if (pf_seqblocks[stream] >= 2) {
496 	  prefetch_up++;
497 	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
498 	}
499       }
500       else pf_seqblocks[stream] = 0;
501     }
502     else if (pf_seqblocks[stream] <0) {
503       if (pf_lastblock[stream] -1 == block) {
504 	pf_seqblocks[stream]--;
505 	if (pf_seqblocks[stream] <= -2) {
506 	  prefetch_down++;
507 	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
508 	}
509       }
510       else pf_seqblocks[stream] = 0;
511     }
512     pf_lastblock[stream] = block;
513   }
514 }
515 
516 /* simple model with hardware prefetch */
517 
518 static
prefetch_I1_ref(Addr a,UChar size)519 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
520 {
521     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
522     prefetch_LL_doref(a);
523     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
524     return MemAccess;
525 }
526 
527 static
prefetch_D1_ref(Addr a,UChar size)528 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
529 {
530     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
531     prefetch_LL_doref(a);
532     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
533     return MemAccess;
534 }
535 
536 
537 /* complex model with hardware prefetch */
538 
539 static
prefetch_I1_Read(Addr a,UChar size)540 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
541 {
542     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
543     prefetch_LL_doref(a);
544     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
545 	case Hit: return LL_Hit;
546 	case Miss: return MemAccess;
547 	default: break;
548     }
549     return WriteBackMemAccess;
550 }
551 
552 static
prefetch_D1_Read(Addr a,UChar size)553 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
554 {
555     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
556     prefetch_LL_doref(a);
557     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
558 	case Hit: return LL_Hit;
559 	case Miss: return MemAccess;
560 	default: break;
561     }
562     return WriteBackMemAccess;
563 }
564 
565 static
prefetch_D1_Write(Addr a,UChar size)566 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
567 {
568     prefetch_LL_doref(a);
569     if ( cachesim_ref( &D1, a, size) == Hit ) {
570 	/* Even for a L1 hit, the write-trough L1 passes
571 	 * the write to the LL to make the LL line dirty.
572 	 * But this causes no latency, so return the hit.
573 	 */
574 	cachesim_ref_wb( &LL, Write, a, size);
575 	return L1_Hit;
576     }
577     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
578 	case Hit: return LL_Hit;
579 	case Miss: return MemAccess;
580 	default: break;
581     }
582     return WriteBackMemAccess;
583 }
584 
585 
586 /*------------------------------------------------------------*/
587 /*--- Cache Simulation with use metric collection          ---*/
588 /*------------------------------------------------------------*/
589 
590 /* can not be combined with write-back or prefetch */
591 
592 static
cacheuse_initcache(cache_t2 * c)593 void cacheuse_initcache(cache_t2* c)
594 {
595     int i;
596     unsigned int start_mask, start_val;
597     unsigned int end_mask, end_val;
598 
599     c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
600                            sizeof(line_use) * c->sets * c->assoc);
601     c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
602                            sizeof(line_loaded) * c->sets * c->assoc);
603     c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
604                                     sizeof(int) * c->line_size);
605     c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
606                                   sizeof(int) * c->line_size);
607 
608     c->line_size_mask = c->line_size-1;
609 
610     /* Meaning of line_start_mask/line_end_mask
611      * Example: for a given cache line, you get an access starting at
612      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
613      * line size of 32, you have 1 bit per byte in the mask:
614      *
615      *   bit31   bit8 bit5  bit 0
616      *       |      |  |    |
617      *       11..111111100000   line_start_mask[5]
618      *       00..000111111111   line_end_mask[(5+4)-1]
619      *
620      *  use_mask |= line_start_mask[5] && line_end_mask[8]
621      *
622      */
623     start_val = end_val = ~0;
624     if (c->line_size < 32) {
625 	int bits_per_byte = 32/c->line_size;
626 	start_mask = (1<<bits_per_byte)-1;
627 	end_mask   = start_mask << (32-bits_per_byte);
628 	for(i=0;i<c->line_size;i++) {
629 	    c->line_start_mask[i] = start_val;
630 	    start_val  = start_val & ~start_mask;
631 	    start_mask = start_mask << bits_per_byte;
632 
633 	    c->line_end_mask[c->line_size-i-1] = end_val;
634 	    end_val  = end_val & ~end_mask;
635 	    end_mask = end_mask >> bits_per_byte;
636 	}
637     }
638     else {
639 	int bytes_per_bit = c->line_size/32;
640 	start_mask = 1;
641 	end_mask   = 1 << 31;
642 	for(i=0;i<c->line_size;i++) {
643 	    c->line_start_mask[i] = start_val;
644 	    c->line_end_mask[c->line_size-i-1] = end_val;
645 	    if ( ((i+1)%bytes_per_bit) == 0) {
646 		start_val   &= ~start_mask;
647 		end_val     &= ~end_mask;
648 		start_mask <<= 1;
649 		end_mask   >>= 1;
650 	    }
651 	}
652     }
653 
654     CLG_DEBUG(6, "Config %s:\n", c->desc_line);
655     for(i=0;i<c->line_size;i++) {
656 	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
657 		  i, c->line_start_mask[i], c->line_end_mask[i]);
658     }
659 
660     /* We use lower tag bits as offset pointers to cache use info.
661      * I.e. some cache parameters don't work.
662      */
663     if ( (1<<c->tag_shift) < c->assoc) {
664 	VG_(message)(Vg_DebugMsg,
665 		     "error: Use associativity < %d for cache use statistics!\n",
666 		     (1<<c->tag_shift) );
667 	VG_(tool_panic)("Unsupported cache configuration");
668     }
669 }
670 
671 
672 /* for I1/D1 caches */
673 #define CACHEUSE(L)                                                         \
674                                                                             \
675 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
676 {                                                                           \
677    UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
678    UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
679    UWord tag  = a & L.tag_mask;                                             \
680    UWord tag2;                                                              \
681    int i, j, idx;                                                           \
682    UWord *set, tmp_tag; 						    \
683    UInt use_mask;							    \
684                                                                             \
685    CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
686 	    L.name, a, size, set1, set2);				    \
687                                                                             \
688    /* First case: word entirely within line. */                             \
689    if (set1 == set2) {                                                      \
690                                                                             \
691       set = &(L.tags[set1 * L.assoc]);                                      \
692       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
693 	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
694                                                                             \
695       /* This loop is unrolled for just the first case, which is the most */\
696       /* common.  We can't unroll any further because it would screw up   */\
697       /* if we have a direct-mapped (1-way) cache.                        */\
698       if (tag == (set[0] & L.tag_mask)) {                                   \
699         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
700         L.use[idx].count ++;                                                \
701         L.use[idx].mask |= use_mask;                                        \
702 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
703 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
704 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
705 	return L1_Hit;							    \
706       }                                                                     \
707       /* If the tag is one other than the MRU, move it into the MRU spot  */\
708       /* and shuffle the rest down.                                       */\
709       for (i = 1; i < L.assoc; i++) {                                       \
710 	 if (tag == (set[i] & L.tag_mask)) {			            \
711   	    tmp_tag = set[i];                                               \
712             for (j = i; j > 0; j--) {                                       \
713                set[j] = set[j - 1];                                         \
714             }                                                               \
715             set[0] = tmp_tag;			                            \
716             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
717             L.use[idx].count ++;                                            \
718             L.use[idx].mask |= use_mask;                                    \
719 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
720 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
721 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
722             return L1_Hit;                                                  \
723          }                                                                  \
724       }                                                                     \
725                                                                             \
726       /* A miss;  install this tag as MRU, shuffle rest down. */            \
727       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
728       for (j = L.assoc - 1; j > 0; j--) {                                   \
729          set[j] = set[j - 1];                                               \
730       }                                                                     \
731       set[0] = tag | tmp_tag;                                               \
732       idx = (set1 * L.assoc) + tmp_tag;                                     \
733       return update_##L##_use(&L, idx,         			            \
734 		       use_mask, a &~ L.line_size_mask);		    \
735                                                                             \
736    /* Second case: word straddles two lines. */                             \
737    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
738    } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
739       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
740       set = &(L.tags[set1 * L.assoc]);                                      \
741       use_mask = L.line_start_mask[a & L.line_size_mask];		    \
742       if (tag == (set[0] & L.tag_mask)) {                                   \
743          idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
744          L.use[idx].count ++;                                               \
745          L.use[idx].mask |= use_mask;                                       \
746 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
747 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
748 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
749          goto block2;                                                       \
750       }                                                                     \
751       for (i = 1; i < L.assoc; i++) {                                       \
752 	 if (tag == (set[i] & L.tag_mask)) {			            \
753   	    tmp_tag = set[i];                                               \
754             for (j = i; j > 0; j--) {                                       \
755                set[j] = set[j - 1];                                         \
756             }                                                               \
757             set[0] = tmp_tag;                                               \
758             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
759             L.use[idx].count ++;                                            \
760             L.use[idx].mask |= use_mask;                                    \
761 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
762 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
763 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
764             goto block2;                                                    \
765          }                                                                  \
766       }                                                                     \
767       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
768       for (j = L.assoc - 1; j > 0; j--) {                                   \
769          set[j] = set[j - 1];                                               \
770       }                                                                     \
771       set[0] = tag | tmp_tag;                                               \
772       idx = (set1 * L.assoc) + tmp_tag;                                     \
773       miss1 = update_##L##_use(&L, idx,        			            \
774 		       use_mask, a &~ L.line_size_mask);		    \
775 block2:                                                                     \
776       set = &(L.tags[set2 * L.assoc]);                                      \
777       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
778       tag2  = (a+size-1) & L.tag_mask;                                      \
779       if (tag2 == (set[0] & L.tag_mask)) {                                  \
780          idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
781          L.use[idx].count ++;                                               \
782          L.use[idx].mask |= use_mask;                                       \
783 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
784 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
785 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
786          return miss1;                                                      \
787       }                                                                     \
788       for (i = 1; i < L.assoc; i++) {                                       \
789 	 if (tag2 == (set[i] & L.tag_mask)) {			            \
790   	    tmp_tag = set[i];                                               \
791             for (j = i; j > 0; j--) {                                       \
792                set[j] = set[j - 1];                                         \
793             }                                                               \
794             set[0] = tmp_tag;                                               \
795             idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
796             L.use[idx].count ++;                                            \
797             L.use[idx].mask |= use_mask;                                    \
798 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
799 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
800 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
801             return miss1;                                                   \
802          }                                                                  \
803       }                                                                     \
804       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
805       for (j = L.assoc - 1; j > 0; j--) {                                   \
806          set[j] = set[j - 1];                                               \
807       }                                                                     \
808       set[0] = tag2 | tmp_tag;                                              \
809       idx = (set2 * L.assoc) + tmp_tag;                                     \
810       miss2 = update_##L##_use(&L, idx,			                    \
811 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
812       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
813                                                                             \
814    } else {                                                                 \
815        VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
816        VG_(tool_panic)("item straddles more than two cache sets");          \
817    }                                                                        \
818    return 0;                                                                \
819 }
820 
821 
822 /* logarithmic bitcounting algorithm, see
823  * http://graphics.stanford.edu/~seander/bithacks.html
824  */
countBits(unsigned int bits)825 static __inline__ unsigned int countBits(unsigned int bits)
826 {
827   unsigned int c; // store the total here
828   const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
829   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
830 
831   c = bits;
832   c = ((c >> S[0]) & B[0]) + (c & B[0]);
833   c = ((c >> S[1]) & B[1]) + (c & B[1]);
834   c = ((c >> S[2]) & B[2]) + (c & B[2]);
835   c = ((c >> S[3]) & B[3]) + (c & B[3]);
836   c = ((c >> S[4]) & B[4]) + (c & B[4]);
837   return c;
838 }
839 
update_LL_use(int idx,Addr memline)840 static void update_LL_use(int idx, Addr memline)
841 {
842   line_loaded* loaded = &(LL.loaded[idx]);
843   line_use* use = &(LL.use[idx]);
844   int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
845 
846   CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
847            idx, CLG_(bb_base) + current_ii->instr_offset, memline);
848   if (use->count>0) {
849     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
850 	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
851     CLG_DEBUG(2, "   collect: %d, use_base %p\n",
852 	     CLG_(current_state).collect, loaded->use_base);
853 
854     if (CLG_(current_state).collect && loaded->use_base) {
855       (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
856       (loaded->use_base)[off_LL_SpLoss] += i;
857     }
858    }
859 
860    use->count = 0;
861    use->mask  = 0;
862 
863   loaded->memline = memline;
864   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
865   loaded->use_base = (CLG_(current_state).nonskipped) ?
866     CLG_(current_state).nonskipped->skipped :
867     CLG_(cost_base) + current_ii->cost_offset;
868 }
869 
870 static
cacheuse_LL_access(Addr memline,line_loaded * l1_loaded)871 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
872 {
873    UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
874    UWord* set = &(LL.tags[setNo * LL.assoc]);
875    UWord tag  = memline & LL.tag_mask;
876 
877    int i, j, idx;
878    UWord tmp_tag;
879 
880    CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
881 
882    if (tag == (set[0] & LL.tag_mask)) {
883      idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
884      l1_loaded->dep_use = &(LL.use[idx]);
885 
886      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
887 		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
888 		 LL.use[idx].mask, LL.use[idx].count);
889      return LL_Hit;
890    }
891    for (i = 1; i < LL.assoc; i++) {
892      if (tag == (set[i] & LL.tag_mask)) {
893        tmp_tag = set[i];
894        for (j = i; j > 0; j--) {
895 	 set[j] = set[j - 1];
896        }
897        set[0] = tmp_tag;
898        idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
899        l1_loaded->dep_use = &(LL.use[idx]);
900 
901 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
902 		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
903 		 LL.use[idx].mask, LL.use[idx].count);
904 	return LL_Hit;
905      }
906    }
907 
908    /* A miss;  install this tag as MRU, shuffle rest down. */
909    tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
910    for (j = LL.assoc - 1; j > 0; j--) {
911      set[j] = set[j - 1];
912    }
913    set[0] = tag | tmp_tag;
914    idx = (setNo * LL.assoc) + tmp_tag;
915    l1_loaded->dep_use = &(LL.use[idx]);
916 
917    update_LL_use(idx, memline);
918 
919    return MemAccess;
920 }
921 
922 
923 
924 
925 #define UPDATE_USE(L)					             \
926                                                                      \
927 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
928 			       UInt mask, Addr memline)		     \
929 {                                                                    \
930   line_loaded* loaded = &(cache->loaded[idx]);			     \
931   line_use* use = &(cache->use[idx]);				     \
932   int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
933                                                                      \
934   CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
935            cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
936   if (use->count>0) {                                                \
937     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
938 	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
939     CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
940 	     CLG_(current_state).collect, loaded->use_base);	     \
941                                                                      \
942     if (CLG_(current_state).collect && loaded->use_base) {           \
943       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
944       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
945                                                                      \
946       /* FIXME (?): L1/LL line sizes must be equal ! */              \
947       loaded->dep_use->mask |= use->mask;                            \
948       loaded->dep_use->count += use->count;                          \
949     }                                                                \
950   }                                                                  \
951                                                                      \
952   use->count = 1;                                                    \
953   use->mask  = mask;                                                 \
954   loaded->memline = memline;                                         \
955   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
956   loaded->use_base = (CLG_(current_state).nonskipped) ?              \
957     CLG_(current_state).nonskipped->skipped :                        \
958     CLG_(cost_base) + current_ii->cost_offset;                       \
959                                                                      \
960   if (memline == 0) return LL_Hit;                                   \
961   return cacheuse_LL_access(memline, loaded);                        \
962 }
963 
964 UPDATE_USE(I1);
965 UPDATE_USE(D1);
966 
967 CACHEUSE(I1);
968 CACHEUSE(D1);
969 
970 
971 static
cacheuse_finish(void)972 void cacheuse_finish(void)
973 {
974   int i;
975   InstrInfo ii = { 0,0,0,0 };
976 
977   if (!CLG_(current_state).collect) return;
978 
979   CLG_(bb_base) = 0;
980   current_ii = &ii; /* needs to be set for update_XX_use */
981   CLG_(cost_base) = 0;
982 
983   /* update usage counters */
984   if (I1.use)
985     for (i = 0; i < I1.sets * I1.assoc; i++)
986       if (I1.loaded[i].use_base)
987 	update_I1_use( &I1, i, 0,0);
988 
989   if (D1.use)
990     for (i = 0; i < D1.sets * D1.assoc; i++)
991       if (D1.loaded[i].use_base)
992 	update_D1_use( &D1, i, 0,0);
993 
994   if (LL.use)
995     for (i = 0; i < LL.sets * LL.assoc; i++)
996       if (LL.loaded[i].use_base)
997 	update_LL_use(i, 0);
998 
999   current_ii = 0;
1000 }
1001 
1002 
1003 
1004 /*------------------------------------------------------------*/
1005 /*--- Helper functions called by instrumented code         ---*/
1006 /*------------------------------------------------------------*/
1007 
1008 
1009 static __inline__
inc_costs(CacheModelResult r,ULong * c1,ULong * c2)1010 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1011 {
1012     switch(r) {
1013 	case WriteBackMemAccess:
1014 	    if (clo_simulate_writeback) {
1015 		c1[3]++;
1016 		c2[3]++;
1017 	    }
1018 	    // fall through
1019 
1020 	case MemAccess:
1021 	    c1[2]++;
1022 	    c2[2]++;
1023 	    // fall through
1024 
1025 	case LL_Hit:
1026 	    c1[1]++;
1027 	    c2[1]++;
1028 	    // fall through
1029 
1030 	default:
1031 	    c1[0]++;
1032 	    c2[0]++;
1033     }
1034 }
1035 
1036 static
cacheRes(CacheModelResult r)1037 Char* cacheRes(CacheModelResult r)
1038 {
1039     switch(r) {
1040     case L1_Hit:    return "L1 Hit ";
1041     case LL_Hit:    return "LL Hit ";
1042     case MemAccess: return "LL Miss";
1043     case WriteBackMemAccess: return "LL Miss (dirty)";
1044     default:
1045 	tl_assert(0);
1046     }
1047     return "??";
1048 }
1049 
1050 VG_REGPARM(1)
log_1I0D(InstrInfo * ii)1051 static void log_1I0D(InstrInfo* ii)
1052 {
1053     CacheModelResult IrRes;
1054 
1055     current_ii = ii;
1056     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1057 
1058     CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
1059               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1060 
1061     if (CLG_(current_state).collect) {
1062 	ULong* cost_Ir;
1063 
1064 	if (CLG_(current_state).nonskipped)
1065 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1066 	else
1067             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1068 
1069 	inc_costs(IrRes, cost_Ir,
1070 		  CLG_(current_state).cost + fullOffset(EG_IR) );
1071     }
1072 }
1073 
1074 VG_REGPARM(2)
log_2I0D(InstrInfo * ii1,InstrInfo * ii2)1075 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1076 {
1077     CacheModelResult Ir1Res, Ir2Res;
1078     ULong *global_cost_Ir;
1079 
1080     current_ii = ii1;
1081     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1082     current_ii = ii2;
1083     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1084 
1085     CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1086               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1087               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1088 
1089     if (!CLG_(current_state).collect) return;
1090 
1091     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1092     if (CLG_(current_state).nonskipped) {
1093 	ULong* skipped_cost_Ir =
1094 	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1095 
1096 	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1097 	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1098 	return;
1099     }
1100 
1101     inc_costs(Ir1Res, global_cost_Ir,
1102               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1103     inc_costs(Ir2Res, global_cost_Ir,
1104               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1105 }
1106 
1107 VG_REGPARM(3)
log_3I0D(InstrInfo * ii1,InstrInfo * ii2,InstrInfo * ii3)1108 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1109 {
1110     CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1111     ULong *global_cost_Ir;
1112 
1113     current_ii = ii1;
1114     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1115     current_ii = ii2;
1116     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1117     current_ii = ii3;
1118     Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1119 
1120     CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1121               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1122               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1123               CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1124 
1125     if (!CLG_(current_state).collect) return;
1126 
1127     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1128     if (CLG_(current_state).nonskipped) {
1129 	ULong* skipped_cost_Ir =
1130 	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1131 	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1132 	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1133 	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1134 	return;
1135     }
1136 
1137     inc_costs(Ir1Res, global_cost_Ir,
1138               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1139     inc_costs(Ir2Res, global_cost_Ir,
1140               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1141     inc_costs(Ir3Res, global_cost_Ir,
1142               CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1143 }
1144 
1145 /* Instruction doing a read access */
1146 
1147 VG_REGPARM(3)
log_1I1Dr(InstrInfo * ii,Addr data_addr,Word data_size)1148 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1149 {
1150     CacheModelResult IrRes, DrRes;
1151 
1152     current_ii = ii;
1153     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1154     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1155 
1156     CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
1157               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1158 	      data_addr, data_size, cacheRes(DrRes));
1159 
1160     if (CLG_(current_state).collect) {
1161 	ULong *cost_Ir, *cost_Dr;
1162 
1163 	if (CLG_(current_state).nonskipped) {
1164 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1165 	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1166 	}
1167 	else {
1168             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1169             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1170 	}
1171 
1172 	inc_costs(IrRes, cost_Ir,
1173 		  CLG_(current_state).cost + fullOffset(EG_IR) );
1174 	inc_costs(DrRes, cost_Dr,
1175 		  CLG_(current_state).cost + fullOffset(EG_DR) );
1176     }
1177 }
1178 
1179 
1180 VG_REGPARM(3)
log_0I1Dr(InstrInfo * ii,Addr data_addr,Word data_size)1181 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1182 {
1183     CacheModelResult DrRes;
1184 
1185     current_ii = ii;
1186     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1187 
1188     CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
1189 	      data_addr, data_size, cacheRes(DrRes));
1190 
1191     if (CLG_(current_state).collect) {
1192 	ULong *cost_Dr;
1193 
1194 	if (CLG_(current_state).nonskipped)
1195 	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1196 	else
1197             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1198 
1199 	inc_costs(DrRes, cost_Dr,
1200 		  CLG_(current_state).cost + fullOffset(EG_DR) );
1201     }
1202 }
1203 
1204 
1205 /* Instruction doing a write access */
1206 
1207 VG_REGPARM(3)
log_1I1Dw(InstrInfo * ii,Addr data_addr,Word data_size)1208 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1209 {
1210     CacheModelResult IrRes, DwRes;
1211 
1212     current_ii = ii;
1213     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1214     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1215 
1216     CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
1217               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1218 	      data_addr, data_size, cacheRes(DwRes));
1219 
1220     if (CLG_(current_state).collect) {
1221 	ULong *cost_Ir, *cost_Dw;
1222 
1223 	if (CLG_(current_state).nonskipped) {
1224 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1225 	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1226 	}
1227 	else {
1228             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1229             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1230 	}
1231 
1232 	inc_costs(IrRes, cost_Ir,
1233 		  CLG_(current_state).cost + fullOffset(EG_IR) );
1234 	inc_costs(DwRes, cost_Dw,
1235 		  CLG_(current_state).cost + fullOffset(EG_DW) );
1236     }
1237 }
1238 
1239 VG_REGPARM(3)
log_0I1Dw(InstrInfo * ii,Addr data_addr,Word data_size)1240 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1241 {
1242     CacheModelResult DwRes;
1243 
1244     current_ii = ii;
1245     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1246 
1247     CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
1248 	      data_addr, data_size, cacheRes(DwRes));
1249 
1250     if (CLG_(current_state).collect) {
1251 	ULong *cost_Dw;
1252 
1253 	if (CLG_(current_state).nonskipped)
1254 	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1255 	else
1256             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1257 
1258 	inc_costs(DwRes, cost_Dw,
1259 		  CLG_(current_state).cost + fullOffset(EG_DW) );
1260     }
1261 }
1262 
1263 
1264 
1265 /*------------------------------------------------------------*/
1266 /*--- Cache configuration                                  ---*/
1267 /*------------------------------------------------------------*/
1268 
1269 static cache_t clo_I1_cache = UNDEFINED_CACHE;
1270 static cache_t clo_D1_cache = UNDEFINED_CACHE;
1271 static cache_t clo_LL_cache = UNDEFINED_CACHE;
1272 
1273 /* Initialize and clear simulator state */
cachesim_post_clo_init(void)1274 static void cachesim_post_clo_init(void)
1275 {
1276   /* Cache configurations. */
1277   cache_t  I1c, D1c, LLc;
1278 
1279   /* Initialize access handlers */
1280   if (!CLG_(clo).simulate_cache) {
1281     CLG_(cachesim).log_1I0D  = 0;
1282     CLG_(cachesim).log_1I0D_name = "(no function)";
1283     CLG_(cachesim).log_2I0D  = 0;
1284     CLG_(cachesim).log_2I0D_name = "(no function)";
1285     CLG_(cachesim).log_3I0D  = 0;
1286     CLG_(cachesim).log_3I0D_name = "(no function)";
1287 
1288     CLG_(cachesim).log_1I1Dr = 0;
1289     CLG_(cachesim).log_1I1Dr_name = "(no function)";
1290     CLG_(cachesim).log_1I1Dw = 0;
1291     CLG_(cachesim).log_1I1Dw_name = "(no function)";
1292 
1293     CLG_(cachesim).log_0I1Dr = 0;
1294     CLG_(cachesim).log_0I1Dr_name = "(no function)";
1295     CLG_(cachesim).log_0I1Dw = 0;
1296     CLG_(cachesim).log_0I1Dw_name = "(no function)";
1297     return;
1298   }
1299 
1300   /* Configuration of caches only needed with real cache simulation */
1301   VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
1302                                       &clo_I1_cache,
1303                                       &clo_D1_cache,
1304                                       &clo_LL_cache);
1305 
1306   I1.name = "I1";
1307   D1.name = "D1";
1308   LL.name = "LL";
1309 
1310   cachesim_initcache(I1c, &I1);
1311   cachesim_initcache(D1c, &D1);
1312   cachesim_initcache(LLc, &LL);
1313 
1314   /* the other cache simulators use the standard helpers
1315    * with dispatching via simulator struct */
1316 
1317   CLG_(cachesim).log_1I0D  = log_1I0D;
1318   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1319   CLG_(cachesim).log_2I0D  = log_2I0D;
1320   CLG_(cachesim).log_2I0D_name  = "log_2I0D";
1321   CLG_(cachesim).log_3I0D  = log_3I0D;
1322   CLG_(cachesim).log_3I0D_name  = "log_3I0D";
1323 
1324   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1325   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1326   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1327   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1328 
1329   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1330   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1331   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1332   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1333 
1334   if (clo_collect_cacheuse) {
1335 
1336       /* Output warning for not supported option combinations */
1337       if (clo_simulate_hwpref) {
1338 	  VG_(message)(Vg_DebugMsg,
1339 		       "warning: prefetch simulation can not be "
1340                        "used with cache usage\n");
1341 	  clo_simulate_hwpref = False;
1342       }
1343 
1344       if (clo_simulate_writeback) {
1345 	  VG_(message)(Vg_DebugMsg,
1346 		       "warning: write-back simulation can not be "
1347                        "used with cache usage\n");
1348 	  clo_simulate_writeback = False;
1349       }
1350 
1351       simulator.I1_Read  = cacheuse_I1_doRead;
1352       simulator.D1_Read  = cacheuse_D1_doRead;
1353       simulator.D1_Write = cacheuse_D1_doRead;
1354       return;
1355   }
1356 
1357   if (clo_simulate_hwpref) {
1358     prefetch_clear();
1359 
1360     if (clo_simulate_writeback) {
1361       simulator.I1_Read  = prefetch_I1_Read;
1362       simulator.D1_Read  = prefetch_D1_Read;
1363       simulator.D1_Write = prefetch_D1_Write;
1364     }
1365     else {
1366       simulator.I1_Read  = prefetch_I1_ref;
1367       simulator.D1_Read  = prefetch_D1_ref;
1368       simulator.D1_Write = prefetch_D1_ref;
1369     }
1370 
1371     return;
1372   }
1373 
1374   if (clo_simulate_writeback) {
1375       simulator.I1_Read  = cachesim_I1_Read;
1376       simulator.D1_Read  = cachesim_D1_Read;
1377       simulator.D1_Write = cachesim_D1_Write;
1378   }
1379   else {
1380       simulator.I1_Read  = cachesim_I1_ref;
1381       simulator.D1_Read  = cachesim_D1_ref;
1382       simulator.D1_Write = cachesim_D1_ref;
1383   }
1384 }
1385 
1386 
1387 /* Clear simulator state. Has to be initialized before */
1388 static
cachesim_clear(void)1389 void cachesim_clear(void)
1390 {
1391   cachesim_clearcache(&I1);
1392   cachesim_clearcache(&D1);
1393   cachesim_clearcache(&LL);
1394 
1395   prefetch_clear();
1396 }
1397 
1398 
cachesim_getdesc(Char * buf)1399 static void cachesim_getdesc(Char* buf)
1400 {
1401   Int p;
1402   p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1403   p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1404   VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
1405 }
1406 
1407 static
cachesim_print_opts(void)1408 void cachesim_print_opts(void)
1409 {
1410   VG_(printf)(
1411 "\n   cache simulator options (does cache simulation if used):\n"
1412 "    --simulate-wb=no|yes      Count write-back events [no]\n"
1413 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1414 #if CLG_EXPERIMENTAL
1415 "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1416 #endif
1417 "    --cacheuse=no|yes         Collect cache block use [no]\n");
1418   VG_(print_cache_clo_opts)();
1419 }
1420 
1421 /* Check for command line option for cache configuration.
1422  * Return False if unknown and not handled.
1423  *
1424  * Called from CLG_(process_cmd_line_option)() in clo.c
1425  */
cachesim_parse_opt(Char * arg)1426 static Bool cachesim_parse_opt(Char* arg)
1427 {
1428    if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
1429    else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
1430    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
1431 
1432    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1433       if (clo_collect_cacheuse) {
1434          /* Use counters only make sense with fine dumping */
1435          CLG_(clo).dump_instr = True;
1436       }
1437    }
1438 
1439    else if (VG_(str_clo_cache_opt)(arg,
1440                                    &clo_I1_cache,
1441                                    &clo_D1_cache,
1442                                    &clo_LL_cache)) {}
1443 
1444    else
1445      return False;
1446 
1447   return True;
1448 }
1449 
1450 /* Adds commas to ULong, right justifying in a field field_width wide, returns
1451  * the string in buf. */
1452 static
commify(ULong n,int field_width,char * buf)1453 Int commify(ULong n, int field_width, char* buf)
1454 {
1455    int len, n_commas, i, j, new_len, space;
1456 
1457    VG_(sprintf)(buf, "%llu", n);
1458    len = VG_(strlen)(buf);
1459    n_commas = (len - 1) / 3;
1460    new_len = len + n_commas;
1461    space = field_width - new_len;
1462 
1463    /* Allow for printing a number in a field_width smaller than it's size */
1464    if (space < 0) space = 0;
1465 
1466    /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1467     * of three. */
1468    for (j = -1, i = len ; i >= 0; i--) {
1469       buf[i + n_commas + space] = buf[i];
1470 
1471       if ((i>0) && (3 == ++j)) {
1472          j = 0;
1473          n_commas--;
1474          buf[i + n_commas + space] = ',';
1475       }
1476    }
1477    /* Right justify in field. */
1478    for (i = 0; i < space; i++)  buf[i] = ' ';
1479    return new_len;
1480 }
1481 
1482 static
percentify(Int n,Int ex,Int field_width,char buf[])1483 void percentify(Int n, Int ex, Int field_width, char buf[])
1484 {
1485    int i, len, space;
1486 
1487    VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1488    len = VG_(strlen)(buf);
1489    space = field_width - len;
1490    if (space < 0) space = 0;     /* Allow for v. small field_width */
1491    i = len;
1492 
1493    /* Right justify in field */
1494    for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
1495    for (i = 0; i < space; i++)  buf[i] = ' ';
1496 }
1497 
1498 static
cachesim_printstat(Int l1,Int l2,Int l3)1499 void cachesim_printstat(Int l1, Int l2, Int l3)
1500 {
1501   FullCost total = CLG_(total_cost), D_total = 0;
1502   ULong LL_total_m, LL_total_mr, LL_total_mw,
1503     LL_total, LL_total_r, LL_total_w;
1504   char buf1[RESULTS_BUF_LEN],
1505     buf2[RESULTS_BUF_LEN],
1506     buf3[RESULTS_BUF_LEN];
1507   Int p;
1508 
1509   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1510     VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
1511 		 prefetch_up);
1512     VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
1513 		 prefetch_down);
1514     VG_(message)(Vg_DebugMsg, "\n");
1515   }
1516 
1517   commify(total[fullOffset(EG_IR) +1], l1, buf1);
1518   VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
1519 
1520   commify(total[fullOffset(EG_IR) +2], l1, buf1);
1521   VG_(message)(Vg_UserMsg, "LLi misses:    %s\n", buf1);
1522 
1523   p = 100;
1524 
1525   if (0 == total[fullOffset(EG_IR)])
1526     total[fullOffset(EG_IR)] = 1;
1527 
1528   percentify(total[fullOffset(EG_IR)+1] * 100 * p /
1529 	     total[fullOffset(EG_IR)], p, l1+1, buf1);
1530   VG_(message)(Vg_UserMsg, "I1  miss rate: %s\n", buf1);
1531 
1532   percentify(total[fullOffset(EG_IR)+2] * 100 * p /
1533 	     total[fullOffset(EG_IR)], p, l1+1, buf1);
1534   VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
1535   VG_(message)(Vg_UserMsg, "\n");
1536 
1537   /* D cache results.
1538      Use the D_refs.rd and D_refs.wr values to determine the
1539    * width of columns 2 & 3. */
1540 
1541   D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1542   CLG_(init_cost)( CLG_(sets).full, D_total);
1543   // we only use the first 3 values of D_total, adding up Dr and Dw costs
1544   CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1545   CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1546 
1547   commify( D_total[0], l1, buf1);
1548   commify(total[fullOffset(EG_DR)], l2,  buf2);
1549   commify(total[fullOffset(EG_DW)], l3,  buf3);
1550   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)\n",
1551 	       buf1,  buf2,  buf3);
1552 
1553   commify( D_total[1], l1, buf1);
1554   commify(total[fullOffset(EG_DR)+1], l2, buf2);
1555   commify(total[fullOffset(EG_DW)+1], l3, buf3);
1556   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)\n",
1557 	       buf1, buf2, buf3);
1558 
1559   commify( D_total[2], l1, buf1);
1560   commify(total[fullOffset(EG_DR)+2], l2, buf2);
1561   commify(total[fullOffset(EG_DW)+2], l3, buf3);
1562   VG_(message)(Vg_UserMsg, "LLd misses:    %s  (%s rd + %s wr)\n",
1563 	       buf1, buf2, buf3);
1564 
1565   p = 10;
1566 
1567   if (0 == D_total[0])   D_total[0] = 1;
1568   if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1569   if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1570 
1571   percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
1572   percentify(total[fullOffset(EG_DR)+1] * 100 * p /
1573 	     total[fullOffset(EG_DR)], p, l2+1, buf2);
1574   percentify(total[fullOffset(EG_DW)+1] * 100 * p /
1575 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1576   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )\n",
1577                buf1, buf2,buf3);
1578 
1579   percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
1580   percentify(total[fullOffset(EG_DR)+2] * 100 * p /
1581 	     total[fullOffset(EG_DR)], p, l2+1, buf2);
1582   percentify(total[fullOffset(EG_DW)+2] * 100 * p /
1583 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1584   VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s   + %s  )\n",
1585                buf1, buf2,buf3);
1586   VG_(message)(Vg_UserMsg, "\n");
1587 
1588 
1589 
1590   /* LL overall results */
1591 
1592   LL_total   =
1593     total[fullOffset(EG_DR) +1] +
1594     total[fullOffset(EG_DW) +1] +
1595     total[fullOffset(EG_IR) +1];
1596   LL_total_r =
1597     total[fullOffset(EG_DR) +1] +
1598     total[fullOffset(EG_IR) +1];
1599   LL_total_w = total[fullOffset(EG_DW) +1];
1600   commify(LL_total,   l1, buf1);
1601   commify(LL_total_r, l2, buf2);
1602   commify(LL_total_w, l3, buf3);
1603   VG_(message)(Vg_UserMsg, "LL refs:       %s  (%s rd + %s wr)\n",
1604 	       buf1, buf2, buf3);
1605 
1606   LL_total_m  =
1607     total[fullOffset(EG_DR) +2] +
1608     total[fullOffset(EG_DW) +2] +
1609     total[fullOffset(EG_IR) +2];
1610   LL_total_mr =
1611     total[fullOffset(EG_DR) +2] +
1612     total[fullOffset(EG_IR) +2];
1613   LL_total_mw = total[fullOffset(EG_DW) +2];
1614   commify(LL_total_m,  l1, buf1);
1615   commify(LL_total_mr, l2, buf2);
1616   commify(LL_total_mw, l3, buf3);
1617   VG_(message)(Vg_UserMsg, "LL misses:     %s  (%s rd + %s wr)\n",
1618 	       buf1, buf2, buf3);
1619 
1620   percentify(LL_total_m  * 100 * p /
1621 	     (total[fullOffset(EG_IR)] + D_total[0]),  p, l1+1, buf1);
1622   percentify(LL_total_mr * 100 * p /
1623 	     (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1624 	     p, l2+1, buf2);
1625   percentify(LL_total_mw * 100 * p /
1626 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1627   VG_(message)(Vg_UserMsg, "LL miss rate:  %s (%s   + %s  )\n",
1628 	       buf1, buf2,buf3);
1629 }
1630 
1631 
1632 /*------------------------------------------------------------*/
1633 /*--- Setup for Event set.                                 ---*/
1634 /*------------------------------------------------------------*/
1635 
1636 struct event_sets CLG_(sets);
1637 
CLG_(init_eventsets)1638 void CLG_(init_eventsets)()
1639 {
1640     // Event groups from which the event sets are composed
1641     // the "Use" group only is used with "cacheuse" simulation
1642     if (clo_collect_cacheuse)
1643 	CLG_(register_event_group4)(EG_USE,
1644 				    "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1645 
1646     if (!CLG_(clo).simulate_cache)
1647 	CLG_(register_event_group)(EG_IR, "Ir");
1648     else if (!clo_simulate_writeback) {
1649 	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
1650 	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
1651 	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
1652     }
1653     else { // clo_simulate_writeback
1654 	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
1655         CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
1656         CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
1657     }
1658 
1659     if (CLG_(clo).simulate_branch) {
1660         CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1661         CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1662     }
1663 
1664     if (CLG_(clo).collect_bus)
1665 	CLG_(register_event_group)(EG_BUS, "Ge");
1666 
1667     if (CLG_(clo).collect_alloc)
1668 	CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1669 
1670     if (CLG_(clo).collect_systime)
1671 	CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1672 
1673     // event set used as base for instruction self cost
1674     CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1675 
1676     // event set comprising all event groups, used for inclusive cost
1677     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1678     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1679     CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1680     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1681 
1682     CLG_DEBUGIF(1) {
1683 	CLG_DEBUG(1, "EventSets:\n");
1684 	CLG_(print_eventset)(-2, CLG_(sets).base);
1685 	CLG_(print_eventset)(-2, CLG_(sets).full);
1686     }
1687 
1688     /* Not-existing events are silently ignored */
1689     CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1690     CLG_(append_event)(CLG_(dumpmap), "Ir");
1691     CLG_(append_event)(CLG_(dumpmap), "Dr");
1692     CLG_(append_event)(CLG_(dumpmap), "Dw");
1693     CLG_(append_event)(CLG_(dumpmap), "I1mr");
1694     CLG_(append_event)(CLG_(dumpmap), "D1mr");
1695     CLG_(append_event)(CLG_(dumpmap), "D1mw");
1696     CLG_(append_event)(CLG_(dumpmap), "ILmr");
1697     CLG_(append_event)(CLG_(dumpmap), "DLmr");
1698     CLG_(append_event)(CLG_(dumpmap), "DLmw");
1699     CLG_(append_event)(CLG_(dumpmap), "ILdmr");
1700     CLG_(append_event)(CLG_(dumpmap), "DLdmr");
1701     CLG_(append_event)(CLG_(dumpmap), "DLdmw");
1702     CLG_(append_event)(CLG_(dumpmap), "Bc");
1703     CLG_(append_event)(CLG_(dumpmap), "Bcm");
1704     CLG_(append_event)(CLG_(dumpmap), "Bi");
1705     CLG_(append_event)(CLG_(dumpmap), "Bim");
1706     CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1707     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1708     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1709     CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1710     CLG_(append_event)(CLG_(dumpmap), "Ge");
1711     CLG_(append_event)(CLG_(dumpmap), "allocCount");
1712     CLG_(append_event)(CLG_(dumpmap), "allocSize");
1713     CLG_(append_event)(CLG_(dumpmap), "sysCount");
1714     CLG_(append_event)(CLG_(dumpmap), "sysTime");
1715 }
1716 
1717 
1718 /* this is called at dump time for every instruction executed */
cachesim_add_icost(SimCost cost,BBCC * bbcc,InstrInfo * ii,ULong exe_count)1719 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1720 			       InstrInfo* ii, ULong exe_count)
1721 {
1722     if (!CLG_(clo).simulate_cache)
1723 	cost[ fullOffset(EG_IR) ] += exe_count;
1724 
1725     if (ii->eventset)
1726 	CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1727 				  ii->eventset, bbcc->cost + ii->cost_offset);
1728 }
1729 
1730 static
cachesim_finish(void)1731 void cachesim_finish(void)
1732 {
1733   if (clo_collect_cacheuse)
1734     cacheuse_finish();
1735 }
1736 
1737 /*------------------------------------------------------------*/
1738 /*--- The simulator defined in this file                   ---*/
1739 /*------------------------------------------------------------*/
1740 
1741 struct cachesim_if CLG_(cachesim) = {
1742   .print_opts    = cachesim_print_opts,
1743   .parse_opt     = cachesim_parse_opt,
1744   .post_clo_init = cachesim_post_clo_init,
1745   .clear         = cachesim_clear,
1746   .getdesc       = cachesim_getdesc,
1747   .printstat     = cachesim_printstat,
1748   .add_icost     = cachesim_add_icost,
1749   .finish        = cachesim_finish,
1750 
1751   /* these will be set by cachesim_post_clo_init */
1752   .log_1I0D        = 0,
1753   .log_2I0D        = 0,
1754   .log_3I0D        = 0,
1755 
1756   .log_1I1Dr       = 0,
1757   .log_1I1Dw       = 0,
1758 
1759   .log_0I1Dr       = 0,
1760   .log_0I1Dw       = 0,
1761 
1762   .log_1I0D_name = "(no function)",
1763   .log_2I0D_name = "(no function)",
1764   .log_3I0D_name = "(no function)",
1765 
1766   .log_1I1Dr_name = "(no function)",
1767   .log_1I1Dw_name = "(no function)",
1768 
1769   .log_0I1Dr_name = "(no function)",
1770   .log_0I1Dw_name = "(no function)",
1771 };
1772 
1773 
1774 /*--------------------------------------------------------------------*/
1775 /*--- end                                                 ct_sim.c ---*/
1776 /*--------------------------------------------------------------------*/
1777 
1778