1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32 /** @file brw_program_cache.c
33 *
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
39 *
40 * The inner workings are a simple hash table based on a FNV-1a of the
41 * key data.
42 *
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
45 */
46
47 #include "main/streaming-load-memcpy.h"
48 #include "x86/common_x86_asm.h"
49 #include "intel_batchbuffer.h"
50 #include "brw_state.h"
51 #include "brw_wm.h"
52 #include "brw_gs.h"
53 #include "brw_cs.h"
54 #include "brw_program.h"
55 #include "compiler/brw_eu.h"
56 #include "util/u_memory.h"
57 #define XXH_INLINE_ALL
58 #include "util/xxhash.h"
59
60 #define FILE_DEBUG_FLAG DEBUG_STATE
61
62 struct brw_cache_item {
63 /**
64 * Effectively part of the key, cache_id identifies what kind of state
65 * buffer is involved, and also which dirty flag should set.
66 */
67 enum brw_cache_id cache_id;
68
69 /** 32-bit hash of the key data */
70 GLuint hash;
71
72 /** for variable-sized keys */
73 GLuint key_size;
74 GLuint prog_data_size;
75 const struct brw_base_prog_key *key;
76
77 uint32_t offset;
78 uint32_t size;
79
80 struct brw_cache_item *next;
81 };
82
83 enum brw_cache_id
brw_stage_cache_id(gl_shader_stage stage)84 brw_stage_cache_id(gl_shader_stage stage)
85 {
86 static const enum brw_cache_id stage_ids[] = {
87 BRW_CACHE_VS_PROG,
88 BRW_CACHE_TCS_PROG,
89 BRW_CACHE_TES_PROG,
90 BRW_CACHE_GS_PROG,
91 BRW_CACHE_FS_PROG,
92 BRW_CACHE_CS_PROG,
93 };
94 assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
95 return stage_ids[stage];
96 }
97
98 static GLuint
hash_key(struct brw_cache_item * item)99 hash_key(struct brw_cache_item *item)
100 {
101 uint32_t hash = 0;
102 hash = XXH32(&item->cache_id, sizeof(item->cache_id), hash);
103 hash = XXH32(item->key, item->key_size, hash);
104
105 return hash;
106 }
107
108 static int
brw_cache_item_equals(const struct brw_cache_item * a,const struct brw_cache_item * b)109 brw_cache_item_equals(const struct brw_cache_item *a,
110 const struct brw_cache_item *b)
111 {
112 return a->cache_id == b->cache_id &&
113 a->hash == b->hash &&
114 a->key_size == b->key_size &&
115 (memcmp(a->key, b->key, a->key_size) == 0);
116 }
117
118 static struct brw_cache_item *
search_cache(struct brw_cache * cache,GLuint hash,struct brw_cache_item * lookup)119 search_cache(struct brw_cache *cache, GLuint hash,
120 struct brw_cache_item *lookup)
121 {
122 struct brw_cache_item *c;
123
124 #if 0
125 int bucketcount = 0;
126
127 for (c = cache->items[hash % cache->size]; c; c = c->next)
128 bucketcount++;
129
130 fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
131 cache->size, bucketcount, cache->n_items);
132 #endif
133
134 for (c = cache->items[hash % cache->size]; c; c = c->next) {
135 if (brw_cache_item_equals(lookup, c))
136 return c;
137 }
138
139 return NULL;
140 }
141
142
143 static void
rehash(struct brw_cache * cache)144 rehash(struct brw_cache *cache)
145 {
146 struct brw_cache_item **items;
147 struct brw_cache_item *c, *next;
148 GLuint size, i;
149
150 size = cache->size * 3;
151 items = calloc(size, sizeof(*items));
152
153 for (i = 0; i < cache->size; i++)
154 for (c = cache->items[i]; c; c = next) {
155 next = c->next;
156 c->next = items[c->hash % size];
157 items[c->hash % size] = c;
158 }
159
160 free(cache->items);
161 cache->items = items;
162 cache->size = size;
163 }
164
165
166 /**
167 * Returns the buffer object matching cache_id and key, or NULL.
168 */
169 bool
brw_search_cache(struct brw_cache * cache,enum brw_cache_id cache_id,const void * key,GLuint key_size,uint32_t * inout_offset,void * inout_prog_data,bool flag_state)170 brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
171 const void *key, GLuint key_size, uint32_t *inout_offset,
172 void *inout_prog_data, bool flag_state)
173 {
174 struct brw_cache_item *item;
175 struct brw_cache_item lookup;
176 GLuint hash;
177
178 lookup.cache_id = cache_id;
179 lookup.key = key;
180 lookup.key_size = key_size;
181 hash = hash_key(&lookup);
182 lookup.hash = hash;
183
184 item = search_cache(cache, hash, &lookup);
185
186 if (item == NULL)
187 return false;
188
189 void *prog_data = ((char *) item->key) + item->key_size;
190
191 if (item->offset != *inout_offset ||
192 prog_data != *((void **) inout_prog_data)) {
193 if (likely(flag_state))
194 cache->brw->ctx.NewDriverState |= (1 << cache_id);
195 *inout_offset = item->offset;
196 *((void **) inout_prog_data) = prog_data;
197 }
198
199 return true;
200 }
201
202 static void
brw_cache_new_bo(struct brw_cache * cache,uint32_t new_size)203 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
204 {
205 struct brw_context *brw = cache->brw;
206 struct brw_bo *new_bo;
207
208 perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
209 (unsigned) cache->bo->size / 1024, new_size / 1024);
210
211 new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
212 BRW_MEMZONE_SHADER);
213 if (can_do_exec_capture(brw->screen))
214 new_bo->kflags |= EXEC_OBJECT_CAPTURE;
215
216 void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
217 MAP_ASYNC | MAP_PERSISTENT);
218
219 /* Copy any existing data that needs to be saved. */
220 if (cache->next_offset != 0) {
221 #ifdef USE_SSE41
222 if (!cache->bo->cache_coherent && cpu_has_sse4_1)
223 _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
224 else
225 #endif
226 memcpy(map, cache->map, cache->next_offset);
227 }
228
229 brw_bo_unmap(cache->bo);
230 brw_bo_unreference(cache->bo);
231 cache->bo = new_bo;
232 cache->map = map;
233
234 /* Since we have a new BO in place, we need to signal the units
235 * that depend on it (state base address on gen5+, or unit state before).
236 */
237 brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
238 brw->batch.state_base_address_emitted = false;
239 }
240
241 /**
242 * Attempts to find an item in the cache with identical data.
243 */
244 static const struct brw_cache_item *
brw_lookup_prog(const struct brw_cache * cache,enum brw_cache_id cache_id,const void * data,unsigned data_size)245 brw_lookup_prog(const struct brw_cache *cache,
246 enum brw_cache_id cache_id,
247 const void *data, unsigned data_size)
248 {
249 unsigned i;
250 const struct brw_cache_item *item;
251
252 for (i = 0; i < cache->size; i++) {
253 for (item = cache->items[i]; item; item = item->next) {
254 if (item->cache_id != cache_id || item->size != data_size ||
255 memcmp(cache->map + item->offset, data, item->size) != 0)
256 continue;
257
258 return item;
259 }
260 }
261
262 return NULL;
263 }
264
265 static uint32_t
brw_alloc_item_data(struct brw_cache * cache,uint32_t size)266 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
267 {
268 uint32_t offset;
269
270 /* Allocate space in the cache BO for our new program. */
271 if (cache->next_offset + size > cache->bo->size) {
272 uint32_t new_size = cache->bo->size * 2;
273
274 while (cache->next_offset + size > new_size)
275 new_size *= 2;
276
277 brw_cache_new_bo(cache, new_size);
278 }
279
280 offset = cache->next_offset;
281
282 /* Programs are always 64-byte aligned, so set up the next one now */
283 cache->next_offset = ALIGN(offset + size, 64);
284
285 return offset;
286 }
287
288 const void *
brw_find_previous_compile(struct brw_cache * cache,enum brw_cache_id cache_id,unsigned program_string_id)289 brw_find_previous_compile(struct brw_cache *cache,
290 enum brw_cache_id cache_id,
291 unsigned program_string_id)
292 {
293 for (unsigned i = 0; i < cache->size; i++) {
294 for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
295 if (c->cache_id == cache_id &&
296 c->key->program_string_id == program_string_id) {
297 return c->key;
298 }
299 }
300 }
301
302 return NULL;
303 }
304
305 void
brw_upload_cache(struct brw_cache * cache,enum brw_cache_id cache_id,const void * key,GLuint key_size,const void * data,GLuint data_size,const void * prog_data,GLuint prog_data_size,uint32_t * out_offset,void * out_prog_data)306 brw_upload_cache(struct brw_cache *cache,
307 enum brw_cache_id cache_id,
308 const void *key,
309 GLuint key_size,
310 const void *data,
311 GLuint data_size,
312 const void *prog_data,
313 GLuint prog_data_size,
314 uint32_t *out_offset,
315 void *out_prog_data)
316 {
317 struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
318 const struct brw_cache_item *matching_data =
319 brw_lookup_prog(cache, cache_id, data, data_size);
320 GLuint hash;
321 void *tmp;
322
323 item->cache_id = cache_id;
324 item->size = data_size;
325 item->key = key;
326 item->key_size = key_size;
327 item->prog_data_size = prog_data_size;
328 hash = hash_key(item);
329 item->hash = hash;
330
331 /* If we can find a matching prog in the cache already, then reuse the
332 * existing stuff without creating new copy into the underlying buffer
333 * object. This is notably useful for programs generating shaders at
334 * runtime, where multiple shaders may compile to the same thing in our
335 * backend.
336 */
337 if (matching_data) {
338 item->offset = matching_data->offset;
339 } else {
340 item->offset = brw_alloc_item_data(cache, data_size);
341
342 /* Copy data to the buffer */
343 memcpy(cache->map + item->offset, data, data_size);
344 }
345
346 /* Set up the memory containing the key and prog_data */
347 tmp = malloc(key_size + prog_data_size);
348
349 memcpy(tmp, key, key_size);
350 memcpy(tmp + key_size, prog_data, prog_data_size);
351
352 item->key = tmp;
353
354 if (cache->n_items > cache->size * 1.5f)
355 rehash(cache);
356
357 hash %= cache->size;
358 item->next = cache->items[hash];
359 cache->items[hash] = item;
360 cache->n_items++;
361
362 *out_offset = item->offset;
363 *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
364 cache->brw->ctx.NewDriverState |= 1 << cache_id;
365 }
366
367 void
brw_init_caches(struct brw_context * brw)368 brw_init_caches(struct brw_context *brw)
369 {
370 struct brw_cache *cache = &brw->cache;
371
372 cache->brw = brw;
373
374 cache->size = 7;
375 cache->n_items = 0;
376 cache->items =
377 calloc(cache->size, sizeof(struct brw_cache_item *));
378
379 cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
380 BRW_MEMZONE_SHADER);
381 if (can_do_exec_capture(brw->screen))
382 cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
383
384 cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
385 MAP_ASYNC | MAP_PERSISTENT);
386 }
387
388 static void
brw_clear_cache(struct brw_context * brw,struct brw_cache * cache)389 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
390 {
391 struct brw_cache_item *c, *next;
392 GLuint i;
393
394 DBG("%s\n", __func__);
395
396 for (i = 0; i < cache->size; i++) {
397 for (c = cache->items[i]; c; c = next) {
398 next = c->next;
399 if (c->cache_id == BRW_CACHE_VS_PROG ||
400 c->cache_id == BRW_CACHE_TCS_PROG ||
401 c->cache_id == BRW_CACHE_TES_PROG ||
402 c->cache_id == BRW_CACHE_GS_PROG ||
403 c->cache_id == BRW_CACHE_FS_PROG ||
404 c->cache_id == BRW_CACHE_CS_PROG) {
405 const void *item_prog_data = ((char *)c->key) + c->key_size;
406 brw_stage_prog_data_free(item_prog_data);
407 }
408 free((void *)c->key);
409 free(c);
410 }
411 cache->items[i] = NULL;
412 }
413
414 cache->n_items = 0;
415
416 /* Start putting programs into the start of the BO again, since
417 * we'll never find the old results.
418 */
419 cache->next_offset = 0;
420
421 /* We need to make sure that the programs get regenerated, since
422 * any offsets leftover in brw_context will no longer be valid.
423 */
424 brw->NewGLState = ~0;
425 brw->ctx.NewDriverState = ~0ull;
426 brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
427 brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
428 brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
429 brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
430
431 /* Also, NULL out any stale program pointers. */
432 brw->vs.base.prog_data = NULL;
433 brw->tcs.base.prog_data = NULL;
434 brw->tes.base.prog_data = NULL;
435 brw->gs.base.prog_data = NULL;
436 brw->wm.base.prog_data = NULL;
437 brw->cs.base.prog_data = NULL;
438
439 intel_batchbuffer_flush(brw);
440 }
441
442 void
brw_program_cache_check_size(struct brw_context * brw)443 brw_program_cache_check_size(struct brw_context *brw)
444 {
445 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
446 * state cache.
447 */
448 if (brw->cache.n_items > 2000) {
449 perf_debug("Exceeded state cache size limit. Clearing the set "
450 "of compiled programs, which will trigger recompiles\n");
451 brw_clear_cache(brw, &brw->cache);
452 brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
453 }
454 }
455
456
457 static void
brw_destroy_cache(struct brw_context * brw,struct brw_cache * cache)458 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
459 {
460
461 DBG("%s\n", __func__);
462
463 /* This can be NULL if context creation failed early on */
464 if (cache->bo) {
465 brw_bo_unmap(cache->bo);
466 brw_bo_unreference(cache->bo);
467 cache->bo = NULL;
468 cache->map = NULL;
469 }
470 brw_clear_cache(brw, cache);
471 free(cache->items);
472 cache->items = NULL;
473 cache->size = 0;
474 }
475
476
477 void
brw_destroy_caches(struct brw_context * brw)478 brw_destroy_caches(struct brw_context *brw)
479 {
480 brw_destroy_cache(brw, &brw->cache);
481 }
482
483 static const char *
cache_name(enum brw_cache_id cache_id)484 cache_name(enum brw_cache_id cache_id)
485 {
486 switch (cache_id) {
487 case BRW_CACHE_VS_PROG:
488 return "VS kernel";
489 case BRW_CACHE_TCS_PROG:
490 return "TCS kernel";
491 case BRW_CACHE_TES_PROG:
492 return "TES kernel";
493 case BRW_CACHE_FF_GS_PROG:
494 return "Fixed-function GS kernel";
495 case BRW_CACHE_GS_PROG:
496 return "GS kernel";
497 case BRW_CACHE_CLIP_PROG:
498 return "CLIP kernel";
499 case BRW_CACHE_SF_PROG:
500 return "SF kernel";
501 case BRW_CACHE_FS_PROG:
502 return "FS kernel";
503 case BRW_CACHE_CS_PROG:
504 return "CS kernel";
505 default:
506 return "unknown";
507 }
508 }
509
510 void
brw_print_program_cache(struct brw_context * brw)511 brw_print_program_cache(struct brw_context *brw)
512 {
513 const struct brw_cache *cache = &brw->cache;
514 struct brw_cache_item *item;
515
516 for (unsigned i = 0; i < cache->size; i++) {
517 for (item = cache->items[i]; item; item = item->next) {
518 fprintf(stderr, "%s:\n", cache_name(i));
519 brw_disassemble_with_labels(&brw->screen->devinfo, cache->map,
520 item->offset, item->size, stderr);
521 }
522 }
523 }
524