• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 /** @file brw_program_cache.c
33  *
34  * This file implements a simple program cache for 965.  The consumers can
35  *  query the hash table of programs using a cache_id and program key, and
36  * receive the corresponding program buffer object (plus associated auxiliary
37  *  data) in return.  Objects in the cache may not have relocations
38  * (pointers to other BOs) in them.
39  *
40  * The inner workings are a simple hash table based on a CRC of the
41  * key data.
42  *
43  * Replacement is not implemented.  Instead, when the cache gets too
44  * big we throw out all of the cache data and let it get regenerated.
45  */
46 
47 #include "main/imports.h"
48 #include "intel_batchbuffer.h"
49 #include "brw_state.h"
50 #include "brw_vs.h"
51 #include "brw_wm.h"
52 #include "brw_gs.h"
53 #include "brw_cs.h"
54 #include "brw_program.h"
55 
56 #define FILE_DEBUG_FLAG DEBUG_STATE
57 
58 struct brw_cache_item {
59    /**
60     * Effectively part of the key, cache_id identifies what kind of state
61     * buffer is involved, and also which dirty flag should set.
62     */
63    enum brw_cache_id cache_id;
64 
65    /** 32-bit hash of the key data */
66    GLuint hash;
67 
68    /** for variable-sized keys */
69    GLuint key_size;
70    GLuint aux_size;
71    const void *key;
72 
73    uint32_t offset;
74    uint32_t size;
75 
76    struct brw_cache_item *next;
77 };
78 
79 static unsigned
get_program_string_id(enum brw_cache_id cache_id,const void * key)80 get_program_string_id(enum brw_cache_id cache_id, const void *key)
81 {
82    switch (cache_id) {
83    case BRW_CACHE_VS_PROG:
84       return ((struct brw_vs_prog_key *) key)->program_string_id;
85    case BRW_CACHE_TCS_PROG:
86       return ((struct brw_tcs_prog_key *) key)->program_string_id;
87    case BRW_CACHE_TES_PROG:
88       return ((struct brw_tes_prog_key *) key)->program_string_id;
89    case BRW_CACHE_GS_PROG:
90       return ((struct brw_gs_prog_key *) key)->program_string_id;
91    case BRW_CACHE_CS_PROG:
92       return ((struct brw_cs_prog_key *) key)->program_string_id;
93    case BRW_CACHE_FS_PROG:
94       return ((struct brw_wm_prog_key *) key)->program_string_id;
95    default:
96       unreachable("no program string id for this kind of program");
97    }
98 }
99 
100 static GLuint
hash_key(struct brw_cache_item * item)101 hash_key(struct brw_cache_item *item)
102 {
103    GLuint *ikey = (GLuint *)item->key;
104    GLuint hash = item->cache_id, i;
105 
106    assert(item->key_size % 4 == 0);
107 
108    /* I'm sure this can be improved on:
109     */
110    for (i = 0; i < item->key_size/4; i++) {
111       hash ^= ikey[i];
112       hash = (hash << 5) | (hash >> 27);
113    }
114 
115    return hash;
116 }
117 
118 static int
brw_cache_item_equals(const struct brw_cache_item * a,const struct brw_cache_item * b)119 brw_cache_item_equals(const struct brw_cache_item *a,
120                       const struct brw_cache_item *b)
121 {
122    return a->cache_id == b->cache_id &&
123       a->hash == b->hash &&
124       a->key_size == b->key_size &&
125       (memcmp(a->key, b->key, a->key_size) == 0);
126 }
127 
128 static struct brw_cache_item *
search_cache(struct brw_cache * cache,GLuint hash,struct brw_cache_item * lookup)129 search_cache(struct brw_cache *cache, GLuint hash,
130              struct brw_cache_item *lookup)
131 {
132    struct brw_cache_item *c;
133 
134 #if 0
135    int bucketcount = 0;
136 
137    for (c = cache->items[hash % cache->size]; c; c = c->next)
138       bucketcount++;
139 
140    fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
141            cache->size, bucketcount, cache->n_items);
142 #endif
143 
144    for (c = cache->items[hash % cache->size]; c; c = c->next) {
145       if (brw_cache_item_equals(lookup, c))
146          return c;
147    }
148 
149    return NULL;
150 }
151 
152 
153 static void
rehash(struct brw_cache * cache)154 rehash(struct brw_cache *cache)
155 {
156    struct brw_cache_item **items;
157    struct brw_cache_item *c, *next;
158    GLuint size, i;
159 
160    size = cache->size * 3;
161    items = calloc(size, sizeof(*items));
162 
163    for (i = 0; i < cache->size; i++)
164       for (c = cache->items[i]; c; c = next) {
165          next = c->next;
166          c->next = items[c->hash % size];
167          items[c->hash % size] = c;
168       }
169 
170    free(cache->items);
171    cache->items = items;
172    cache->size = size;
173 }
174 
175 
176 /**
177  * Returns the buffer object matching cache_id and key, or NULL.
178  */
179 bool
brw_search_cache(struct brw_cache * cache,enum brw_cache_id cache_id,const void * key,GLuint key_size,uint32_t * inout_offset,void * inout_aux)180 brw_search_cache(struct brw_cache *cache,
181                  enum brw_cache_id cache_id,
182                  const void *key, GLuint key_size,
183                  uint32_t *inout_offset, void *inout_aux)
184 {
185    struct brw_context *brw = cache->brw;
186    struct brw_cache_item *item;
187    struct brw_cache_item lookup;
188    GLuint hash;
189 
190    lookup.cache_id = cache_id;
191    lookup.key = key;
192    lookup.key_size = key_size;
193    hash = hash_key(&lookup);
194    lookup.hash = hash;
195 
196    item = search_cache(cache, hash, &lookup);
197 
198    if (item == NULL)
199       return false;
200 
201    void *aux = ((char *) item->key) + item->key_size;
202 
203    if (item->offset != *inout_offset || aux != *((void **) inout_aux)) {
204       brw->ctx.NewDriverState |= (1 << cache_id);
205       *inout_offset = item->offset;
206       *((void **) inout_aux) = aux;
207    }
208 
209    return true;
210 }
211 
212 static void
brw_cache_new_bo(struct brw_cache * cache,uint32_t new_size)213 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
214 {
215    struct brw_context *brw = cache->brw;
216    drm_intel_bo *new_bo;
217 
218    new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
219    if (brw->has_llc)
220       drm_intel_gem_bo_map_unsynchronized(new_bo);
221 
222    /* Copy any existing data that needs to be saved. */
223    if (cache->next_offset != 0) {
224       if (brw->has_llc) {
225          memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
226       } else {
227          drm_intel_bo_map(cache->bo, false);
228          drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
229                               cache->bo->virtual);
230          drm_intel_bo_unmap(cache->bo);
231       }
232    }
233 
234    if (brw->has_llc)
235       drm_intel_bo_unmap(cache->bo);
236    drm_intel_bo_unreference(cache->bo);
237    cache->bo = new_bo;
238    cache->bo_used_by_gpu = false;
239 
240    /* Since we have a new BO in place, we need to signal the units
241     * that depend on it (state base address on gen5+, or unit state before).
242     */
243    brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
244    brw->batch.state_base_address_emitted = false;
245 }
246 
247 /**
248  * Attempts to find an item in the cache with identical data.
249  */
250 static const struct brw_cache_item *
brw_lookup_prog(const struct brw_cache * cache,enum brw_cache_id cache_id,const void * data,unsigned data_size)251 brw_lookup_prog(const struct brw_cache *cache,
252                 enum brw_cache_id cache_id,
253                 const void *data, unsigned data_size)
254 {
255    const struct brw_context *brw = cache->brw;
256    unsigned i;
257    const struct brw_cache_item *item;
258 
259    for (i = 0; i < cache->size; i++) {
260       for (item = cache->items[i]; item; item = item->next) {
261          int ret;
262 
263          if (item->cache_id != cache_id || item->size != data_size)
264             continue;
265 
266          if (!brw->has_llc)
267             drm_intel_bo_map(cache->bo, false);
268          ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
269          if (!brw->has_llc)
270             drm_intel_bo_unmap(cache->bo);
271          if (ret)
272             continue;
273 
274          return item;
275       }
276    }
277 
278    return NULL;
279 }
280 
281 static uint32_t
brw_alloc_item_data(struct brw_cache * cache,uint32_t size)282 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
283 {
284    uint32_t offset;
285    struct brw_context *brw = cache->brw;
286 
287    /* Allocate space in the cache BO for our new program. */
288    if (cache->next_offset + size > cache->bo->size) {
289       uint32_t new_size = cache->bo->size * 2;
290 
291       while (cache->next_offset + size > new_size)
292          new_size *= 2;
293 
294       brw_cache_new_bo(cache, new_size);
295    }
296 
297    /* If we would block on writing to an in-use program BO, just
298     * recreate it.
299     */
300    if (!brw->has_llc && cache->bo_used_by_gpu) {
301       perf_debug("Copying busy program cache buffer.\n");
302       brw_cache_new_bo(cache, cache->bo->size);
303    }
304 
305    offset = cache->next_offset;
306 
307    /* Programs are always 64-byte aligned, so set up the next one now */
308    cache->next_offset = ALIGN(offset + size, 64);
309 
310    return offset;
311 }
312 
313 const void *
brw_find_previous_compile(struct brw_cache * cache,enum brw_cache_id cache_id,unsigned program_string_id)314 brw_find_previous_compile(struct brw_cache *cache,
315                           enum brw_cache_id cache_id,
316                           unsigned program_string_id)
317 {
318    for (unsigned i = 0; i < cache->size; i++) {
319       for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
320          if (c->cache_id == cache_id &&
321              get_program_string_id(cache_id, c->key) == program_string_id) {
322             return c->key;
323          }
324       }
325    }
326 
327    return NULL;
328 }
329 
330 void
brw_upload_cache(struct brw_cache * cache,enum brw_cache_id cache_id,const void * key,GLuint key_size,const void * data,GLuint data_size,const void * aux,GLuint aux_size,uint32_t * out_offset,void * out_aux)331 brw_upload_cache(struct brw_cache *cache,
332                  enum brw_cache_id cache_id,
333                  const void *key,
334                  GLuint key_size,
335                  const void *data,
336                  GLuint data_size,
337                  const void *aux,
338                  GLuint aux_size,
339                  uint32_t *out_offset,
340                  void *out_aux)
341 {
342    struct brw_context *brw = cache->brw;
343    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
344    const struct brw_cache_item *matching_data =
345       brw_lookup_prog(cache, cache_id, data, data_size);
346    GLuint hash;
347    void *tmp;
348 
349    item->cache_id = cache_id;
350    item->size = data_size;
351    item->key = key;
352    item->key_size = key_size;
353    item->aux_size = aux_size;
354    hash = hash_key(item);
355    item->hash = hash;
356 
357    /* If we can find a matching prog in the cache already, then reuse the
358     * existing stuff without creating new copy into the underlying buffer
359     * object. This is notably useful for programs generating shaders at
360     * runtime, where multiple shaders may compile to the same thing in our
361     * backend.
362     */
363    if (matching_data) {
364       item->offset = matching_data->offset;
365    } else {
366       item->offset = brw_alloc_item_data(cache, data_size);
367 
368       /* Copy data to the buffer */
369       if (brw->has_llc) {
370          memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
371       } else {
372          drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
373       }
374    }
375 
376    /* Set up the memory containing the key and aux_data */
377    tmp = malloc(key_size + aux_size);
378 
379    memcpy(tmp, key, key_size);
380    memcpy(tmp + key_size, aux, aux_size);
381 
382    item->key = tmp;
383 
384    if (cache->n_items > cache->size * 1.5f)
385       rehash(cache);
386 
387    hash %= cache->size;
388    item->next = cache->items[hash];
389    cache->items[hash] = item;
390    cache->n_items++;
391 
392    *out_offset = item->offset;
393    *(void **)out_aux = (void *)((char *)item->key + item->key_size);
394    cache->brw->ctx.NewDriverState |= 1 << cache_id;
395 }
396 
397 void
brw_init_caches(struct brw_context * brw)398 brw_init_caches(struct brw_context *brw)
399 {
400    struct brw_cache *cache = &brw->cache;
401 
402    cache->brw = brw;
403 
404    cache->size = 7;
405    cache->n_items = 0;
406    cache->items =
407       calloc(cache->size, sizeof(struct brw_cache_item *));
408 
409    cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
410    if (brw->has_llc)
411       drm_intel_gem_bo_map_unsynchronized(cache->bo);
412 }
413 
414 static void
brw_clear_cache(struct brw_context * brw,struct brw_cache * cache)415 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
416 {
417    struct brw_cache_item *c, *next;
418    GLuint i;
419 
420    DBG("%s\n", __func__);
421 
422    for (i = 0; i < cache->size; i++) {
423       for (c = cache->items[i]; c; c = next) {
424          next = c->next;
425          if (c->cache_id == BRW_CACHE_VS_PROG ||
426              c->cache_id == BRW_CACHE_TCS_PROG ||
427              c->cache_id == BRW_CACHE_TES_PROG ||
428              c->cache_id == BRW_CACHE_GS_PROG ||
429              c->cache_id == BRW_CACHE_FS_PROG ||
430              c->cache_id == BRW_CACHE_CS_PROG) {
431             const void *item_aux = c->key + c->key_size;
432             brw_stage_prog_data_free(item_aux);
433          }
434          free((void *)c->key);
435          free(c);
436       }
437       cache->items[i] = NULL;
438    }
439 
440    cache->n_items = 0;
441 
442    /* Start putting programs into the start of the BO again, since
443     * we'll never find the old results.
444     */
445    cache->next_offset = 0;
446 
447    /* We need to make sure that the programs get regenerated, since
448     * any offsets leftover in brw_context will no longer be valid.
449     */
450    brw->NewGLState = ~0;
451    brw->ctx.NewDriverState = ~0ull;
452    brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
453    brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
454    brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
455    brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
456 
457    /* Also, NULL out any stale program pointers. */
458    brw->vs.base.prog_data = NULL;
459    brw->tcs.base.prog_data = NULL;
460    brw->tes.base.prog_data = NULL;
461    brw->gs.base.prog_data = NULL;
462    brw->wm.base.prog_data = NULL;
463    brw->cs.base.prog_data = NULL;
464 
465    intel_batchbuffer_flush(brw);
466 }
467 
468 void
brw_program_cache_check_size(struct brw_context * brw)469 brw_program_cache_check_size(struct brw_context *brw)
470 {
471    /* un-tuned guess.  Each object is generally a page, so 2000 of them is 8 MB of
472     * state cache.
473     */
474    if (brw->cache.n_items > 2000) {
475       perf_debug("Exceeded state cache size limit.  Clearing the set "
476                  "of compiled programs, which will trigger recompiles\n");
477       brw_clear_cache(brw, &brw->cache);
478    }
479 }
480 
481 
482 static void
brw_destroy_cache(struct brw_context * brw,struct brw_cache * cache)483 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
484 {
485 
486    DBG("%s\n", __func__);
487 
488    if (brw->has_llc)
489       drm_intel_bo_unmap(cache->bo);
490    drm_intel_bo_unreference(cache->bo);
491    cache->bo = NULL;
492    brw_clear_cache(brw, cache);
493    free(cache->items);
494    cache->items = NULL;
495    cache->size = 0;
496 }
497 
498 
499 void
brw_destroy_caches(struct brw_context * brw)500 brw_destroy_caches(struct brw_context *brw)
501 {
502    brw_destroy_cache(brw, &brw->cache);
503 }
504 
505 static const char *
cache_name(enum brw_cache_id cache_id)506 cache_name(enum brw_cache_id cache_id)
507 {
508    switch (cache_id) {
509    case BRW_CACHE_VS_PROG:
510       return "VS kernel";
511    case BRW_CACHE_TCS_PROG:
512       return "TCS kernel";
513    case BRW_CACHE_TES_PROG:
514       return "TES kernel";
515    case BRW_CACHE_FF_GS_PROG:
516       return "Fixed-function GS kernel";
517    case BRW_CACHE_GS_PROG:
518       return "GS kernel";
519    case BRW_CACHE_CLIP_PROG:
520       return "CLIP kernel";
521    case BRW_CACHE_SF_PROG:
522       return "SF kernel";
523    case BRW_CACHE_FS_PROG:
524       return "FS kernel";
525    case BRW_CACHE_CS_PROG:
526       return "CS kernel";
527    default:
528       return "unknown";
529    }
530 }
531 
532 void
brw_print_program_cache(struct brw_context * brw)533 brw_print_program_cache(struct brw_context *brw)
534 {
535    const struct brw_cache *cache = &brw->cache;
536    struct brw_cache_item *item;
537 
538    if (!brw->has_llc)
539       drm_intel_bo_map(cache->bo, false);
540 
541    for (unsigned i = 0; i < cache->size; i++) {
542       for (item = cache->items[i]; item; item = item->next) {
543          fprintf(stderr, "%s:\n", cache_name(i));
544          brw_disassemble(&brw->screen->devinfo, cache->bo->virtual,
545                          item->offset, item->size, stderr);
546       }
547    }
548 
549    if (!brw->has_llc)
550       drm_intel_bo_unmap(cache->bo);
551 }
552