• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_eu.h"
25 #include "brw_nir.h"
26 #include "compiler/nir/nir.h"
27 #include "util/u_dynarray.h"
28 
29 /**
30  * \file brw_nir_analyze_ubo_ranges.c
31  *
32  * This pass decides which portions of UBOs to upload as push constants,
33  * so shaders can access them as part of the thread payload, rather than
34  * having to issue expensive memory reads to pull the data.
35  *
36  * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
37  * buffers, in GRF sized units.  This was always 256 bits (32 bytes).
38  * Starting with Xe2, it is 512 bits (64 bytes).
39  *
40  * To do this, we examine NIR load_ubo intrinsics, recording the number of
41  * loads at each offset.  We track offsets at a sizeof(GRF) granularity, so even
42  * fields with a bit of padding between them tend to fall into contiguous
43  * ranges.  We build a list of these ranges, tracking their "cost" (number
44  * of registers required) and "benefit" (number of pull loads eliminated
45  * by pushing the range).  We then sort the list to obtain the four best
46  * ranges (most benefit for the least cost).
47  */
48 
49 struct ubo_range_entry
50 {
51    struct brw_ubo_range range;
52    int benefit;
53 };
54 
55 static int
score(const struct ubo_range_entry * entry)56 score(const struct ubo_range_entry *entry)
57 {
58    return 2 * entry->benefit - entry->range.length;
59 }
60 
61 /**
62  * Compares score for two UBO range entries.
63  *
64  * For a descending qsort().
65  */
66 static int
cmp_ubo_range_entry(const void * va,const void * vb)67 cmp_ubo_range_entry(const void *va, const void *vb)
68 {
69    const struct ubo_range_entry *a = va;
70    const struct ubo_range_entry *b = vb;
71 
72    /* Rank based on scores, descending order */
73    int delta = score(b) - score(a);
74 
75    /* Then use the UBO block index as a tie-breaker, descending order */
76    if (delta == 0)
77       delta = b->range.block - a->range.block;
78 
79    /* Finally use the start offset as a second tie-breaker, ascending order */
80    if (delta == 0)
81       delta = a->range.start - b->range.start;
82 
83    return delta;
84 }
85 
86 struct ubo_block_info
87 {
88    /* Each bit in the offsets bitfield represents a 32-byte section of data.
89     * If it's set to one, there is interesting UBO data at that offset.  If
90     * not, there's a "hole" - padding between data - or just nothing at all.
91     */
92    uint64_t offsets;
93    uint8_t uses[64];
94 };
95 
96 struct ubo_analysis_state
97 {
98    struct hash_table *blocks;
99    const struct intel_device_info *devinfo;
100 };
101 
102 static struct ubo_block_info *
get_block_info(struct ubo_analysis_state * state,int block)103 get_block_info(struct ubo_analysis_state *state, int block)
104 {
105    uint32_t hash = block + 1;
106    void *key = (void *) (uintptr_t) hash;
107 
108    struct hash_entry *entry =
109       _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
110 
111    if (entry)
112       return (struct ubo_block_info *) entry->data;
113 
114    struct ubo_block_info *info =
115       rzalloc(state->blocks, struct ubo_block_info);
116    _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
117 
118    return info;
119 }
120 
121 static void
analyze_ubos_block(struct ubo_analysis_state * state,nir_block * block)122 analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
123 {
124    nir_foreach_instr(instr, block) {
125       if (instr->type != nir_instr_type_intrinsic)
126          continue;
127 
128       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
129       if (intrin->intrinsic != nir_intrinsic_load_ubo)
130          continue;
131 
132       if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
133           nir_src_is_const(intrin->src[1])) {
134          const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
135          const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
136          const unsigned sizeof_GRF = REG_SIZE * reg_unit(state->devinfo);
137          const int offset = byte_offset / sizeof_GRF;
138 
139          /* Avoid shifting by larger than the width of our bitfield, as this
140           * is undefined in C.  Even if we require multiple bits to represent
141           * the entire value, it's OK to record a partial value - the backend
142           * is capable of falling back to pull loads for later components of
143           * vectors, as it has to shrink ranges for other reasons anyway.
144           */
145          if (offset >= 64)
146             continue;
147 
148          /* The value might span multiple sizeof(GRF) chunks. */
149          const unsigned num_components =
150             nir_def_last_component_read(&intrin->def) + 1;
151          const int bytes = num_components * (intrin->def.bit_size / 8);
152          const int start = ROUND_DOWN_TO(byte_offset, sizeof_GRF);
153          const int end = ALIGN(byte_offset + bytes, sizeof_GRF);
154          const int chunks = (end - start) / sizeof_GRF;
155 
156          /* TODO: should we count uses in loops as higher benefit? */
157 
158          struct ubo_block_info *info = get_block_info(state, block);
159          info->offsets |= ((1ull << chunks) - 1) << offset;
160          info->uses[offset]++;
161       }
162    }
163 }
164 
165 static void
print_ubo_entry(FILE * file,const struct ubo_range_entry * entry,struct ubo_analysis_state * state)166 print_ubo_entry(FILE *file,
167                 const struct ubo_range_entry *entry,
168                 struct ubo_analysis_state *state)
169 {
170    struct ubo_block_info *info = get_block_info(state, entry->range.block);
171 
172    fprintf(file,
173            "block %2d, start %2d, length %2d, bits = %"PRIx64", "
174            "benefit %2d, cost %2d, score = %2d\n",
175            entry->range.block, entry->range.start, entry->range.length,
176            info->offsets, entry->benefit, entry->range.length, score(entry));
177 }
178 
179 void
brw_nir_analyze_ubo_ranges(const struct brw_compiler * compiler,nir_shader * nir,struct brw_ubo_range out_ranges[4])180 brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
181                            nir_shader *nir,
182                            struct brw_ubo_range out_ranges[4])
183 {
184    void *mem_ctx = ralloc_context(NULL);
185 
186    struct ubo_analysis_state state = {
187       .blocks =
188          _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
189       .devinfo = compiler->devinfo,
190    };
191 
192    /* Walk the IR, recording how many times each UBO block/offset is used. */
193    nir_foreach_function_impl(impl, nir) {
194       nir_foreach_block(block, impl) {
195          analyze_ubos_block(&state, block);
196       }
197    }
198 
199    /* Find ranges: a block, starting register-size aligned byte offset, and
200     * length.
201     */
202    struct util_dynarray ranges;
203    util_dynarray_init(&ranges, mem_ctx);
204 
205    hash_table_foreach(state.blocks, entry) {
206       const int b = entry->hash - 1;
207       const struct ubo_block_info *info = entry->data;
208       uint64_t offsets = info->offsets;
209 
210       /* Walk through the offsets bitfield, finding contiguous regions of
211        * set bits:
212        *
213        *   0000000001111111111111000000000000111111111111110000000011111100
214        *            ^^^^^^^^^^^^^            ^^^^^^^^^^^^^^        ^^^^^^
215        *
216        * Each of these will become a UBO range.
217        */
218       while (offsets != 0) {
219          /* Find the first 1 in the offsets bitfield.  This represents the
220           * start of a range of interesting UBO data.  Make it zero-indexed.
221           */
222          int first_bit = ffsll(offsets) - 1;
223 
224          /* Find the first 0 bit in offsets beyond first_bit.  To find the
225           * first zero bit, we find the first 1 bit in the complement.  In
226           * order to ignore bits before first_bit, we mask off those bits.
227           */
228          int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
229 
230          if (first_hole == -1) {
231             /* If we didn't find a hole, then set it to the end of the
232              * bitfield.  There are no more ranges to process.
233              */
234             first_hole = 64;
235             offsets = 0;
236          } else {
237             /* We've processed all bits before first_hole.  Mask them off. */
238             offsets &= ~((1ull << first_hole) - 1);
239          }
240 
241          struct ubo_range_entry *entry =
242             util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
243 
244          entry->range.block = b;
245          entry->range.start = first_bit;
246          /* first_hole is one beyond the end, so we don't need to add 1 */
247          entry->range.length = first_hole - first_bit;
248          entry->benefit = 0;
249 
250          for (int i = 0; i < entry->range.length; i++)
251             entry->benefit += info->uses[first_bit + i];
252       }
253    }
254 
255    int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
256 
257    if (0) {
258       util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
259          print_ubo_entry(stderr, entry, &state);
260       }
261    }
262 
263    /* TODO: Consider combining ranges.
264     *
265     * We can only push 4 ranges via 3DSTATE_CONSTANT_XS.  If there are
266     * more ranges, and two are close by with only a small hole, it may be
267     * worth combining them.  The holes will waste register space, but the
268     * benefit of removing pulls may outweigh that cost.
269     */
270 
271    /* Sort the list so the most beneficial ranges are at the front. */
272    if (nr_entries > 0) {
273       qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
274             cmp_ubo_range_entry);
275    }
276 
277    struct ubo_range_entry *entries = ranges.data;
278 
279    /* Return the top 4, limited to the maximum number of push registers.
280     *
281     * The Vulkan driver sets up additional non-UBO push constants, so it may
282     * need to shrink these ranges further (see anv_nir_compute_push_layout.c).
283     * The OpenGL driver treats legacy uniforms as a UBO, so this is enough.
284     *
285     * To limit further, simply drop the tail of the list, as that's the least
286     * valuable portion.
287     */
288    const int max_ubos = 4;
289    nr_entries = MIN2(nr_entries, max_ubos);
290 
291    const unsigned max_push_regs = 64 / reg_unit(compiler->devinfo);
292    unsigned total_push_regs = 0;
293 
294    for (unsigned i = 0; i < nr_entries; i++) {
295       if (total_push_regs + entries[i].range.length > max_push_regs)
296          entries[i].range.length = max_push_regs - total_push_regs;
297       total_push_regs += entries[i].range.length;
298    }
299 
300    for (int i = 0; i < nr_entries; i++) {
301       out_ranges[i] = entries[i].range;
302 
303       /* To this point, various values have been tracked in terms of the real
304        * hardware register sizes.  However, the rest of the compiler expects
305        * values in terms of pre-Xe2 256-bit registers. Scale start and length
306        * to account for this.
307        */
308       out_ranges[i].start *= reg_unit(compiler->devinfo);
309       out_ranges[i].length *= reg_unit(compiler->devinfo);
310    }
311    for (int i = nr_entries; i < 4; i++) {
312       out_ranges[i].block = 0;
313       out_ranges[i].start = 0;
314       out_ranges[i].length = 0;
315    }
316 
317    ralloc_free(ranges.mem_ctx);
318 }
319