• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 #include "util/u_math.h"
29 #include "util/u_memory.h"
30 
31 /* This pass promotes reads from UBOs to register-mapped uniforms.  This saves
32  * both instructions and work register pressure, but it reduces the work
33  * registers available, requiring a balance.
34  *
35  * We use a heuristic to determine the ideal count, implemented by
36  * mir_work_heuristic, which returns the ideal number of work registers.
37  */
38 
39 static bool
mir_is_ubo(midgard_instruction * ins)40 mir_is_ubo(midgard_instruction *ins)
41 {
42         return (ins->type == TAG_LOAD_STORE_4) &&
43                 (OP_IS_UBO_READ(ins->op));
44 }
45 
46 static bool
mir_is_direct_aligned_ubo(midgard_instruction * ins)47 mir_is_direct_aligned_ubo(midgard_instruction *ins)
48 {
49         return mir_is_ubo(ins) &&
50                 !(ins->constants.u32[0] & 0xF) &&
51                 (ins->src[1] == ~0) &&
52                 (ins->src[2] == ~0);
53 }
54 
55 /* Represents use data for a single UBO */
56 
57 #define MAX_UBO_QWORDS (65536 / 16)
58 
59 struct mir_ubo_block {
60         BITSET_DECLARE(uses, MAX_UBO_QWORDS);
61         BITSET_DECLARE(pushed, MAX_UBO_QWORDS);
62 };
63 
64 struct mir_ubo_analysis {
65         /* Per block analysis */
66         unsigned nr_blocks;
67         struct mir_ubo_block *blocks;
68 };
69 
70 static struct mir_ubo_analysis
mir_analyze_ranges(compiler_context * ctx)71 mir_analyze_ranges(compiler_context *ctx)
72 {
73         struct mir_ubo_analysis res = {
74                 .nr_blocks = ctx->nir->info.num_ubos + 1,
75         };
76 
77         res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block));
78 
79         mir_foreach_instr_global(ctx, ins) {
80                 if (!mir_is_direct_aligned_ubo(ins)) continue;
81 
82                 unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);
83                 unsigned offset = ins->constants.u32[0] / 16;
84 
85                 assert(ubo < res.nr_blocks);
86 
87                 if (offset < MAX_UBO_QWORDS)
88                         BITSET_SET(res.blocks[ubo].uses, offset);
89         }
90 
91         return res;
92 }
93 
94 /* Select UBO words to push. A sophisticated implementation would consider the
95  * number of uses and perhaps the control flow to estimate benefit. This is not
96  * sophisticated. Select from the last UBO first to prioritize sysvals. */
97 
98 static void
mir_pick_ubo(struct panfrost_ubo_push * push,struct mir_ubo_analysis * analysis,unsigned max_qwords)99 mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords)
100 {
101         unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4);
102 
103         for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {
104                 struct mir_ubo_block *block = &analysis->blocks[ubo];
105 
106                 unsigned vec4;
107                 BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) {
108                         /* Don't push more than possible */
109                         if (push->count > max_words - 4)
110                                 return;
111 
112                         for (unsigned offs = 0; offs < 4; ++offs) {
113                                 struct panfrost_ubo_word word = {
114                                         .ubo = ubo,
115                                         .offset = (vec4 * 16) + (offs * 4)
116                                 };
117 
118                                 push->words[push->count++] = word;
119                         }
120 
121                         /* Mark it as pushed so we can rewrite */
122                         BITSET_SET(block->pushed, vec4);
123                 }
124         }
125 }
126 
127 #if 0
128 static void
129 mir_dump_ubo_analysis(struct mir_ubo_analysis *res)
130 {
131         printf("%u blocks\n", res->nr_blocks);
132 
133         for (unsigned i = 0; i < res->nr_blocks; ++i) {
134                 BITSET_WORD *uses = res->blocks[i].uses;
135                 BITSET_WORD *push = res->blocks[i].pushed;
136 
137                 unsigned last = BITSET_LAST_BIT_SIZED(uses, BITSET_WORDS(MAX_UBO_QWORDS));
138 
139                 printf("\t");
140 
141                 for (unsigned j = 0; j < last; ++j) {
142                         bool used = BITSET_TEST(uses, j);
143                         bool pushed = BITSET_TEST(push, j);
144                         assert(used || !pushed);
145 
146                         putchar(pushed ? '*' : used ? '-' : '_');
147                 }
148 
149                 printf("\n");
150         }
151 }
152 #endif
153 
154 static unsigned
mir_promoteable_uniform_count(struct mir_ubo_analysis * analysis)155 mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis)
156 {
157         unsigned count = 0;
158 
159         for (unsigned i = 0; i < analysis->nr_blocks; ++i) {
160                 BITSET_WORD *uses = analysis->blocks[i].uses;
161 
162                 for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w)
163                         count += util_bitcount(uses[w]);
164         }
165 
166         return count;
167 }
168 
169 static unsigned
mir_count_live(uint16_t * live,unsigned temp_count)170 mir_count_live(uint16_t *live, unsigned temp_count)
171 {
172         unsigned count = 0;
173 
174         for (unsigned i = 0; i < temp_count; ++i)
175                 count += util_bitcount(live[i]);
176 
177         return count;
178 }
179 
180 static unsigned
mir_estimate_pressure(compiler_context * ctx)181 mir_estimate_pressure(compiler_context *ctx)
182 {
183         mir_invalidate_liveness(ctx);
184         mir_compute_liveness(ctx);
185 
186         unsigned max_live = 0;
187 
188         mir_foreach_block(ctx, _block) {
189                 midgard_block *block = (midgard_block *) _block;
190                 uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t));
191 
192                 mir_foreach_instr_in_block_rev(block, ins) {
193                         unsigned count = mir_count_live(live, ctx->temp_count);
194                         max_live = MAX2(max_live, count);
195                         mir_liveness_ins_update(live, ins, ctx->temp_count);
196                 }
197 
198                 free(live);
199         }
200 
201         return DIV_ROUND_UP(max_live, 16);
202 }
203 
204 static unsigned
mir_work_heuristic(compiler_context * ctx,struct mir_ubo_analysis * analysis)205 mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis)
206 {
207         unsigned uniform_count = mir_promoteable_uniform_count(analysis);
208 
209         /* If there are 8 or fewer uniforms, it doesn't matter what we do, so
210          * allow as many work registers as needed */
211 
212         if (uniform_count <= 8)
213                 return 16;
214 
215         /* Otherwise, estimate the register pressure */
216 
217         unsigned pressure = mir_estimate_pressure(ctx);
218 
219         /* Prioritize not spilling above all else. The relation between the
220          * pressure estimate and the actual register pressure is a little
221          * murkier than we might like (due to scheduling, pipeline registers,
222          * failure to pack vector registers, load/store registers, texture
223          * registers...), hence why this is a heuristic parameter */
224 
225         if (pressure > 6)
226                 return 16;
227 
228         /* If there's no chance of spilling, prioritize UBOs and thread count */
229 
230         return 8;
231 }
232 
233 /* Bitset of indices that will be used as a special register -- inputs to a
234  * non-ALU op. We precompute this set so that testing is efficient, otherwise
235  * we end up O(mn) behaviour for n instructions and m uniform reads */
236 
237 static BITSET_WORD *
mir_special_indices(compiler_context * ctx)238 mir_special_indices(compiler_context *ctx)
239 {
240         mir_compute_temp_count(ctx);
241         BITSET_WORD *bset = calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD));
242 
243         mir_foreach_instr_global(ctx, ins) {
244                 /* Look for special instructions */
245                 bool is_ldst = ins->type == TAG_LOAD_STORE_4;
246                 bool is_tex = ins->type == TAG_TEXTURE_4;
247                 bool is_writeout = ins->compact_branch && ins->writeout;
248 
249                 if (!(is_ldst || is_tex || is_writeout))
250                         continue;
251 
252                 /* Anything read by a special instruction is itself special */
253                 mir_foreach_src(ins, i) {
254                         unsigned idx = ins->src[i];
255 
256                         if (idx < ctx->temp_count)
257                                 BITSET_SET(bset, idx);
258                 }
259         }
260 
261         return bset;
262 }
263 
264 void
midgard_promote_uniforms(compiler_context * ctx)265 midgard_promote_uniforms(compiler_context *ctx)
266 {
267         if (ctx->inputs->no_ubo_to_push) {
268                 /* If nothing is pushed, all UBOs need to be uploaded
269                  * conventionally */
270                 ctx->ubo_mask = ~0;
271                 return;
272         }
273 
274         struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx);
275 
276         unsigned work_count = mir_work_heuristic(ctx, &analysis);
277         unsigned promoted_count = 24 - work_count;
278 
279         /* Ensure we are 16 byte aligned to avoid underallocations */
280         mir_pick_ubo(&ctx->info->push, &analysis, promoted_count);
281         ctx->info->push.count = ALIGN_POT(ctx->info->push.count, 4);
282 
283         /* First, figure out special indices a priori so we don't recompute a lot */
284         BITSET_WORD *special = mir_special_indices(ctx);
285 
286         ctx->ubo_mask = 0;
287 
288         mir_foreach_instr_global_safe(ctx, ins) {
289                 if (!mir_is_ubo(ins)) continue;
290 
291                 unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);
292                 unsigned qword = ins->constants.u32[0] / 16;
293 
294                 if (!mir_is_direct_aligned_ubo(ins)) {
295                         if (ins->src[1] == ~0)
296                                 ctx->ubo_mask |= BITSET_BIT(ubo);
297                         else
298                                 ctx->ubo_mask = ~0;
299 
300                         continue;
301                 }
302 
303                 /* Check if we decided to push this */
304                 assert(ubo < analysis.nr_blocks);
305                 if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) {
306                         ctx->ubo_mask |= BITSET_BIT(ubo);
307                         continue;
308                 }
309 
310                 /* Find where we pushed to, TODO: unaligned pushes to pack */
311                 unsigned base = pan_lookup_pushed_ubo(&ctx->info->push, ubo, qword * 16);
312                 assert((base & 0x3) == 0);
313 
314                 unsigned address = base / 4;
315                 unsigned uniform_reg = 23 - address;
316 
317                 /* Should've taken into account when pushing */
318                 assert(address < promoted_count);
319                 unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);
320 
321                 /* We do need the move for safety for a non-SSA dest, or if
322                  * we're being fed into a special class */
323 
324                 bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1;
325 
326                 if (ins->dest < ctx->temp_count)
327                         needs_move |= BITSET_TEST(special, ins->dest);
328 
329                 if (needs_move) {
330                         unsigned type_size = nir_alu_type_get_type_size(ins->dest_type);
331                         midgard_instruction mov = v_mov(promoted, ins->dest);
332                         mov.dest_type = nir_type_uint | type_size;
333                         mov.src_types[1] = mov.dest_type;
334 
335                         uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size);
336                         mir_set_bytemask(&mov, rounded);
337                         mir_insert_instruction_before(ctx, ins, mov);
338                 } else {
339                         mir_rewrite_index_src(ctx, ins->dest, promoted);
340                 }
341 
342                 mir_remove_instruction(ins);
343         }
344 
345         free(special);
346         free(analysis.blocks);
347 }
348