1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_builder.h"
26
27 /* This pass provides a way to move computations that are always the same for
28 * an entire draw/compute dispatch into a "preamble" that runs before the main
29 * entrypoint.
30 *
31 * We also expose a separate API to get or construct the preamble of a shader
32 * in case backends want to insert their own code.
33 */
34
35
36 nir_function_impl *
nir_shader_get_preamble(nir_shader * shader)37 nir_shader_get_preamble(nir_shader *shader)
38 {
39 nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
40 if (entrypoint->preamble) {
41 return entrypoint->preamble->impl;
42 } else {
43 nir_function *preamble = nir_function_create(shader, "@preamble");
44 preamble->is_preamble = true;
45 nir_function_impl *impl = nir_function_impl_create(preamble);
46 entrypoint->preamble = preamble;
47 return impl;
48 }
49 }
50
51 typedef struct {
52 bool can_move;
53 bool candidate;
54 bool must_stay;
55 bool replace;
56
57 unsigned can_move_users;
58
59 unsigned size, align;
60
61 unsigned offset;
62
63 /* Average the cost of a value among its users, to try to account for
64 * values that have multiple can_move uses.
65 */
66 float value;
67
68 /* Overall benefit, i.e. the value minus any cost to inserting
69 * load_preamble.
70 */
71 float benefit;
72 } def_state;
73
74 typedef struct {
75 /* Per-definition array of states */
76 def_state *states;
77
78 nir_ssa_def *def;
79
80 const nir_opt_preamble_options *options;
81 } opt_preamble_ctx;
82
83 static float
get_instr_cost(nir_instr * instr,const nir_opt_preamble_options * options)84 get_instr_cost(nir_instr *instr, const nir_opt_preamble_options *options)
85 {
86 /* No backend will want to hoist load_const or undef by itself, so handle
87 * this for them.
88 */
89 if (instr->type == nir_instr_type_load_const ||
90 instr->type == nir_instr_type_ssa_undef)
91 return 0;
92
93 return options->instr_cost_cb(instr, options->cb_data);
94 }
95
96 static bool
can_move_src(nir_src * src,void * state)97 can_move_src(nir_src *src, void *state)
98 {
99 opt_preamble_ctx *ctx = state;
100
101 assert(src->is_ssa);
102 return ctx->states[src->ssa->index].can_move;
103 }
104
105 static bool
can_move_srcs(nir_instr * instr,opt_preamble_ctx * ctx)106 can_move_srcs(nir_instr *instr, opt_preamble_ctx *ctx)
107 {
108 return nir_foreach_src(instr, can_move_src, ctx);
109 }
110
111 static bool
can_move_intrinsic(nir_intrinsic_instr * instr,opt_preamble_ctx * ctx)112 can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
113 {
114 switch (instr->intrinsic) {
115 /* Intrinsics which can always be moved */
116 case nir_intrinsic_load_push_constant:
117 case nir_intrinsic_load_work_dim:
118 case nir_intrinsic_load_num_workgroups:
119 case nir_intrinsic_load_workgroup_size:
120 case nir_intrinsic_load_ray_launch_size:
121 case nir_intrinsic_load_ray_launch_size_addr_amd:
122 case nir_intrinsic_load_sbt_base_amd:
123 case nir_intrinsic_load_is_indexed_draw:
124 case nir_intrinsic_load_viewport_scale:
125 case nir_intrinsic_load_user_clip_plane:
126 case nir_intrinsic_load_viewport_x_scale:
127 case nir_intrinsic_load_viewport_y_scale:
128 case nir_intrinsic_load_viewport_z_scale:
129 case nir_intrinsic_load_viewport_offset:
130 case nir_intrinsic_load_viewport_x_offset:
131 case nir_intrinsic_load_viewport_y_offset:
132 case nir_intrinsic_load_viewport_z_offset:
133 case nir_intrinsic_load_blend_const_color_a_float:
134 case nir_intrinsic_load_blend_const_color_b_float:
135 case nir_intrinsic_load_blend_const_color_g_float:
136 case nir_intrinsic_load_blend_const_color_r_float:
137 case nir_intrinsic_load_blend_const_color_rgba:
138 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
139 case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
140 case nir_intrinsic_load_line_width:
141 case nir_intrinsic_load_aa_line_width:
142 case nir_intrinsic_load_fb_layers_v3d:
143 case nir_intrinsic_load_tcs_num_patches_amd:
144 case nir_intrinsic_load_sample_positions_pan:
145 case nir_intrinsic_load_shader_query_enabled_amd:
146 case nir_intrinsic_load_cull_front_face_enabled_amd:
147 case nir_intrinsic_load_cull_back_face_enabled_amd:
148 case nir_intrinsic_load_cull_ccw_amd:
149 case nir_intrinsic_load_cull_small_primitives_enabled_amd:
150 case nir_intrinsic_load_cull_any_enabled_amd:
151 case nir_intrinsic_load_cull_small_prim_precision_amd:
152 return true;
153
154 /* Intrinsics which can be moved depending on hardware */
155 case nir_intrinsic_load_base_instance:
156 case nir_intrinsic_load_base_vertex:
157 case nir_intrinsic_load_first_vertex:
158 case nir_intrinsic_load_draw_id:
159 return ctx->options->drawid_uniform;
160
161 case nir_intrinsic_load_subgroup_size:
162 case nir_intrinsic_load_num_subgroups:
163 return ctx->options->subgroup_size_uniform;
164
165 /* Intrinsics which can be moved if the sources can */
166 case nir_intrinsic_load_ubo:
167 case nir_intrinsic_load_ubo_vec4:
168 case nir_intrinsic_get_ubo_size:
169 case nir_intrinsic_get_ssbo_size:
170 case nir_intrinsic_ballot_bitfield_extract:
171 case nir_intrinsic_ballot_find_lsb:
172 case nir_intrinsic_ballot_find_msb:
173 case nir_intrinsic_ballot_bit_count_reduce:
174 case nir_intrinsic_load_deref:
175 case nir_intrinsic_load_global_constant:
176 case nir_intrinsic_load_uniform:
177 case nir_intrinsic_load_constant:
178 case nir_intrinsic_load_sample_pos_from_id:
179 case nir_intrinsic_load_kernel_input:
180 case nir_intrinsic_load_buffer_amd:
181 case nir_intrinsic_image_samples:
182 case nir_intrinsic_image_deref_samples:
183 case nir_intrinsic_bindless_image_samples:
184 case nir_intrinsic_image_size:
185 case nir_intrinsic_image_deref_size:
186 case nir_intrinsic_bindless_image_size:
187 case nir_intrinsic_vulkan_resource_index:
188 case nir_intrinsic_vulkan_resource_reindex:
189 case nir_intrinsic_load_vulkan_descriptor:
190 case nir_intrinsic_quad_swizzle_amd:
191 case nir_intrinsic_masked_swizzle_amd:
192 case nir_intrinsic_load_ssbo_address:
193 case nir_intrinsic_bindless_resource_ir3:
194 return can_move_srcs(&instr->instr, ctx);
195
196 /* Image/SSBO loads can be moved if they are CAN_REORDER and their
197 * sources can be moved.
198 */
199 case nir_intrinsic_image_load:
200 case nir_intrinsic_bindless_image_load:
201 case nir_intrinsic_load_ssbo:
202 case nir_intrinsic_load_ssbo_ir3:
203 return (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) &&
204 can_move_srcs(&instr->instr, ctx);
205
206 default:
207 return false;
208 }
209 }
210
211 static bool
can_move_instr(nir_instr * instr,opt_preamble_ctx * ctx)212 can_move_instr(nir_instr *instr, opt_preamble_ctx *ctx)
213 {
214 switch (instr->type) {
215 case nir_instr_type_tex: {
216 nir_tex_instr *tex = nir_instr_as_tex(instr);
217 /* See note below about derivatives. We have special code to convert tex
218 * to txd, though, because it's a common case.
219 */
220 if (nir_tex_instr_has_implicit_derivative(tex) &&
221 tex->op != nir_texop_tex) {
222 return false;
223 }
224 return can_move_srcs(instr, ctx);
225 }
226 case nir_instr_type_alu: {
227 /* The preamble is presumably run with only one thread, so we can't run
228 * derivatives in it.
229 * TODO: Replace derivatives with 0 instead, if real apps hit this.
230 */
231 nir_alu_instr *alu = nir_instr_as_alu(instr);
232 switch (alu->op) {
233 case nir_op_fddx:
234 case nir_op_fddy:
235 case nir_op_fddx_fine:
236 case nir_op_fddy_fine:
237 case nir_op_fddx_coarse:
238 case nir_op_fddy_coarse:
239 return false;
240 default:
241 return can_move_srcs(instr, ctx);
242 }
243 }
244 case nir_instr_type_intrinsic:
245 return can_move_intrinsic(nir_instr_as_intrinsic(instr), ctx);
246
247 case nir_instr_type_load_const:
248 case nir_instr_type_ssa_undef:
249 return true;
250
251 case nir_instr_type_deref: {
252 nir_deref_instr *deref = nir_instr_as_deref(instr);
253 if (deref->deref_type == nir_deref_type_var) {
254 switch (deref->modes) {
255 case nir_var_uniform:
256 case nir_var_mem_ubo:
257 return true;
258 default:
259 return false;
260 }
261 } else {
262 return can_move_srcs(instr, ctx);
263 }
264 }
265
266 case nir_instr_type_phi:
267 /* TODO: we could move an if-statement if everything inside it is
268 * moveable.
269 */
270 return false;
271
272 default:
273 return false;
274 }
275 }
276
277 /* True if we should avoid making this a candidate. This is only called on
278 * instructions we already determined we can move, this just makes it so that
279 * uses of this instruction cannot be rewritten. Typically this happens
280 * because of static constraints on the IR, for example some deref chains
281 * cannot be broken.
282 */
283 static bool
avoid_instr(nir_instr * instr,const nir_opt_preamble_options * options)284 avoid_instr(nir_instr *instr, const nir_opt_preamble_options *options)
285 {
286 if (instr->type == nir_instr_type_deref)
287 return true;
288
289 return options->avoid_instr_cb(instr, options->cb_data);
290 }
291
292 static bool
update_src_value(nir_src * src,void * data)293 update_src_value(nir_src *src, void *data)
294 {
295 opt_preamble_ctx *ctx = data;
296
297 def_state *state = &ctx->states[ctx->def->index];
298 def_state *src_state = &ctx->states[src->ssa->index];
299
300 assert(src_state->can_move);
301
302 /* If an instruction has can_move and non-can_move users, it becomes a
303 * candidate and its value shouldn't propagate downwards. For example,
304 * imagine a chain like this:
305 *
306 * -- F (cannot move)
307 * /
308 * A <-- B <-- C <-- D <-- E (cannot move)
309 *
310 * B and D are marked candidates. Picking B removes A and B, picking D
311 * removes C and D, and picking both removes all 4. Therefore B and D are
312 * independent and B's value shouldn't flow into D.
313 *
314 * A similar argument holds for must_stay values.
315 */
316 if (!src_state->must_stay && !src_state->candidate)
317 state->value += src_state->value;
318 return true;
319 }
320
321 static int
candidate_sort(const void * data1,const void * data2)322 candidate_sort(const void *data1, const void *data2)
323 {
324 const def_state *state1 = *(def_state **)data1;
325 const def_state *state2 = *(def_state **)data2;
326
327 float value1 = state1->value / state1->size;
328 float value2 = state2->value / state2->size;
329 if (value1 < value2)
330 return 1;
331 else if (value1 > value2)
332 return -1;
333 else
334 return 0;
335 }
336
337 bool
nir_opt_preamble(nir_shader * shader,const nir_opt_preamble_options * options,unsigned * size)338 nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
339 unsigned *size)
340 {
341 opt_preamble_ctx ctx = {
342 .options = options,
343 };
344
345 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
346 ctx.states = calloc(impl->ssa_alloc, sizeof(*ctx.states));
347
348 /* Step 1: Calculate can_move */
349 nir_foreach_block (block, impl) {
350 nir_foreach_instr (instr, block) {
351 nir_ssa_def *def = nir_instr_ssa_def(instr);
352 if (!def)
353 continue;
354
355 def_state *state = &ctx.states[def->index];
356
357 state->can_move = can_move_instr(instr, &ctx);
358 }
359 }
360
361 /* Step 2: Calculate is_candidate. This is complicated by the presence of
362 * non-candidate instructions like derefs whose users cannot be rewritten.
363 * If a deref chain is used at all by a non-can_move thing, then any offset
364 * sources anywhere along the chain should be considered candidates because
365 * the entire deref chain will never be deleted, but if it's only used by
366 * can_move things then it becomes subsumed by its users and none of the
367 * offset sources should be considered candidates as they will be removed
368 * when the users of the deref chain are moved. We need to replace "are
369 * there any non-can_move users" with "are there any non-can_move users,
370 * *recursing through non-candidate users*". We do this by walking backward
371 * and marking when a non-candidate instruction must stay in the final
372 * program because it has a non-can_move user, including recursively.
373 */
374 unsigned num_candidates = 0;
375 nir_foreach_block_reverse (block, impl) {
376 nir_foreach_instr_reverse (instr, block) {
377 nir_ssa_def *def = nir_instr_ssa_def(instr);
378 if (!def)
379 continue;
380
381 def_state *state = &ctx.states[def->index];
382 if (!state->can_move)
383 continue;
384
385 state->value = get_instr_cost(instr, options);
386 bool is_candidate = !avoid_instr(instr, options);
387 state->candidate = false;
388 state->must_stay = false;
389 nir_foreach_use (use, def) {
390 nir_ssa_def *use_def = nir_instr_ssa_def(use->parent_instr);
391 if (!use_def || !ctx.states[use_def->index].can_move ||
392 ctx.states[use_def->index].must_stay) {
393 if (is_candidate)
394 state->candidate = true;
395 else
396 state->must_stay = true;
397 } else {
398 state->can_move_users++;
399 }
400 }
401
402 nir_foreach_if_use (use, def) {
403 if (is_candidate)
404 state->candidate = true;
405 else
406 state->must_stay = true;
407 break;
408 }
409
410 if (state->candidate)
411 num_candidates++;
412 }
413 }
414
415 if (num_candidates == 0) {
416 *size = 0;
417 free(ctx.states);
418 return false;
419 }
420
421 def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
422 unsigned candidate_idx = 0;
423 unsigned total_size = 0;
424
425 /* Step 3: Calculate value of candidates by propagating downwards. We try
426 * to share the value amongst can_move uses, in case there are multiple.
427 * This won't always find the most optimal solution, but is hopefully a
428 * good heuristic.
429 *
430 * Note that we use the can_move adjusted in the last pass, because if a
431 * can_move instruction cannot be moved because it's not a candidate and it
432 * has a non-can_move source then we don't want to count it as a use.
433 *
434 * While we're here, also collect an array of candidates.
435 */
436 nir_foreach_block (block, impl) {
437 nir_foreach_instr (instr, block) {
438 nir_ssa_def *def = nir_instr_ssa_def(instr);
439 if (!def)
440 continue;
441
442 def_state *state = &ctx.states[def->index];
443 if (!state->can_move || state->must_stay)
444 continue;
445
446 ctx.def = def;
447 nir_foreach_src(instr, update_src_value, &ctx);
448
449 /* If this instruction is a candidate, its value shouldn't be
450 * propagated so we skip dividing it.
451 *
452 * Note: if it's can_move but not a candidate, then all its users
453 * must be can_move, so if there are no users then it must be dead.
454 */
455 if (!state->candidate && !state->must_stay) {
456 if (state->can_move_users > 0)
457 state->value /= state->can_move_users;
458 else
459 state->value = 0;
460 }
461
462 if (state->candidate) {
463 state->benefit = state->value -
464 options->rewrite_cost_cb(def, options->cb_data);
465
466 if (state->benefit > 0) {
467 options->def_size(def, &state->size, &state->align);
468 total_size = ALIGN_POT(total_size, state->align);
469 total_size += state->size;
470 candidates[candidate_idx++] = state;
471 }
472 }
473 }
474 }
475
476 assert(candidate_idx <= num_candidates);
477 num_candidates = candidate_idx;
478
479 if (num_candidates == 0) {
480 *size = 0;
481 free(ctx.states);
482 free(candidates);
483 return false;
484 }
485
486 /* Step 4: Figure out which candidates we're going to replace and assign an
487 * offset. Assuming there is no expression sharing, this is similar to the
488 * 0-1 knapsack problem, except when there is a gap introduced by
489 * alignment. We use a well-known greedy approximation, sorting by value
490 * divided by size.
491 */
492
493 if (total_size > options->preamble_storage_size) {
494 qsort(candidates, num_candidates, sizeof(*candidates), candidate_sort);
495 }
496
497 unsigned offset = 0;
498 for (unsigned i = 0; i < num_candidates; i++) {
499 def_state *state = candidates[i];
500 offset = ALIGN_POT(offset, state->align);
501
502 if (offset + state->size > options->preamble_storage_size)
503 break;
504
505 state->replace = true;
506 state->offset = offset;
507
508 offset += state->size;
509 }
510
511 *size = offset;
512
513 free(candidates);
514
515 /* Step 5: Actually do the replacement. */
516 struct hash_table *remap_table =
517 _mesa_pointer_hash_table_create(NULL);
518 nir_function_impl *preamble =
519 nir_shader_get_preamble(impl->function->shader);
520 nir_builder _b;
521 nir_builder *b = &_b;
522 nir_builder_init(b, preamble);
523 b->cursor = nir_before_cf_list(&preamble->body);
524
525 nir_foreach_block (block, impl) {
526 nir_foreach_instr (instr, block) {
527 nir_ssa_def *def = nir_instr_ssa_def(instr);
528 if (!def)
529 continue;
530
531 def_state *state = &ctx.states[def->index];
532 if (!state->can_move)
533 continue;
534
535 nir_instr *clone = nir_instr_clone_deep(impl->function->shader,
536 instr, remap_table);
537
538 nir_builder_instr_insert(b, clone);
539
540 if (clone->type == nir_instr_type_tex) {
541 nir_tex_instr *tex = nir_instr_as_tex(clone);
542 if (tex->op == nir_texop_tex) {
543 /* For maximum compatibility, replace normal textures with
544 * textureGrad with a gradient of 0.
545 * TODO: Handle txb somehow.
546 */
547 b->cursor = nir_before_instr(clone);
548
549 nir_ssa_def *zero =
550 nir_imm_zero(b, tex->coord_components - tex->is_array, 32);
551 nir_tex_instr_add_src(tex, nir_tex_src_ddx, nir_src_for_ssa(zero));
552 nir_tex_instr_add_src(tex, nir_tex_src_ddy, nir_src_for_ssa(zero));
553 tex->op = nir_texop_txd;
554
555 b->cursor = nir_after_instr(clone);
556 }
557 }
558
559 if (state->replace) {
560 nir_ssa_def *clone_def = nir_instr_ssa_def(clone);
561 nir_store_preamble(b, clone_def, .base = state->offset);
562 }
563 }
564 }
565
566 nir_builder_init(b, impl);
567
568 nir_foreach_block (block, impl) {
569 nir_foreach_instr_safe (instr, block) {
570 nir_ssa_def *def = nir_instr_ssa_def(instr);
571 if (!def)
572 continue;
573
574 def_state *state = &ctx.states[def->index];
575 if (!state->replace)
576 continue;
577
578 b->cursor = nir_before_instr(instr);
579
580 nir_ssa_def *new_def =
581 nir_load_preamble(b, def->num_components, def->bit_size,
582 .base = state->offset);
583
584
585 nir_ssa_def_rewrite_uses(def, new_def);
586 nir_instr_free_and_dce(instr);
587 }
588 }
589
590 nir_metadata_preserve(impl,
591 nir_metadata_block_index |
592 nir_metadata_dominance);
593
594 ralloc_free(remap_table);
595 free(ctx.states);
596 return true;
597 }
598