1 /*
2 * Copyright © 2021 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /* This is a new block-level load instruction scheduler where loads are grouped
25 * according to their indirection level within a basic block. An indirection
26 * is when a result of one load is used as a source of another load. The result
27 * is that disjoint ALU opcode groups and load (texture) opcode groups are
28 * created where each next load group is the next level of indirection.
29 * It's done by finding the first and last load with the same indirection
30 * level, and moving all unrelated instructions between them after the last
31 * load except for load sources, which are moved before the first load.
32 * It naturally suits hardware that has limits on texture indirections, but
33 * other hardware can benefit too. Only texture, image, and SSBO load and
34 * atomic instructions are grouped.
35 *
36 * There is an option to group only those loads that use the same resource
37 * variable. This increases the chance to get more cache hits than if the loads
38 * were spread out.
39 *
40 * The increased register usage is offset by the increase in observed memory
41 * bandwidth due to more cache hits (dependent on hw behavior) and thus
42 * decrease the subgroup lifetime, which allows registers to be deallocated
43 * and reused sooner. In some bandwidth-bound cases, low register usage doesn't
44 * benefit at all. Doubling the register usage and using those registers to
45 * amplify observed bandwidth can improve performance a lot.
46 *
47 * It's recommended to run a hw-specific instruction scheduler after this to
48 * prevent spilling.
49 */
50
51 #include "nir.h"
52
53 static bool
is_memory_load(nir_instr * instr)54 is_memory_load(nir_instr *instr)
55 {
56 /* Count texture_size too because it has the same latency as cache hits. */
57 if (instr->type == nir_instr_type_tex)
58 return true;
59
60 if (instr->type == nir_instr_type_intrinsic) {
61 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
62 const char *name = nir_intrinsic_infos[intr->intrinsic].name;
63
64 /* TODO: nir_intrinsics.py could do this */
65 /* load_ubo is ignored because it's usually cheap. */
66 if (!nir_intrinsic_writes_external_memory(intr) &&
67 !strstr(name, "shared") &&
68 (strstr(name, "ssbo") || strstr(name, "image")))
69 return true;
70 }
71
72 return false;
73 }
74
75 static nir_instr *
get_intrinsic_resource(nir_intrinsic_instr * intr)76 get_intrinsic_resource(nir_intrinsic_instr *intr)
77 {
78 /* This is also the list of intrinsics that are grouped. */
79 /* load_ubo is ignored because it's usually cheap. */
80 switch (intr->intrinsic) {
81 case nir_intrinsic_image_load:
82 case nir_intrinsic_image_deref_load:
83 case nir_intrinsic_image_sparse_load:
84 case nir_intrinsic_image_deref_sparse_load:
85 /* Group image_size too because it has the same latency as cache hits. */
86 case nir_intrinsic_image_samples_identical:
87 case nir_intrinsic_image_deref_samples_identical:
88 case nir_intrinsic_bindless_image_samples_identical:
89 case nir_intrinsic_image_size:
90 case nir_intrinsic_image_deref_size:
91 case nir_intrinsic_bindless_image_load:
92 case nir_intrinsic_bindless_image_sparse_load:
93 case nir_intrinsic_load_ssbo:
94 case nir_intrinsic_image_fragment_mask_load_amd:
95 case nir_intrinsic_image_deref_fragment_mask_load_amd:
96 case nir_intrinsic_bindless_image_fragment_mask_load_amd:
97 return intr->src[0].ssa->parent_instr;
98 default:
99 return NULL;
100 }
101 }
102
103 /* Track only those that we want to group. */
104 static bool
is_grouped_load(nir_instr * instr)105 is_grouped_load(nir_instr *instr)
106 {
107 /* Count texture_size too because it has the same latency as cache hits. */
108 if (instr->type == nir_instr_type_tex)
109 return true;
110
111 if (instr->type == nir_instr_type_intrinsic)
112 return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL;
113
114 return false;
115 }
116
117 static bool
can_move(nir_instr * instr,uint8_t current_indirection_level)118 can_move(nir_instr *instr, uint8_t current_indirection_level)
119 {
120 /* Grouping is done by moving everything else out of the first/last
121 * instruction range of the indirection level.
122 */
123 if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level)
124 return false;
125
126 if (instr->type == nir_instr_type_alu ||
127 instr->type == nir_instr_type_deref ||
128 instr->type == nir_instr_type_tex ||
129 instr->type == nir_instr_type_load_const ||
130 instr->type == nir_instr_type_undef)
131 return true;
132
133 if (instr->type == nir_instr_type_intrinsic &&
134 nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
135 return true;
136
137 return false;
138 }
139
140 static nir_instr *
get_uniform_inst_resource(nir_instr * instr)141 get_uniform_inst_resource(nir_instr *instr)
142 {
143 if (instr->type == nir_instr_type_tex) {
144 nir_tex_instr *tex = nir_instr_as_tex(instr);
145
146 if (tex->texture_non_uniform)
147 return NULL;
148
149 for (unsigned i = 0; i < tex->num_srcs; i++) {
150 switch (tex->src[i].src_type) {
151 case nir_tex_src_texture_deref:
152 case nir_tex_src_texture_handle:
153 return tex->src[i].src.ssa->parent_instr;
154 default:
155 break;
156 }
157 }
158 return NULL;
159 }
160
161 if (instr->type == nir_instr_type_intrinsic)
162 return get_intrinsic_resource(nir_instr_as_intrinsic(instr));
163
164 return NULL;
165 }
166
167 struct check_sources_state {
168 nir_block *block;
169 uint32_t first_index;
170 };
171
172 static bool
has_only_sources_less_than(nir_src * src,void * data)173 has_only_sources_less_than(nir_src *src, void *data)
174 {
175 struct check_sources_state *state = (struct check_sources_state *)data;
176
177 /* true if nir_foreach_src should keep going */
178 return state->block != src->ssa->parent_instr->block ||
179 src->ssa->parent_instr->index < state->first_index;
180 }
181
182 static void
group_loads(nir_instr * first,nir_instr * last)183 group_loads(nir_instr *first, nir_instr *last)
184 {
185 /* Walk the instruction range between the first and last backward, and
186 * move those that have no uses within the range after the last one.
187 */
188 for (nir_instr *instr = exec_node_data_backward(nir_instr,
189 last->node.prev, node);
190 instr != first;
191 instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) {
192 /* Only move instructions without side effects. */
193 if (!can_move(instr, first->pass_flags))
194 continue;
195
196 nir_def *def = nir_instr_def(instr);
197 if (def) {
198 bool all_uses_after_last = true;
199
200 nir_foreach_use(use, def) {
201 if (nir_src_parent_instr(use)->block == instr->block &&
202 nir_src_parent_instr(use)->index <= last->index) {
203 all_uses_after_last = false;
204 break;
205 }
206 }
207
208 if (all_uses_after_last) {
209 nir_instr *move_instr = instr;
210 /* Set the last instruction because we'll delete the current one. */
211 instr = exec_node_data_forward(nir_instr, instr->node.next, node);
212
213 /* Move the instruction after the last and update its index
214 * to indicate that it's after it.
215 */
216 nir_instr_move(nir_after_instr(last), move_instr);
217 move_instr->index = last->index + 1;
218 }
219 }
220 }
221
222 struct check_sources_state state;
223 state.block = first->block;
224 state.first_index = first->index;
225
226 /* Walk the instruction range between the first and last forward, and move
227 * those that have no sources within the range before the first one.
228 */
229 for (nir_instr *instr = exec_node_data_forward(nir_instr,
230 first->node.next, node);
231 instr != last;
232 instr = exec_node_data_forward(nir_instr, instr->node.next, node)) {
233 /* Only move instructions without side effects. */
234 if (!can_move(instr, first->pass_flags))
235 continue;
236
237 if (nir_foreach_src(instr, has_only_sources_less_than, &state)) {
238 nir_instr *move_instr = instr;
239 /* Set the last instruction because we'll delete the current one. */
240 instr = exec_node_data_backward(nir_instr, instr->node.prev, node);
241
242 /* Move the instruction before the first and update its index
243 * to indicate that it's before it.
244 */
245 nir_instr_move(nir_before_instr(first), move_instr);
246 move_instr->index = first->index - 1;
247 }
248 }
249 }
250
251 static bool
is_pseudo_inst(nir_instr * instr)252 is_pseudo_inst(nir_instr *instr)
253 {
254 /* Other instructions do not usually contribute to the shader binary size. */
255 return instr->type != nir_instr_type_alu &&
256 instr->type != nir_instr_type_call &&
257 instr->type != nir_instr_type_tex &&
258 instr->type != nir_instr_type_intrinsic;
259 }
260
261 static void
set_instr_indices(nir_block * block)262 set_instr_indices(nir_block *block)
263 {
264 /* Start with 1 because we'll move instruction before the first one
265 * and will want to label it 0.
266 */
267 unsigned counter = 1;
268 nir_instr *last = NULL;
269
270 nir_foreach_instr(instr, block) {
271 /* Make sure grouped instructions don't have the same index as pseudo
272 * instructions.
273 */
274 if (last && is_pseudo_inst(last) && is_grouped_load(instr))
275 counter++;
276
277 /* Set each instruction's index within the block. */
278 instr->index = counter;
279
280 /* Only count non-pseudo instructions. */
281 if (!is_pseudo_inst(instr))
282 counter++;
283
284 last = instr;
285 }
286 }
287
288 static void
handle_load_range(nir_instr ** first,nir_instr ** last,nir_instr * current,unsigned max_distance)289 handle_load_range(nir_instr **first, nir_instr **last,
290 nir_instr *current, unsigned max_distance)
291 {
292 assert(!current || !*first || current->index >= (*first)->index);
293 if (*first && *last &&
294 (!current || current->index - (*first)->index > max_distance)) {
295 assert(*first != *last);
296 group_loads(*first, *last);
297 set_instr_indices((*first)->block);
298 *first = NULL;
299 *last = NULL;
300 }
301 }
302
303 static bool
is_barrier(nir_instr * instr)304 is_barrier(nir_instr *instr)
305 {
306 if (instr->type == nir_instr_type_intrinsic) {
307 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
308 const char *name = nir_intrinsic_infos[intr->intrinsic].name;
309
310 if (intr->intrinsic == nir_intrinsic_terminate ||
311 intr->intrinsic == nir_intrinsic_terminate_if ||
312 /* TODO: nir_intrinsics.py could do this */
313 strstr(name, "barrier"))
314 return true;
315 }
316
317 return false;
318 }
319
320 struct indirection_state {
321 nir_block *block;
322 unsigned indirections;
323 };
324
325 static unsigned
326 get_num_indirections(nir_instr *instr);
327
328 static bool
gather_indirections(nir_src * src,void * data)329 gather_indirections(nir_src *src, void *data)
330 {
331 struct indirection_state *state = (struct indirection_state *)data;
332 nir_instr *instr = src->ssa->parent_instr;
333
334 /* We only count indirections within the same block. */
335 if (instr->block == state->block) {
336 unsigned indirections = get_num_indirections(src->ssa->parent_instr);
337
338 if (instr->type == nir_instr_type_tex || is_memory_load(instr))
339 indirections++;
340
341 state->indirections = MAX2(state->indirections, indirections);
342 }
343
344 return true; /* whether nir_foreach_src should keep going */
345 }
346
347 /* Return the number of load indirections within the block. */
348 static unsigned
get_num_indirections(nir_instr * instr)349 get_num_indirections(nir_instr *instr)
350 {
351 /* Don't traverse phis because we could end up in an infinite recursion
352 * if the phi points to the current block (such as a loop body).
353 */
354 if (instr->type == nir_instr_type_phi)
355 return 0;
356
357 if (instr->index != UINT32_MAX)
358 return instr->index; /* we've visited this instruction before */
359
360 struct indirection_state state;
361 state.block = instr->block;
362 state.indirections = 0;
363
364 nir_foreach_src(instr, gather_indirections, &state);
365
366 instr->index = state.indirections;
367 return state.indirections;
368 }
369
370 static void
process_block(nir_block * block,nir_load_grouping grouping,unsigned max_distance)371 process_block(nir_block *block, nir_load_grouping grouping,
372 unsigned max_distance)
373 {
374 int max_indirection = -1;
375 unsigned num_inst_per_level[256] = { 0 };
376
377 /* UINT32_MAX means the instruction has not been visited. Once
378 * an instruction has been visited and its indirection level has been
379 * determined, we'll store the indirection level in the index. The next
380 * instruction that visits it will use the index instead of recomputing
381 * the indirection level, which would result in an exponetial time
382 * complexity.
383 */
384 nir_foreach_instr(instr, block) {
385 instr->index = UINT32_MAX; /* unknown */
386 }
387
388 /* Count the number of load indirections for each load instruction
389 * within this block. Store it in pass_flags.
390 */
391 nir_foreach_instr(instr, block) {
392 if (is_grouped_load(instr)) {
393 unsigned indirections = get_num_indirections(instr);
394
395 /* pass_flags has only 8 bits */
396 indirections = MIN2(indirections, 255);
397 num_inst_per_level[indirections]++;
398 instr->pass_flags = indirections;
399
400 max_indirection = MAX2(max_indirection, (int)indirections);
401 }
402 }
403
404 /* 255 contains all indirection levels >= 255, so ignore them. */
405 max_indirection = MIN2(max_indirection, 254);
406
407 /* Each indirection level is grouped. */
408 for (int level = 0; level <= max_indirection; level++) {
409 if (num_inst_per_level[level] <= 1)
410 continue;
411
412 set_instr_indices(block);
413
414 nir_instr *resource = NULL;
415 nir_instr *first_load = NULL, *last_load = NULL;
416
417 /* Find the first and last instruction that use the same
418 * resource and are within a certain distance of each other.
419 * If found, group them by moving all movable instructions
420 * between them out.
421 */
422 nir_foreach_instr(current, block) {
423 /* Don't group across barriers. */
424 if (is_barrier(current)) {
425 /* Group unconditionally. */
426 handle_load_range(&first_load, &last_load, NULL, 0);
427 first_load = NULL;
428 last_load = NULL;
429 continue;
430 }
431
432 /* Only group load instructions with the same indirection level. */
433 if (is_grouped_load(current) && current->pass_flags == level) {
434 nir_instr *current_resource;
435
436 switch (grouping) {
437 case nir_group_all:
438 if (!first_load)
439 first_load = current;
440 else
441 last_load = current;
442 break;
443
444 case nir_group_same_resource_only:
445 current_resource = get_uniform_inst_resource(current);
446
447 if (current_resource) {
448 if (!first_load) {
449 first_load = current;
450 resource = current_resource;
451 } else if (current_resource == resource) {
452 last_load = current;
453 }
454 }
455 }
456 }
457
458 /* Group only if we exceeded the maximum distance. */
459 handle_load_range(&first_load, &last_load, current, max_distance);
460 }
461
462 /* Group unconditionally. */
463 handle_load_range(&first_load, &last_load, NULL, 0);
464 }
465 }
466
467 /* max_distance is the maximum distance between the first and last instruction
468 * in a group.
469 */
470 void
nir_group_loads(nir_shader * shader,nir_load_grouping grouping,unsigned max_distance)471 nir_group_loads(nir_shader *shader, nir_load_grouping grouping,
472 unsigned max_distance)
473 {
474 nir_foreach_function_impl(impl, shader) {
475 nir_foreach_block(block, impl) {
476 process_block(block, grouping, max_distance);
477 }
478
479 nir_metadata_preserve(impl, nir_metadata_control_flow |
480 nir_metadata_loop_analysis);
481 }
482 }
483