• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /* This is a new block-level load instruction scheduler where loads are grouped
25  * according to their indirection level within a basic block. An indirection
26  * is when a result of one load is used as a source of another load. The result
27  * is that disjoint ALU opcode groups and load (texture) opcode groups are
28  * created where each next load group is the next level of indirection.
29  * It's done by finding the first and last load with the same indirection
30  * level, and moving all unrelated instructions between them after the last
31  * load except for load sources, which are moved before the first load.
32  * It naturally suits hardware that has limits on texture indirections, but
33  * other hardware can benefit too. Only texture, image, and SSBO load and
34  * atomic instructions are grouped.
35  *
36  * There is an option to group only those loads that use the same resource
37  * variable. This increases the chance to get more cache hits than if the loads
38  * were spread out.
39  *
40  * The increased register usage is offset by the increase in observed memory
41  * bandwidth due to more cache hits (dependent on hw behavior) and thus
42  * decrease the subgroup lifetime, which allows registers to be deallocated
43  * and reused sooner. In some bandwidth-bound cases, low register usage doesn't
44  * benefit at all. Doubling the register usage and using those registers to
45  * amplify observed bandwidth can improve performance a lot.
46  *
47  * It's recommended to run a hw-specific instruction scheduler after this to
48  * prevent spilling.
49  */
50 
51 #include "nir.h"
52 
53 static bool
is_memory_load(nir_instr * instr)54 is_memory_load(nir_instr *instr)
55 {
56    /* Count texture_size too because it has the same latency as cache hits. */
57    if (instr->type == nir_instr_type_tex)
58       return true;
59 
60    if (instr->type == nir_instr_type_intrinsic) {
61       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
62       const char *name = nir_intrinsic_infos[intr->intrinsic].name;
63 
64       /* TODO: nir_intrinsics.py could do this */
65       /* load_ubo is ignored because it's usually cheap. */
66       if (!nir_intrinsic_writes_external_memory(intr) &&
67           !strstr(name, "shared") &&
68           (strstr(name, "ssbo") || strstr(name, "image")))
69          return true;
70    }
71 
72    return false;
73 }
74 
75 static nir_instr *
get_intrinsic_resource(nir_intrinsic_instr * intr)76 get_intrinsic_resource(nir_intrinsic_instr *intr)
77 {
78    /* This is also the list of intrinsics that are grouped. */
79    /* load_ubo is ignored because it's usually cheap. */
80    switch (intr->intrinsic) {
81    case nir_intrinsic_image_load:
82    case nir_intrinsic_image_deref_load:
83    case nir_intrinsic_image_sparse_load:
84    case nir_intrinsic_image_deref_sparse_load:
85    /* Group image_size too because it has the same latency as cache hits. */
86    case nir_intrinsic_image_samples_identical:
87    case nir_intrinsic_image_deref_samples_identical:
88    case nir_intrinsic_bindless_image_samples_identical:
89    case nir_intrinsic_image_size:
90    case nir_intrinsic_image_deref_size:
91    case nir_intrinsic_bindless_image_load:
92    case nir_intrinsic_bindless_image_sparse_load:
93    case nir_intrinsic_load_ssbo:
94    case nir_intrinsic_image_fragment_mask_load_amd:
95    case nir_intrinsic_image_deref_fragment_mask_load_amd:
96    case nir_intrinsic_bindless_image_fragment_mask_load_amd:
97       return intr->src[0].ssa->parent_instr;
98    default:
99       return NULL;
100    }
101 }
102 
103 /* Track only those that we want to group. */
104 static bool
is_grouped_load(nir_instr * instr)105 is_grouped_load(nir_instr *instr)
106 {
107    /* Count texture_size too because it has the same latency as cache hits. */
108    if (instr->type == nir_instr_type_tex)
109       return true;
110 
111    if (instr->type == nir_instr_type_intrinsic)
112       return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL;
113 
114    return false;
115 }
116 
117 static bool
can_move(nir_instr * instr,uint8_t current_indirection_level)118 can_move(nir_instr *instr, uint8_t current_indirection_level)
119 {
120    /* Grouping is done by moving everything else out of the first/last
121     * instruction range of the indirection level.
122     */
123    if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level)
124       return false;
125 
126    if (instr->type == nir_instr_type_alu ||
127        instr->type == nir_instr_type_deref ||
128        instr->type == nir_instr_type_tex ||
129        instr->type == nir_instr_type_load_const ||
130        instr->type == nir_instr_type_undef)
131       return true;
132 
133    if (instr->type == nir_instr_type_intrinsic &&
134        nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
135       return true;
136 
137    return false;
138 }
139 
140 static nir_instr *
get_uniform_inst_resource(nir_instr * instr)141 get_uniform_inst_resource(nir_instr *instr)
142 {
143    if (instr->type == nir_instr_type_tex) {
144       nir_tex_instr *tex = nir_instr_as_tex(instr);
145 
146       if (tex->texture_non_uniform)
147          return NULL;
148 
149       for (unsigned i = 0; i < tex->num_srcs; i++) {
150          switch (tex->src[i].src_type) {
151          case nir_tex_src_texture_deref:
152          case nir_tex_src_texture_handle:
153             return tex->src[i].src.ssa->parent_instr;
154          default:
155             break;
156          }
157       }
158       return NULL;
159    }
160 
161    if (instr->type == nir_instr_type_intrinsic)
162       return get_intrinsic_resource(nir_instr_as_intrinsic(instr));
163 
164    return NULL;
165 }
166 
167 struct check_sources_state {
168    nir_block *block;
169    uint32_t first_index;
170 };
171 
172 static bool
has_only_sources_less_than(nir_src * src,void * data)173 has_only_sources_less_than(nir_src *src, void *data)
174 {
175    struct check_sources_state *state = (struct check_sources_state *)data;
176 
177    /* true if nir_foreach_src should keep going */
178    return state->block != src->ssa->parent_instr->block ||
179           src->ssa->parent_instr->index < state->first_index;
180 }
181 
182 static void
group_loads(nir_instr * first,nir_instr * last)183 group_loads(nir_instr *first, nir_instr *last)
184 {
185    /* Walk the instruction range between the first and last backward, and
186     * move those that have no uses within the range after the last one.
187     */
188    for (nir_instr *instr = exec_node_data_backward(nir_instr,
189                                                    last->node.prev, node);
190         instr != first;
191         instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) {
192       /* Only move instructions without side effects. */
193       if (!can_move(instr, first->pass_flags))
194          continue;
195 
196       nir_def *def = nir_instr_def(instr);
197       if (def) {
198          bool all_uses_after_last = true;
199 
200          nir_foreach_use(use, def) {
201             if (nir_src_parent_instr(use)->block == instr->block &&
202                 nir_src_parent_instr(use)->index <= last->index) {
203                all_uses_after_last = false;
204                break;
205             }
206          }
207 
208          if (all_uses_after_last) {
209             nir_instr *move_instr = instr;
210             /* Set the last instruction because we'll delete the current one. */
211             instr = exec_node_data_forward(nir_instr, instr->node.next, node);
212 
213             /* Move the instruction after the last and update its index
214              * to indicate that it's after it.
215              */
216             nir_instr_move(nir_after_instr(last), move_instr);
217             move_instr->index = last->index + 1;
218          }
219       }
220    }
221 
222    struct check_sources_state state;
223    state.block = first->block;
224    state.first_index = first->index;
225 
226    /* Walk the instruction range between the first and last forward, and move
227     * those that have no sources within the range before the first one.
228     */
229    for (nir_instr *instr = exec_node_data_forward(nir_instr,
230                                                   first->node.next, node);
231         instr != last;
232         instr = exec_node_data_forward(nir_instr, instr->node.next, node)) {
233       /* Only move instructions without side effects. */
234       if (!can_move(instr, first->pass_flags))
235          continue;
236 
237       if (nir_foreach_src(instr, has_only_sources_less_than, &state)) {
238          nir_instr *move_instr = instr;
239          /* Set the last instruction because we'll delete the current one. */
240          instr = exec_node_data_backward(nir_instr, instr->node.prev, node);
241 
242          /* Move the instruction before the first and update its index
243           * to indicate that it's before it.
244           */
245          nir_instr_move(nir_before_instr(first), move_instr);
246          move_instr->index = first->index - 1;
247       }
248    }
249 }
250 
251 static bool
is_pseudo_inst(nir_instr * instr)252 is_pseudo_inst(nir_instr *instr)
253 {
254    /* Other instructions do not usually contribute to the shader binary size. */
255    return instr->type != nir_instr_type_alu &&
256           instr->type != nir_instr_type_call &&
257           instr->type != nir_instr_type_tex &&
258           instr->type != nir_instr_type_intrinsic;
259 }
260 
261 static void
set_instr_indices(nir_block * block)262 set_instr_indices(nir_block *block)
263 {
264    /* Start with 1 because we'll move instruction before the first one
265     * and will want to label it 0.
266     */
267    unsigned counter = 1;
268    nir_instr *last = NULL;
269 
270    nir_foreach_instr(instr, block) {
271       /* Make sure grouped instructions don't have the same index as pseudo
272        * instructions.
273        */
274       if (last && is_pseudo_inst(last) && is_grouped_load(instr))
275          counter++;
276 
277       /* Set each instruction's index within the block. */
278       instr->index = counter;
279 
280       /* Only count non-pseudo instructions. */
281       if (!is_pseudo_inst(instr))
282          counter++;
283 
284       last = instr;
285    }
286 }
287 
288 static void
handle_load_range(nir_instr ** first,nir_instr ** last,nir_instr * current,unsigned max_distance)289 handle_load_range(nir_instr **first, nir_instr **last,
290                   nir_instr *current, unsigned max_distance)
291 {
292    if (*first && *last &&
293        (!current || current->index > (*first)->index + max_distance)) {
294       assert(*first != *last);
295       group_loads(*first, *last);
296       set_instr_indices((*first)->block);
297       *first = NULL;
298       *last = NULL;
299    }
300 }
301 
302 static bool
is_barrier(nir_instr * instr)303 is_barrier(nir_instr *instr)
304 {
305    if (instr->type == nir_instr_type_intrinsic) {
306       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
307       const char *name = nir_intrinsic_infos[intr->intrinsic].name;
308 
309       if (intr->intrinsic == nir_intrinsic_discard ||
310           intr->intrinsic == nir_intrinsic_discard_if ||
311           intr->intrinsic == nir_intrinsic_terminate ||
312           intr->intrinsic == nir_intrinsic_terminate_if ||
313           /* TODO: nir_intrinsics.py could do this */
314           strstr(name, "barrier"))
315          return true;
316    }
317 
318    return false;
319 }
320 
321 struct indirection_state {
322    nir_block *block;
323    unsigned indirections;
324 };
325 
326 static unsigned
327 get_num_indirections(nir_instr *instr);
328 
329 static bool
gather_indirections(nir_src * src,void * data)330 gather_indirections(nir_src *src, void *data)
331 {
332    struct indirection_state *state = (struct indirection_state *)data;
333    nir_instr *instr = src->ssa->parent_instr;
334 
335    /* We only count indirections within the same block. */
336    if (instr->block == state->block) {
337       unsigned indirections = get_num_indirections(src->ssa->parent_instr);
338 
339       if (instr->type == nir_instr_type_tex || is_memory_load(instr))
340          indirections++;
341 
342       state->indirections = MAX2(state->indirections, indirections);
343    }
344 
345    return true; /* whether nir_foreach_src should keep going */
346 }
347 
348 /* Return the number of load indirections within the block. */
349 static unsigned
get_num_indirections(nir_instr * instr)350 get_num_indirections(nir_instr *instr)
351 {
352    /* Don't traverse phis because we could end up in an infinite recursion
353     * if the phi points to the current block (such as a loop body).
354     */
355    if (instr->type == nir_instr_type_phi)
356       return 0;
357 
358    if (instr->index != UINT32_MAX)
359       return instr->index; /* we've visited this instruction before */
360 
361    struct indirection_state state;
362    state.block = instr->block;
363    state.indirections = 0;
364 
365    nir_foreach_src(instr, gather_indirections, &state);
366 
367    instr->index = state.indirections;
368    return state.indirections;
369 }
370 
371 static void
process_block(nir_block * block,nir_load_grouping grouping,unsigned max_distance)372 process_block(nir_block *block, nir_load_grouping grouping,
373               unsigned max_distance)
374 {
375    int max_indirection = -1;
376    unsigned num_inst_per_level[256] = { 0 };
377 
378    /* UINT32_MAX means the instruction has not been visited. Once
379     * an instruction has been visited and its indirection level has been
380     * determined, we'll store the indirection level in the index. The next
381     * instruction that visits it will use the index instead of recomputing
382     * the indirection level, which would result in an exponetial time
383     * complexity.
384     */
385    nir_foreach_instr(instr, block) {
386       instr->index = UINT32_MAX; /* unknown */
387    }
388 
389    /* Count the number of load indirections for each load instruction
390     * within this block. Store it in pass_flags.
391     */
392    nir_foreach_instr(instr, block) {
393       if (is_grouped_load(instr)) {
394          unsigned indirections = get_num_indirections(instr);
395 
396          /* pass_flags has only 8 bits */
397          indirections = MIN2(indirections, 255);
398          num_inst_per_level[indirections]++;
399          instr->pass_flags = indirections;
400 
401          max_indirection = MAX2(max_indirection, (int)indirections);
402       }
403    }
404 
405    /* 255 contains all indirection levels >= 255, so ignore them. */
406    max_indirection = MIN2(max_indirection, 254);
407 
408    /* Each indirection level is grouped. */
409    for (int level = 0; level <= max_indirection; level++) {
410       if (num_inst_per_level[level] <= 1)
411          continue;
412 
413       set_instr_indices(block);
414 
415       nir_instr *resource = NULL;
416       nir_instr *first_load = NULL, *last_load = NULL;
417 
418       /* Find the first and last instruction that use the same
419        * resource and are within a certain distance of each other.
420        * If found, group them by moving all movable instructions
421        * between them out.
422        */
423       nir_foreach_instr(current, block) {
424          /* Don't group across barriers. */
425          if (is_barrier(current)) {
426             /* Group unconditionally.  */
427             handle_load_range(&first_load, &last_load, NULL, 0);
428             first_load = NULL;
429             last_load = NULL;
430             continue;
431          }
432 
433          /* Only group load instructions with the same indirection level. */
434          if (is_grouped_load(current) && current->pass_flags == level) {
435             nir_instr *current_resource;
436 
437             switch (grouping) {
438             case nir_group_all:
439                if (!first_load)
440                   first_load = current;
441                else
442                   last_load = current;
443                break;
444 
445             case nir_group_same_resource_only:
446                current_resource = get_uniform_inst_resource(current);
447 
448                if (current_resource) {
449                   if (!first_load) {
450                      first_load = current;
451                      resource = current_resource;
452                   } else if (current_resource == resource) {
453                      last_load = current;
454                   }
455                }
456             }
457          }
458 
459          /* Group only if we exceeded the maximum distance. */
460          handle_load_range(&first_load, &last_load, current, max_distance);
461       }
462 
463       /* Group unconditionally.  */
464       handle_load_range(&first_load, &last_load, NULL, 0);
465    }
466 }
467 
468 /* max_distance is the maximum distance between the first and last instruction
469  * in a group.
470  */
471 void
nir_group_loads(nir_shader * shader,nir_load_grouping grouping,unsigned max_distance)472 nir_group_loads(nir_shader *shader, nir_load_grouping grouping,
473                 unsigned max_distance)
474 {
475    nir_foreach_function_impl(impl, shader) {
476       nir_foreach_block(block, impl) {
477          process_block(block, grouping, max_distance);
478       }
479 
480       nir_metadata_preserve(impl, nir_metadata_block_index |
481                                      nir_metadata_dominance |
482                                      nir_metadata_loop_analysis);
483    }
484 }
485