• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27 
28 #include "common/amdgfxregs.h"
29 
30 #include <algorithm>
31 #include <unordered_set>
32 #include <vector>
33 
34 #define SMEM_WINDOW_SIZE    (350 - ctx.num_waves * 35)
35 #define VMEM_WINDOW_SIZE    (1024 - ctx.num_waves * 64)
36 #define POS_EXP_WINDOW_SIZE 512
37 #define SMEM_MAX_MOVES      (64 - ctx.num_waves * 4)
38 #define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
39 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
40 #define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
41 #define POS_EXP_MAX_MOVES         512
42 
43 namespace aco {
44 
45 enum MoveResult {
46    move_success,
47    move_fail_ssa,
48    move_fail_rar,
49    move_fail_pressure,
50 };
51 
52 /**
53  * Cursor for downwards moves, where a single instruction is moved towards
54  * or below a group of instruction that hardware can execute as a clause.
55  */
56 struct DownwardsCursor {
57    int source_idx; /* Current instruction to consider for moving */
58 
59    int insert_idx_clause; /* First clause instruction */
60    int insert_idx;        /* First instruction *after* the clause */
61 
62    /* Maximum demand of all clause instructions,
63     * i.e. from insert_idx_clause (inclusive) to insert_idx (exclusive) */
64    RegisterDemand clause_demand;
65    /* Maximum demand of instructions from source_idx to insert_idx_clause (both exclusive) */
66    RegisterDemand total_demand;
67 
DownwardsCursoraco::DownwardsCursor68    DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand)
69        : source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1),
70          clause_demand(initial_clause_demand)
71    {}
72 
73    void verify_invariants(const RegisterDemand* register_demand);
74 };
75 
76 /**
77  * Cursor for upwards moves, where a single instruction is moved below
78  * another instruction.
79  */
80 struct UpwardsCursor {
81    int source_idx; /* Current instruction to consider for moving */
82    int insert_idx; /* Instruction to move in front of */
83 
84    /* Maximum demand of instructions from insert_idx (inclusive) to source_idx (exclusive) */
85    RegisterDemand total_demand;
86 
UpwardsCursoraco::UpwardsCursor87    UpwardsCursor(int source_idx_) : source_idx(source_idx_)
88    {
89       insert_idx = -1; /* to be initialized later */
90    }
91 
has_insert_idxaco::UpwardsCursor92    bool has_insert_idx() const { return insert_idx != -1; }
93    void verify_invariants(const RegisterDemand* register_demand);
94 };
95 
96 struct MoveState {
97    RegisterDemand max_registers;
98 
99    Block* block;
100    Instruction* current;
101    RegisterDemand* register_demand; /* demand per instruction */
102    bool improved_rar;
103 
104    std::vector<bool> depends_on;
105    /* Two are needed because, for downwards VMEM scheduling, one needs to
106     * exclude the instructions in the clause, since new instructions in the
107     * clause are not moved past any other instructions in the clause. */
108    std::vector<bool> RAR_dependencies;
109    std::vector<bool> RAR_dependencies_clause;
110 
111    /* for moving instructions before the current instruction to after it */
112    DownwardsCursor downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
113    MoveResult downwards_move(DownwardsCursor&, bool clause);
114    void downwards_skip(DownwardsCursor&);
115 
116    /* for moving instructions after the first use of the current instruction upwards */
117    UpwardsCursor upwards_init(int source_idx, bool improved_rar);
118    bool upwards_check_deps(UpwardsCursor&);
119    void upwards_update_insert_idx(UpwardsCursor&);
120    MoveResult upwards_move(UpwardsCursor&);
121    void upwards_skip(UpwardsCursor&);
122 };
123 
124 struct sched_ctx {
125    amd_gfx_level gfx_level;
126    int16_t num_waves;
127    int16_t last_SMEM_stall;
128    int last_SMEM_dep_idx;
129    MoveState mv;
130    bool schedule_pos_exports = true;
131    unsigned schedule_pos_export_div = 1;
132 };
133 
134 /* This scheduler is a simple bottom-up pass based on ideas from
135  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
136  * from Xiaohua Shi and Peng Guo.
137  * The basic approach is to iterate over all instructions. When a memory instruction
138  * is encountered it tries to move independent instructions from above and below
139  * between the memory instruction and it's first user.
140  * The novelty is that this scheduler cares for the current register pressure:
141  * Instructions will only be moved if the register pressure won't exceed a certain bound.
142  */
143 
144 template <typename T>
145 void
move_element(T begin_it,size_t idx,size_t before)146 move_element(T begin_it, size_t idx, size_t before)
147 {
148    if (idx < before) {
149       auto begin = std::next(begin_it, idx);
150       auto end = std::next(begin_it, before);
151       std::rotate(begin, begin + 1, end);
152    } else if (idx > before) {
153       auto begin = std::next(begin_it, before);
154       auto end = std::next(begin_it, idx + 1);
155       std::rotate(begin, end - 1, end);
156    }
157 }
158 
159 void
verify_invariants(const RegisterDemand * register_demand)160 DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
161 {
162    assert(source_idx < insert_idx_clause);
163    assert(insert_idx_clause < insert_idx);
164 
165 #ifndef NDEBUG
166    RegisterDemand reference_demand;
167    for (int i = source_idx + 1; i < insert_idx_clause; ++i) {
168       reference_demand.update(register_demand[i]);
169    }
170    assert(total_demand == reference_demand);
171 
172    reference_demand = {};
173    for (int i = insert_idx_clause; i < insert_idx; ++i) {
174       reference_demand.update(register_demand[i]);
175    }
176    assert(clause_demand == reference_demand);
177 #endif
178 }
179 
180 DownwardsCursor
downwards_init(int current_idx,bool improved_rar_,bool may_form_clauses)181 MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
182 {
183    improved_rar = improved_rar_;
184 
185    std::fill(depends_on.begin(), depends_on.end(), false);
186    if (improved_rar) {
187       std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
188       if (may_form_clauses)
189          std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
190    }
191 
192    for (const Operand& op : current->operands) {
193       if (op.isTemp()) {
194          depends_on[op.tempId()] = true;
195          if (improved_rar && op.isFirstKill())
196             RAR_dependencies[op.tempId()] = true;
197       }
198    }
199 
200    DownwardsCursor cursor(current_idx, register_demand[current_idx]);
201    cursor.verify_invariants(register_demand);
202    return cursor;
203 }
204 
205 /* If add_to_clause is true, the current clause is extended by moving the
206  * instruction at source_idx in front of the clause. Otherwise, the instruction
207  * is moved past the end of the clause without extending it */
208 MoveResult
downwards_move(DownwardsCursor & cursor,bool add_to_clause)209 MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
210 {
211    aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
212 
213    for (const Definition& def : instr->definitions)
214       if (def.isTemp() && depends_on[def.tempId()])
215          return move_fail_ssa;
216 
217    /* check if one of candidate's operands is killed by depending instruction */
218    std::vector<bool>& RAR_deps =
219       improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
220    for (const Operand& op : instr->operands) {
221       if (op.isTemp() && RAR_deps[op.tempId()]) {
222          // FIXME: account for difference in register pressure
223          return move_fail_rar;
224       }
225    }
226 
227    if (add_to_clause) {
228       for (const Operand& op : instr->operands) {
229          if (op.isTemp()) {
230             depends_on[op.tempId()] = true;
231             if (op.isFirstKill())
232                RAR_dependencies[op.tempId()] = true;
233          }
234       }
235    }
236 
237    const int dest_insert_idx = add_to_clause ? cursor.insert_idx_clause : cursor.insert_idx;
238    RegisterDemand register_pressure = cursor.total_demand;
239    if (!add_to_clause) {
240       register_pressure.update(cursor.clause_demand);
241    }
242 
243    /* Check the new demand of the instructions being moved over */
244    const RegisterDemand candidate_diff = get_live_changes(instr);
245    if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
246       return move_fail_pressure;
247 
248    /* New demand for the moved instruction */
249    const RegisterDemand temp = get_temp_registers(instr);
250    const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
251    const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
252    if (new_demand.exceeds(max_registers))
253       return move_fail_pressure;
254 
255    /* move the candidate below the memory load */
256    move_element(block->instructions.begin(), cursor.source_idx, dest_insert_idx);
257 
258    /* update register pressure */
259    move_element(register_demand, cursor.source_idx, dest_insert_idx);
260    for (int i = cursor.source_idx; i < dest_insert_idx - 1; i++)
261       register_demand[i] -= candidate_diff;
262    register_demand[dest_insert_idx - 1] = new_demand;
263    cursor.insert_idx_clause--;
264    if (cursor.source_idx != cursor.insert_idx_clause) {
265       /* Update demand if we moved over any instructions before the clause */
266       cursor.total_demand -= candidate_diff;
267    } else {
268       assert(cursor.total_demand == RegisterDemand{});
269    }
270    if (add_to_clause) {
271       cursor.clause_demand.update(new_demand);
272    } else {
273       cursor.clause_demand -= candidate_diff;
274       cursor.insert_idx--;
275    }
276 
277    cursor.source_idx--;
278    cursor.verify_invariants(register_demand);
279    return move_success;
280 }
281 
282 void
downwards_skip(DownwardsCursor & cursor)283 MoveState::downwards_skip(DownwardsCursor& cursor)
284 {
285    aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
286 
287    for (const Operand& op : instr->operands) {
288       if (op.isTemp()) {
289          depends_on[op.tempId()] = true;
290          if (improved_rar && op.isFirstKill()) {
291             RAR_dependencies[op.tempId()] = true;
292             RAR_dependencies_clause[op.tempId()] = true;
293          }
294       }
295    }
296    cursor.total_demand.update(register_demand[cursor.source_idx]);
297    cursor.source_idx--;
298    cursor.verify_invariants(register_demand);
299 }
300 
301 void
verify_invariants(const RegisterDemand * register_demand)302 UpwardsCursor::verify_invariants(const RegisterDemand* register_demand)
303 {
304 #ifndef NDEBUG
305    if (!has_insert_idx()) {
306       return;
307    }
308 
309    assert(insert_idx < source_idx);
310 
311    RegisterDemand reference_demand;
312    for (int i = insert_idx; i < source_idx; ++i) {
313       reference_demand.update(register_demand[i]);
314    }
315    assert(total_demand == reference_demand);
316 #endif
317 }
318 
319 UpwardsCursor
upwards_init(int source_idx,bool improved_rar_)320 MoveState::upwards_init(int source_idx, bool improved_rar_)
321 {
322    improved_rar = improved_rar_;
323 
324    std::fill(depends_on.begin(), depends_on.end(), false);
325    std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
326 
327    for (const Definition& def : current->definitions) {
328       if (def.isTemp())
329          depends_on[def.tempId()] = true;
330    }
331 
332    return UpwardsCursor(source_idx);
333 }
334 
335 bool
upwards_check_deps(UpwardsCursor & cursor)336 MoveState::upwards_check_deps(UpwardsCursor& cursor)
337 {
338    aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
339    for (const Operand& op : instr->operands) {
340       if (op.isTemp() && depends_on[op.tempId()])
341          return false;
342    }
343    return true;
344 }
345 
346 void
upwards_update_insert_idx(UpwardsCursor & cursor)347 MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
348 {
349    cursor.insert_idx = cursor.source_idx;
350    cursor.total_demand = register_demand[cursor.insert_idx];
351 }
352 
353 MoveResult
upwards_move(UpwardsCursor & cursor)354 MoveState::upwards_move(UpwardsCursor& cursor)
355 {
356    assert(cursor.has_insert_idx());
357 
358    aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
359    for (const Operand& op : instr->operands) {
360       if (op.isTemp() && depends_on[op.tempId()])
361          return move_fail_ssa;
362    }
363 
364    /* check if candidate uses/kills an operand which is used by a dependency */
365    for (const Operand& op : instr->operands) {
366       if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
367          return move_fail_rar;
368    }
369 
370    /* check if register pressure is low enough: the diff is negative if register pressure is
371     * decreased */
372    const RegisterDemand candidate_diff = get_live_changes(instr);
373    const RegisterDemand temp = get_temp_registers(instr);
374    if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers))
375       return move_fail_pressure;
376    const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]);
377    const RegisterDemand new_demand =
378       register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
379    if (new_demand.exceeds(max_registers))
380       return move_fail_pressure;
381 
382    /* move the candidate above the insert_idx */
383    move_element(block->instructions.begin(), cursor.source_idx, cursor.insert_idx);
384 
385    /* update register pressure */
386    move_element(register_demand, cursor.source_idx, cursor.insert_idx);
387    register_demand[cursor.insert_idx] = new_demand;
388    for (int i = cursor.insert_idx + 1; i <= cursor.source_idx; i++)
389       register_demand[i] += candidate_diff;
390    cursor.total_demand += candidate_diff;
391 
392    cursor.total_demand.update(register_demand[cursor.source_idx]);
393 
394    cursor.insert_idx++;
395    cursor.source_idx++;
396 
397    cursor.verify_invariants(register_demand);
398 
399    return move_success;
400 }
401 
402 void
upwards_skip(UpwardsCursor & cursor)403 MoveState::upwards_skip(UpwardsCursor& cursor)
404 {
405    if (cursor.has_insert_idx()) {
406       aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
407       for (const Definition& def : instr->definitions) {
408          if (def.isTemp())
409             depends_on[def.tempId()] = true;
410       }
411       for (const Operand& op : instr->operands) {
412          if (op.isTemp())
413             RAR_dependencies[op.tempId()] = true;
414       }
415       cursor.total_demand.update(register_demand[cursor.source_idx]);
416    }
417 
418    cursor.source_idx++;
419 
420    cursor.verify_invariants(register_demand);
421 }
422 
423 bool
is_done_sendmsg(amd_gfx_level gfx_level,const Instruction * instr)424 is_done_sendmsg(amd_gfx_level gfx_level, const Instruction* instr)
425 {
426    if (gfx_level <= GFX10_3 && instr->opcode == aco_opcode::s_sendmsg)
427       return (instr->sopp().imm & sendmsg_id_mask) == sendmsg_gs_done;
428    return false;
429 }
430 
431 bool
is_pos_prim_export(amd_gfx_level gfx_level,const Instruction * instr)432 is_pos_prim_export(amd_gfx_level gfx_level, const Instruction* instr)
433 {
434    /* Because of NO_PC_EXPORT=1, a done=1 position or primitive export can launch PS waves before
435     * the NGG/VS wave finishes if there are no parameter exports.
436     */
437    return instr->opcode == aco_opcode::exp && instr->exp().dest >= V_008DFC_SQ_EXP_POS &&
438           instr->exp().dest <= V_008DFC_SQ_EXP_PRIM && gfx_level >= GFX10;
439 }
440 
441 memory_sync_info
get_sync_info_with_hack(const Instruction * instr)442 get_sync_info_with_hack(const Instruction* instr)
443 {
444    memory_sync_info sync = get_sync_info(instr);
445    if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
446       // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
447       sync.storage = (storage_class)(sync.storage | storage_buffer);
448       sync.semantics =
449          (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
450    }
451    return sync;
452 }
453 
454 struct memory_event_set {
455    bool has_control_barrier;
456 
457    unsigned bar_acquire;
458    unsigned bar_release;
459    unsigned bar_classes;
460 
461    unsigned access_acquire;
462    unsigned access_release;
463    unsigned access_relaxed;
464    unsigned access_atomic;
465 };
466 
467 struct hazard_query {
468    amd_gfx_level gfx_level;
469    bool contains_spill;
470    bool contains_sendmsg;
471    bool uses_exec;
472    bool writes_exec;
473    memory_event_set mem_events;
474    unsigned aliasing_storage;      /* storage classes which are accessed (non-SMEM) */
475    unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
476 };
477 
478 void
init_hazard_query(const sched_ctx & ctx,hazard_query * query)479 init_hazard_query(const sched_ctx& ctx, hazard_query* query)
480 {
481    query->gfx_level = ctx.gfx_level;
482    query->contains_spill = false;
483    query->contains_sendmsg = false;
484    query->uses_exec = false;
485    query->writes_exec = false;
486    memset(&query->mem_events, 0, sizeof(query->mem_events));
487    query->aliasing_storage = 0;
488    query->aliasing_storage_smem = 0;
489 }
490 
491 void
add_memory_event(amd_gfx_level gfx_level,memory_event_set * set,Instruction * instr,memory_sync_info * sync)492 add_memory_event(amd_gfx_level gfx_level, memory_event_set* set, Instruction* instr,
493                  memory_sync_info* sync)
494 {
495    set->has_control_barrier |= is_done_sendmsg(gfx_level, instr);
496    set->has_control_barrier |= is_pos_prim_export(gfx_level, instr);
497    if (instr->opcode == aco_opcode::p_barrier) {
498       Pseudo_barrier_instruction& bar = instr->barrier();
499       if (bar.sync.semantics & semantic_acquire)
500          set->bar_acquire |= bar.sync.storage;
501       if (bar.sync.semantics & semantic_release)
502          set->bar_release |= bar.sync.storage;
503       set->bar_classes |= bar.sync.storage;
504 
505       set->has_control_barrier |= bar.exec_scope > scope_invocation;
506    }
507 
508    if (!sync->storage)
509       return;
510 
511    if (sync->semantics & semantic_acquire)
512       set->access_acquire |= sync->storage;
513    if (sync->semantics & semantic_release)
514       set->access_release |= sync->storage;
515 
516    if (!(sync->semantics & semantic_private)) {
517       if (sync->semantics & semantic_atomic)
518          set->access_atomic |= sync->storage;
519       else
520          set->access_relaxed |= sync->storage;
521    }
522 }
523 
524 void
add_to_hazard_query(hazard_query * query,Instruction * instr)525 add_to_hazard_query(hazard_query* query, Instruction* instr)
526 {
527    if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
528       query->contains_spill = true;
529    query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
530    query->uses_exec |= needs_exec_mask(instr);
531    for (const Definition& def : instr->definitions) {
532       if (def.isFixed() && def.physReg() == exec)
533          query->writes_exec = true;
534    }
535 
536    memory_sync_info sync = get_sync_info_with_hack(instr);
537 
538    add_memory_event(query->gfx_level, &query->mem_events, instr, &sync);
539 
540    if (!(sync.semantics & semantic_can_reorder)) {
541       unsigned storage = sync.storage;
542       /* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and
543                                                       // buffer/global memory can alias
544       if (storage & (storage_buffer | storage_image))
545          storage |= storage_buffer | storage_image;
546       if (instr->isSMEM())
547          query->aliasing_storage_smem |= storage;
548       else
549          query->aliasing_storage |= storage;
550    }
551 }
552 
553 enum HazardResult {
554    hazard_success,
555    hazard_fail_reorder_vmem_smem,
556    hazard_fail_reorder_ds,
557    hazard_fail_reorder_sendmsg,
558    hazard_fail_spill,
559    hazard_fail_export,
560    hazard_fail_barrier,
561    /* Must stop at these failures. The hazard query code doesn't consider them
562     * when added. */
563    hazard_fail_exec,
564    hazard_fail_unreorderable,
565 };
566 
567 HazardResult
perform_hazard_query(hazard_query * query,Instruction * instr,bool upwards)568 perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
569 {
570    /* don't schedule discards downwards */
571    if (!upwards && instr->opcode == aco_opcode::p_exit_early_if)
572       return hazard_fail_unreorderable;
573 
574    /* In Primitive Ordered Pixel Shading, await overlapped waves as late as possible, and notify
575     * overlapping waves that they can continue execution as early as possible.
576     */
577    if (upwards) {
578       if (instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id ||
579           (instr->opcode == aco_opcode::s_wait_event &&
580            !(instr->sopp().imm & wait_event_imm_dont_wait_export_ready))) {
581          return hazard_fail_unreorderable;
582       }
583    } else {
584       if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
585          return hazard_fail_unreorderable;
586       }
587    }
588 
589    if (query->uses_exec || query->writes_exec) {
590       for (const Definition& def : instr->definitions) {
591          if (def.isFixed() && def.physReg() == exec)
592             return hazard_fail_exec;
593       }
594    }
595    if (query->writes_exec && needs_exec_mask(instr))
596       return hazard_fail_exec;
597 
598    /* Don't move exports so that they stay closer together.
599     * Since GFX11, export order matters. MRTZ must come first,
600     * then color exports sorted from first to last.
601     * Also, with Primitive Ordered Pixel Shading on GFX11+, the `done` export must not be moved
602     * above the memory accesses before the queue family scope (more precisely, fragment interlock
603     * scope, but it's not available in ACO) release barrier that is expected to be inserted before
604     * the export, as well as before any `s_wait_event export_ready` which enters the ordered
605     * section, because the `done` export exits the ordered section.
606     */
607    if (instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11)
608       return hazard_fail_export;
609 
610    /* don't move non-reorderable instructions */
611    if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
612        instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 ||
613        instr->opcode == aco_opcode::p_init_scratch ||
614        instr->opcode == aco_opcode::p_jump_to_epilog ||
615        instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
616        instr->opcode == aco_opcode::s_sendmsg_rtn_b64 ||
617        instr->opcode == aco_opcode::p_end_with_regs)
618       return hazard_fail_unreorderable;
619 
620    memory_event_set instr_set;
621    memset(&instr_set, 0, sizeof(instr_set));
622    memory_sync_info sync = get_sync_info_with_hack(instr);
623    add_memory_event(query->gfx_level, &instr_set, instr, &sync);
624 
625    memory_event_set* first = &instr_set;
626    memory_event_set* second = &query->mem_events;
627    if (upwards)
628       std::swap(first, second);
629 
630    /* everything after barrier(acquire) happens after the atomics/control_barriers before
631     * everything after load(acquire) happens after the load
632     */
633    if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
634       return hazard_fail_barrier;
635    if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
636        ((first->access_acquire | first->bar_acquire) &
637         (second->access_relaxed | second->access_atomic)))
638       return hazard_fail_barrier;
639 
640    /* everything before barrier(release) happens before the atomics/control_barriers after *
641     * everything before store(release) happens before the store
642     */
643    if (first->bar_release && (second->has_control_barrier || second->access_atomic))
644       return hazard_fail_barrier;
645    if ((first->bar_classes && (second->bar_release || second->access_release)) ||
646        ((first->access_relaxed | first->access_atomic) &
647         (second->bar_release | second->access_release)))
648       return hazard_fail_barrier;
649 
650    /* don't move memory barriers around other memory barriers */
651    if (first->bar_classes && second->bar_classes)
652       return hazard_fail_barrier;
653 
654    /* Don't move memory accesses to before control barriers. I don't think
655     * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
656    unsigned control_classes =
657       storage_buffer | storage_image | storage_shared | storage_task_payload;
658    if (first->has_control_barrier &&
659        ((second->access_atomic | second->access_relaxed) & control_classes))
660       return hazard_fail_barrier;
661 
662    /* don't move memory loads/stores past potentially aliasing loads/stores */
663    unsigned aliasing_storage =
664       instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage;
665    if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
666       unsigned intersect = sync.storage & aliasing_storage;
667       if (intersect & storage_shared)
668          return hazard_fail_reorder_ds;
669       return hazard_fail_reorder_vmem_smem;
670    }
671 
672    if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
673        query->contains_spill)
674       return hazard_fail_spill;
675 
676    if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
677       return hazard_fail_reorder_sendmsg;
678 
679    return hazard_success;
680 }
681 
682 void
schedule_SMEM(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)683 schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
684               Instruction* current, int idx)
685 {
686    assert(idx != 0);
687    int window_size = SMEM_WINDOW_SIZE;
688    int max_moves = SMEM_MAX_MOVES;
689    int16_t k = 0;
690 
691    /* don't move s_memtime/s_memrealtime */
692    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime ||
693        current->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
694        current->opcode == aco_opcode::s_sendmsg_rtn_b64)
695       return;
696 
697    /* first, check if we have instructions before current to move down */
698    hazard_query hq;
699    init_hazard_query(ctx, &hq);
700    add_to_hazard_query(&hq, current);
701 
702    DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false);
703 
704    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
705         candidate_idx--) {
706       assert(candidate_idx >= 0);
707       assert(candidate_idx == cursor.source_idx);
708       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
709 
710       /* break if we'd make the previous SMEM instruction stall */
711       bool can_stall_prev_smem =
712          idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
713       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
714          break;
715 
716       /* break when encountering another MEM instruction, logical_start or barriers */
717       if (candidate->opcode == aco_opcode::p_logical_start)
718          break;
719       /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
720        * to help create more vmem clauses */
721       if ((candidate->isVMEM() || candidate->isFlatLike()) &&
722           (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
723            current->operands[0].size() == 4))
724          break;
725       /* don't move descriptor loads below buffer loads */
726       if (candidate->isSMEM() && !candidate->operands.empty() && current->operands[0].size() == 4 &&
727           candidate->operands[0].size() == 2)
728          break;
729 
730       bool can_move_down = true;
731 
732       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
733       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
734           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
735           haz == hazard_fail_export)
736          can_move_down = false;
737       else if (haz != hazard_success)
738          break;
739 
740       /* don't use LDS/GDS instructions to hide latency since it can
741        * significantly worsen LDS scheduling */
742       if (candidate->isDS() || !can_move_down) {
743          add_to_hazard_query(&hq, candidate.get());
744          ctx.mv.downwards_skip(cursor);
745          continue;
746       }
747 
748       MoveResult res = ctx.mv.downwards_move(cursor, false);
749       if (res == move_fail_ssa || res == move_fail_rar) {
750          add_to_hazard_query(&hq, candidate.get());
751          ctx.mv.downwards_skip(cursor);
752          continue;
753       } else if (res == move_fail_pressure) {
754          break;
755       }
756 
757       if (candidate_idx < ctx.last_SMEM_dep_idx)
758          ctx.last_SMEM_stall++;
759       k++;
760    }
761 
762    /* find the first instruction depending on current or find another MEM */
763    UpwardsCursor up_cursor = ctx.mv.upwards_init(idx + 1, false);
764 
765    bool found_dependency = false;
766    /* second, check if we have instructions after current to move up */
767    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
768         candidate_idx++) {
769       assert(candidate_idx == up_cursor.source_idx);
770       assert(candidate_idx < (int)block->instructions.size());
771       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
772 
773       if (candidate->opcode == aco_opcode::p_logical_end)
774          break;
775 
776       /* check if candidate depends on current */
777       bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps(up_cursor);
778       /* no need to steal from following VMEM instructions */
779       if (is_dependency && (candidate->isVMEM() || candidate->isFlatLike()))
780          break;
781 
782       if (found_dependency) {
783          HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
784          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
785              haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
786              haz == hazard_fail_export)
787             is_dependency = true;
788          else if (haz != hazard_success)
789             break;
790       }
791 
792       if (is_dependency) {
793          if (!found_dependency) {
794             ctx.mv.upwards_update_insert_idx(up_cursor);
795             init_hazard_query(ctx, &hq);
796             found_dependency = true;
797          }
798       }
799 
800       if (is_dependency || !found_dependency) {
801          if (found_dependency)
802             add_to_hazard_query(&hq, candidate.get());
803          else
804             k++;
805          ctx.mv.upwards_skip(up_cursor);
806          continue;
807       }
808 
809       MoveResult res = ctx.mv.upwards_move(up_cursor);
810       if (res == move_fail_ssa || res == move_fail_rar) {
811          /* no need to steal from following VMEM instructions */
812          if (res == move_fail_ssa && (candidate->isVMEM() || candidate->isFlatLike()))
813             break;
814          add_to_hazard_query(&hq, candidate.get());
815          ctx.mv.upwards_skip(up_cursor);
816          continue;
817       } else if (res == move_fail_pressure) {
818          break;
819       }
820       k++;
821    }
822 
823    ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0;
824    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
825 }
826 
827 void
schedule_VMEM(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)828 schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
829               Instruction* current, int idx)
830 {
831    assert(idx != 0);
832    int window_size = VMEM_WINDOW_SIZE;
833    int max_moves = VMEM_MAX_MOVES;
834    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
835    bool only_clauses = false;
836    int16_t k = 0;
837 
838    /* first, check if we have instructions before current to move down */
839    hazard_query indep_hq;
840    hazard_query clause_hq;
841    init_hazard_query(ctx, &indep_hq);
842    init_hazard_query(ctx, &clause_hq);
843    add_to_hazard_query(&indep_hq, current);
844 
845    DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
846 
847    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
848         candidate_idx--) {
849       assert(candidate_idx == cursor.source_idx);
850       assert(candidate_idx >= 0);
851       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
852       bool is_vmem = candidate->isVMEM() || candidate->isFlatLike();
853 
854       /* break when encountering another VMEM instruction, logical_start or barriers */
855       if (candidate->opcode == aco_opcode::p_logical_start)
856          break;
857 
858       /* break if we'd make the previous SMEM instruction stall */
859       bool can_stall_prev_smem =
860          idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
861       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
862          break;
863 
864       bool part_of_clause = false;
865       if (current->isVMEM() == candidate->isVMEM()) {
866          int grab_dist = cursor.insert_idx_clause - candidate_idx;
867          /* We can't easily tell how much this will decrease the def-to-use
868           * distances, so just use how far it will be moved as a heuristic. */
869          part_of_clause =
870             grab_dist < clause_max_grab_dist + k && should_form_clause(current, candidate.get());
871       }
872 
873       /* if current depends on candidate, add additional dependencies and continue */
874       bool can_move_down = !is_vmem || part_of_clause || candidate->definitions.empty();
875       if (only_clauses) {
876          /* In case of high register pressure, only try to form clauses,
877           * and only if the previous clause is not larger
878           * than the current one will be.
879           */
880          if (part_of_clause) {
881             int clause_size = cursor.insert_idx - cursor.insert_idx_clause;
882             int prev_clause_size = 1;
883             while (should_form_clause(current,
884                                       block->instructions[candidate_idx - prev_clause_size].get()))
885                prev_clause_size++;
886             if (prev_clause_size > clause_size + 1)
887                break;
888          } else {
889             can_move_down = false;
890          }
891       }
892       HazardResult haz =
893          perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
894       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
895           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
896           haz == hazard_fail_export)
897          can_move_down = false;
898       else if (haz != hazard_success)
899          break;
900 
901       if (!can_move_down) {
902          if (part_of_clause)
903             break;
904          add_to_hazard_query(&indep_hq, candidate.get());
905          add_to_hazard_query(&clause_hq, candidate.get());
906          ctx.mv.downwards_skip(cursor);
907          continue;
908       }
909 
910       Instruction* candidate_ptr = candidate.get();
911       MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
912       if (res == move_fail_ssa || res == move_fail_rar) {
913          if (part_of_clause)
914             break;
915          add_to_hazard_query(&indep_hq, candidate.get());
916          add_to_hazard_query(&clause_hq, candidate.get());
917          ctx.mv.downwards_skip(cursor);
918          continue;
919       } else if (res == move_fail_pressure) {
920          only_clauses = true;
921          if (part_of_clause)
922             break;
923          add_to_hazard_query(&indep_hq, candidate.get());
924          add_to_hazard_query(&clause_hq, candidate.get());
925          ctx.mv.downwards_skip(cursor);
926          continue;
927       }
928       if (part_of_clause)
929          add_to_hazard_query(&indep_hq, candidate_ptr);
930       else
931          k++;
932       if (candidate_idx < ctx.last_SMEM_dep_idx)
933          ctx.last_SMEM_stall++;
934    }
935 
936    /* find the first instruction depending on current or find another VMEM */
937    UpwardsCursor up_cursor = ctx.mv.upwards_init(idx + 1, true);
938 
939    bool found_dependency = false;
940    /* second, check if we have instructions after current to move up */
941    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
942         candidate_idx++) {
943       assert(candidate_idx == up_cursor.source_idx);
944       assert(candidate_idx < (int)block->instructions.size());
945       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
946       bool is_vmem = candidate->isVMEM() || candidate->isFlatLike();
947 
948       if (candidate->opcode == aco_opcode::p_logical_end)
949          break;
950 
951       /* check if candidate depends on current */
952       bool is_dependency = false;
953       if (found_dependency) {
954          HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
955          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
956              haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
957              haz == hazard_fail_barrier || haz == hazard_fail_export)
958             is_dependency = true;
959          else if (haz != hazard_success)
960             break;
961       }
962 
963       is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps(up_cursor);
964       if (is_dependency) {
965          if (!found_dependency) {
966             ctx.mv.upwards_update_insert_idx(up_cursor);
967             init_hazard_query(ctx, &indep_hq);
968             found_dependency = true;
969          }
970       } else if (is_vmem) {
971          /* don't move up dependencies of other VMEM instructions */
972          for (const Definition& def : candidate->definitions) {
973             if (def.isTemp())
974                ctx.mv.depends_on[def.tempId()] = true;
975          }
976       }
977 
978       if (is_dependency || !found_dependency) {
979          if (found_dependency)
980             add_to_hazard_query(&indep_hq, candidate.get());
981          else
982             k++;
983          ctx.mv.upwards_skip(up_cursor);
984          continue;
985       }
986 
987       MoveResult res = ctx.mv.upwards_move(up_cursor);
988       if (res == move_fail_ssa || res == move_fail_rar) {
989          add_to_hazard_query(&indep_hq, candidate.get());
990          ctx.mv.upwards_skip(up_cursor);
991          continue;
992       } else if (res == move_fail_pressure) {
993          break;
994       }
995       k++;
996    }
997 }
998 
999 void
schedule_position_export(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)1000 schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
1001                          Instruction* current, int idx)
1002 {
1003    assert(idx != 0);
1004    int window_size = POS_EXP_WINDOW_SIZE / ctx.schedule_pos_export_div;
1005    int max_moves = POS_EXP_MAX_MOVES / ctx.schedule_pos_export_div;
1006    int16_t k = 0;
1007 
1008    DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false);
1009 
1010    hazard_query hq;
1011    init_hazard_query(ctx, &hq);
1012    add_to_hazard_query(&hq, current);
1013 
1014    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
1015         candidate_idx--) {
1016       assert(candidate_idx >= 0);
1017       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
1018 
1019       if (candidate->opcode == aco_opcode::p_logical_start)
1020          break;
1021       if (candidate->isVMEM() || candidate->isSMEM() || candidate->isFlatLike())
1022          break;
1023 
1024       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
1025       if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
1026          break;
1027 
1028       if (haz != hazard_success) {
1029          add_to_hazard_query(&hq, candidate.get());
1030          ctx.mv.downwards_skip(cursor);
1031          continue;
1032       }
1033 
1034       MoveResult res = ctx.mv.downwards_move(cursor, false);
1035       if (res == move_fail_ssa || res == move_fail_rar) {
1036          add_to_hazard_query(&hq, candidate.get());
1037          ctx.mv.downwards_skip(cursor);
1038          continue;
1039       } else if (res == move_fail_pressure) {
1040          break;
1041       }
1042       k++;
1043    }
1044 }
1045 
1046 unsigned
schedule_VMEM_store(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)1047 schedule_VMEM_store(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
1048                     Instruction* current, int idx)
1049 {
1050    hazard_query hq;
1051    init_hazard_query(ctx, &hq);
1052 
1053    DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
1054    unsigned skip = 0;
1055 
1056    for (int i = 0; i < VMEM_CLAUSE_MAX_GRAB_DIST; i++) {
1057       aco_ptr<Instruction>& candidate = block->instructions[cursor.source_idx];
1058       if (candidate->opcode == aco_opcode::p_logical_start)
1059          break;
1060 
1061       if (!should_form_clause(current, candidate.get())) {
1062          add_to_hazard_query(&hq, candidate.get());
1063          ctx.mv.downwards_skip(cursor);
1064          continue;
1065       }
1066 
1067       if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success ||
1068           ctx.mv.downwards_move(cursor, true) != move_success)
1069          break;
1070 
1071       skip++;
1072    }
1073 
1074    return skip;
1075 }
1076 
1077 void
schedule_block(sched_ctx & ctx,Program * program,Block * block,live & live_vars)1078 schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
1079 {
1080    ctx.last_SMEM_dep_idx = 0;
1081    ctx.last_SMEM_stall = INT16_MIN;
1082    ctx.mv.block = block;
1083    ctx.mv.register_demand = live_vars.register_demand[block->index].data();
1084 
1085    /* go through all instructions and find memory loads */
1086    unsigned num_stores = 0;
1087    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
1088       Instruction* current = block->instructions[idx].get();
1089 
1090       if (current->opcode == aco_opcode::p_logical_end)
1091          break;
1092 
1093       if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) {
1094          unsigned target = current->exp().dest;
1095          if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
1096             ctx.mv.current = current;
1097             schedule_position_export(ctx, block, live_vars.register_demand[block->index], current,
1098                                      idx);
1099          }
1100       }
1101 
1102       if (current->definitions.empty()) {
1103          num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0;
1104          continue;
1105       }
1106 
1107       if (current->isVMEM() || current->isFlatLike()) {
1108          ctx.mv.current = current;
1109          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
1110       }
1111 
1112       if (current->isSMEM()) {
1113          ctx.mv.current = current;
1114          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
1115       }
1116    }
1117 
1118    /* GFX11 benefits from creating VMEM store clauses. */
1119    if (num_stores > 1 && program->gfx_level >= GFX11) {
1120       for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
1121          Instruction* current = block->instructions[idx].get();
1122          if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike()))
1123             continue;
1124 
1125          ctx.mv.current = current;
1126          idx -=
1127             schedule_VMEM_store(ctx, block, live_vars.register_demand[block->index], current, idx);
1128       }
1129    }
1130 
1131    /* resummarize the block's register demand */
1132    block->register_demand = RegisterDemand();
1133    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
1134       block->register_demand.update(live_vars.register_demand[block->index][idx]);
1135    }
1136 }
1137 
1138 void
schedule_program(Program * program,live & live_vars)1139 schedule_program(Program* program, live& live_vars)
1140 {
1141    /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
1142    RegisterDemand demand;
1143    for (Block& block : program->blocks)
1144       demand.update(block.register_demand);
1145    demand.vgpr += program->config->num_shared_vgprs / 2;
1146 
1147    sched_ctx ctx;
1148    ctx.gfx_level = program->gfx_level;
1149    ctx.mv.depends_on.resize(program->peekAllocationId());
1150    ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
1151    ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
1152    /* Allowing the scheduler to reduce the number of waves to as low as 5
1153     * improves performance of Thrones of Britannia significantly and doesn't
1154     * seem to hurt anything else. */
1155    // TODO: account for possible uneven num_waves on GFX10+
1156    unsigned wave_fac = program->dev.physical_vgprs / 256;
1157    if (program->num_waves <= 5 * wave_fac)
1158       ctx.num_waves = program->num_waves;
1159    else if (demand.vgpr >= 29)
1160       ctx.num_waves = 5 * wave_fac;
1161    else if (demand.vgpr >= 25)
1162       ctx.num_waves = 6 * wave_fac;
1163    else
1164       ctx.num_waves = 7 * wave_fac;
1165    ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
1166    ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
1167    ctx.num_waves = max_suitable_waves(program, ctx.num_waves);
1168 
1169    /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
1170    ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
1171 
1172    assert(ctx.num_waves > 0);
1173    ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
1174                            int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
1175 
1176    /* NGG culling shaders are very sensitive to position export scheduling.
1177     * Schedule less aggressively when early primitive export is used, and
1178     * keep the position export at the very bottom when late primitive export is used.
1179     */
1180    if (program->info.has_ngg_culling && program->stage.num_sw_stages() == 1) {
1181       if (!program->info.has_ngg_early_prim_export)
1182          ctx.schedule_pos_exports = false;
1183       else
1184          ctx.schedule_pos_export_div = 4;
1185    }
1186 
1187    for (Block& block : program->blocks)
1188       schedule_block(ctx, program, &block, live_vars);
1189 
1190    /* update max_reg_demand and num_waves */
1191    RegisterDemand new_demand;
1192    for (Block& block : program->blocks) {
1193       new_demand.update(block.register_demand);
1194    }
1195    update_vgpr_sgpr_demand(program, new_demand);
1196 
1197 /* if enabled, this code asserts that register_demand is updated correctly */
1198 #if 0
1199    int prev_num_waves = program->num_waves;
1200    const RegisterDemand prev_max_demand = program->max_reg_demand;
1201 
1202    std::vector<RegisterDemand> demands(program->blocks.size());
1203    for (unsigned j = 0; j < program->blocks.size(); j++) {
1204       demands[j] = program->blocks[j].register_demand;
1205    }
1206 
1207    live live_vars2 = aco::live_var_analysis(program);
1208 
1209    for (unsigned j = 0; j < program->blocks.size(); j++) {
1210       Block &b = program->blocks[j];
1211       for (unsigned i = 0; i < b.instructions.size(); i++)
1212          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
1213       assert(b.register_demand == demands[j]);
1214    }
1215 
1216    assert(program->max_reg_demand == prev_max_demand);
1217    assert(program->num_waves == prev_num_waves);
1218 #endif
1219 }
1220 
1221 } // namespace aco
1222