• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 #include "aco_builder.h"
27 #include <unordered_set>
28 #include <algorithm>
29 
30 #include "amdgfxregs.h"
31 
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
37 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
38 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
39 #define POS_EXP_MAX_MOVES 512
40 
41 namespace aco {
42 
43 enum MoveResult {
44    move_success,
45    move_fail_ssa,
46    move_fail_rar,
47    move_fail_pressure,
48 };
49 
50 struct MoveState {
51    RegisterDemand max_registers;
52 
53    Block *block;
54    Instruction *current;
55    RegisterDemand *register_demand;
56    bool improved_rar;
57 
58    std::vector<bool> depends_on;
59    /* Two are needed because, for downwards VMEM scheduling, one needs to
60     * exclude the instructions in the clause, since new instructions in the
61     * clause are not moved past any other instructions in the clause. */
62    std::vector<bool> RAR_dependencies;
63    std::vector<bool> RAR_dependencies_clause;
64 
65    int source_idx;
66    int insert_idx, insert_idx_clause;
67    RegisterDemand total_demand, total_demand_clause;
68 
69    /* for moving instructions before the current instruction to after it */
70    void downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
71    MoveResult downwards_move(bool clause);
72    void downwards_skip();
73 
74    /* for moving instructions after the first use of the current instruction upwards */
75    void upwards_init(int source_idx, bool improved_rar);
76    bool upwards_check_deps();
77    void upwards_set_insert_idx(int before);
78    MoveResult upwards_move();
79    void upwards_skip();
80 
81 private:
82    void downwards_advance_helper();
83 };
84 
85 struct sched_ctx {
86    int16_t num_waves;
87    int16_t last_SMEM_stall;
88    int last_SMEM_dep_idx;
89    MoveState mv;
90 };
91 
92 /* This scheduler is a simple bottom-up pass based on ideas from
93  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
94  * from Xiaohua Shi and Peng Guo.
95  * The basic approach is to iterate over all instructions. When a memory instruction
96  * is encountered it tries to move independent instructions from above and below
97  * between the memory instruction and it's first user.
98  * The novelty is that this scheduler cares for the current register pressure:
99  * Instructions will only be moved if the register pressure won't exceed a certain bound.
100  */
101 
102 template <typename T>
move_element(T begin_it,size_t idx,size_t before)103 void move_element(T begin_it, size_t idx, size_t before) {
104     if (idx < before) {
105         auto begin = std::next(begin_it, idx);
106         auto end = std::next(begin_it, before);
107         std::rotate(begin, begin + 1, end);
108     } else if (idx > before) {
109         auto begin = std::next(begin_it, before);
110         auto end = std::next(begin_it, idx + 1);
111         std::rotate(begin, end - 1, end);
112     }
113 }
114 
downwards_advance_helper()115 void MoveState::downwards_advance_helper()
116 {
117    source_idx--;
118    total_demand.update(register_demand[source_idx]);
119 }
120 
downwards_init(int current_idx,bool improved_rar_,bool may_form_clauses)121 void MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
122 {
123    improved_rar = improved_rar_;
124    source_idx = current_idx;
125 
126    insert_idx = current_idx + 1;
127    insert_idx_clause = current_idx;
128 
129    total_demand = total_demand_clause = register_demand[current_idx];
130 
131    std::fill(depends_on.begin(), depends_on.end(), false);
132    if (improved_rar) {
133       std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
134       if (may_form_clauses)
135          std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
136    }
137 
138    for (const Operand& op : current->operands) {
139       if (op.isTemp()) {
140          depends_on[op.tempId()] = true;
141          if (improved_rar && op.isFirstKill())
142             RAR_dependencies[op.tempId()] = true;
143       }
144    }
145 
146    /* update total_demand/source_idx */
147    downwards_advance_helper();
148 }
149 
downwards_move(bool clause)150 MoveResult MoveState::downwards_move(bool clause)
151 {
152    aco_ptr<Instruction>& instr = block->instructions[source_idx];
153 
154    for (const Definition& def : instr->definitions)
155       if (def.isTemp() && depends_on[def.tempId()])
156          return move_fail_ssa;
157 
158    /* check if one of candidate's operands is killed by depending instruction */
159    std::vector<bool>& RAR_deps = improved_rar ? (clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
160    for (const Operand& op : instr->operands) {
161       if (op.isTemp() && RAR_deps[op.tempId()]) {
162          // FIXME: account for difference in register pressure
163          return move_fail_rar;
164       }
165    }
166 
167    if (clause) {
168       for (const Operand& op : instr->operands) {
169          if (op.isTemp()) {
170             depends_on[op.tempId()] = true;
171             if (op.isFirstKill())
172                RAR_dependencies[op.tempId()] = true;
173          }
174       }
175    }
176 
177    int dest_insert_idx = clause ? insert_idx_clause : insert_idx;
178    RegisterDemand register_pressure = clause ? total_demand_clause : total_demand;
179 
180    const RegisterDemand candidate_diff = get_live_changes(instr);
181    const RegisterDemand temp = get_temp_registers(instr);
182    if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
183       return move_fail_pressure;
184    const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
185    const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
186    if (new_demand.exceeds(max_registers))
187       return move_fail_pressure;
188 
189    /* move the candidate below the memory load */
190    move_element(block->instructions.begin(), source_idx, dest_insert_idx);
191 
192    /* update register pressure */
193    move_element(register_demand, source_idx, dest_insert_idx);
194    for (int i = source_idx; i < dest_insert_idx - 1; i++)
195       register_demand[i] -= candidate_diff;
196    register_demand[dest_insert_idx - 1] = new_demand;
197    total_demand_clause -= candidate_diff;
198    insert_idx_clause--;
199    if (!clause) {
200       total_demand -= candidate_diff;
201       insert_idx--;
202    }
203 
204    downwards_advance_helper();
205    return move_success;
206 }
207 
downwards_skip()208 void MoveState::downwards_skip()
209 {
210    aco_ptr<Instruction>& instr = block->instructions[source_idx];
211 
212    for (const Operand& op : instr->operands) {
213       if (op.isTemp()) {
214          depends_on[op.tempId()] = true;
215          if (improved_rar && op.isFirstKill()) {
216             RAR_dependencies[op.tempId()] = true;
217             RAR_dependencies_clause[op.tempId()] = true;
218          }
219       }
220    }
221    total_demand_clause.update(register_demand[source_idx]);
222 
223    downwards_advance_helper();
224 }
225 
upwards_init(int source_idx_,bool improved_rar_)226 void MoveState::upwards_init(int source_idx_, bool improved_rar_)
227 {
228    source_idx = source_idx_;
229    improved_rar = improved_rar_;
230 
231    insert_idx = -1;
232 
233    std::fill(depends_on.begin(), depends_on.end(), false);
234    std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
235 
236    for (const Definition& def : current->definitions) {
237       if (def.isTemp())
238          depends_on[def.tempId()] = true;
239    }
240 }
241 
upwards_check_deps()242 bool MoveState::upwards_check_deps()
243 {
244    aco_ptr<Instruction>& instr = block->instructions[source_idx];
245    for (const Operand& op : instr->operands) {
246       if (op.isTemp() && depends_on[op.tempId()])
247          return false;
248    }
249    return true;
250 }
251 
upwards_set_insert_idx(int before)252 void MoveState::upwards_set_insert_idx(int before)
253 {
254    insert_idx = before;
255    total_demand = register_demand[before - 1];
256 }
257 
upwards_move()258 MoveResult MoveState::upwards_move()
259 {
260    assert(insert_idx >= 0);
261 
262    aco_ptr<Instruction>& instr = block->instructions[source_idx];
263    for (const Operand& op : instr->operands) {
264       if (op.isTemp() && depends_on[op.tempId()])
265          return move_fail_ssa;
266    }
267 
268    /* check if candidate uses/kills an operand which is used by a dependency */
269    for (const Operand& op : instr->operands) {
270       if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
271          return move_fail_rar;
272    }
273 
274    /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
275    const RegisterDemand candidate_diff = get_live_changes(instr);
276    const RegisterDemand temp = get_temp_registers(instr);
277    if (RegisterDemand(total_demand + candidate_diff).exceeds(max_registers))
278       return move_fail_pressure;
279    const RegisterDemand temp2 = get_temp_registers(block->instructions[insert_idx - 1]);
280    const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
281    if (new_demand.exceeds(max_registers))
282       return move_fail_pressure;
283 
284    /* move the candidate above the insert_idx */
285    move_element(block->instructions.begin(), source_idx, insert_idx);
286 
287    /* update register pressure */
288    move_element(register_demand, source_idx, insert_idx);
289    for (int i = insert_idx + 1; i <= source_idx; i++)
290       register_demand[i] += candidate_diff;
291    register_demand[insert_idx] = new_demand;
292    total_demand += candidate_diff;
293 
294    insert_idx++;
295 
296    total_demand.update(register_demand[source_idx]);
297    source_idx++;
298 
299    return move_success;
300 }
301 
upwards_skip()302 void MoveState::upwards_skip()
303 {
304    if (insert_idx >= 0) {
305       aco_ptr<Instruction>& instr = block->instructions[source_idx];
306       for (const Definition& def : instr->definitions) {
307          if (def.isTemp())
308             depends_on[def.tempId()] = true;
309       }
310       for (const Operand& op : instr->operands) {
311          if (op.isTemp())
312             RAR_dependencies[op.tempId()] = true;
313       }
314       total_demand.update(register_demand[source_idx]);
315    }
316 
317    source_idx++;
318 }
319 
is_gs_or_done_sendmsg(const Instruction * instr)320 bool is_gs_or_done_sendmsg(const Instruction *instr)
321 {
322    if (instr->opcode == aco_opcode::s_sendmsg) {
323       uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
324       return (imm & sendmsg_id_mask) == _sendmsg_gs ||
325              (imm & sendmsg_id_mask) == _sendmsg_gs_done;
326    }
327    return false;
328 }
329 
is_done_sendmsg(const Instruction * instr)330 bool is_done_sendmsg(const Instruction *instr)
331 {
332    if (instr->opcode == aco_opcode::s_sendmsg) {
333       uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
334       return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
335    }
336    return false;
337 }
338 
get_sync_info_with_hack(const Instruction * instr)339 memory_sync_info get_sync_info_with_hack(const Instruction* instr)
340 {
341    memory_sync_info sync = get_sync_info(instr);
342    if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
343       // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
344       sync.storage = (storage_class)(sync.storage | storage_buffer);
345       sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
346    }
347    return sync;
348 }
349 
350 struct memory_event_set {
351    bool has_control_barrier;
352 
353    unsigned bar_acquire;
354    unsigned bar_release;
355    unsigned bar_classes;
356 
357    unsigned access_acquire;
358    unsigned access_release;
359    unsigned access_relaxed;
360    unsigned access_atomic;
361 };
362 
363 struct hazard_query {
364    bool contains_spill;
365    bool contains_sendmsg;
366    memory_event_set mem_events;
367    unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
368    unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
369 };
370 
init_hazard_query(hazard_query * query)371 void init_hazard_query(hazard_query *query) {
372    query->contains_spill = false;
373    query->contains_sendmsg = false;
374    memset(&query->mem_events, 0, sizeof(query->mem_events));
375    query->aliasing_storage = 0;
376    query->aliasing_storage_smem = 0;
377 }
378 
add_memory_event(memory_event_set * set,Instruction * instr,memory_sync_info * sync)379 void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
380 {
381    set->has_control_barrier |= is_done_sendmsg(instr);
382    if (instr->opcode == aco_opcode::p_barrier) {
383       Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
384       if (bar->sync.semantics & semantic_acquire)
385          set->bar_acquire |= bar->sync.storage;
386       if (bar->sync.semantics & semantic_release)
387          set->bar_release |= bar->sync.storage;
388       set->bar_classes |= bar->sync.storage;
389 
390       set->has_control_barrier |= bar->exec_scope > scope_invocation;
391    }
392 
393    if (!sync->storage)
394       return;
395 
396    if (sync->semantics & semantic_acquire)
397       set->access_acquire |= sync->storage;
398    if (sync->semantics & semantic_release)
399       set->access_release |= sync->storage;
400 
401    if (!(sync->semantics & semantic_private)) {
402       if (sync->semantics & semantic_atomic)
403          set->access_atomic |= sync->storage;
404       else
405          set->access_relaxed |= sync->storage;
406    }
407 }
408 
add_to_hazard_query(hazard_query * query,Instruction * instr)409 void add_to_hazard_query(hazard_query *query, Instruction *instr)
410 {
411    if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
412       query->contains_spill = true;
413    query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
414 
415    memory_sync_info sync = get_sync_info_with_hack(instr);
416 
417    add_memory_event(&query->mem_events, instr, &sync);
418 
419    if (!(sync.semantics & semantic_can_reorder)) {
420       unsigned storage = sync.storage;
421       /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
422       if (storage & (storage_buffer | storage_image))
423          storage |= storage_buffer | storage_image;
424       if (instr->format == Format::SMEM)
425          query->aliasing_storage_smem |= storage;
426       else
427          query->aliasing_storage |= storage;
428    }
429 }
430 
431 enum HazardResult {
432    hazard_success,
433    hazard_fail_reorder_vmem_smem,
434    hazard_fail_reorder_ds,
435    hazard_fail_reorder_sendmsg,
436    hazard_fail_spill,
437    hazard_fail_export,
438    hazard_fail_barrier,
439    /* Must stop at these failures. The hazard query code doesn't consider them
440     * when added. */
441    hazard_fail_exec,
442    hazard_fail_unreorderable,
443 };
444 
perform_hazard_query(hazard_query * query,Instruction * instr,bool upwards)445 HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
446 {
447    if (instr->opcode == aco_opcode::p_exit_early_if)
448       return hazard_fail_exec;
449    for (const Definition& def : instr->definitions) {
450       if (def.isFixed() && def.physReg() == exec)
451          return hazard_fail_exec;
452    }
453 
454    /* don't move exports so that they stay closer together */
455    if (instr->format == Format::EXP)
456       return hazard_fail_export;
457 
458    /* don't move non-reorderable instructions */
459    if (instr->opcode == aco_opcode::s_memtime ||
460        instr->opcode == aco_opcode::s_memrealtime ||
461        instr->opcode == aco_opcode::s_setprio ||
462        instr->opcode == aco_opcode::s_getreg_b32)
463       return hazard_fail_unreorderable;
464 
465    memory_event_set instr_set;
466    memset(&instr_set, 0, sizeof(instr_set));
467    memory_sync_info sync = get_sync_info_with_hack(instr);
468    add_memory_event(&instr_set, instr, &sync);
469 
470    memory_event_set *first = &instr_set;
471    memory_event_set *second = &query->mem_events;
472    if (upwards)
473       std::swap(first, second);
474 
475    /* everything after barrier(acquire) happens after the atomics/control_barriers before
476     * everything after load(acquire) happens after the load
477     */
478    if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
479       return hazard_fail_barrier;
480    if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
481        ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
482       return hazard_fail_barrier;
483 
484    /* everything before barrier(release) happens before the atomics/control_barriers after *
485     * everything before store(release) happens before the store
486     */
487    if (first->bar_release && (second->has_control_barrier || second->access_atomic))
488       return hazard_fail_barrier;
489    if ((first->bar_classes && (second->bar_release || second->access_release)) ||
490        ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
491       return hazard_fail_barrier;
492 
493    /* don't move memory barriers around other memory barriers */
494    if (first->bar_classes && second->bar_classes)
495       return hazard_fail_barrier;
496 
497    /* Don't move memory accesses to before control barriers. I don't think
498     * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
499    unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared;
500    if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes))
501       return hazard_fail_barrier;
502 
503    /* don't move memory loads/stores past potentially aliasing loads/stores */
504    unsigned aliasing_storage = instr->format == Format::SMEM ?
505                                query->aliasing_storage_smem :
506                                query->aliasing_storage;
507    if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
508       unsigned intersect = sync.storage & aliasing_storage;
509       if (intersect & storage_shared)
510          return hazard_fail_reorder_ds;
511       return hazard_fail_reorder_vmem_smem;
512    }
513 
514    if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
515        query->contains_spill)
516       return hazard_fail_spill;
517 
518    if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
519       return hazard_fail_reorder_sendmsg;
520 
521    return hazard_success;
522 }
523 
schedule_SMEM(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)524 void schedule_SMEM(sched_ctx& ctx, Block* block,
525                    std::vector<RegisterDemand>& register_demand,
526                    Instruction* current, int idx)
527 {
528    assert(idx != 0);
529    int window_size = SMEM_WINDOW_SIZE;
530    int max_moves = SMEM_MAX_MOVES;
531    int16_t k = 0;
532 
533    /* don't move s_memtime/s_memrealtime */
534    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
535       return;
536 
537    /* first, check if we have instructions before current to move down */
538    hazard_query hq;
539    init_hazard_query(&hq);
540    add_to_hazard_query(&hq, current);
541 
542    ctx.mv.downwards_init(idx, false, false);
543 
544    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
545       assert(candidate_idx >= 0);
546       assert(candidate_idx == ctx.mv.source_idx);
547       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
548 
549       /* break if we'd make the previous SMEM instruction stall */
550       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
551       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
552          break;
553 
554       /* break when encountering another MEM instruction, logical_start or barriers */
555       if (candidate->opcode == aco_opcode::p_logical_start)
556          break;
557       if (candidate->isVMEM())
558          break;
559 
560       bool can_move_down = true;
561 
562       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
563       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
564          can_move_down = false;
565       else if (haz != hazard_success)
566          break;
567 
568       /* don't use LDS/GDS instructions to hide latency since it can
569        * significanly worsen LDS scheduling */
570       if (candidate->format == Format::DS || !can_move_down) {
571          add_to_hazard_query(&hq, candidate.get());
572          ctx.mv.downwards_skip();
573          continue;
574       }
575 
576       MoveResult res = ctx.mv.downwards_move(false);
577       if (res == move_fail_ssa || res == move_fail_rar) {
578          add_to_hazard_query(&hq, candidate.get());
579          ctx.mv.downwards_skip();
580          continue;
581       } else if (res == move_fail_pressure) {
582          break;
583       }
584 
585       if (candidate_idx < ctx.last_SMEM_dep_idx)
586          ctx.last_SMEM_stall++;
587       k++;
588    }
589 
590    /* find the first instruction depending on current or find another MEM */
591    ctx.mv.upwards_init(idx + 1, false);
592 
593    bool found_dependency = false;
594    /* second, check if we have instructions after current to move up */
595    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
596       assert(candidate_idx == ctx.mv.source_idx);
597       assert(candidate_idx < (int) block->instructions.size());
598       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
599 
600       if (candidate->opcode == aco_opcode::p_logical_end)
601          break;
602 
603       /* check if candidate depends on current */
604       bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps();
605       /* no need to steal from following VMEM instructions */
606       if (is_dependency && candidate->isVMEM())
607          break;
608 
609       if (found_dependency) {
610          HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
611          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
612              haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
613              haz == hazard_fail_export)
614             is_dependency = true;
615          else if (haz != hazard_success)
616             break;
617       }
618 
619       if (is_dependency) {
620          if (!found_dependency) {
621             ctx.mv.upwards_set_insert_idx(candidate_idx);
622             init_hazard_query(&hq);
623             found_dependency = true;
624          }
625       }
626 
627       if (is_dependency || !found_dependency) {
628          if (found_dependency)
629             add_to_hazard_query(&hq, candidate.get());
630          else
631             k++;
632          ctx.mv.upwards_skip();
633          continue;
634       }
635 
636       MoveResult res = ctx.mv.upwards_move();
637       if (res == move_fail_ssa || res == move_fail_rar) {
638          /* no need to steal from following VMEM instructions */
639          if (res == move_fail_ssa && candidate->isVMEM())
640             break;
641          add_to_hazard_query(&hq, candidate.get());
642          ctx.mv.upwards_skip();
643          continue;
644       } else if (res == move_fail_pressure) {
645          break;
646       }
647       k++;
648    }
649 
650    ctx.last_SMEM_dep_idx = found_dependency ? ctx.mv.insert_idx : 0;
651    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
652 }
653 
schedule_VMEM(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)654 void schedule_VMEM(sched_ctx& ctx, Block* block,
655                    std::vector<RegisterDemand>& register_demand,
656                    Instruction* current, int idx)
657 {
658    assert(idx != 0);
659    int window_size = VMEM_WINDOW_SIZE;
660    int max_moves = VMEM_MAX_MOVES;
661    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
662    int16_t k = 0;
663 
664    /* first, check if we have instructions before current to move down */
665    hazard_query indep_hq;
666    hazard_query clause_hq;
667    init_hazard_query(&indep_hq);
668    init_hazard_query(&clause_hq);
669    add_to_hazard_query(&indep_hq, current);
670 
671    ctx.mv.downwards_init(idx, true, true);
672 
673    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
674       assert(candidate_idx == ctx.mv.source_idx);
675       assert(candidate_idx >= 0);
676       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
677       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
678 
679       /* break when encountering another VMEM instruction, logical_start or barriers */
680       if (candidate->opcode == aco_opcode::p_logical_start)
681          break;
682 
683       /* break if we'd make the previous SMEM instruction stall */
684       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
685       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
686          break;
687 
688       bool part_of_clause = false;
689       if (current->isVMEM() == candidate->isVMEM()) {
690          bool same_resource = true;
691          if (current->isVMEM())
692             same_resource = candidate->operands[0].tempId() == current->operands[0].tempId();
693          int grab_dist = ctx.mv.insert_idx_clause - candidate_idx;
694          /* We can't easily tell how much this will decrease the def-to-use
695           * distances, so just use how far it will be moved as a heuristic. */
696          part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
697       }
698 
699       /* if current depends on candidate, add additional dependencies and continue */
700       bool can_move_down = !is_vmem || part_of_clause;
701 
702       HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
703       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
704           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
705           haz == hazard_fail_export)
706          can_move_down = false;
707       else if (haz != hazard_success)
708          break;
709 
710       if (!can_move_down) {
711          add_to_hazard_query(&indep_hq, candidate.get());
712          add_to_hazard_query(&clause_hq, candidate.get());
713          ctx.mv.downwards_skip();
714          continue;
715       }
716 
717       Instruction *candidate_ptr = candidate.get();
718       MoveResult res = ctx.mv.downwards_move(part_of_clause);
719       if (res == move_fail_ssa || res == move_fail_rar) {
720          add_to_hazard_query(&indep_hq, candidate.get());
721          add_to_hazard_query(&clause_hq, candidate.get());
722          ctx.mv.downwards_skip();
723          continue;
724       } else if (res == move_fail_pressure) {
725          break;
726       }
727       if (part_of_clause)
728          add_to_hazard_query(&indep_hq, candidate_ptr);
729       k++;
730       if (candidate_idx < ctx.last_SMEM_dep_idx)
731          ctx.last_SMEM_stall++;
732    }
733 
734    /* find the first instruction depending on current or find another VMEM */
735    ctx.mv.upwards_init(idx + 1, true);
736 
737    bool found_dependency = false;
738    /* second, check if we have instructions after current to move up */
739    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
740       assert(candidate_idx == ctx.mv.source_idx);
741       assert(candidate_idx < (int) block->instructions.size());
742       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
743       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
744 
745       if (candidate->opcode == aco_opcode::p_logical_end)
746          break;
747 
748       /* check if candidate depends on current */
749       bool is_dependency = false;
750       if (found_dependency) {
751          HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
752          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
753              haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
754              haz == hazard_fail_barrier || haz == hazard_fail_export)
755             is_dependency = true;
756          else if (haz != hazard_success)
757             break;
758       }
759 
760       is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps();
761       if (is_dependency) {
762          if (!found_dependency) {
763             ctx.mv.upwards_set_insert_idx(candidate_idx);
764             init_hazard_query(&indep_hq);
765             found_dependency = true;
766          }
767       } else if (is_vmem) {
768          /* don't move up dependencies of other VMEM instructions */
769          for (const Definition& def : candidate->definitions) {
770             if (def.isTemp())
771                ctx.mv.depends_on[def.tempId()] = true;
772          }
773       }
774 
775       if (is_dependency || !found_dependency) {
776          if (found_dependency)
777             add_to_hazard_query(&indep_hq, candidate.get());
778          ctx.mv.upwards_skip();
779          continue;
780       }
781 
782       MoveResult res = ctx.mv.upwards_move();
783       if (res == move_fail_ssa || res == move_fail_rar) {
784          add_to_hazard_query(&indep_hq, candidate.get());
785          ctx.mv.upwards_skip();
786          continue;
787       } else if (res == move_fail_pressure) {
788          break;
789       }
790       k++;
791    }
792 }
793 
schedule_position_export(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)794 void schedule_position_export(sched_ctx& ctx, Block* block,
795                               std::vector<RegisterDemand>& register_demand,
796                               Instruction* current, int idx)
797 {
798    assert(idx != 0);
799    int window_size = POS_EXP_WINDOW_SIZE;
800    int max_moves = POS_EXP_MAX_MOVES;
801    int16_t k = 0;
802 
803    ctx.mv.downwards_init(idx, true, false);
804 
805    hazard_query hq;
806    init_hazard_query(&hq);
807    add_to_hazard_query(&hq, current);
808 
809    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
810       assert(candidate_idx >= 0);
811       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
812 
813       if (candidate->opcode == aco_opcode::p_logical_start)
814          break;
815       if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
816          break;
817 
818       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
819       if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
820          break;
821 
822       if (haz != hazard_success) {
823          add_to_hazard_query(&hq, candidate.get());
824          ctx.mv.downwards_skip();
825          continue;
826       }
827 
828       MoveResult res = ctx.mv.downwards_move(false);
829       if (res == move_fail_ssa || res == move_fail_rar) {
830          add_to_hazard_query(&hq, candidate.get());
831          ctx.mv.downwards_skip();
832          continue;
833       } else if (res == move_fail_pressure) {
834          break;
835       }
836       k++;
837    }
838 }
839 
schedule_block(sched_ctx & ctx,Program * program,Block * block,live & live_vars)840 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
841 {
842    ctx.last_SMEM_dep_idx = 0;
843    ctx.last_SMEM_stall = INT16_MIN;
844    ctx.mv.block = block;
845    ctx.mv.register_demand = live_vars.register_demand[block->index].data();
846 
847    /* go through all instructions and find memory loads */
848    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
849       Instruction* current = block->instructions[idx].get();
850 
851       if (current->definitions.empty())
852          continue;
853 
854       if (current->isVMEM() || current->isFlatOrGlobal()) {
855          ctx.mv.current = current;
856          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
857       }
858 
859       if (current->format == Format::SMEM) {
860          ctx.mv.current = current;
861          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
862       }
863    }
864 
865    if ((program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) &&
866        (block->kind & block_kind_export_end)) {
867       /* Try to move position exports as far up as possible, to reduce register
868        * usage and because ISA reference guides say so. */
869       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
870          Instruction* current = block->instructions[idx].get();
871 
872          if (current->format == Format::EXP) {
873             unsigned target = static_cast<Export_instruction*>(current)->dest;
874             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) {
875                ctx.mv.current = current;
876                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
877             }
878          }
879       }
880    }
881 
882    /* resummarize the block's register demand */
883    block->register_demand = RegisterDemand();
884    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
885       block->register_demand.update(live_vars.register_demand[block->index][idx]);
886    }
887 }
888 
889 
schedule_program(Program * program,live & live_vars)890 void schedule_program(Program *program, live& live_vars)
891 {
892    /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
893    RegisterDemand demand;
894    for (Block& block : program->blocks)
895       demand.update(block.register_demand);
896 
897    sched_ctx ctx;
898    ctx.mv.depends_on.resize(program->peekAllocationId());
899    ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
900    ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
901    /* Allowing the scheduler to reduce the number of waves to as low as 5
902     * improves performance of Thrones of Britannia significantly and doesn't
903     * seem to hurt anything else. */
904    if (program->num_waves <= 5)
905       ctx.num_waves = program->num_waves;
906    else if (demand.vgpr >= 29)
907       ctx.num_waves = 5;
908    else if (demand.vgpr >= 25)
909       ctx.num_waves = 6;
910    else
911       ctx.num_waves = 7;
912    ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
913    ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
914 
915    assert(ctx.num_waves > 0);
916    ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
917                             int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
918 
919    for (Block& block : program->blocks)
920       schedule_block(ctx, program, &block, live_vars);
921 
922    /* update max_reg_demand and num_waves */
923    RegisterDemand new_demand;
924    for (Block& block : program->blocks) {
925       new_demand.update(block.register_demand);
926    }
927    update_vgpr_sgpr_demand(program, new_demand);
928 
929    /* if enabled, this code asserts that register_demand is updated correctly */
930    #if 0
931    int prev_num_waves = program->num_waves;
932    const RegisterDemand prev_max_demand = program->max_reg_demand;
933 
934    std::vector<RegisterDemand> demands(program->blocks.size());
935    for (unsigned j = 0; j < program->blocks.size(); j++) {
936       demands[j] = program->blocks[j].register_demand;
937    }
938 
939    live live_vars2 = aco::live_var_analysis(program);
940 
941    for (unsigned j = 0; j < program->blocks.size(); j++) {
942       Block &b = program->blocks[j];
943       for (unsigned i = 0; i < b.instructions.size(); i++)
944          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
945       assert(b.register_demand == demands[j]);
946    }
947 
948    assert(program->max_reg_demand == prev_max_demand);
949    assert(program->num_waves == prev_num_waves);
950    #endif
951 }
952 
953 }
954