1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26 #include "aco_builder.h"
27 #include <unordered_set>
28 #include <algorithm>
29
30 #include "amdgfxregs.h"
31
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
37 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
38 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
39 #define POS_EXP_MAX_MOVES 512
40
41 namespace aco {
42
43 enum MoveResult {
44 move_success,
45 move_fail_ssa,
46 move_fail_rar,
47 move_fail_pressure,
48 };
49
50 struct MoveState {
51 RegisterDemand max_registers;
52
53 Block *block;
54 Instruction *current;
55 RegisterDemand *register_demand;
56 bool improved_rar;
57
58 std::vector<bool> depends_on;
59 /* Two are needed because, for downwards VMEM scheduling, one needs to
60 * exclude the instructions in the clause, since new instructions in the
61 * clause are not moved past any other instructions in the clause. */
62 std::vector<bool> RAR_dependencies;
63 std::vector<bool> RAR_dependencies_clause;
64
65 int source_idx;
66 int insert_idx, insert_idx_clause;
67 RegisterDemand total_demand, total_demand_clause;
68
69 /* for moving instructions before the current instruction to after it */
70 void downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
71 MoveResult downwards_move(bool clause);
72 void downwards_skip();
73
74 /* for moving instructions after the first use of the current instruction upwards */
75 void upwards_init(int source_idx, bool improved_rar);
76 bool upwards_check_deps();
77 void upwards_set_insert_idx(int before);
78 MoveResult upwards_move();
79 void upwards_skip();
80
81 private:
82 void downwards_advance_helper();
83 };
84
85 struct sched_ctx {
86 int16_t num_waves;
87 int16_t last_SMEM_stall;
88 int last_SMEM_dep_idx;
89 MoveState mv;
90 };
91
92 /* This scheduler is a simple bottom-up pass based on ideas from
93 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
94 * from Xiaohua Shi and Peng Guo.
95 * The basic approach is to iterate over all instructions. When a memory instruction
96 * is encountered it tries to move independent instructions from above and below
97 * between the memory instruction and it's first user.
98 * The novelty is that this scheduler cares for the current register pressure:
99 * Instructions will only be moved if the register pressure won't exceed a certain bound.
100 */
101
102 template <typename T>
move_element(T begin_it,size_t idx,size_t before)103 void move_element(T begin_it, size_t idx, size_t before) {
104 if (idx < before) {
105 auto begin = std::next(begin_it, idx);
106 auto end = std::next(begin_it, before);
107 std::rotate(begin, begin + 1, end);
108 } else if (idx > before) {
109 auto begin = std::next(begin_it, before);
110 auto end = std::next(begin_it, idx + 1);
111 std::rotate(begin, end - 1, end);
112 }
113 }
114
downwards_advance_helper()115 void MoveState::downwards_advance_helper()
116 {
117 source_idx--;
118 total_demand.update(register_demand[source_idx]);
119 }
120
downwards_init(int current_idx,bool improved_rar_,bool may_form_clauses)121 void MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
122 {
123 improved_rar = improved_rar_;
124 source_idx = current_idx;
125
126 insert_idx = current_idx + 1;
127 insert_idx_clause = current_idx;
128
129 total_demand = total_demand_clause = register_demand[current_idx];
130
131 std::fill(depends_on.begin(), depends_on.end(), false);
132 if (improved_rar) {
133 std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
134 if (may_form_clauses)
135 std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
136 }
137
138 for (const Operand& op : current->operands) {
139 if (op.isTemp()) {
140 depends_on[op.tempId()] = true;
141 if (improved_rar && op.isFirstKill())
142 RAR_dependencies[op.tempId()] = true;
143 }
144 }
145
146 /* update total_demand/source_idx */
147 downwards_advance_helper();
148 }
149
downwards_move(bool clause)150 MoveResult MoveState::downwards_move(bool clause)
151 {
152 aco_ptr<Instruction>& instr = block->instructions[source_idx];
153
154 for (const Definition& def : instr->definitions)
155 if (def.isTemp() && depends_on[def.tempId()])
156 return move_fail_ssa;
157
158 /* check if one of candidate's operands is killed by depending instruction */
159 std::vector<bool>& RAR_deps = improved_rar ? (clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
160 for (const Operand& op : instr->operands) {
161 if (op.isTemp() && RAR_deps[op.tempId()]) {
162 // FIXME: account for difference in register pressure
163 return move_fail_rar;
164 }
165 }
166
167 if (clause) {
168 for (const Operand& op : instr->operands) {
169 if (op.isTemp()) {
170 depends_on[op.tempId()] = true;
171 if (op.isFirstKill())
172 RAR_dependencies[op.tempId()] = true;
173 }
174 }
175 }
176
177 int dest_insert_idx = clause ? insert_idx_clause : insert_idx;
178 RegisterDemand register_pressure = clause ? total_demand_clause : total_demand;
179
180 const RegisterDemand candidate_diff = get_live_changes(instr);
181 const RegisterDemand temp = get_temp_registers(instr);
182 if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
183 return move_fail_pressure;
184 const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
185 const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
186 if (new_demand.exceeds(max_registers))
187 return move_fail_pressure;
188
189 /* move the candidate below the memory load */
190 move_element(block->instructions.begin(), source_idx, dest_insert_idx);
191
192 /* update register pressure */
193 move_element(register_demand, source_idx, dest_insert_idx);
194 for (int i = source_idx; i < dest_insert_idx - 1; i++)
195 register_demand[i] -= candidate_diff;
196 register_demand[dest_insert_idx - 1] = new_demand;
197 total_demand_clause -= candidate_diff;
198 insert_idx_clause--;
199 if (!clause) {
200 total_demand -= candidate_diff;
201 insert_idx--;
202 }
203
204 downwards_advance_helper();
205 return move_success;
206 }
207
downwards_skip()208 void MoveState::downwards_skip()
209 {
210 aco_ptr<Instruction>& instr = block->instructions[source_idx];
211
212 for (const Operand& op : instr->operands) {
213 if (op.isTemp()) {
214 depends_on[op.tempId()] = true;
215 if (improved_rar && op.isFirstKill()) {
216 RAR_dependencies[op.tempId()] = true;
217 RAR_dependencies_clause[op.tempId()] = true;
218 }
219 }
220 }
221 total_demand_clause.update(register_demand[source_idx]);
222
223 downwards_advance_helper();
224 }
225
upwards_init(int source_idx_,bool improved_rar_)226 void MoveState::upwards_init(int source_idx_, bool improved_rar_)
227 {
228 source_idx = source_idx_;
229 improved_rar = improved_rar_;
230
231 insert_idx = -1;
232
233 std::fill(depends_on.begin(), depends_on.end(), false);
234 std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
235
236 for (const Definition& def : current->definitions) {
237 if (def.isTemp())
238 depends_on[def.tempId()] = true;
239 }
240 }
241
upwards_check_deps()242 bool MoveState::upwards_check_deps()
243 {
244 aco_ptr<Instruction>& instr = block->instructions[source_idx];
245 for (const Operand& op : instr->operands) {
246 if (op.isTemp() && depends_on[op.tempId()])
247 return false;
248 }
249 return true;
250 }
251
upwards_set_insert_idx(int before)252 void MoveState::upwards_set_insert_idx(int before)
253 {
254 insert_idx = before;
255 total_demand = register_demand[before - 1];
256 }
257
upwards_move()258 MoveResult MoveState::upwards_move()
259 {
260 assert(insert_idx >= 0);
261
262 aco_ptr<Instruction>& instr = block->instructions[source_idx];
263 for (const Operand& op : instr->operands) {
264 if (op.isTemp() && depends_on[op.tempId()])
265 return move_fail_ssa;
266 }
267
268 /* check if candidate uses/kills an operand which is used by a dependency */
269 for (const Operand& op : instr->operands) {
270 if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
271 return move_fail_rar;
272 }
273
274 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
275 const RegisterDemand candidate_diff = get_live_changes(instr);
276 const RegisterDemand temp = get_temp_registers(instr);
277 if (RegisterDemand(total_demand + candidate_diff).exceeds(max_registers))
278 return move_fail_pressure;
279 const RegisterDemand temp2 = get_temp_registers(block->instructions[insert_idx - 1]);
280 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
281 if (new_demand.exceeds(max_registers))
282 return move_fail_pressure;
283
284 /* move the candidate above the insert_idx */
285 move_element(block->instructions.begin(), source_idx, insert_idx);
286
287 /* update register pressure */
288 move_element(register_demand, source_idx, insert_idx);
289 for (int i = insert_idx + 1; i <= source_idx; i++)
290 register_demand[i] += candidate_diff;
291 register_demand[insert_idx] = new_demand;
292 total_demand += candidate_diff;
293
294 insert_idx++;
295
296 total_demand.update(register_demand[source_idx]);
297 source_idx++;
298
299 return move_success;
300 }
301
upwards_skip()302 void MoveState::upwards_skip()
303 {
304 if (insert_idx >= 0) {
305 aco_ptr<Instruction>& instr = block->instructions[source_idx];
306 for (const Definition& def : instr->definitions) {
307 if (def.isTemp())
308 depends_on[def.tempId()] = true;
309 }
310 for (const Operand& op : instr->operands) {
311 if (op.isTemp())
312 RAR_dependencies[op.tempId()] = true;
313 }
314 total_demand.update(register_demand[source_idx]);
315 }
316
317 source_idx++;
318 }
319
is_gs_or_done_sendmsg(const Instruction * instr)320 bool is_gs_or_done_sendmsg(const Instruction *instr)
321 {
322 if (instr->opcode == aco_opcode::s_sendmsg) {
323 uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
324 return (imm & sendmsg_id_mask) == _sendmsg_gs ||
325 (imm & sendmsg_id_mask) == _sendmsg_gs_done;
326 }
327 return false;
328 }
329
is_done_sendmsg(const Instruction * instr)330 bool is_done_sendmsg(const Instruction *instr)
331 {
332 if (instr->opcode == aco_opcode::s_sendmsg) {
333 uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
334 return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
335 }
336 return false;
337 }
338
get_sync_info_with_hack(const Instruction * instr)339 memory_sync_info get_sync_info_with_hack(const Instruction* instr)
340 {
341 memory_sync_info sync = get_sync_info(instr);
342 if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
343 // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
344 sync.storage = (storage_class)(sync.storage | storage_buffer);
345 sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
346 }
347 return sync;
348 }
349
350 struct memory_event_set {
351 bool has_control_barrier;
352
353 unsigned bar_acquire;
354 unsigned bar_release;
355 unsigned bar_classes;
356
357 unsigned access_acquire;
358 unsigned access_release;
359 unsigned access_relaxed;
360 unsigned access_atomic;
361 };
362
363 struct hazard_query {
364 bool contains_spill;
365 bool contains_sendmsg;
366 memory_event_set mem_events;
367 unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
368 unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
369 };
370
init_hazard_query(hazard_query * query)371 void init_hazard_query(hazard_query *query) {
372 query->contains_spill = false;
373 query->contains_sendmsg = false;
374 memset(&query->mem_events, 0, sizeof(query->mem_events));
375 query->aliasing_storage = 0;
376 query->aliasing_storage_smem = 0;
377 }
378
add_memory_event(memory_event_set * set,Instruction * instr,memory_sync_info * sync)379 void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
380 {
381 set->has_control_barrier |= is_done_sendmsg(instr);
382 if (instr->opcode == aco_opcode::p_barrier) {
383 Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
384 if (bar->sync.semantics & semantic_acquire)
385 set->bar_acquire |= bar->sync.storage;
386 if (bar->sync.semantics & semantic_release)
387 set->bar_release |= bar->sync.storage;
388 set->bar_classes |= bar->sync.storage;
389
390 set->has_control_barrier |= bar->exec_scope > scope_invocation;
391 }
392
393 if (!sync->storage)
394 return;
395
396 if (sync->semantics & semantic_acquire)
397 set->access_acquire |= sync->storage;
398 if (sync->semantics & semantic_release)
399 set->access_release |= sync->storage;
400
401 if (!(sync->semantics & semantic_private)) {
402 if (sync->semantics & semantic_atomic)
403 set->access_atomic |= sync->storage;
404 else
405 set->access_relaxed |= sync->storage;
406 }
407 }
408
add_to_hazard_query(hazard_query * query,Instruction * instr)409 void add_to_hazard_query(hazard_query *query, Instruction *instr)
410 {
411 if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
412 query->contains_spill = true;
413 query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
414
415 memory_sync_info sync = get_sync_info_with_hack(instr);
416
417 add_memory_event(&query->mem_events, instr, &sync);
418
419 if (!(sync.semantics & semantic_can_reorder)) {
420 unsigned storage = sync.storage;
421 /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
422 if (storage & (storage_buffer | storage_image))
423 storage |= storage_buffer | storage_image;
424 if (instr->format == Format::SMEM)
425 query->aliasing_storage_smem |= storage;
426 else
427 query->aliasing_storage |= storage;
428 }
429 }
430
431 enum HazardResult {
432 hazard_success,
433 hazard_fail_reorder_vmem_smem,
434 hazard_fail_reorder_ds,
435 hazard_fail_reorder_sendmsg,
436 hazard_fail_spill,
437 hazard_fail_export,
438 hazard_fail_barrier,
439 /* Must stop at these failures. The hazard query code doesn't consider them
440 * when added. */
441 hazard_fail_exec,
442 hazard_fail_unreorderable,
443 };
444
perform_hazard_query(hazard_query * query,Instruction * instr,bool upwards)445 HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
446 {
447 if (instr->opcode == aco_opcode::p_exit_early_if)
448 return hazard_fail_exec;
449 for (const Definition& def : instr->definitions) {
450 if (def.isFixed() && def.physReg() == exec)
451 return hazard_fail_exec;
452 }
453
454 /* don't move exports so that they stay closer together */
455 if (instr->format == Format::EXP)
456 return hazard_fail_export;
457
458 /* don't move non-reorderable instructions */
459 if (instr->opcode == aco_opcode::s_memtime ||
460 instr->opcode == aco_opcode::s_memrealtime ||
461 instr->opcode == aco_opcode::s_setprio ||
462 instr->opcode == aco_opcode::s_getreg_b32)
463 return hazard_fail_unreorderable;
464
465 memory_event_set instr_set;
466 memset(&instr_set, 0, sizeof(instr_set));
467 memory_sync_info sync = get_sync_info_with_hack(instr);
468 add_memory_event(&instr_set, instr, &sync);
469
470 memory_event_set *first = &instr_set;
471 memory_event_set *second = &query->mem_events;
472 if (upwards)
473 std::swap(first, second);
474
475 /* everything after barrier(acquire) happens after the atomics/control_barriers before
476 * everything after load(acquire) happens after the load
477 */
478 if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
479 return hazard_fail_barrier;
480 if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
481 ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
482 return hazard_fail_barrier;
483
484 /* everything before barrier(release) happens before the atomics/control_barriers after *
485 * everything before store(release) happens before the store
486 */
487 if (first->bar_release && (second->has_control_barrier || second->access_atomic))
488 return hazard_fail_barrier;
489 if ((first->bar_classes && (second->bar_release || second->access_release)) ||
490 ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
491 return hazard_fail_barrier;
492
493 /* don't move memory barriers around other memory barriers */
494 if (first->bar_classes && second->bar_classes)
495 return hazard_fail_barrier;
496
497 /* Don't move memory accesses to before control barriers. I don't think
498 * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
499 unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared;
500 if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes))
501 return hazard_fail_barrier;
502
503 /* don't move memory loads/stores past potentially aliasing loads/stores */
504 unsigned aliasing_storage = instr->format == Format::SMEM ?
505 query->aliasing_storage_smem :
506 query->aliasing_storage;
507 if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
508 unsigned intersect = sync.storage & aliasing_storage;
509 if (intersect & storage_shared)
510 return hazard_fail_reorder_ds;
511 return hazard_fail_reorder_vmem_smem;
512 }
513
514 if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
515 query->contains_spill)
516 return hazard_fail_spill;
517
518 if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
519 return hazard_fail_reorder_sendmsg;
520
521 return hazard_success;
522 }
523
schedule_SMEM(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)524 void schedule_SMEM(sched_ctx& ctx, Block* block,
525 std::vector<RegisterDemand>& register_demand,
526 Instruction* current, int idx)
527 {
528 assert(idx != 0);
529 int window_size = SMEM_WINDOW_SIZE;
530 int max_moves = SMEM_MAX_MOVES;
531 int16_t k = 0;
532
533 /* don't move s_memtime/s_memrealtime */
534 if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
535 return;
536
537 /* first, check if we have instructions before current to move down */
538 hazard_query hq;
539 init_hazard_query(&hq);
540 add_to_hazard_query(&hq, current);
541
542 ctx.mv.downwards_init(idx, false, false);
543
544 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
545 assert(candidate_idx >= 0);
546 assert(candidate_idx == ctx.mv.source_idx);
547 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
548
549 /* break if we'd make the previous SMEM instruction stall */
550 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
551 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
552 break;
553
554 /* break when encountering another MEM instruction, logical_start or barriers */
555 if (candidate->opcode == aco_opcode::p_logical_start)
556 break;
557 if (candidate->isVMEM())
558 break;
559
560 bool can_move_down = true;
561
562 HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
563 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
564 can_move_down = false;
565 else if (haz != hazard_success)
566 break;
567
568 /* don't use LDS/GDS instructions to hide latency since it can
569 * significanly worsen LDS scheduling */
570 if (candidate->format == Format::DS || !can_move_down) {
571 add_to_hazard_query(&hq, candidate.get());
572 ctx.mv.downwards_skip();
573 continue;
574 }
575
576 MoveResult res = ctx.mv.downwards_move(false);
577 if (res == move_fail_ssa || res == move_fail_rar) {
578 add_to_hazard_query(&hq, candidate.get());
579 ctx.mv.downwards_skip();
580 continue;
581 } else if (res == move_fail_pressure) {
582 break;
583 }
584
585 if (candidate_idx < ctx.last_SMEM_dep_idx)
586 ctx.last_SMEM_stall++;
587 k++;
588 }
589
590 /* find the first instruction depending on current or find another MEM */
591 ctx.mv.upwards_init(idx + 1, false);
592
593 bool found_dependency = false;
594 /* second, check if we have instructions after current to move up */
595 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
596 assert(candidate_idx == ctx.mv.source_idx);
597 assert(candidate_idx < (int) block->instructions.size());
598 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
599
600 if (candidate->opcode == aco_opcode::p_logical_end)
601 break;
602
603 /* check if candidate depends on current */
604 bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps();
605 /* no need to steal from following VMEM instructions */
606 if (is_dependency && candidate->isVMEM())
607 break;
608
609 if (found_dependency) {
610 HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
611 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
612 haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
613 haz == hazard_fail_export)
614 is_dependency = true;
615 else if (haz != hazard_success)
616 break;
617 }
618
619 if (is_dependency) {
620 if (!found_dependency) {
621 ctx.mv.upwards_set_insert_idx(candidate_idx);
622 init_hazard_query(&hq);
623 found_dependency = true;
624 }
625 }
626
627 if (is_dependency || !found_dependency) {
628 if (found_dependency)
629 add_to_hazard_query(&hq, candidate.get());
630 else
631 k++;
632 ctx.mv.upwards_skip();
633 continue;
634 }
635
636 MoveResult res = ctx.mv.upwards_move();
637 if (res == move_fail_ssa || res == move_fail_rar) {
638 /* no need to steal from following VMEM instructions */
639 if (res == move_fail_ssa && candidate->isVMEM())
640 break;
641 add_to_hazard_query(&hq, candidate.get());
642 ctx.mv.upwards_skip();
643 continue;
644 } else if (res == move_fail_pressure) {
645 break;
646 }
647 k++;
648 }
649
650 ctx.last_SMEM_dep_idx = found_dependency ? ctx.mv.insert_idx : 0;
651 ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
652 }
653
schedule_VMEM(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)654 void schedule_VMEM(sched_ctx& ctx, Block* block,
655 std::vector<RegisterDemand>& register_demand,
656 Instruction* current, int idx)
657 {
658 assert(idx != 0);
659 int window_size = VMEM_WINDOW_SIZE;
660 int max_moves = VMEM_MAX_MOVES;
661 int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
662 int16_t k = 0;
663
664 /* first, check if we have instructions before current to move down */
665 hazard_query indep_hq;
666 hazard_query clause_hq;
667 init_hazard_query(&indep_hq);
668 init_hazard_query(&clause_hq);
669 add_to_hazard_query(&indep_hq, current);
670
671 ctx.mv.downwards_init(idx, true, true);
672
673 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
674 assert(candidate_idx == ctx.mv.source_idx);
675 assert(candidate_idx >= 0);
676 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
677 bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
678
679 /* break when encountering another VMEM instruction, logical_start or barriers */
680 if (candidate->opcode == aco_opcode::p_logical_start)
681 break;
682
683 /* break if we'd make the previous SMEM instruction stall */
684 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
685 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
686 break;
687
688 bool part_of_clause = false;
689 if (current->isVMEM() == candidate->isVMEM()) {
690 bool same_resource = true;
691 if (current->isVMEM())
692 same_resource = candidate->operands[0].tempId() == current->operands[0].tempId();
693 int grab_dist = ctx.mv.insert_idx_clause - candidate_idx;
694 /* We can't easily tell how much this will decrease the def-to-use
695 * distances, so just use how far it will be moved as a heuristic. */
696 part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
697 }
698
699 /* if current depends on candidate, add additional dependencies and continue */
700 bool can_move_down = !is_vmem || part_of_clause;
701
702 HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
703 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
704 haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
705 haz == hazard_fail_export)
706 can_move_down = false;
707 else if (haz != hazard_success)
708 break;
709
710 if (!can_move_down) {
711 add_to_hazard_query(&indep_hq, candidate.get());
712 add_to_hazard_query(&clause_hq, candidate.get());
713 ctx.mv.downwards_skip();
714 continue;
715 }
716
717 Instruction *candidate_ptr = candidate.get();
718 MoveResult res = ctx.mv.downwards_move(part_of_clause);
719 if (res == move_fail_ssa || res == move_fail_rar) {
720 add_to_hazard_query(&indep_hq, candidate.get());
721 add_to_hazard_query(&clause_hq, candidate.get());
722 ctx.mv.downwards_skip();
723 continue;
724 } else if (res == move_fail_pressure) {
725 break;
726 }
727 if (part_of_clause)
728 add_to_hazard_query(&indep_hq, candidate_ptr);
729 k++;
730 if (candidate_idx < ctx.last_SMEM_dep_idx)
731 ctx.last_SMEM_stall++;
732 }
733
734 /* find the first instruction depending on current or find another VMEM */
735 ctx.mv.upwards_init(idx + 1, true);
736
737 bool found_dependency = false;
738 /* second, check if we have instructions after current to move up */
739 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
740 assert(candidate_idx == ctx.mv.source_idx);
741 assert(candidate_idx < (int) block->instructions.size());
742 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
743 bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
744
745 if (candidate->opcode == aco_opcode::p_logical_end)
746 break;
747
748 /* check if candidate depends on current */
749 bool is_dependency = false;
750 if (found_dependency) {
751 HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
752 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
753 haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
754 haz == hazard_fail_barrier || haz == hazard_fail_export)
755 is_dependency = true;
756 else if (haz != hazard_success)
757 break;
758 }
759
760 is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps();
761 if (is_dependency) {
762 if (!found_dependency) {
763 ctx.mv.upwards_set_insert_idx(candidate_idx);
764 init_hazard_query(&indep_hq);
765 found_dependency = true;
766 }
767 } else if (is_vmem) {
768 /* don't move up dependencies of other VMEM instructions */
769 for (const Definition& def : candidate->definitions) {
770 if (def.isTemp())
771 ctx.mv.depends_on[def.tempId()] = true;
772 }
773 }
774
775 if (is_dependency || !found_dependency) {
776 if (found_dependency)
777 add_to_hazard_query(&indep_hq, candidate.get());
778 ctx.mv.upwards_skip();
779 continue;
780 }
781
782 MoveResult res = ctx.mv.upwards_move();
783 if (res == move_fail_ssa || res == move_fail_rar) {
784 add_to_hazard_query(&indep_hq, candidate.get());
785 ctx.mv.upwards_skip();
786 continue;
787 } else if (res == move_fail_pressure) {
788 break;
789 }
790 k++;
791 }
792 }
793
schedule_position_export(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)794 void schedule_position_export(sched_ctx& ctx, Block* block,
795 std::vector<RegisterDemand>& register_demand,
796 Instruction* current, int idx)
797 {
798 assert(idx != 0);
799 int window_size = POS_EXP_WINDOW_SIZE;
800 int max_moves = POS_EXP_MAX_MOVES;
801 int16_t k = 0;
802
803 ctx.mv.downwards_init(idx, true, false);
804
805 hazard_query hq;
806 init_hazard_query(&hq);
807 add_to_hazard_query(&hq, current);
808
809 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
810 assert(candidate_idx >= 0);
811 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
812
813 if (candidate->opcode == aco_opcode::p_logical_start)
814 break;
815 if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
816 break;
817
818 HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
819 if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
820 break;
821
822 if (haz != hazard_success) {
823 add_to_hazard_query(&hq, candidate.get());
824 ctx.mv.downwards_skip();
825 continue;
826 }
827
828 MoveResult res = ctx.mv.downwards_move(false);
829 if (res == move_fail_ssa || res == move_fail_rar) {
830 add_to_hazard_query(&hq, candidate.get());
831 ctx.mv.downwards_skip();
832 continue;
833 } else if (res == move_fail_pressure) {
834 break;
835 }
836 k++;
837 }
838 }
839
schedule_block(sched_ctx & ctx,Program * program,Block * block,live & live_vars)840 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
841 {
842 ctx.last_SMEM_dep_idx = 0;
843 ctx.last_SMEM_stall = INT16_MIN;
844 ctx.mv.block = block;
845 ctx.mv.register_demand = live_vars.register_demand[block->index].data();
846
847 /* go through all instructions and find memory loads */
848 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
849 Instruction* current = block->instructions[idx].get();
850
851 if (current->definitions.empty())
852 continue;
853
854 if (current->isVMEM() || current->isFlatOrGlobal()) {
855 ctx.mv.current = current;
856 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
857 }
858
859 if (current->format == Format::SMEM) {
860 ctx.mv.current = current;
861 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
862 }
863 }
864
865 if ((program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) &&
866 (block->kind & block_kind_export_end)) {
867 /* Try to move position exports as far up as possible, to reduce register
868 * usage and because ISA reference guides say so. */
869 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
870 Instruction* current = block->instructions[idx].get();
871
872 if (current->format == Format::EXP) {
873 unsigned target = static_cast<Export_instruction*>(current)->dest;
874 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) {
875 ctx.mv.current = current;
876 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
877 }
878 }
879 }
880 }
881
882 /* resummarize the block's register demand */
883 block->register_demand = RegisterDemand();
884 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
885 block->register_demand.update(live_vars.register_demand[block->index][idx]);
886 }
887 }
888
889
schedule_program(Program * program,live & live_vars)890 void schedule_program(Program *program, live& live_vars)
891 {
892 /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
893 RegisterDemand demand;
894 for (Block& block : program->blocks)
895 demand.update(block.register_demand);
896
897 sched_ctx ctx;
898 ctx.mv.depends_on.resize(program->peekAllocationId());
899 ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
900 ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
901 /* Allowing the scheduler to reduce the number of waves to as low as 5
902 * improves performance of Thrones of Britannia significantly and doesn't
903 * seem to hurt anything else. */
904 if (program->num_waves <= 5)
905 ctx.num_waves = program->num_waves;
906 else if (demand.vgpr >= 29)
907 ctx.num_waves = 5;
908 else if (demand.vgpr >= 25)
909 ctx.num_waves = 6;
910 else
911 ctx.num_waves = 7;
912 ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
913 ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
914
915 assert(ctx.num_waves > 0);
916 ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
917 int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
918
919 for (Block& block : program->blocks)
920 schedule_block(ctx, program, &block, live_vars);
921
922 /* update max_reg_demand and num_waves */
923 RegisterDemand new_demand;
924 for (Block& block : program->blocks) {
925 new_demand.update(block.register_demand);
926 }
927 update_vgpr_sgpr_demand(program, new_demand);
928
929 /* if enabled, this code asserts that register_demand is updated correctly */
930 #if 0
931 int prev_num_waves = program->num_waves;
932 const RegisterDemand prev_max_demand = program->max_reg_demand;
933
934 std::vector<RegisterDemand> demands(program->blocks.size());
935 for (unsigned j = 0; j < program->blocks.size(); j++) {
936 demands[j] = program->blocks[j].register_demand;
937 }
938
939 live live_vars2 = aco::live_var_analysis(program);
940
941 for (unsigned j = 0; j < program->blocks.size(); j++) {
942 Block &b = program->blocks[j];
943 for (unsigned i = 0; i < b.instructions.size(); i++)
944 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
945 assert(b.register_demand == demands[j]);
946 }
947
948 assert(program->max_reg_demand == prev_max_demand);
949 assert(program->num_waves == prev_num_waves);
950 #endif
951 }
952
953 }
954