1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2022 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "sfn_scheduler.h"
28
29 #include "../r600_isa.h"
30
31 #include "amd_family.h"
32 #include "sfn_alu_defines.h"
33 #include "sfn_debug.h"
34 #include "sfn_instr_alugroup.h"
35 #include "sfn_instr_controlflow.h"
36 #include "sfn_instr_export.h"
37 #include "sfn_instr_fetch.h"
38 #include "sfn_instr_lds.h"
39 #include "sfn_instr_mem.h"
40 #include "sfn_instr_tex.h"
41
42 #include <algorithm>
43 #include <sstream>
44
45 namespace r600 {
46
47 class CollectInstructions : public InstrVisitor {
48
49 public:
CollectInstructions(ValueFactory & vf)50 CollectInstructions(ValueFactory& vf):
51 m_value_factory(vf)
52 {
53 }
54
visit(AluInstr * instr)55 void visit(AluInstr *instr) override
56 {
57 if (instr->has_alu_flag(alu_is_trans))
58 alu_trans.push_back(instr);
59 else {
60 if (instr->alu_slots() == 1)
61 alu_vec.push_back(instr);
62 else
63 alu_groups.push_back(instr->split(m_value_factory));
64 }
65 }
visit(AluGroup * instr)66 void visit(AluGroup *instr) override { alu_groups.push_back(instr); }
visit(TexInstr * instr)67 void visit(TexInstr *instr) override { tex.push_back(instr); }
visit(ExportInstr * instr)68 void visit(ExportInstr *instr) override { exports.push_back(instr); }
visit(FetchInstr * instr)69 void visit(FetchInstr *instr) override { fetches.push_back(instr); }
visit(Block * instr)70 void visit(Block *instr) override
71 {
72 for (auto& i : *instr)
73 i->accept(*this);
74 }
75
visit(ControlFlowInstr * instr)76 void visit(ControlFlowInstr *instr) override
77 {
78 assert(!m_cf_instr);
79 m_cf_instr = instr;
80 }
81
visit(IfInstr * instr)82 void visit(IfInstr *instr) override
83 {
84 assert(!m_cf_instr);
85 m_cf_instr = instr;
86 }
87
visit(EmitVertexInstr * instr)88 void visit(EmitVertexInstr *instr) override
89 {
90 assert(!m_cf_instr);
91 m_cf_instr = instr;
92 }
93
visit(ScratchIOInstr * instr)94 void visit(ScratchIOInstr *instr) override { mem_write_instr.push_back(instr); }
95
visit(StreamOutInstr * instr)96 void visit(StreamOutInstr *instr) override { mem_write_instr.push_back(instr); }
97
visit(MemRingOutInstr * instr)98 void visit(MemRingOutInstr *instr) override { mem_ring_writes.push_back(instr); }
99
visit(GDSInstr * instr)100 void visit(GDSInstr *instr) override { gds_op.push_back(instr); }
101
visit(WriteTFInstr * instr)102 void visit(WriteTFInstr *instr) override { write_tf.push_back(instr); }
103
visit(LDSReadInstr * instr)104 void visit(LDSReadInstr *instr) override
105 {
106 std::vector<AluInstr *> buffer;
107 m_last_lds_instr = instr->split(buffer, m_last_lds_instr);
108 for (auto& i : buffer) {
109 i->accept(*this);
110 }
111 }
112
visit(LDSAtomicInstr * instr)113 void visit(LDSAtomicInstr *instr) override
114 {
115 std::vector<AluInstr *> buffer;
116 m_last_lds_instr = instr->split(buffer, m_last_lds_instr);
117 for (auto& i : buffer) {
118 i->accept(*this);
119 }
120 }
121
visit(RatInstr * instr)122 void visit(RatInstr *instr) override { rat_instr.push_back(instr); }
123
124 std::list<AluInstr *> alu_trans;
125 std::list<AluInstr *> alu_vec;
126 std::list<TexInstr *> tex;
127 std::list<AluGroup *> alu_groups;
128 std::list<ExportInstr *> exports;
129 std::list<FetchInstr *> fetches;
130 std::list<WriteOutInstr *> mem_write_instr;
131 std::list<MemRingOutInstr *> mem_ring_writes;
132 std::list<GDSInstr *> gds_op;
133 std::list<WriteTFInstr *> write_tf;
134 std::list<RatInstr *> rat_instr;
135
136 Instr *m_cf_instr{nullptr};
137 ValueFactory& m_value_factory;
138
139 AluInstr *m_last_lds_instr{nullptr};
140 };
141
142 struct ArrayChanHash
143 {
operator ()r600::ArrayChanHash144 std::size_t operator()(std::pair<int, int> const& s) const noexcept
145 {
146 return std::hash<size_t>{}((size_t(s.first) << 3) | s.second);
147 }
148 };
149
150 using ArrayCheckSet = std::unordered_set<std::pair<int, int>, ArrayChanHash>;
151
152 class BlockScheduler {
153 public:
154 BlockScheduler(r600_chip_class chip_class,
155 radeon_family family);
156
157 void run(Shader *shader);
158
159 void finalize();
160
161 private:
162 void
163 schedule_block(Block& in_block, Shader::ShaderBlocks& out_blocks, ValueFactory& vf);
164
165 bool collect_ready(CollectInstructions& available);
166
167 template <typename T>
168 bool collect_ready_type(std::list<T *>& ready, std::list<T *>& orig);
169
170 bool collect_ready_alu_vec(std::list<AluInstr *>& ready,
171 std::list<AluInstr *>& available);
172
173 bool schedule_tex(Shader::ShaderBlocks& out_blocks);
174 bool schedule_vtx(Shader::ShaderBlocks& out_blocks);
175
176 template <typename I>
177 bool schedule_gds(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list);
178
179 template <typename I>
180 bool schedule_cf(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list);
181
182 bool schedule_alu(Shader::ShaderBlocks& out_blocks);
183 void start_new_block(Shader::ShaderBlocks& out_blocks, Block::Type type);
184
185 bool schedule_alu_to_group_vec(AluGroup *group);
186 bool schedule_alu_to_group_trans(AluGroup *group, std::list<AluInstr *>& readylist);
187
188 bool schedule_exports(Shader::ShaderBlocks& out_blocks,
189 std::list<ExportInstr *>& ready_list);
190
191 void maybe_split_alu_block(Shader::ShaderBlocks& out_blocks);
192
193 template <typename I> bool schedule(std::list<I *>& ready_list);
194
195 template <typename I> bool schedule_block(std::list<I *>& ready_list);
196
197 void update_array_writes(const AluGroup& group);
198 bool check_array_reads(const AluInstr& instr);
199 bool check_array_reads(const AluGroup& group);
200
201 std::list<AluInstr *> alu_vec_ready;
202 std::list<AluInstr *> alu_trans_ready;
203 std::list<AluGroup *> alu_groups_ready;
204 std::list<TexInstr *> tex_ready;
205 std::list<ExportInstr *> exports_ready;
206 std::list<FetchInstr *> fetches_ready;
207 std::list<WriteOutInstr *> memops_ready;
208 std::list<MemRingOutInstr *> mem_ring_writes_ready;
209 std::list<GDSInstr *> gds_ready;
210 std::list<WriteTFInstr *> write_tf_ready;
211 std::list<RatInstr *> rat_instr_ready;
212
213 enum {
214 sched_alu,
215 sched_tex,
216 sched_fetch,
217 sched_free,
218 sched_mem_ring,
219 sched_gds,
220 sched_write_tf,
221 sched_rat,
222 } current_shed;
223
224 ExportInstr *m_last_pos;
225 ExportInstr *m_last_pixel;
226 ExportInstr *m_last_param;
227
228 Block *m_current_block;
229
230 int m_lds_addr_count{0};
231 int m_alu_groups_scheduled{0};
232 r600_chip_class m_chip_class;
233 radeon_family m_chip_family;
234 bool m_idx0_loading{false};
235 bool m_idx1_loading{false};
236 bool m_idx0_pending{false};
237 bool m_idx1_pending{false};
238
239 bool m_nop_after_rel_dest{false};
240 bool m_nop_befor_rel_src{false};
241 uint32_t m_next_block_id{1};
242
243
244 ArrayCheckSet m_last_indirect_array_write;
245 ArrayCheckSet m_last_direct_array_write;
246 };
247
248 Shader *
schedule(Shader * original)249 schedule(Shader *original)
250 {
251 Block::set_chipclass(original->chip_class());
252 AluGroup::set_chipclass(original->chip_class());
253
254 sfn_log << SfnLog::schedule << "Original shader\n";
255 if (sfn_log.has_debug_flag(SfnLog::schedule)) {
256 std::stringstream ss;
257 original->print(ss);
258 sfn_log << ss.str() << "\n\n";
259 }
260
261 // TODO later it might be necessary to clone the shader
262 // to be able to re-start scheduling
263
264 auto scheduled_shader = original;
265
266 BlockScheduler s(original->chip_class(), original->chip_family());
267
268 s.run(scheduled_shader);
269 s.finalize();
270
271 sfn_log << SfnLog::schedule << "Scheduled shader\n";
272 if (sfn_log.has_debug_flag(SfnLog::schedule)) {
273 std::stringstream ss;
274 scheduled_shader->print(ss);
275 sfn_log << ss.str() << "\n\n";
276 }
277
278 return scheduled_shader;
279 }
280
BlockScheduler(r600_chip_class chip_class,radeon_family chip_family)281 BlockScheduler::BlockScheduler(r600_chip_class chip_class,
282 radeon_family chip_family):
283 current_shed(sched_alu),
284 m_last_pos(nullptr),
285 m_last_pixel(nullptr),
286 m_last_param(nullptr),
287 m_current_block(nullptr),
288 m_chip_class(chip_class),
289 m_chip_family(chip_family)
290 {
291 m_nop_after_rel_dest = chip_family == CHIP_RV770;
292
293 m_nop_befor_rel_src = m_chip_class == ISA_CC_R600 &&
294 chip_family != CHIP_RV670 &&
295 chip_family != CHIP_RS780 &&
296 chip_family != CHIP_RS880;
297 }
298
299 void
run(Shader * shader)300 BlockScheduler::run(Shader *shader)
301 {
302 Shader::ShaderBlocks scheduled_blocks;
303
304 for (auto& block : shader->func()) {
305 sfn_log << SfnLog::schedule << "Process block " << block->id() << "\n";
306 if (sfn_log.has_debug_flag(SfnLog::schedule)) {
307 std::stringstream ss;
308 block->print(ss);
309 sfn_log << ss.str() << "\n";
310 }
311 schedule_block(*block, scheduled_blocks, shader->value_factory());
312 }
313
314 shader->reset_function(scheduled_blocks);
315 }
316
317 void
schedule_block(Block & in_block,Shader::ShaderBlocks & out_blocks,ValueFactory & vf)318 BlockScheduler::schedule_block(Block& in_block,
319 Shader::ShaderBlocks& out_blocks,
320 ValueFactory& vf)
321 {
322
323 assert(in_block.id() >= 0);
324
325 current_shed = sched_fetch;
326 auto last_shed = sched_fetch;
327
328 CollectInstructions cir(vf);
329 in_block.accept(cir);
330
331 bool have_instr = collect_ready(cir);
332
333 m_current_block = new Block(in_block.nesting_depth(), m_next_block_id++);
334 m_current_block->set_instr_flag(Instr::force_cf);
335 assert(m_current_block->id() >= 0);
336
337 while (have_instr) {
338
339 sfn_log << SfnLog::schedule << "Have ready instructions\n";
340
341 if (alu_vec_ready.size())
342 sfn_log << SfnLog::schedule << " ALU V:" << alu_vec_ready.size() << "\n";
343
344 if (alu_trans_ready.size())
345 sfn_log << SfnLog::schedule << " ALU T:" << alu_trans_ready.size() << "\n";
346
347 if (alu_groups_ready.size())
348 sfn_log << SfnLog::schedule << " ALU G:" << alu_groups_ready.size() << "\n";
349
350 if (exports_ready.size())
351 sfn_log << SfnLog::schedule << " EXP:" << exports_ready.size() << "\n";
352 if (tex_ready.size())
353 sfn_log << SfnLog::schedule << " TEX:" << tex_ready.size() << "\n";
354 if (fetches_ready.size())
355 sfn_log << SfnLog::schedule << " FETCH:" << fetches_ready.size() << "\n";
356 if (mem_ring_writes_ready.size())
357 sfn_log << SfnLog::schedule << " MEM_RING:" << mem_ring_writes_ready.size()
358 << "\n";
359 if (memops_ready.size())
360 sfn_log << SfnLog::schedule << " MEM_OPS:" << mem_ring_writes_ready.size()
361 << "\n";
362
363 if (!m_current_block->lds_group_active() &&
364 m_current_block->expected_ar_uses() == 0) {
365 if (last_shed != sched_free && memops_ready.size() > 8)
366 current_shed = sched_free;
367 else if (mem_ring_writes_ready.size() > 15)
368 current_shed = sched_mem_ring;
369 else if (rat_instr_ready.size() > 3)
370 current_shed = sched_rat;
371 else if (tex_ready.size() > (m_chip_class >= ISA_CC_EVERGREEN ? 15 : 7))
372 current_shed = sched_tex;
373 }
374
375 switch (current_shed) {
376 case sched_alu:
377 if (!schedule_alu(out_blocks)) {
378 assert(!m_current_block->lds_group_active());
379 current_shed = sched_tex;
380 continue;
381 }
382 last_shed = current_shed;
383 break;
384 case sched_tex:
385 if (tex_ready.empty() || !schedule_tex(out_blocks)) {
386 current_shed = sched_fetch;
387 continue;
388 }
389 last_shed = current_shed;
390 break;
391 case sched_fetch:
392 if (!fetches_ready.empty()) {
393 schedule_vtx(out_blocks);
394 last_shed = current_shed;
395 }
396 current_shed = sched_gds;
397 continue;
398 case sched_gds:
399 if (!gds_ready.empty()) {
400 schedule_gds(out_blocks, gds_ready);
401 last_shed = current_shed;
402 }
403 current_shed = sched_mem_ring;
404 continue;
405 case sched_mem_ring:
406 if (mem_ring_writes_ready.empty() ||
407 !schedule_cf(out_blocks, mem_ring_writes_ready)) {
408 current_shed = sched_write_tf;
409 continue;
410 }
411 last_shed = current_shed;
412 break;
413 case sched_write_tf:
414 if (write_tf_ready.empty() || !schedule_gds(out_blocks, write_tf_ready)) {
415 current_shed = sched_rat;
416 continue;
417 }
418 last_shed = current_shed;
419 break;
420 case sched_rat:
421 if (rat_instr_ready.empty() || !schedule_cf(out_blocks, rat_instr_ready)) {
422 current_shed = sched_free;
423 continue;
424 }
425 last_shed = current_shed;
426 break;
427 case sched_free:
428 if (memops_ready.empty() || !schedule_cf(out_blocks, memops_ready)) {
429 current_shed = sched_alu;
430 break;
431 }
432 last_shed = current_shed;
433 }
434
435 have_instr = collect_ready(cir);
436 }
437
438 /* Emit exports always at end of a block */
439 while (collect_ready_type(exports_ready, cir.exports))
440 schedule_exports(out_blocks, exports_ready);
441
442 ASSERTED bool fail = false;
443
444 if (!cir.alu_groups.empty()) {
445 std::cerr << "Unscheduled ALU groups:\n";
446 for (auto& a : cir.alu_groups) {
447 std::cerr << " " << *a << "\n";
448 }
449 fail = true;
450 }
451
452 if (!cir.alu_vec.empty()) {
453 std::cerr << "Unscheduled ALU vec ops:\n";
454 for (auto& a : cir.alu_vec) {
455 std::cerr << " [" << a->block_id() << ":"
456 << a->index() <<"]:" << *a << "\n";
457 for (auto& d : a->required_instr())
458 std::cerr << " R["<< d->block_id() << ":" << d->index() <<"]:"
459 << *d << "\n";
460 }
461 fail = true;
462 }
463
464 if (!cir.alu_trans.empty()) {
465 std::cerr << "Unscheduled ALU trans ops:\n";
466 for (auto& a : cir.alu_trans) {
467 std::cerr << " " << " [" << a->block_id() << ":"
468 << a->index() <<"]:" << *a << "\n";
469 for (auto& d : a->required_instr())
470 std::cerr << " R:" << *d << "\n";
471 }
472 fail = true;
473 }
474 if (!cir.mem_write_instr.empty()) {
475 std::cerr << "Unscheduled MEM ops:\n";
476 for (auto& a : cir.mem_write_instr) {
477 std::cerr << " " << *a << "\n";
478 }
479 fail = true;
480 }
481
482 if (!cir.fetches.empty()) {
483 std::cerr << "Unscheduled Fetch ops:\n";
484 for (auto& a : cir.fetches) {
485 std::cerr << " " << *a << "\n";
486 }
487 fail = true;
488 }
489
490 if (!cir.tex.empty()) {
491 std::cerr << "Unscheduled Tex ops:\n";
492 for (auto& a : cir.tex) {
493 std::cerr << " " << *a << "\n";
494 }
495 fail = true;
496 }
497
498 if (fail) {
499 std::cerr << "Failing block:\n";
500 for (auto& i : in_block)
501 std::cerr << "[" << i->block_id() << ":" << i->index() << "] "
502 << (i->is_scheduled() ? "S " : "")
503 << *i << "\n";
504 std::cerr << "\nSo far scheduled: ";
505
506 for (auto i : *m_current_block)
507 std::cerr << "[" << i->block_id() << ":" << i->index() << "] " << *i << "\n";
508 std::cerr << "\n\n: ";
509 }
510
511 assert(cir.tex.empty());
512 assert(cir.exports.empty());
513 assert(cir.fetches.empty());
514 assert(cir.alu_vec.empty());
515 assert(cir.mem_write_instr.empty());
516 assert(cir.mem_ring_writes.empty());
517
518 assert(!fail);
519
520 if (cir.m_cf_instr) {
521 // Assert that if condition is ready
522 if (m_current_block->type() != Block::alu) {
523 start_new_block(out_blocks, Block::alu);
524 }
525 m_current_block->push_back(cir.m_cf_instr);
526 cir.m_cf_instr->set_scheduled();
527 }
528
529 if (m_current_block->type() == Block::alu)
530 maybe_split_alu_block(out_blocks);
531 else
532 out_blocks.push_back(m_current_block);
533 }
534
535 void
finalize()536 BlockScheduler::finalize()
537 {
538 if (m_last_pos)
539 m_last_pos->set_is_last_export(true);
540 if (m_last_pixel)
541 m_last_pixel->set_is_last_export(true);
542 if (m_last_param)
543 m_last_param->set_is_last_export(true);
544 }
545
546 bool
schedule_alu(Shader::ShaderBlocks & out_blocks)547 BlockScheduler::schedule_alu(Shader::ShaderBlocks& out_blocks)
548 {
549 bool success = false;
550 AluGroup *group = nullptr;
551
552 sfn_log << SfnLog::schedule << "Schedule alu with " <<
553 m_current_block->expected_ar_uses()
554 << " pending AR loads\n";
555
556 bool has_alu_ready = !alu_vec_ready.empty() || !alu_trans_ready.empty();
557
558 bool has_lds_ready =
559 !alu_vec_ready.empty() && (*alu_vec_ready.begin())->has_lds_access();
560
561 bool has_ar_read_ready = !alu_vec_ready.empty() &&
562 std::get<0>((*alu_vec_ready.begin())->indirect_addr());
563
564 /* If we have ready ALU instructions we have to start a new ALU block */
565 if (has_alu_ready || !alu_groups_ready.empty()) {
566 if (m_current_block->type() != Block::alu) {
567 start_new_block(out_blocks, Block::alu);
568 m_alu_groups_scheduled = 0;
569 }
570 }
571
572 /* Schedule groups first. unless we have a pending LDS instruction
573 * We don't want the LDS instructions to be too far apart because the
574 * fetch + read from queue has to be in the same ALU CF block */
575 if (!alu_groups_ready.empty() && !has_lds_ready && !has_ar_read_ready) {
576 group = *alu_groups_ready.begin();
577
578 if (!check_array_reads(*group)) {
579
580
581 sfn_log << SfnLog::schedule << "try schedule " <<
582 *group << "\n";
583
584 /* Only start a new CF if we have no pending AR reads */
585 if (m_current_block->try_reserve_kcache(*group)) {
586 alu_groups_ready.erase(alu_groups_ready.begin());
587 success = true;
588 } else {
589 if (m_current_block->expected_ar_uses() == 0) {
590 start_new_block(out_blocks, Block::alu);
591
592 if (!m_current_block->try_reserve_kcache(*group))
593 unreachable("Scheduling a group in a new block should always succeed");
594 alu_groups_ready.erase(alu_groups_ready.begin());
595 sfn_log << SfnLog::schedule << "Schedule ALU group\n";
596 success = true;
597 } else {
598 sfn_log << SfnLog::schedule << "Don't add group because of " <<
599 m_current_block->expected_ar_uses()
600 << "pending AR loads\n";
601 group = nullptr;
602 }
603 }
604 }
605 }
606
607 if (!group && has_alu_ready) {
608 group = new AluGroup();
609 sfn_log << SfnLog::schedule << "START new ALU group\n";
610 } else if (!success) {
611 return false;
612 }
613
614 assert(group);
615
616 int free_slots = group->free_slots();
617
618 while (free_slots && has_alu_ready) {
619 if (!alu_vec_ready.empty())
620 success |= schedule_alu_to_group_vec(group);
621
622 /* Apparently one can't schedule a t-slot if there is already
623 * and LDS instruction scheduled.
624 * TODO: check whether this is only relevant for actual LDS instructions
625 * or also for instructions that read from the LDS return value queue */
626
627 if (free_slots & 0x10 && !has_lds_ready) {
628 sfn_log << SfnLog::schedule << "Try schedule TRANS channel\n";
629 if (!alu_trans_ready.empty())
630 success |= schedule_alu_to_group_trans(group, alu_trans_ready);
631 if (!alu_vec_ready.empty())
632 success |= schedule_alu_to_group_trans(group, alu_vec_ready);
633 }
634
635 if (success) {
636 ++m_alu_groups_scheduled;
637 break;
638 } else if (m_current_block->kcache_reservation_failed()) {
639 // LDS read groups should not lead to impossible
640 // kcache constellations
641 assert(!m_current_block->lds_group_active());
642
643 // AR is loaded but not all uses are done, we don't want
644 // to start a new CF here
645 assert(m_current_block->expected_ar_uses() ==0);
646
647 // kcache reservation failed, so we have to start a new CF
648 start_new_block(out_blocks, Block::alu);
649 } else {
650 // Ready is not empty, but we didn't schedule anything, this
651 // means we had a indirect array read or write conflict that we
652 // can resolve with an extra group that has a NOP instruction
653 if (!alu_trans_ready.empty() || !alu_vec_ready.empty()) {
654 group->add_vec_instructions(new AluInstr(op0_nop, 0));
655 break;
656 } else {
657 return false;
658 }
659 }
660 }
661
662
663
664 sfn_log << SfnLog::schedule << "Finalize ALU group\n";
665 group->set_scheduled();
666 group->fix_last_flag();
667 group->set_nesting_depth(m_current_block->nesting_depth());
668
669 auto [addr, is_index] = group->addr();
670 if (is_index) {
671 if (addr->sel() == AddressRegister::idx0 && m_idx0_pending) {
672 assert(!group->has_lds_group_start());
673 assert(m_current_block->expected_ar_uses() == 0);
674 start_new_block(out_blocks, Block::alu);
675 m_current_block->try_reserve_kcache(*group);
676 }
677 if (addr->sel() == AddressRegister::idx1 && m_idx1_pending) {
678 assert(!group->has_lds_group_start());
679 assert(m_current_block->expected_ar_uses() == 0);
680 start_new_block(out_blocks, Block::alu);
681 m_current_block->try_reserve_kcache(*group);
682 }
683 }
684
685 m_current_block->push_back(group);
686
687 update_array_writes(*group);
688
689 m_idx0_pending |= m_idx0_loading;
690 m_idx0_loading = false;
691
692 m_idx1_pending |= m_idx1_loading;
693 m_idx1_loading = false;
694
695 if (!m_current_block->lds_group_active() &&
696 m_current_block->expected_ar_uses() == 0 &&
697 (!addr || is_index)) {
698 group->set_instr_flag(Instr::no_lds_or_addr_group);
699 }
700
701 if (group->has_lds_group_start())
702 m_current_block->lds_group_start(*group->begin());
703
704 if (group->has_lds_group_end())
705 m_current_block->lds_group_end();
706
707 if (group->has_kill_op()) {
708 assert(!group->has_lds_group_start());
709 assert(m_current_block->expected_ar_uses() == 0);
710 start_new_block(out_blocks, Block::alu);
711 }
712
713 return success;
714 }
715
716 bool
schedule_tex(Shader::ShaderBlocks & out_blocks)717 BlockScheduler::schedule_tex(Shader::ShaderBlocks& out_blocks)
718 {
719 if (m_current_block->type() != Block::tex || m_current_block->remaining_slots() == 0) {
720 start_new_block(out_blocks, Block::tex);
721 m_current_block->set_instr_flag(Instr::force_cf);
722 }
723
724 if (!tex_ready.empty() && m_current_block->remaining_slots() > 0) {
725 auto ii = tex_ready.begin();
726 sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
727
728 if ((unsigned)m_current_block->remaining_slots() < 1 + (*ii)->prepare_instr().size())
729 start_new_block(out_blocks, Block::tex);
730
731 for (auto prep : (*ii)->prepare_instr()) {
732 prep->set_scheduled();
733 m_current_block->push_back(prep);
734 }
735
736 (*ii)->set_scheduled();
737 m_current_block->push_back(*ii);
738 tex_ready.erase(ii);
739 return true;
740 }
741 return false;
742 }
743
744 bool
schedule_vtx(Shader::ShaderBlocks & out_blocks)745 BlockScheduler::schedule_vtx(Shader::ShaderBlocks& out_blocks)
746 {
747 if (m_current_block->type() != Block::vtx || m_current_block->remaining_slots() == 0) {
748 start_new_block(out_blocks, Block::vtx);
749 m_current_block->set_instr_flag(Instr::force_cf);
750 }
751 return schedule_block(fetches_ready);
752 }
753
754 template <typename I>
755 bool
schedule_gds(Shader::ShaderBlocks & out_blocks,std::list<I * > & ready_list)756 BlockScheduler::schedule_gds(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list)
757 {
758 bool was_full = m_current_block->remaining_slots() == 0;
759 if (m_current_block->type() != Block::gds || was_full) {
760 start_new_block(out_blocks, Block::gds);
761 if (was_full)
762 m_current_block->set_instr_flag(Instr::force_cf);
763 }
764 return schedule_block(ready_list);
765 }
766
767 void
start_new_block(Shader::ShaderBlocks & out_blocks,Block::Type type)768 BlockScheduler::start_new_block(Shader::ShaderBlocks& out_blocks, Block::Type type)
769 {
770 if (!m_current_block->empty()) {
771 sfn_log << SfnLog::schedule << "Start new block\n";
772 assert(!m_current_block->lds_group_active());
773
774 if (m_current_block->type() != Block::alu)
775 out_blocks.push_back(m_current_block);
776 else
777 maybe_split_alu_block(out_blocks);
778 m_current_block = new Block(m_current_block->nesting_depth(), m_next_block_id++);
779 m_current_block->set_instr_flag(Instr::force_cf);
780 m_idx0_pending = m_idx1_pending = false;
781
782 }
783 m_current_block->set_type(type, m_chip_class);
784 }
785
maybe_split_alu_block(Shader::ShaderBlocks & out_blocks)786 void BlockScheduler::maybe_split_alu_block(Shader::ShaderBlocks& out_blocks)
787 {
788 // TODO: needs fixing
789 if (m_current_block->remaining_slots() > 0) {
790 out_blocks.push_back(m_current_block);
791 return;
792 }
793
794 int used_slots = 0;
795 int pending_slots = 0;
796
797 Instr *next_block_start = nullptr;
798 for (auto cur_group : *m_current_block) {
799 /* This limit is a bit fishy, it should be 128 */
800 if (used_slots + pending_slots + cur_group->slots() < 128) {
801 if (cur_group->can_start_alu_block()) {
802 next_block_start = cur_group;
803 used_slots += pending_slots;
804 pending_slots = cur_group->slots();
805 } else {
806 pending_slots += cur_group->slots();
807 }
808 } else {
809 assert(next_block_start);
810 next_block_start->set_instr_flag(Instr::force_cf);
811 used_slots = pending_slots;
812 pending_slots = cur_group->slots();
813 }
814 }
815
816 Block *sub_block = new Block(m_current_block->nesting_depth(),
817 m_next_block_id++);
818 sub_block->set_type(Block::alu, m_chip_class);
819 sub_block->set_instr_flag(Instr::force_cf);
820
821 for (auto instr : *m_current_block) {
822 auto group = instr->as_alu_group();
823 if (!group) {
824 sub_block->push_back(instr);
825 continue;
826 }
827
828 if (group->group_force_alu_cf()) {
829 assert(!sub_block->lds_group_active());
830 out_blocks.push_back(sub_block);
831 sub_block = new Block(m_current_block->nesting_depth(),
832 m_next_block_id++);
833 sub_block->set_type(Block::alu, m_chip_class);
834 sub_block->set_instr_flag(Instr::force_cf);
835 }
836 sub_block->push_back(group);
837 if (group->has_lds_group_start())
838 sub_block->lds_group_start(*group->begin());
839
840 if (group->has_lds_group_end())
841 sub_block->lds_group_end();
842
843 }
844 if (!sub_block->empty())
845 out_blocks.push_back(sub_block);
846 }
847
848 template <typename I>
849 bool
schedule_cf(Shader::ShaderBlocks & out_blocks,std::list<I * > & ready_list)850 BlockScheduler::schedule_cf(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list)
851 {
852 if (ready_list.empty())
853 return false;
854 if (m_current_block->type() != Block::cf)
855 start_new_block(out_blocks, Block::cf);
856 return schedule(ready_list);
857 }
858
859 bool
schedule_alu_to_group_vec(AluGroup * group)860 BlockScheduler::schedule_alu_to_group_vec(AluGroup *group)
861 {
862 assert(group);
863 assert(!alu_vec_ready.empty());
864
865 bool success = false;
866 auto i = alu_vec_ready.begin();
867 auto e = alu_vec_ready.end();
868 while (i != e) {
869 sfn_log << SfnLog::schedule << "Try schedule to vec " << **i;
870
871 if (check_array_reads(**i)) {
872 ++i;
873 continue;
874 }
875
876 // precausion: don't kill while we hae LDS queue reads in the pipeline
877 if ((*i)->is_kill() && m_current_block->lds_group_active())
878 continue;
879
880 if (!m_current_block->try_reserve_kcache(**i)) {
881 sfn_log << SfnLog::schedule << " failed (kcache)\n";
882 ++i;
883 continue;
884 }
885
886 if (group->add_vec_instructions(*i)) {
887 auto old_i = i;
888 ++i;
889 if ((*old_i)->has_alu_flag(alu_is_lds)) {
890 --m_lds_addr_count;
891 }
892
893 if ((*old_i)->num_ar_uses())
894 m_current_block->set_expected_ar_uses((*old_i)->num_ar_uses());
895 auto addr = std::get<0>((*old_i)->indirect_addr());
896 bool has_indirect_reg_load = addr != nullptr && addr->has_flag(Register::addr_or_idx);
897
898 bool is_idx_load_on_eg = false;
899 if (!(*old_i)->has_alu_flag(alu_is_lds)) {
900 bool load_idx0_eg = (*old_i)->opcode() == op1_set_cf_idx0;
901 bool load_idx0_ca = ((*old_i)->opcode() == op1_mova_int &&
902 (*old_i)->dest()->sel() == AddressRegister::idx0);
903
904 bool load_idx1_eg = (*old_i)->opcode() == op1_set_cf_idx1;
905 bool load_idx1_ca = ((*old_i)->opcode() == op1_mova_int &&
906 (*old_i)->dest()->sel() == AddressRegister::idx1);
907
908 is_idx_load_on_eg = load_idx0_eg || load_idx1_eg;
909
910 bool load_idx0 = load_idx0_eg || load_idx0_ca;
911 bool load_idx1 = load_idx1_eg || load_idx1_ca;
912
913
914 assert(!m_idx0_pending || !load_idx0);
915 assert(!m_idx1_pending || !load_idx1);
916
917 m_idx0_loading |= load_idx0;
918 m_idx1_loading |= load_idx1;
919 }
920
921 if (has_indirect_reg_load || is_idx_load_on_eg)
922 m_current_block->dec_expected_ar_uses();
923
924 alu_vec_ready.erase(old_i);
925 success = true;
926 sfn_log << SfnLog::schedule << " success\n";
927 } else {
928 ++i;
929 sfn_log << SfnLog::schedule << " failed\n";
930 }
931 }
932 return success;
933 }
934
935 bool
schedule_alu_to_group_trans(AluGroup * group,std::list<AluInstr * > & readylist)936 BlockScheduler::schedule_alu_to_group_trans(AluGroup *group,
937 std::list<AluInstr *>& readylist)
938 {
939 assert(group);
940
941 bool success = false;
942 auto i = readylist.begin();
943 auto e = readylist.end();
944 while (i != e) {
945
946 if (check_array_reads(**i)) {
947 ++i;
948 continue;
949 }
950
951 sfn_log << SfnLog::schedule << "Try schedule to trans " << **i;
952 if (!m_current_block->try_reserve_kcache(**i)) {
953 sfn_log << SfnLog::schedule << " failed (kcache)\n";
954 ++i;
955 continue;
956 }
957
958 if (group->add_trans_instructions(*i)) {
959 auto old_i = i;
960 ++i;
961 auto addr = std::get<0>((*old_i)->indirect_addr());
962 if (addr && addr->has_flag(Register::addr_or_idx))
963 m_current_block->dec_expected_ar_uses();
964
965 readylist.erase(old_i);
966 success = true;
967 sfn_log << SfnLog::schedule << " success\n";
968 break;
969 } else {
970 ++i;
971 sfn_log << SfnLog::schedule << " failed\n";
972 }
973 }
974 return success;
975 }
976
977 template <typename I>
978 bool
schedule(std::list<I * > & ready_list)979 BlockScheduler::schedule(std::list<I *>& ready_list)
980 {
981 if (!ready_list.empty() && m_current_block->remaining_slots() > 0) {
982 auto ii = ready_list.begin();
983 sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
984 (*ii)->set_scheduled();
985 m_current_block->push_back(*ii);
986 ready_list.erase(ii);
987 return true;
988 }
989 return false;
990 }
991
992 template <typename I>
993 bool
schedule_block(std::list<I * > & ready_list)994 BlockScheduler::schedule_block(std::list<I *>& ready_list)
995 {
996 bool success = false;
997 while (!ready_list.empty() && m_current_block->remaining_slots() > 0) {
998 auto ii = ready_list.begin();
999 sfn_log << SfnLog::schedule << "Schedule: " << **ii << " "
1000 << m_current_block->remaining_slots() << "\n";
1001 (*ii)->set_scheduled();
1002 m_current_block->push_back(*ii);
1003 ready_list.erase(ii);
1004 success = true;
1005 }
1006 return success;
1007 }
1008
1009 bool
schedule_exports(Shader::ShaderBlocks & out_blocks,std::list<ExportInstr * > & ready_list)1010 BlockScheduler::schedule_exports(Shader::ShaderBlocks& out_blocks,
1011 std::list<ExportInstr *>& ready_list)
1012 {
1013 if (m_current_block->type() != Block::cf)
1014 start_new_block(out_blocks, Block::cf);
1015
1016 if (!ready_list.empty()) {
1017 auto ii = ready_list.begin();
1018 sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
1019 (*ii)->set_scheduled();
1020 m_current_block->push_back(*ii);
1021 switch ((*ii)->export_type()) {
1022 case ExportInstr::pos:
1023 m_last_pos = *ii;
1024 break;
1025 case ExportInstr::param:
1026 m_last_param = *ii;
1027 break;
1028 case ExportInstr::pixel:
1029 m_last_pixel = *ii;
1030 break;
1031 }
1032 (*ii)->set_is_last_export(false);
1033 ready_list.erase(ii);
1034 return true;
1035 }
1036 return false;
1037 }
1038
1039 bool
collect_ready(CollectInstructions & available)1040 BlockScheduler::collect_ready(CollectInstructions& available)
1041 {
1042 sfn_log << SfnLog::schedule << "Ready instructions\n";
1043 bool result = false;
1044 result |= collect_ready_alu_vec(alu_vec_ready, available.alu_vec);
1045 result |= collect_ready_type(alu_trans_ready, available.alu_trans);
1046 result |= collect_ready_type(alu_groups_ready, available.alu_groups);
1047 result |= collect_ready_type(gds_ready, available.gds_op);
1048 result |= collect_ready_type(tex_ready, available.tex);
1049 result |= collect_ready_type(fetches_ready, available.fetches);
1050 result |= collect_ready_type(memops_ready, available.mem_write_instr);
1051 result |= collect_ready_type(mem_ring_writes_ready, available.mem_ring_writes);
1052 result |= collect_ready_type(write_tf_ready, available.write_tf);
1053 result |= collect_ready_type(rat_instr_ready, available.rat_instr);
1054
1055 sfn_log << SfnLog::schedule << "\n";
1056 return result;
1057 }
1058
1059 bool
collect_ready_alu_vec(std::list<AluInstr * > & ready,std::list<AluInstr * > & available)1060 BlockScheduler::collect_ready_alu_vec(std::list<AluInstr *>& ready,
1061 std::list<AluInstr *>& available)
1062 {
1063 auto i = available.begin();
1064 auto e = available.end();
1065
1066 for (auto alu : ready) {
1067 alu->add_priority(100 * alu->register_priority());
1068 }
1069
1070 int max_check = 0;
1071 while (i != e && max_check++ < 64) {
1072 if (ready.size() < 64 && (*i)->ready()) {
1073
1074 int priority = 0;
1075 /* LDS fetches that use static offsets are usually ready ery fast,
1076 * so that they would get schedules early, and this leaves the
1077 * problem that we allocate too many registers with just constant
1078 * values, and this will make problems with RA. So limit the number of
1079 * LDS address registers.
1080 */
1081 if ((*i)->has_alu_flag(alu_lds_address)) {
1082 if (m_lds_addr_count > 64) {
1083 ++i;
1084 continue;
1085 } else {
1086 ++m_lds_addr_count;
1087 }
1088 }
1089
1090 /* LDS instructions are scheduled with high priority.
1091 * instractions that can go into the t slot and don't have
1092 * indirect access are put in last, so that they don't block
1093 * vec-only instructions when scheduling to the vector slots
1094 * for everything else we look at the register use */
1095
1096 auto [addr, dummy1, dummy2] = (*i)->indirect_addr();
1097
1098 if ((*i)->has_lds_access()) {
1099 priority = 100000;
1100 if ((*i)->has_alu_flag(alu_is_lds))
1101 priority += 100000;
1102 } else if (addr) {
1103 priority = 10000;
1104 } else if (AluGroup::has_t()) {
1105 auto opinfo = alu_ops.find((*i)->opcode());
1106 assert(opinfo != alu_ops.end());
1107 if (opinfo->second.can_channel(AluOp::t, m_chip_class))
1108 priority = -1;
1109 }
1110
1111 priority += 100 * (*i)->register_priority();
1112
1113 (*i)->add_priority(priority);
1114 ready.push_back(*i);
1115
1116 auto old_i = i;
1117 ++i;
1118 available.erase(old_i);
1119 } else
1120 ++i;
1121 }
1122
1123 for (auto& i : ready)
1124 sfn_log << SfnLog::schedule << "V: " << *i << "\n";
1125
1126 ready.sort([](const AluInstr *lhs, const AluInstr *rhs) {
1127 return lhs->priority() > rhs->priority();
1128 });
1129
1130 for (auto& i : ready)
1131 sfn_log << SfnLog::schedule << "V (S): " << i->priority() << " " << *i << "\n";
1132
1133 return !ready.empty();
1134 }
1135
1136 template <typename T> struct type_char {
1137 };
1138
1139 template <> struct type_char<AluInstr> {
valuer600::type_char1140 static char value() { return 'A';};
1141 };
1142
1143 template <> struct type_char<AluGroup> {
valuer600::type_char1144 static char value() { return 'G';};
1145 };
1146
1147 template <> struct type_char<ExportInstr> {
valuer600::type_char1148 static char value() { return 'E';};
1149 };
1150
1151 template <> struct type_char<TexInstr> {
valuer600::type_char1152 static char value() { return 'T';};
1153 };
1154
1155 template <> struct type_char<FetchInstr> {
valuer600::type_char1156 static char value() { return 'F';};
1157 };
1158
1159 template <> struct type_char<WriteOutInstr> {
valuer600::type_char1160 static char value() { return 'M';};
1161 };
1162
1163 template <> struct type_char<MemRingOutInstr> {
valuer600::type_char1164 static char value() { return 'R';};
1165 };
1166
1167 template <> struct type_char<WriteTFInstr> {
valuer600::type_char1168 static char value() { return 'X';};
1169 };
1170
1171 template <> struct type_char<GDSInstr> {
valuer600::type_char1172 static char value() { return 'S';};
1173 };
1174
1175 template <> struct type_char<RatInstr> {
valuer600::type_char1176 static char value() { return 'I';};
1177 };
1178
1179 template <typename T>
1180 bool
collect_ready_type(std::list<T * > & ready,std::list<T * > & available)1181 BlockScheduler::collect_ready_type(std::list<T *>& ready, std::list<T *>& available)
1182 {
1183 auto i = available.begin();
1184 auto e = available.end();
1185
1186 int lookahead = 16;
1187 while (i != e && ready.size() < 16 && lookahead-- > 0) {
1188 if ((*i)->ready()) {
1189 ready.push_back(*i);
1190 auto old_i = i;
1191 ++i;
1192 available.erase(old_i);
1193 } else
1194 ++i;
1195 }
1196
1197 for (auto& i : ready)
1198 sfn_log << SfnLog::schedule << type_char<T>::value() << "; " << *i << "\n";
1199
1200 return !ready.empty();
1201 }
1202
1203 class CheckArrayAccessVisitor : public ConstRegisterVisitor {
1204 public:
1205 using ConstRegisterVisitor::visit;
visit(const Register & value)1206 void visit(const Register& value) override {(void)value;}
visit(const LocalArray & value)1207 void visit(const LocalArray& value) override {(void)value;}
visit(const UniformValue & value)1208 void visit(const UniformValue& value) override {(void)value;}
visit(const LiteralConstant & value)1209 void visit(const LiteralConstant& value) override {(void)value;}
visit(const InlineConstant & value)1210 void visit(const InlineConstant& value) override {(void)value;}
1211 };
1212
1213 class UpdateArrayWrite : public CheckArrayAccessVisitor {
1214 public:
UpdateArrayWrite(ArrayCheckSet & indirect_arrays,ArrayCheckSet & direct_arrays,bool tdw)1215 UpdateArrayWrite(ArrayCheckSet& indirect_arrays,
1216 ArrayCheckSet& direct_arrays,
1217 bool tdw):
1218 last_indirect_array_write(indirect_arrays),
1219 last_direct_array_write(direct_arrays),
1220 track_direct_writes(tdw)
1221 {
1222 }
1223
visit(const LocalArrayValue & value)1224 void visit(const LocalArrayValue& value) override {
1225 int array_base = value.array().base_sel();
1226 auto entry = std::make_pair(array_base, value.chan());
1227 if (value.addr())
1228 last_indirect_array_write.insert(entry);
1229 else if (track_direct_writes)
1230 last_direct_array_write.insert(entry);
1231 }
1232 private:
1233 ArrayCheckSet& last_indirect_array_write;
1234 ArrayCheckSet& last_direct_array_write;
1235 bool track_direct_writes {false};
1236 };
1237
1238
update_array_writes(const AluGroup & group)1239 void BlockScheduler::update_array_writes(const AluGroup& group)
1240 {
1241 if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1242 m_last_direct_array_write.clear();
1243 m_last_indirect_array_write.clear();
1244
1245 UpdateArrayWrite visitor(m_last_indirect_array_write,
1246 m_last_direct_array_write,
1247 m_nop_befor_rel_src);
1248
1249 for (auto alu : group) {
1250 if (alu && alu->dest())
1251 alu->dest()->accept(visitor);
1252 }
1253 }
1254 }
1255
1256 class CheckArrayRead : public CheckArrayAccessVisitor {
1257 public:
CheckArrayRead(const ArrayCheckSet & indirect_arrays,const ArrayCheckSet & direct_arrays)1258 CheckArrayRead(const ArrayCheckSet& indirect_arrays,
1259 const ArrayCheckSet& direct_arrays):
1260 last_indirect_array_write(indirect_arrays),
1261 last_direct_array_write(direct_arrays)
1262 {
1263 }
1264
visit(const LocalArrayValue & value)1265 void visit(const LocalArrayValue& value) override {
1266 int array_base = value.array().base_sel();
1267 auto entry = std::make_pair(array_base, value.chan());
1268
1269 if (last_indirect_array_write.find(entry) !=
1270 last_indirect_array_write.end())
1271 need_extra_group = true;
1272
1273 if (value.addr() && last_direct_array_write.find(entry) !=
1274 last_direct_array_write.end()) {
1275 need_extra_group = true;
1276 }
1277 }
1278
1279 const ArrayCheckSet& last_indirect_array_write;
1280 const ArrayCheckSet& last_direct_array_write;
1281 bool need_extra_group {false};
1282 };
1283
1284
check_array_reads(const AluInstr & instr)1285 bool BlockScheduler::check_array_reads(const AluInstr& instr)
1286 {
1287 if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1288
1289 CheckArrayRead visitor(m_last_indirect_array_write,
1290 m_last_direct_array_write);
1291
1292 for (auto& s : instr.sources()) {
1293 s->accept(visitor);
1294 }
1295 return visitor.need_extra_group;
1296 }
1297 return false;
1298 }
1299
check_array_reads(const AluGroup & group)1300 bool BlockScheduler::check_array_reads(const AluGroup& group)
1301 {
1302 if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1303
1304 CheckArrayRead visitor(m_last_indirect_array_write,
1305 m_last_direct_array_write);
1306
1307 for (auto alu : group) {
1308 if (!alu)
1309 continue;
1310 for (auto& s : alu->sources()) {
1311 s->accept(visitor);
1312 }
1313 }
1314 return visitor.need_extra_group;
1315 }
1316 return false;
1317 }
1318
1319
1320 } // namespace r600
1321