• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- mesa-c++  -*-
2  *
3  * Copyright (c) 2022 Collabora LTD
4  *
5  * Author: Gert Wollny <gert.wollny@collabora.com>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * on the rights to use, copy, modify, merge, publish, distribute, sub
11  * license, and/or sell copies of the Software, and to permit persons to whom
12  * the Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24  * USE OR OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "sfn_scheduler.h"
28 
29 #include "../r600_isa.h"
30 
31 #include "amd_family.h"
32 #include "sfn_alu_defines.h"
33 #include "sfn_debug.h"
34 #include "sfn_instr_alugroup.h"
35 #include "sfn_instr_controlflow.h"
36 #include "sfn_instr_export.h"
37 #include "sfn_instr_fetch.h"
38 #include "sfn_instr_lds.h"
39 #include "sfn_instr_mem.h"
40 #include "sfn_instr_tex.h"
41 
42 #include <algorithm>
43 #include <sstream>
44 
45 namespace r600 {
46 
47 class CollectInstructions : public InstrVisitor {
48 
49 public:
CollectInstructions(ValueFactory & vf)50    CollectInstructions(ValueFactory& vf):
51        m_value_factory(vf)
52    {
53    }
54 
visit(AluInstr * instr)55    void visit(AluInstr *instr) override
56    {
57       if (instr->has_alu_flag(alu_is_trans))
58          alu_trans.push_back(instr);
59       else {
60          if (instr->alu_slots() == 1)
61             alu_vec.push_back(instr);
62          else
63             alu_groups.push_back(instr->split(m_value_factory));
64       }
65    }
visit(AluGroup * instr)66    void visit(AluGroup *instr) override { alu_groups.push_back(instr); }
visit(TexInstr * instr)67    void visit(TexInstr *instr) override { tex.push_back(instr); }
visit(ExportInstr * instr)68    void visit(ExportInstr *instr) override { exports.push_back(instr); }
visit(FetchInstr * instr)69    void visit(FetchInstr *instr) override { fetches.push_back(instr); }
visit(Block * instr)70    void visit(Block *instr) override
71    {
72       for (auto& i : *instr)
73          i->accept(*this);
74    }
75 
visit(ControlFlowInstr * instr)76    void visit(ControlFlowInstr *instr) override
77    {
78       assert(!m_cf_instr);
79       m_cf_instr = instr;
80    }
81 
visit(IfInstr * instr)82    void visit(IfInstr *instr) override
83    {
84       assert(!m_cf_instr);
85       m_cf_instr = instr;
86    }
87 
visit(EmitVertexInstr * instr)88    void visit(EmitVertexInstr *instr) override
89    {
90       assert(!m_cf_instr);
91       m_cf_instr = instr;
92    }
93 
visit(ScratchIOInstr * instr)94    void visit(ScratchIOInstr *instr) override { mem_write_instr.push_back(instr); }
95 
visit(StreamOutInstr * instr)96    void visit(StreamOutInstr *instr) override { mem_write_instr.push_back(instr); }
97 
visit(MemRingOutInstr * instr)98    void visit(MemRingOutInstr *instr) override { mem_ring_writes.push_back(instr); }
99 
visit(GDSInstr * instr)100    void visit(GDSInstr *instr) override { gds_op.push_back(instr); }
101 
visit(WriteTFInstr * instr)102    void visit(WriteTFInstr *instr) override { write_tf.push_back(instr); }
103 
visit(LDSReadInstr * instr)104    void visit(LDSReadInstr *instr) override
105    {
106       std::vector<AluInstr *> buffer;
107       m_last_lds_instr = instr->split(buffer, m_last_lds_instr);
108       for (auto& i : buffer) {
109          i->accept(*this);
110       }
111    }
112 
visit(LDSAtomicInstr * instr)113    void visit(LDSAtomicInstr *instr) override
114    {
115       std::vector<AluInstr *> buffer;
116       m_last_lds_instr = instr->split(buffer, m_last_lds_instr);
117       for (auto& i : buffer) {
118          i->accept(*this);
119       }
120    }
121 
visit(RatInstr * instr)122    void visit(RatInstr *instr) override { rat_instr.push_back(instr); }
123 
124    std::list<AluInstr *> alu_trans;
125    std::list<AluInstr *> alu_vec;
126    std::list<TexInstr *> tex;
127    std::list<AluGroup *> alu_groups;
128    std::list<ExportInstr *> exports;
129    std::list<FetchInstr *> fetches;
130    std::list<WriteOutInstr *> mem_write_instr;
131    std::list<MemRingOutInstr *> mem_ring_writes;
132    std::list<GDSInstr *> gds_op;
133    std::list<WriteTFInstr *> write_tf;
134    std::list<RatInstr *> rat_instr;
135 
136    Instr *m_cf_instr{nullptr};
137    ValueFactory& m_value_factory;
138 
139    AluInstr *m_last_lds_instr{nullptr};
140 };
141 
142 struct ArrayChanHash
143 {
operator ()r600::ArrayChanHash144     std::size_t operator()(std::pair<int, int> const& s) const noexcept
145     {
146        return std::hash<size_t>{}((size_t(s.first) << 3) | s.second);
147     }
148 };
149 
150 using ArrayCheckSet = std::unordered_set<std::pair<int, int>, ArrayChanHash>;
151 
152 class BlockScheduler {
153 public:
154    BlockScheduler(r600_chip_class chip_class,
155                   radeon_family family);
156 
157    void run(Shader *shader);
158 
159    void finalize();
160 
161 private:
162    void
163    schedule_block(Block& in_block, Shader::ShaderBlocks& out_blocks, ValueFactory& vf);
164 
165    bool collect_ready(CollectInstructions& available);
166 
167    template <typename T>
168    bool collect_ready_type(std::list<T *>& ready, std::list<T *>& orig);
169 
170    bool collect_ready_alu_vec(std::list<AluInstr *>& ready,
171                               std::list<AluInstr *>& available);
172 
173    bool schedule_tex(Shader::ShaderBlocks& out_blocks);
174    bool schedule_vtx(Shader::ShaderBlocks& out_blocks);
175 
176    template <typename I>
177    bool schedule_gds(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list);
178 
179    template <typename I>
180    bool schedule_cf(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list);
181 
182    bool schedule_alu(Shader::ShaderBlocks& out_blocks);
183    void start_new_block(Shader::ShaderBlocks& out_blocks, Block::Type type);
184 
185    bool schedule_alu_to_group_vec(AluGroup *group);
186    bool schedule_alu_to_group_trans(AluGroup *group, std::list<AluInstr *>& readylist);
187 
188    bool schedule_exports(Shader::ShaderBlocks& out_blocks,
189                          std::list<ExportInstr *>& ready_list);
190 
191    void maybe_split_alu_block(Shader::ShaderBlocks& out_blocks);
192 
193    template <typename I> bool schedule(std::list<I *>& ready_list);
194 
195    template <typename I> bool schedule_block(std::list<I *>& ready_list);
196 
197    void update_array_writes(const AluGroup& group);
198    bool check_array_reads(const AluInstr& instr);
199    bool check_array_reads(const AluGroup& group);
200 
201    std::list<AluInstr *> alu_vec_ready;
202    std::list<AluInstr *> alu_trans_ready;
203    std::list<AluGroup *> alu_groups_ready;
204    std::list<TexInstr *> tex_ready;
205    std::list<ExportInstr *> exports_ready;
206    std::list<FetchInstr *> fetches_ready;
207    std::list<WriteOutInstr *> memops_ready;
208    std::list<MemRingOutInstr *> mem_ring_writes_ready;
209    std::list<GDSInstr *> gds_ready;
210    std::list<WriteTFInstr *> write_tf_ready;
211    std::list<RatInstr *> rat_instr_ready;
212 
213    enum {
214       sched_alu,
215       sched_tex,
216       sched_fetch,
217       sched_free,
218       sched_mem_ring,
219       sched_gds,
220       sched_write_tf,
221       sched_rat,
222    } current_shed;
223 
224    ExportInstr *m_last_pos;
225    ExportInstr *m_last_pixel;
226    ExportInstr *m_last_param;
227 
228    Block *m_current_block;
229 
230    int m_lds_addr_count{0};
231    int m_alu_groups_scheduled{0};
232    r600_chip_class m_chip_class;
233    radeon_family m_chip_family;
234    bool m_idx0_loading{false};
235    bool m_idx1_loading{false};
236    bool m_idx0_pending{false};
237    bool m_idx1_pending{false};
238 
239    bool m_nop_after_rel_dest{false};
240    bool m_nop_befor_rel_src{false};
241    uint32_t m_next_block_id{1};
242 
243 
244    ArrayCheckSet m_last_indirect_array_write;
245    ArrayCheckSet m_last_direct_array_write;
246 };
247 
248 Shader *
schedule(Shader * original)249 schedule(Shader *original)
250 {
251    Block::set_chipclass(original->chip_class());
252    AluGroup::set_chipclass(original->chip_class());
253 
254    sfn_log << SfnLog::schedule << "Original shader\n";
255    if (sfn_log.has_debug_flag(SfnLog::schedule)) {
256       std::stringstream ss;
257       original->print(ss);
258       sfn_log << ss.str() << "\n\n";
259    }
260 
261    // TODO later it might be necessary to clone the shader
262    // to be able to re-start scheduling
263 
264    auto scheduled_shader = original;
265 
266    BlockScheduler s(original->chip_class(), original->chip_family());
267 
268    s.run(scheduled_shader);
269    s.finalize();
270 
271    sfn_log << SfnLog::schedule << "Scheduled shader\n";
272    if (sfn_log.has_debug_flag(SfnLog::schedule)) {
273       std::stringstream ss;
274       scheduled_shader->print(ss);
275       sfn_log << ss.str() << "\n\n";
276    }
277 
278    return scheduled_shader;
279 }
280 
BlockScheduler(r600_chip_class chip_class,radeon_family chip_family)281 BlockScheduler::BlockScheduler(r600_chip_class chip_class,
282                                radeon_family chip_family):
283     current_shed(sched_alu),
284     m_last_pos(nullptr),
285     m_last_pixel(nullptr),
286     m_last_param(nullptr),
287     m_current_block(nullptr),
288     m_chip_class(chip_class),
289     m_chip_family(chip_family)
290 {
291    m_nop_after_rel_dest = chip_family == CHIP_RV770;
292 
293    m_nop_befor_rel_src = m_chip_class == ISA_CC_R600 &&
294                          chip_family != CHIP_RV670 &&
295                          chip_family != CHIP_RS780 &&
296                          chip_family != CHIP_RS880;
297 }
298 
299 void
run(Shader * shader)300 BlockScheduler::run(Shader *shader)
301 {
302    Shader::ShaderBlocks scheduled_blocks;
303 
304    for (auto& block : shader->func()) {
305       sfn_log << SfnLog::schedule << "Process block " << block->id() << "\n";
306       if (sfn_log.has_debug_flag(SfnLog::schedule)) {
307          std::stringstream ss;
308          block->print(ss);
309          sfn_log << ss.str() << "\n";
310       }
311       schedule_block(*block, scheduled_blocks, shader->value_factory());
312    }
313 
314    shader->reset_function(scheduled_blocks);
315 }
316 
317 void
schedule_block(Block & in_block,Shader::ShaderBlocks & out_blocks,ValueFactory & vf)318 BlockScheduler::schedule_block(Block& in_block,
319                               Shader::ShaderBlocks& out_blocks,
320                               ValueFactory& vf)
321 {
322 
323    assert(in_block.id() >= 0);
324 
325    current_shed = sched_fetch;
326    auto last_shed = sched_fetch;
327 
328    CollectInstructions cir(vf);
329    in_block.accept(cir);
330 
331    bool have_instr = collect_ready(cir);
332 
333    m_current_block = new Block(in_block.nesting_depth(), m_next_block_id++);
334    m_current_block->set_instr_flag(Instr::force_cf);
335    assert(m_current_block->id() >= 0);
336 
337    while (have_instr) {
338 
339       sfn_log << SfnLog::schedule << "Have ready instructions\n";
340 
341       if (alu_vec_ready.size())
342          sfn_log << SfnLog::schedule << "  ALU V:" << alu_vec_ready.size() << "\n";
343 
344       if (alu_trans_ready.size())
345          sfn_log << SfnLog::schedule << "  ALU T:" << alu_trans_ready.size() << "\n";
346 
347       if (alu_groups_ready.size())
348          sfn_log << SfnLog::schedule << "  ALU G:" << alu_groups_ready.size() << "\n";
349 
350       if (exports_ready.size())
351          sfn_log << SfnLog::schedule << "  EXP:" << exports_ready.size() << "\n";
352       if (tex_ready.size())
353          sfn_log << SfnLog::schedule << "  TEX:" << tex_ready.size() << "\n";
354       if (fetches_ready.size())
355          sfn_log << SfnLog::schedule << "  FETCH:" << fetches_ready.size() << "\n";
356       if (mem_ring_writes_ready.size())
357          sfn_log << SfnLog::schedule << "  MEM_RING:" << mem_ring_writes_ready.size()
358                  << "\n";
359       if (memops_ready.size())
360          sfn_log << SfnLog::schedule << "  MEM_OPS:" << mem_ring_writes_ready.size()
361                  << "\n";
362 
363       if (!m_current_block->lds_group_active() &&
364           m_current_block->expected_ar_uses() == 0) {
365          if (last_shed != sched_free && memops_ready.size() > 8)
366             current_shed = sched_free;
367          else if (mem_ring_writes_ready.size() > 15)
368             current_shed = sched_mem_ring;
369          else if (rat_instr_ready.size() > 3)
370             current_shed = sched_rat;
371          else if (tex_ready.size() > (m_chip_class >= ISA_CC_EVERGREEN ? 15 : 7))
372             current_shed = sched_tex;
373       }
374 
375       switch (current_shed) {
376       case sched_alu:
377          if (!schedule_alu(out_blocks)) {
378             assert(!m_current_block->lds_group_active());
379             current_shed = sched_tex;
380             continue;
381          }
382          last_shed = current_shed;
383          break;
384       case sched_tex:
385          if (tex_ready.empty() || !schedule_tex(out_blocks)) {
386             current_shed = sched_fetch;
387             continue;
388          }
389          last_shed = current_shed;
390          break;
391       case sched_fetch:
392          if (!fetches_ready.empty()) {
393             schedule_vtx(out_blocks);
394             last_shed = current_shed;
395          }
396          current_shed = sched_gds;
397          continue;
398       case sched_gds:
399          if (!gds_ready.empty()) {
400             schedule_gds(out_blocks, gds_ready);
401             last_shed = current_shed;
402          }
403          current_shed = sched_mem_ring;
404          continue;
405       case sched_mem_ring:
406          if (mem_ring_writes_ready.empty() ||
407              !schedule_cf(out_blocks, mem_ring_writes_ready)) {
408             current_shed = sched_write_tf;
409             continue;
410          }
411          last_shed = current_shed;
412          break;
413       case sched_write_tf:
414          if (write_tf_ready.empty() || !schedule_gds(out_blocks, write_tf_ready)) {
415             current_shed = sched_rat;
416             continue;
417          }
418          last_shed = current_shed;
419          break;
420       case sched_rat:
421          if (rat_instr_ready.empty() || !schedule_cf(out_blocks, rat_instr_ready)) {
422             current_shed = sched_free;
423             continue;
424          }
425          last_shed = current_shed;
426          break;
427       case sched_free:
428          if (memops_ready.empty() || !schedule_cf(out_blocks, memops_ready)) {
429             current_shed = sched_alu;
430             break;
431          }
432          last_shed = current_shed;
433       }
434 
435       have_instr = collect_ready(cir);
436    }
437 
438    /* Emit exports always at end of a block */
439    while (collect_ready_type(exports_ready, cir.exports))
440       schedule_exports(out_blocks, exports_ready);
441 
442    ASSERTED bool fail = false;
443 
444    if (!cir.alu_groups.empty()) {
445       std::cerr << "Unscheduled ALU groups:\n";
446       for (auto& a : cir.alu_groups) {
447          std::cerr << "   " << *a << "\n";
448       }
449       fail = true;
450    }
451 
452    if (!cir.alu_vec.empty()) {
453       std::cerr << "Unscheduled ALU vec ops:\n";
454       for (auto& a : cir.alu_vec) {
455          std::cerr << "   [" << a->block_id() << ":"
456                    << a->index() <<"]:" << *a << "\n";
457          for (auto& d : a->required_instr())
458             std::cerr << "      R["<< d->block_id() << ":" << d->index() <<"]:"
459                       << *d << "\n";
460       }
461       fail = true;
462    }
463 
464    if (!cir.alu_trans.empty()) {
465       std::cerr << "Unscheduled ALU trans ops:\n";
466       for (auto& a : cir.alu_trans) {
467          std::cerr << "   " << "   [" << a->block_id() << ":"
468                    << a->index() <<"]:" << *a << "\n";
469          for (auto& d : a->required_instr())
470             std::cerr << "      R:" << *d << "\n";
471       }
472       fail = true;
473    }
474    if (!cir.mem_write_instr.empty()) {
475       std::cerr << "Unscheduled MEM ops:\n";
476       for (auto& a : cir.mem_write_instr) {
477          std::cerr << "   " << *a << "\n";
478       }
479       fail = true;
480    }
481 
482    if (!cir.fetches.empty()) {
483       std::cerr << "Unscheduled Fetch ops:\n";
484       for (auto& a : cir.fetches) {
485          std::cerr << "   " << *a << "\n";
486       }
487       fail = true;
488    }
489 
490    if (!cir.tex.empty()) {
491       std::cerr << "Unscheduled Tex ops:\n";
492       for (auto& a : cir.tex) {
493          std::cerr << "   " << *a << "\n";
494       }
495       fail = true;
496    }
497 
498    if (fail) {
499       std::cerr << "Failing block:\n";
500       for (auto& i : in_block)
501          std::cerr << "[" << i->block_id() << ":" << i->index() << "] "
502                    << (i->is_scheduled() ? "S " : "")
503                    << *i << "\n";
504       std::cerr << "\nSo far scheduled: ";
505 
506       for (auto i : *m_current_block)
507          std::cerr << "[" << i->block_id() << ":" << i->index() << "] " << *i << "\n";
508       std::cerr << "\n\n: ";
509    }
510 
511    assert(cir.tex.empty());
512    assert(cir.exports.empty());
513    assert(cir.fetches.empty());
514    assert(cir.alu_vec.empty());
515    assert(cir.mem_write_instr.empty());
516    assert(cir.mem_ring_writes.empty());
517 
518    assert(!fail);
519 
520    if (cir.m_cf_instr) {
521       // Assert that if condition is ready
522       if (m_current_block->type() != Block::alu) {
523          start_new_block(out_blocks, Block::alu);
524       }
525       m_current_block->push_back(cir.m_cf_instr);
526       cir.m_cf_instr->set_scheduled();
527    }
528 
529    if (m_current_block->type() == Block::alu)
530       maybe_split_alu_block(out_blocks);
531    else
532       out_blocks.push_back(m_current_block);
533 }
534 
535 void
finalize()536 BlockScheduler::finalize()
537 {
538    if (m_last_pos)
539       m_last_pos->set_is_last_export(true);
540    if (m_last_pixel)
541       m_last_pixel->set_is_last_export(true);
542    if (m_last_param)
543       m_last_param->set_is_last_export(true);
544 }
545 
546 bool
schedule_alu(Shader::ShaderBlocks & out_blocks)547 BlockScheduler::schedule_alu(Shader::ShaderBlocks& out_blocks)
548 {
549    bool success = false;
550    AluGroup *group = nullptr;
551 
552    sfn_log << SfnLog::schedule << "Schedule alu with " <<
553               m_current_block->expected_ar_uses()
554            << " pending AR loads\n";
555 
556    bool has_alu_ready = !alu_vec_ready.empty() || !alu_trans_ready.empty();
557 
558    bool has_lds_ready =
559       !alu_vec_ready.empty() && (*alu_vec_ready.begin())->has_lds_access();
560 
561    bool has_ar_read_ready = !alu_vec_ready.empty() &&
562                             std::get<0>((*alu_vec_ready.begin())->indirect_addr());
563 
564    /* If we have ready ALU instructions we have to start a new ALU block */
565    if (has_alu_ready || !alu_groups_ready.empty()) {
566       if (m_current_block->type() != Block::alu) {
567          start_new_block(out_blocks, Block::alu);
568          m_alu_groups_scheduled = 0;
569       }
570    }
571 
572    /* Schedule groups first. unless we have a pending LDS instruction
573     * We don't want the LDS instructions to be too far apart because the
574     * fetch + read from queue has to be in the same ALU CF block */
575    if (!alu_groups_ready.empty() && !has_lds_ready && !has_ar_read_ready) {
576       group = *alu_groups_ready.begin();
577 
578       if (!check_array_reads(*group)) {
579 
580 
581          sfn_log << SfnLog::schedule << "try schedule " <<
582                     *group << "\n";
583 
584          /* Only start a new CF if we have no pending AR reads */
585          if (m_current_block->try_reserve_kcache(*group)) {
586             alu_groups_ready.erase(alu_groups_ready.begin());
587             success = true;
588          } else {
589             if (m_current_block->expected_ar_uses() == 0) {
590                start_new_block(out_blocks, Block::alu);
591 
592                if (!m_current_block->try_reserve_kcache(*group))
593                   unreachable("Scheduling a group in a new block should always succeed");
594                alu_groups_ready.erase(alu_groups_ready.begin());
595                sfn_log << SfnLog::schedule << "Schedule ALU group\n";
596                success = true;
597             } else {
598                sfn_log << SfnLog::schedule << "Don't add group because of " <<
599                           m_current_block->expected_ar_uses()
600                        << "pending AR loads\n";
601                group = nullptr;
602             }
603          }
604       }
605    }
606 
607    if (!group && has_alu_ready) {
608       group = new AluGroup();
609       sfn_log << SfnLog::schedule << "START new ALU group\n";
610    } else if (!success) {
611       return false;
612    }
613 
614    assert(group);
615 
616    int free_slots = group->free_slots();
617 
618    while (free_slots && has_alu_ready) {
619       if (!alu_vec_ready.empty())
620          success |= schedule_alu_to_group_vec(group);
621 
622       /* Apparently one can't schedule a t-slot if there is already
623        * and LDS instruction scheduled.
624        * TODO: check whether this is only relevant for actual LDS instructions
625        * or also for instructions that read from the LDS return value queue */
626 
627       if (free_slots & 0x10 && !has_lds_ready) {
628          sfn_log << SfnLog::schedule << "Try schedule TRANS channel\n";
629          if (!alu_trans_ready.empty())
630             success |= schedule_alu_to_group_trans(group, alu_trans_ready);
631          if (!alu_vec_ready.empty())
632             success |= schedule_alu_to_group_trans(group, alu_vec_ready);
633       }
634 
635       if (success) {
636          ++m_alu_groups_scheduled;
637          break;
638       } else if (m_current_block->kcache_reservation_failed()) {
639          // LDS read groups should not lead to impossible
640          // kcache constellations
641          assert(!m_current_block->lds_group_active());
642 
643          // AR is loaded but not all uses are done, we don't want
644          // to start a new CF here
645          assert(m_current_block->expected_ar_uses() ==0);
646 
647          // kcache reservation failed, so we have to start a new CF
648          start_new_block(out_blocks, Block::alu);
649       } else {
650          // Ready is not empty, but we didn't schedule anything, this
651          // means we had a indirect array read or write conflict that we
652          // can resolve with an extra group that has a NOP instruction
653          if (!alu_trans_ready.empty()  || !alu_vec_ready.empty()) {
654             group->add_vec_instructions(new AluInstr(op0_nop, 0));
655             break;
656          } else {
657             return false;
658          }
659       }
660    }
661 
662 
663 
664    sfn_log << SfnLog::schedule << "Finalize ALU group\n";
665    group->set_scheduled();
666    group->fix_last_flag();
667    group->set_nesting_depth(m_current_block->nesting_depth());
668 
669    auto [addr, is_index] = group->addr();
670    if (is_index) {
671       if (addr->sel() == AddressRegister::idx0 && m_idx0_pending) {
672          assert(!group->has_lds_group_start());
673          assert(m_current_block->expected_ar_uses() == 0);
674          start_new_block(out_blocks, Block::alu);
675          m_current_block->try_reserve_kcache(*group);
676       }
677       if (addr->sel() == AddressRegister::idx1 && m_idx1_pending) {
678          assert(!group->has_lds_group_start());
679          assert(m_current_block->expected_ar_uses() == 0);
680          start_new_block(out_blocks, Block::alu);
681          m_current_block->try_reserve_kcache(*group);
682       }
683    }
684 
685    m_current_block->push_back(group);
686 
687    update_array_writes(*group);
688 
689    m_idx0_pending |= m_idx0_loading;
690    m_idx0_loading = false;
691 
692    m_idx1_pending |= m_idx1_loading;
693    m_idx1_loading = false;
694 
695    if (!m_current_block->lds_group_active() &&
696        m_current_block->expected_ar_uses() == 0 &&
697        (!addr || is_index)) {
698       group->set_instr_flag(Instr::no_lds_or_addr_group);
699    }
700 
701    if (group->has_lds_group_start())
702       m_current_block->lds_group_start(*group->begin());
703 
704    if (group->has_lds_group_end())
705       m_current_block->lds_group_end();
706 
707    if (group->has_kill_op()) {
708       assert(!group->has_lds_group_start());
709       assert(m_current_block->expected_ar_uses() == 0);
710       start_new_block(out_blocks, Block::alu);
711    }
712 
713    return success;
714 }
715 
716 bool
schedule_tex(Shader::ShaderBlocks & out_blocks)717 BlockScheduler::schedule_tex(Shader::ShaderBlocks& out_blocks)
718 {
719    if (m_current_block->type() != Block::tex || m_current_block->remaining_slots() == 0) {
720       start_new_block(out_blocks, Block::tex);
721       m_current_block->set_instr_flag(Instr::force_cf);
722    }
723 
724    if (!tex_ready.empty() && m_current_block->remaining_slots() > 0) {
725       auto ii = tex_ready.begin();
726       sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
727 
728       if ((unsigned)m_current_block->remaining_slots() < 1 + (*ii)->prepare_instr().size())
729          start_new_block(out_blocks, Block::tex);
730 
731       for (auto prep : (*ii)->prepare_instr()) {
732          prep->set_scheduled();
733          m_current_block->push_back(prep);
734       }
735 
736       (*ii)->set_scheduled();
737       m_current_block->push_back(*ii);
738       tex_ready.erase(ii);
739       return true;
740    }
741    return false;
742 }
743 
744 bool
schedule_vtx(Shader::ShaderBlocks & out_blocks)745 BlockScheduler::schedule_vtx(Shader::ShaderBlocks& out_blocks)
746 {
747    if (m_current_block->type() != Block::vtx || m_current_block->remaining_slots() == 0) {
748       start_new_block(out_blocks, Block::vtx);
749       m_current_block->set_instr_flag(Instr::force_cf);
750    }
751    return schedule_block(fetches_ready);
752 }
753 
754 template <typename I>
755 bool
schedule_gds(Shader::ShaderBlocks & out_blocks,std::list<I * > & ready_list)756 BlockScheduler::schedule_gds(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list)
757 {
758    bool was_full = m_current_block->remaining_slots() == 0;
759    if (m_current_block->type() != Block::gds || was_full) {
760       start_new_block(out_blocks, Block::gds);
761       if (was_full)
762          m_current_block->set_instr_flag(Instr::force_cf);
763    }
764    return schedule_block(ready_list);
765 }
766 
767 void
start_new_block(Shader::ShaderBlocks & out_blocks,Block::Type type)768 BlockScheduler::start_new_block(Shader::ShaderBlocks& out_blocks, Block::Type type)
769 {
770    if (!m_current_block->empty()) {
771       sfn_log << SfnLog::schedule << "Start new block\n";
772       assert(!m_current_block->lds_group_active());
773 
774       if (m_current_block->type() != Block::alu)
775          out_blocks.push_back(m_current_block);
776       else
777          maybe_split_alu_block(out_blocks);
778       m_current_block = new Block(m_current_block->nesting_depth(), m_next_block_id++);
779       m_current_block->set_instr_flag(Instr::force_cf);
780       m_idx0_pending = m_idx1_pending = false;
781 
782    }
783    m_current_block->set_type(type, m_chip_class);
784 }
785 
maybe_split_alu_block(Shader::ShaderBlocks & out_blocks)786 void BlockScheduler::maybe_split_alu_block(Shader::ShaderBlocks& out_blocks)
787 {
788    // TODO: needs fixing
789    if (m_current_block->remaining_slots() > 0) {
790       out_blocks.push_back(m_current_block);
791       return;
792    }
793 
794    int used_slots = 0;
795    int pending_slots = 0;
796 
797    Instr *next_block_start = nullptr;
798    for (auto cur_group : *m_current_block) {
799       /* This limit is a bit fishy, it should be 128 */
800       if (used_slots + pending_slots + cur_group->slots() < 128) {
801          if (cur_group->can_start_alu_block()) {
802             next_block_start = cur_group;
803             used_slots += pending_slots;
804             pending_slots = cur_group->slots();
805          } else {
806             pending_slots += cur_group->slots();
807          }
808       } else {
809          assert(next_block_start);
810          next_block_start->set_instr_flag(Instr::force_cf);
811          used_slots = pending_slots;
812          pending_slots = cur_group->slots();
813       }
814    }
815 
816    Block *sub_block = new Block(m_current_block->nesting_depth(),
817                                 m_next_block_id++);
818    sub_block->set_type(Block::alu, m_chip_class);
819    sub_block->set_instr_flag(Instr::force_cf);
820 
821    for (auto instr : *m_current_block) {
822       auto group = instr->as_alu_group();
823       if (!group) {
824             sub_block->push_back(instr);
825             continue;
826       }
827 
828       if (group->group_force_alu_cf()) {
829          assert(!sub_block->lds_group_active());
830          out_blocks.push_back(sub_block);
831          sub_block = new Block(m_current_block->nesting_depth(),
832                                          m_next_block_id++);
833          sub_block->set_type(Block::alu, m_chip_class);
834          sub_block->set_instr_flag(Instr::force_cf);
835       }
836       sub_block->push_back(group);
837       if (group->has_lds_group_start())
838          sub_block->lds_group_start(*group->begin());
839 
840       if (group->has_lds_group_end())
841          sub_block->lds_group_end();
842 
843    }
844    if (!sub_block->empty())
845       out_blocks.push_back(sub_block);
846 }
847 
848 template <typename I>
849 bool
schedule_cf(Shader::ShaderBlocks & out_blocks,std::list<I * > & ready_list)850 BlockScheduler::schedule_cf(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list)
851 {
852    if (ready_list.empty())
853       return false;
854    if (m_current_block->type() != Block::cf)
855       start_new_block(out_blocks, Block::cf);
856    return schedule(ready_list);
857 }
858 
859 bool
schedule_alu_to_group_vec(AluGroup * group)860 BlockScheduler::schedule_alu_to_group_vec(AluGroup *group)
861 {
862    assert(group);
863    assert(!alu_vec_ready.empty());
864 
865    bool success = false;
866    auto i = alu_vec_ready.begin();
867    auto e = alu_vec_ready.end();
868    while (i != e) {
869       sfn_log << SfnLog::schedule << "Try schedule to vec " << **i;
870 
871       if (check_array_reads(**i)) {
872          ++i;
873          continue;
874       }
875 
876       // precausion: don't kill while we hae LDS queue reads in the pipeline
877       if ((*i)->is_kill() && m_current_block->lds_group_active())
878          continue;
879 
880       if (!m_current_block->try_reserve_kcache(**i)) {
881          sfn_log << SfnLog::schedule << " failed (kcache)\n";
882          ++i;
883          continue;
884       }
885 
886       if (group->add_vec_instructions(*i)) {
887          auto old_i = i;
888          ++i;
889          if ((*old_i)->has_alu_flag(alu_is_lds)) {
890             --m_lds_addr_count;
891          }
892 
893          if ((*old_i)->num_ar_uses())
894             m_current_block->set_expected_ar_uses((*old_i)->num_ar_uses());
895          auto addr = std::get<0>((*old_i)->indirect_addr());
896          bool has_indirect_reg_load = addr != nullptr && addr->has_flag(Register::addr_or_idx);
897 
898          bool is_idx_load_on_eg = false;
899          if (!(*old_i)->has_alu_flag(alu_is_lds)) {
900             bool load_idx0_eg = (*old_i)->opcode() == op1_set_cf_idx0;
901             bool load_idx0_ca = ((*old_i)->opcode() == op1_mova_int &&
902                                  (*old_i)->dest()->sel() == AddressRegister::idx0);
903 
904             bool load_idx1_eg = (*old_i)->opcode() == op1_set_cf_idx1;
905             bool load_idx1_ca = ((*old_i)->opcode() == op1_mova_int &&
906                                  (*old_i)->dest()->sel() == AddressRegister::idx1);
907 
908             is_idx_load_on_eg = load_idx0_eg || load_idx1_eg;
909 
910             bool load_idx0 = load_idx0_eg || load_idx0_ca;
911             bool load_idx1 = load_idx1_eg || load_idx1_ca;
912 
913 
914             assert(!m_idx0_pending || !load_idx0);
915             assert(!m_idx1_pending || !load_idx1);
916 
917             m_idx0_loading |= load_idx0;
918             m_idx1_loading |= load_idx1;
919          }
920 
921          if (has_indirect_reg_load || is_idx_load_on_eg)
922             m_current_block->dec_expected_ar_uses();
923 
924          alu_vec_ready.erase(old_i);
925          success = true;
926          sfn_log << SfnLog::schedule << " success\n";
927       } else {
928          ++i;
929          sfn_log << SfnLog::schedule << " failed\n";
930       }
931    }
932    return success;
933 }
934 
935 bool
schedule_alu_to_group_trans(AluGroup * group,std::list<AluInstr * > & readylist)936 BlockScheduler::schedule_alu_to_group_trans(AluGroup *group,
937                                            std::list<AluInstr *>& readylist)
938 {
939    assert(group);
940 
941    bool success = false;
942    auto i = readylist.begin();
943    auto e = readylist.end();
944    while (i != e) {
945 
946       if (check_array_reads(**i)) {
947          ++i;
948          continue;
949       }
950 
951       sfn_log << SfnLog::schedule << "Try schedule to trans " << **i;
952       if (!m_current_block->try_reserve_kcache(**i)) {
953          sfn_log << SfnLog::schedule << " failed (kcache)\n";
954          ++i;
955          continue;
956       }
957 
958       if (group->add_trans_instructions(*i)) {
959          auto old_i = i;
960          ++i;
961          auto addr = std::get<0>((*old_i)->indirect_addr());
962          if (addr && addr->has_flag(Register::addr_or_idx))
963             m_current_block->dec_expected_ar_uses();
964 
965          readylist.erase(old_i);
966          success = true;
967          sfn_log << SfnLog::schedule << " success\n";
968          break;
969       } else {
970          ++i;
971          sfn_log << SfnLog::schedule << " failed\n";
972       }
973    }
974    return success;
975 }
976 
977 template <typename I>
978 bool
schedule(std::list<I * > & ready_list)979 BlockScheduler::schedule(std::list<I *>& ready_list)
980 {
981    if (!ready_list.empty() && m_current_block->remaining_slots() > 0) {
982       auto ii = ready_list.begin();
983       sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
984       (*ii)->set_scheduled();
985       m_current_block->push_back(*ii);
986       ready_list.erase(ii);
987       return true;
988    }
989    return false;
990 }
991 
992 template <typename I>
993 bool
schedule_block(std::list<I * > & ready_list)994 BlockScheduler::schedule_block(std::list<I *>& ready_list)
995 {
996    bool success = false;
997    while (!ready_list.empty() && m_current_block->remaining_slots() > 0) {
998       auto ii = ready_list.begin();
999       sfn_log << SfnLog::schedule << "Schedule: " << **ii << " "
1000               << m_current_block->remaining_slots() << "\n";
1001       (*ii)->set_scheduled();
1002       m_current_block->push_back(*ii);
1003       ready_list.erase(ii);
1004       success = true;
1005    }
1006    return success;
1007 }
1008 
1009 bool
schedule_exports(Shader::ShaderBlocks & out_blocks,std::list<ExportInstr * > & ready_list)1010 BlockScheduler::schedule_exports(Shader::ShaderBlocks& out_blocks,
1011                                 std::list<ExportInstr *>& ready_list)
1012 {
1013    if (m_current_block->type() != Block::cf)
1014       start_new_block(out_blocks, Block::cf);
1015 
1016    if (!ready_list.empty()) {
1017       auto ii = ready_list.begin();
1018       sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
1019       (*ii)->set_scheduled();
1020       m_current_block->push_back(*ii);
1021       switch ((*ii)->export_type()) {
1022       case ExportInstr::pos:
1023          m_last_pos = *ii;
1024          break;
1025       case ExportInstr::param:
1026          m_last_param = *ii;
1027          break;
1028       case ExportInstr::pixel:
1029          m_last_pixel = *ii;
1030          break;
1031       }
1032       (*ii)->set_is_last_export(false);
1033       ready_list.erase(ii);
1034       return true;
1035    }
1036    return false;
1037 }
1038 
1039 bool
collect_ready(CollectInstructions & available)1040 BlockScheduler::collect_ready(CollectInstructions& available)
1041 {
1042    sfn_log << SfnLog::schedule << "Ready instructions\n";
1043    bool result = false;
1044    result |= collect_ready_alu_vec(alu_vec_ready, available.alu_vec);
1045    result |= collect_ready_type(alu_trans_ready, available.alu_trans);
1046    result |= collect_ready_type(alu_groups_ready, available.alu_groups);
1047    result |= collect_ready_type(gds_ready, available.gds_op);
1048    result |= collect_ready_type(tex_ready, available.tex);
1049    result |= collect_ready_type(fetches_ready, available.fetches);
1050    result |= collect_ready_type(memops_ready, available.mem_write_instr);
1051    result |= collect_ready_type(mem_ring_writes_ready, available.mem_ring_writes);
1052    result |= collect_ready_type(write_tf_ready, available.write_tf);
1053    result |= collect_ready_type(rat_instr_ready, available.rat_instr);
1054 
1055    sfn_log << SfnLog::schedule << "\n";
1056    return result;
1057 }
1058 
1059 bool
collect_ready_alu_vec(std::list<AluInstr * > & ready,std::list<AluInstr * > & available)1060 BlockScheduler::collect_ready_alu_vec(std::list<AluInstr *>& ready,
1061                                      std::list<AluInstr *>& available)
1062 {
1063    auto i = available.begin();
1064    auto e = available.end();
1065 
1066    for (auto alu : ready) {
1067       alu->add_priority(100 * alu->register_priority());
1068    }
1069 
1070    int max_check = 0;
1071    while (i != e && max_check++ < 64) {
1072       if (ready.size() < 64 && (*i)->ready()) {
1073 
1074          int priority = 0;
1075          /* LDS fetches that use static offsets are usually ready ery fast,
1076           * so that they would get schedules early, and this leaves the
1077           * problem that we allocate too many registers with just constant
1078           * values, and this will make problems with RA. So limit the number of
1079           * LDS address registers.
1080           */
1081          if ((*i)->has_alu_flag(alu_lds_address)) {
1082             if (m_lds_addr_count > 64) {
1083                ++i;
1084                continue;
1085             } else {
1086                ++m_lds_addr_count;
1087             }
1088          }
1089 
1090          /* LDS instructions are scheduled with high priority.
1091           * instractions that can go into the t slot and don't have
1092           * indirect access are put in last, so that they don't block
1093           * vec-only instructions when scheduling to the vector slots
1094           * for everything else we look at the register use */
1095 
1096          auto [addr, dummy1, dummy2] = (*i)->indirect_addr();
1097 
1098          if ((*i)->has_lds_access()) {
1099             priority = 100000;
1100             if ((*i)->has_alu_flag(alu_is_lds))
1101                priority += 100000;
1102          } else if (addr) {
1103             priority = 10000;
1104          } else if (AluGroup::has_t()) {
1105             auto opinfo = alu_ops.find((*i)->opcode());
1106             assert(opinfo != alu_ops.end());
1107             if (opinfo->second.can_channel(AluOp::t, m_chip_class))
1108                priority = -1;
1109          }
1110 
1111          priority += 100 * (*i)->register_priority();
1112 
1113          (*i)->add_priority(priority);
1114          ready.push_back(*i);
1115 
1116          auto old_i = i;
1117          ++i;
1118          available.erase(old_i);
1119       } else
1120          ++i;
1121    }
1122 
1123    for (auto& i : ready)
1124       sfn_log << SfnLog::schedule << "V:  " << *i << "\n";
1125 
1126    ready.sort([](const AluInstr *lhs, const AluInstr *rhs) {
1127       return lhs->priority() > rhs->priority();
1128    });
1129 
1130    for (auto& i : ready)
1131       sfn_log << SfnLog::schedule << "V (S):  " << i->priority() << " " << *i << "\n";
1132 
1133    return !ready.empty();
1134 }
1135 
1136 template <typename T> struct type_char {
1137 };
1138 
1139 template <> struct type_char<AluInstr> {
valuer600::type_char1140    static char value() { return 'A';};
1141 };
1142 
1143 template <> struct type_char<AluGroup> {
valuer600::type_char1144    static char value() { return 'G';};
1145 };
1146 
1147 template <> struct type_char<ExportInstr> {
valuer600::type_char1148    static char value() { return 'E';};
1149 };
1150 
1151 template <> struct type_char<TexInstr> {
valuer600::type_char1152    static char value() { return 'T';};
1153 };
1154 
1155 template <> struct type_char<FetchInstr> {
valuer600::type_char1156    static char value() { return 'F';};
1157 };
1158 
1159 template <> struct type_char<WriteOutInstr> {
valuer600::type_char1160    static char value() { return 'M';};
1161 };
1162 
1163 template <> struct type_char<MemRingOutInstr> {
valuer600::type_char1164    static char value() { return 'R';};
1165 };
1166 
1167 template <> struct type_char<WriteTFInstr> {
valuer600::type_char1168    static char value() { return 'X';};
1169 };
1170 
1171 template <> struct type_char<GDSInstr> {
valuer600::type_char1172    static char value() { return 'S';};
1173 };
1174 
1175 template <> struct type_char<RatInstr> {
valuer600::type_char1176    static char value() { return 'I';};
1177 };
1178 
1179 template <typename T>
1180 bool
collect_ready_type(std::list<T * > & ready,std::list<T * > & available)1181 BlockScheduler::collect_ready_type(std::list<T *>& ready, std::list<T *>& available)
1182 {
1183    auto i = available.begin();
1184    auto e = available.end();
1185 
1186    int lookahead = 16;
1187    while (i != e && ready.size() < 16 && lookahead-- > 0) {
1188       if ((*i)->ready()) {
1189          ready.push_back(*i);
1190          auto old_i = i;
1191          ++i;
1192          available.erase(old_i);
1193       } else
1194          ++i;
1195    }
1196 
1197    for (auto& i : ready)
1198       sfn_log << SfnLog::schedule << type_char<T>::value() << ";  " << *i << "\n";
1199 
1200    return !ready.empty();
1201 }
1202 
1203 class CheckArrayAccessVisitor : public  ConstRegisterVisitor {
1204 public:
1205    using ConstRegisterVisitor::visit;
visit(const Register & value)1206    void visit(const Register& value) override {(void)value;}
visit(const LocalArray & value)1207    void visit(const LocalArray& value) override {(void)value;}
visit(const UniformValue & value)1208    void visit(const UniformValue& value) override {(void)value;}
visit(const LiteralConstant & value)1209    void visit(const LiteralConstant& value) override {(void)value;}
visit(const InlineConstant & value)1210    void visit(const InlineConstant& value) override {(void)value;}
1211 };
1212 
1213 class UpdateArrayWrite : public CheckArrayAccessVisitor {
1214 public:
UpdateArrayWrite(ArrayCheckSet & indirect_arrays,ArrayCheckSet & direct_arrays,bool tdw)1215    UpdateArrayWrite(ArrayCheckSet& indirect_arrays,
1216                     ArrayCheckSet& direct_arrays,
1217                     bool tdw):
1218       last_indirect_array_write(indirect_arrays),
1219       last_direct_array_write(direct_arrays),
1220       track_direct_writes(tdw)
1221    {
1222    }
1223 
visit(const LocalArrayValue & value)1224    void visit(const LocalArrayValue& value) override {
1225       int array_base = value.array().base_sel();
1226       auto entry = std::make_pair(array_base, value.chan());
1227       if (value.addr())
1228          last_indirect_array_write.insert(entry);
1229       else if (track_direct_writes)
1230          last_direct_array_write.insert(entry);
1231    }
1232 private:
1233    ArrayCheckSet& last_indirect_array_write;
1234    ArrayCheckSet& last_direct_array_write;
1235    bool track_direct_writes {false};
1236 };
1237 
1238 
update_array_writes(const AluGroup & group)1239 void BlockScheduler::update_array_writes(const AluGroup& group)
1240 {
1241    if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1242       m_last_direct_array_write.clear();
1243       m_last_indirect_array_write.clear();
1244 
1245       UpdateArrayWrite visitor(m_last_indirect_array_write,
1246                                m_last_direct_array_write,
1247                                m_nop_befor_rel_src);
1248 
1249       for (auto alu : group) {
1250          if (alu && alu->dest())
1251             alu->dest()->accept(visitor);
1252       }
1253    }
1254 }
1255 
1256 class CheckArrayRead : public CheckArrayAccessVisitor {
1257 public:
CheckArrayRead(const ArrayCheckSet & indirect_arrays,const ArrayCheckSet & direct_arrays)1258    CheckArrayRead(const ArrayCheckSet& indirect_arrays,
1259                   const ArrayCheckSet& direct_arrays):
1260       last_indirect_array_write(indirect_arrays),
1261       last_direct_array_write(direct_arrays)
1262    {
1263    }
1264 
visit(const LocalArrayValue & value)1265    void visit(const LocalArrayValue& value) override {
1266       int array_base = value.array().base_sel();
1267       auto entry = std::make_pair(array_base, value.chan());
1268 
1269       if (last_indirect_array_write.find(entry) !=
1270           last_indirect_array_write.end())
1271          need_extra_group = true;
1272 
1273       if (value.addr() && last_direct_array_write.find(entry) !=
1274           last_direct_array_write.end()) {
1275          need_extra_group = true;
1276       }
1277    }
1278 
1279    const ArrayCheckSet& last_indirect_array_write;
1280    const ArrayCheckSet& last_direct_array_write;
1281    bool need_extra_group {false};
1282 };
1283 
1284 
check_array_reads(const AluInstr & instr)1285 bool BlockScheduler::check_array_reads(const AluInstr& instr)
1286 {
1287    if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1288 
1289       CheckArrayRead visitor(m_last_indirect_array_write,
1290                              m_last_direct_array_write);
1291 
1292       for (auto& s : instr.sources()) {
1293          s->accept(visitor);
1294       }
1295       return visitor.need_extra_group;
1296    }
1297    return false;
1298 }
1299 
check_array_reads(const AluGroup & group)1300 bool BlockScheduler::check_array_reads(const AluGroup& group)
1301 {
1302    if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1303 
1304       CheckArrayRead visitor(m_last_indirect_array_write,
1305                              m_last_direct_array_write);
1306 
1307       for (auto alu : group) {
1308          if (!alu)
1309             continue;
1310          for (auto& s : alu->sources()) {
1311             s->accept(visitor);
1312          }
1313       }
1314       return visitor.need_extra_group;
1315    }
1316    return false;
1317 }
1318 
1319 
1320 } // namespace r600
1321