• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #ifndef ACO_IR_H
8 #define ACO_IR_H
9 
10 #include "aco_opcodes.h"
11 #include "aco_shader_info.h"
12 #include "aco_util.h"
13 
14 #include "util/compiler.h"
15 
16 #include "ac_binary.h"
17 #include "ac_hw_stage.h"
18 #include "ac_shader_debug_info.h"
19 #include "ac_shader_util.h"
20 #include "amd_family.h"
21 #include <algorithm>
22 #include <bitset>
23 #include <memory>
24 #include <vector>
25 
26 typedef struct nir_shader nir_shader;
27 
28 namespace aco {
29 
30 extern uint64_t debug_flags;
31 
32 enum {
33    DEBUG_VALIDATE_IR = 0x1,
34    DEBUG_VALIDATE_RA = 0x2,
35    DEBUG_VALIDATE_LIVE_VARS = 0x4,
36    DEBUG_FORCE_WAITCNT = 0x8,
37    DEBUG_NO_VN = 0x10,
38    DEBUG_NO_OPT = 0x20,
39    DEBUG_NO_SCHED = 0x40,
40    DEBUG_PERF_INFO = 0x80,
41    DEBUG_LIVE_INFO = 0x100,
42    DEBUG_FORCE_WAITDEPS = 0x200,
43    DEBUG_NO_VALIDATE_IR = 0x400,
44    DEBUG_NO_SCHED_ILP = 0x800,
45    DEBUG_NO_SCHED_VOPD = 0x1000,
46 };
47 
48 enum storage_class : uint8_t {
49    storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
50    storage_buffer = 0x1, /* SSBOs and global memory */
51    storage_gds = 0x2,
52    storage_image = 0x4,
53    storage_shared = 0x8,        /* or TCS output */
54    storage_vmem_output = 0x10,  /* GS or TCS output stores using VMEM */
55    storage_task_payload = 0x20, /* Task-Mesh payload */
56    storage_scratch = 0x40,
57    storage_vgpr_spill = 0x80,
58    storage_count = 8, /* not counting storage_none */
59 };
60 
61 enum memory_semantics : uint8_t {
62    semantic_none = 0x0,
63    /* for loads: don't move any access after this load to before this load (even other loads)
64     * for barriers: don't move any access after the barrier to before any
65     * atomics/control_barriers/sendmsg_gs_done/position-primitive-export before the barrier */
66    semantic_acquire = 0x1,
67    /* for stores: don't move any access before this store to after this store
68     * for barriers: don't move any access before the barrier to after any
69     * atomics/control_barriers/sendmsg_gs_done/position-primitive-export after the barrier */
70    semantic_release = 0x2,
71 
72    /* the rest are for load/stores/atomics only */
73    /* cannot be DCE'd or CSE'd */
74    semantic_volatile = 0x4,
75    /* does not interact with barriers and assumes this lane is the only lane
76     * accessing this memory */
77    semantic_private = 0x8,
78    /* this operation can be reordered around operations of the same storage.
79     * says nothing about barriers */
80    semantic_can_reorder = 0x10,
81    /* this is a atomic instruction (may only read or write memory) */
82    semantic_atomic = 0x20,
83    /* this is instruction both reads and writes memory */
84    semantic_rmw = 0x40,
85 
86    semantic_acqrel = semantic_acquire | semantic_release,
87    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
88 };
89 
90 enum sync_scope : uint8_t {
91    scope_invocation = 0,
92    scope_subgroup = 1,
93    scope_workgroup = 2,
94    scope_queuefamily = 3,
95    scope_device = 4,
96 };
97 
98 struct memory_sync_info {
memory_sync_infomemory_sync_info99    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
100    memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
101        : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
102    {}
103 
104    storage_class storage : 8;
105    memory_semantics semantics : 8;
106    sync_scope scope : 8;
107 
108    bool operator==(const memory_sync_info& rhs) const
109    {
110       return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
111    }
112 
can_reordermemory_sync_info113    bool can_reorder() const
114    {
115       if (semantics & semantic_acqrel)
116          return false;
117       /* Also check storage so that zero-initialized memory_sync_info can be
118        * reordered. */
119       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
120    }
121 };
122 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
123 
124 enum fp_round {
125    fp_round_ne = 0,
126    fp_round_pi = 1,
127    fp_round_ni = 2,
128    fp_round_tz = 3,
129 };
130 
131 enum fp_denorm {
132    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
133     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
134    fp_denorm_flush = 0x0,
135    fp_denorm_keep_in = 0x1,
136    fp_denorm_keep_out = 0x2,
137    fp_denorm_keep = 0x3,
138 };
139 
140 struct float_mode {
141    /* matches encoding of the MODE register */
142    union {
143       struct {
144          fp_round round32 : 2;
145          fp_round round16_64 : 2;
146          unsigned denorm32 : 2;
147          unsigned denorm16_64 : 2;
148       };
149       struct {
150          uint8_t round : 4;
151          uint8_t denorm : 4;
152       };
153       uint8_t val = 0;
154    };
155    /* if false, optimizations which may remove denormal flushing can be done */
156    bool must_flush_denorms32 : 1;
157    bool must_flush_denorms16_64 : 1;
158    bool care_about_round32 : 1;
159    bool care_about_round16_64 : 1;
160 
161    /* Returns true if instructions using the mode "other" can safely use the
162     * current one instead. */
canReplacefloat_mode163    bool canReplace(float_mode other) const noexcept
164    {
165       return val == other.val && (must_flush_denorms32 || !other.must_flush_denorms32) &&
166              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
167              (care_about_round32 || !other.care_about_round32) &&
168              (care_about_round16_64 || !other.care_about_round16_64);
169    }
170 };
171 
172 enum wait_type {
173    wait_type_exp = 0,
174    wait_type_lgkm = 1,
175    wait_type_vm = 2,
176    /* GFX10+ */
177    wait_type_vs = 3,
178    /* GFX12+ */
179    wait_type_sample = 4,
180    wait_type_bvh = 5,
181    wait_type_km = 6,
182    wait_type_num = 7,
183 };
184 
185 struct Instruction;
186 class Builder;
187 
188 struct wait_imm {
189    static const uint8_t unset_counter = 0xff;
190 
191    uint8_t exp;
192    uint8_t lgkm;
193    uint8_t vm;
194    uint8_t vs;
195    uint8_t sample;
196    uint8_t bvh;
197    uint8_t km;
198 
199    wait_imm();
200    wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
201 
202    uint16_t pack(enum amd_gfx_level chip) const;
203 
204    static wait_imm max(enum amd_gfx_level gfx_level);
205 
206    bool unpack(enum amd_gfx_level gfx_level, const Instruction* instr);
207 
208    bool combine(const wait_imm& other);
209 
210    bool empty() const;
211 
212    void print(FILE* output) const;
213 
214    void build_waitcnt(Builder& bld);
215 
216    uint8_t& operator[](size_t i)
217    {
218       assert(i < wait_type_num);
219       return *((uint8_t*)this + i);
220    }
221 
222    const uint8_t& operator[](size_t i) const
223    {
224       assert(i < wait_type_num);
225       return *((uint8_t*)this + i);
226    }
227 };
228 static_assert(offsetof(wait_imm, exp) == wait_type_exp);
229 static_assert(offsetof(wait_imm, lgkm) == wait_type_lgkm);
230 static_assert(offsetof(wait_imm, vm) == wait_type_vm);
231 static_assert(offsetof(wait_imm, vs) == wait_type_vs);
232 static_assert(offsetof(wait_imm, sample) == wait_type_sample);
233 static_assert(offsetof(wait_imm, bvh) == wait_type_bvh);
234 static_assert(offsetof(wait_imm, km) == wait_type_km);
235 
236 /* s_wait_event immediate bits. */
237 enum wait_event_imm : uint16_t {
238    /* If this bit is 0, await that the export buffer space has been allocated.
239     * In Primitive Ordered Pixel Shading, export ready means that the overlapped waves have exited
240     * their ordered sections (by performing the `done` export), and that the current wave may enter
241     * its ordered section.
242     */
243    wait_event_imm_dont_wait_export_ready_gfx11 = 0x1,
244    wait_event_imm_wait_export_ready_gfx12 = 0x2,
245 };
246 
247 constexpr Format
asVOP3(Format format)248 asVOP3(Format format)
249 {
250    return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
251 };
252 
253 constexpr Format
asSDWA(Format format)254 asSDWA(Format format)
255 {
256    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
257    return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
258 }
259 
260 constexpr Format
withoutDPP(Format format)261 withoutDPP(Format format)
262 {
263    return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
264 }
265 
266 constexpr Format
withoutVOP3(Format format)267 withoutVOP3(Format format)
268 {
269    return (Format)((uint32_t)format & ~((uint32_t)Format::VOP3));
270 }
271 
272 enum class RegType {
273    sgpr,
274    vgpr,
275 };
276 
277 struct RegClass {
278 
279    enum RC : uint8_t {
280       s1 = 1,
281       s2 = 2,
282       s3 = 3,
283       s4 = 4,
284       s6 = 6,
285       s8 = 8,
286       s16 = 16,
287       v1 = s1 | (1 << 5),
288       v2 = s2 | (1 << 5),
289       v3 = s3 | (1 << 5),
290       v4 = s4 | (1 << 5),
291       v5 = 5 | (1 << 5),
292       v6 = 6 | (1 << 5),
293       v7 = 7 | (1 << 5),
294       v8 = 8 | (1 << 5),
295       /* byte-sized register class */
296       v1b = v1 | (1 << 7),
297       v2b = v2 | (1 << 7),
298       v3b = v3 | (1 << 7),
299       v4b = v4 | (1 << 7),
300       v6b = v6 | (1 << 7),
301       v8b = v8 | (1 << 7),
302       /* these are used for WWM and spills to vgpr */
303       v1_linear = v1 | (1 << 6),
304       v2_linear = v2 | (1 << 6),
305    };
306 
307    RegClass() = default;
RegClassRegClass308    constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass309    constexpr RegClass(RegType type, unsigned size)
310        : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
311    {}
312 
RCRegClass313    constexpr operator RC() const { return rc; }
314    explicit operator bool() = delete;
315 
typeRegClass316    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass317    constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass318    constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass319    constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
320    // TODO: use size() less in favor of bytes()
sizeRegClass321    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass322    constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass323    constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass324    constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
325 
getRegClass326    static constexpr RegClass get(RegType type, unsigned bytes)
327    {
328       if (type == RegType::sgpr) {
329          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
330       } else {
331          return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
332       }
333    }
334 
resizeRegClass335    constexpr RegClass resize(unsigned bytes) const
336    {
337       if (is_linear_vgpr()) {
338          assert(bytes % 4u == 0);
339          return get(RegType::vgpr, bytes).as_linear();
340       }
341       return get(type(), bytes);
342    }
343 
344 private:
345    RC rc;
346 };
347 
348 /* transitional helper expressions */
349 static constexpr RegClass s1{RegClass::s1};
350 static constexpr RegClass s2{RegClass::s2};
351 static constexpr RegClass s3{RegClass::s3};
352 static constexpr RegClass s4{RegClass::s4};
353 static constexpr RegClass s8{RegClass::s8};
354 static constexpr RegClass s16{RegClass::s16};
355 static constexpr RegClass v1{RegClass::v1};
356 static constexpr RegClass v2{RegClass::v2};
357 static constexpr RegClass v3{RegClass::v3};
358 static constexpr RegClass v4{RegClass::v4};
359 static constexpr RegClass v5{RegClass::v5};
360 static constexpr RegClass v6{RegClass::v6};
361 static constexpr RegClass v7{RegClass::v7};
362 static constexpr RegClass v8{RegClass::v8};
363 static constexpr RegClass v1b{RegClass::v1b};
364 static constexpr RegClass v2b{RegClass::v2b};
365 static constexpr RegClass v3b{RegClass::v3b};
366 static constexpr RegClass v4b{RegClass::v4b};
367 static constexpr RegClass v6b{RegClass::v6b};
368 static constexpr RegClass v8b{RegClass::v8b};
369 
370 /**
371  * Temp Class
372  * Each temporary virtual register has a
373  * register class (i.e. size and type)
374  * and SSA id.
375  */
376 struct Temp {
TempTemp377    Temp() noexcept : id_(0), reg_class(0) {}
TempTemp378    constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
379 
idTemp380    constexpr uint32_t id() const noexcept { return id_; }
regClassTemp381    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
382 
bytesTemp383    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp384    constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp385    constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp386    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
387 
388    constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
389    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
390    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
391 
392 private:
393    uint32_t id_ : 24;
394    uint32_t reg_class : 8;
395 };
396 
397 /**
398  * PhysReg
399  * Represents the physical register for each
400  * Operand and Definition.
401  */
402 struct PhysReg {
403    constexpr PhysReg() = default;
PhysRegPhysReg404    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg405    constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg406    constexpr unsigned byte() const { return reg_b & 0x3; }
407    constexpr operator unsigned() const { return reg(); }
408    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
409    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
410    constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg411    constexpr PhysReg advance(int bytes) const
412    {
413       PhysReg res = *this;
414       res.reg_b += bytes;
415       return res;
416    }
417 
418    uint16_t reg_b = 0;
419 };
420 
421 /* helper expressions for special registers */
422 static constexpr PhysReg m0{124};
423 static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
424 static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
425 static constexpr PhysReg vcc{106};
426 static constexpr PhysReg vcc_hi{107};
427 static constexpr PhysReg tba_lo{108}; /* GFX6-GFX8 */
428 static constexpr PhysReg tba_hi{109}; /* GFX6-GFX8 */
429 static constexpr PhysReg tma_lo{110}; /* GFX6-GFX8 */
430 static constexpr PhysReg tma_hi{111}; /* GFX6-GFX8 */
431 static constexpr PhysReg ttmp0{112};
432 static constexpr PhysReg ttmp1{113};
433 static constexpr PhysReg ttmp2{114};
434 static constexpr PhysReg ttmp3{115};
435 static constexpr PhysReg ttmp4{116};
436 static constexpr PhysReg ttmp5{117};
437 static constexpr PhysReg ttmp6{118};
438 static constexpr PhysReg ttmp7{119};
439 static constexpr PhysReg ttmp8{120};
440 static constexpr PhysReg ttmp9{121};
441 static constexpr PhysReg ttmp10{122};
442 static constexpr PhysReg ttmp11{123};
443 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
444 static constexpr PhysReg exec{126};
445 static constexpr PhysReg exec_lo{126};
446 static constexpr PhysReg exec_hi{127};
447 static constexpr PhysReg pops_exiting_wave_id{239}; /* GFX9-GFX10.3 */
448 static constexpr PhysReg scc{253};
449 
450 /**
451  * Operand Class
452  * Initially, each Operand refers to either
453  * a temporary virtual register
454  * or to a constant value
455  * Temporary registers get mapped to physical register during RA
456  * Constant values are inlined into the instruction sequence.
457  */
458 class Operand final {
459 public:
Operand()460    constexpr Operand()
461        : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isPrecolored_(false),
462          isConstant_(false), isKill_(false), isUndef_(true), isFirstKill_(false),
463          isLateKill_(false), isClobbered_(false), isCopyKill_(false), is16bit_(false),
464          is24bit_(false), signext(false), constSize(0)
465    {}
466 
Operand(Temp r)467    explicit Operand(Temp r) noexcept
468    {
469       data_.temp = r;
470       if (r.id()) {
471          isTemp_ = true;
472       } else {
473          isUndef_ = true;
474          setFixed(PhysReg{128});
475       }
476    };
Operand(Temp r,PhysReg reg)477    explicit Operand(Temp r, PhysReg reg) noexcept
478    {
479       assert(r.id()); /* Don't allow fixing an undef to a register */
480       data_.temp = r;
481       isTemp_ = true;
482       setPrecolored(reg);
483    };
484 
485    /* 8-bit constant */
c8(uint8_t v)486    static Operand c8(uint8_t v) noexcept
487    {
488       /* 8-bit constants are only used for copies and copies from any 8-bit
489        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
490        * to be inline constants. */
491       Operand op;
492       op.control_ = 0;
493       op.data_.i = v;
494       op.isConstant_ = true;
495       op.constSize = 0;
496       op.setFixed(PhysReg{0u});
497       return op;
498    };
499 
500    /* 16-bit constant */
c16(uint16_t v)501    static Operand c16(uint16_t v) noexcept
502    {
503       Operand op;
504       op.control_ = 0;
505       op.data_.i = v;
506       op.isConstant_ = true;
507       op.constSize = 1;
508       if (v <= 64)
509          op.setFixed(PhysReg{128u + v});
510       else if (v >= 0xFFF0) /* [-16 .. -1] */
511          op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
512       else if (v == 0x3800) /* 0.5 */
513          op.setFixed(PhysReg{240});
514       else if (v == 0xB800) /* -0.5 */
515          op.setFixed(PhysReg{241});
516       else if (v == 0x3C00) /* 1.0 */
517          op.setFixed(PhysReg{242});
518       else if (v == 0xBC00) /* -1.0 */
519          op.setFixed(PhysReg{243});
520       else if (v == 0x4000) /* 2.0 */
521          op.setFixed(PhysReg{244});
522       else if (v == 0xC000) /* -2.0 */
523          op.setFixed(PhysReg{245});
524       else if (v == 0x4400) /* 4.0 */
525          op.setFixed(PhysReg{246});
526       else if (v == 0xC400) /* -4.0 */
527          op.setFixed(PhysReg{247});
528       else if (v == 0x3118) /* 1/2 PI */
529          op.setFixed(PhysReg{248});
530       else /* Literal Constant */
531          op.setFixed(PhysReg{255});
532       return op;
533    }
534 
535    /* 32-bit constant */
c32(uint32_t v)536    static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
537 
538    /* 64-bit constant */
c64(uint64_t v)539    static Operand c64(uint64_t v) noexcept
540    {
541       Operand op;
542       op.control_ = 0;
543       op.isConstant_ = true;
544       op.constSize = 3;
545       if (v <= 64) {
546          op.data_.i = (uint32_t)v;
547          op.setFixed(PhysReg{128 + (uint32_t)v});
548       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
549          op.data_.i = (uint32_t)v;
550          op.setFixed(PhysReg{192 - (uint32_t)v});
551       } else if (v == 0x3FE0000000000000) { /* 0.5 */
552          op.data_.i = 0x3f000000;
553          op.setFixed(PhysReg{240});
554       } else if (v == 0xBFE0000000000000) { /* -0.5 */
555          op.data_.i = 0xbf000000;
556          op.setFixed(PhysReg{241});
557       } else if (v == 0x3FF0000000000000) { /* 1.0 */
558          op.data_.i = 0x3f800000;
559          op.setFixed(PhysReg{242});
560       } else if (v == 0xBFF0000000000000) { /* -1.0 */
561          op.data_.i = 0xbf800000;
562          op.setFixed(PhysReg{243});
563       } else if (v == 0x4000000000000000) { /* 2.0 */
564          op.data_.i = 0x40000000;
565          op.setFixed(PhysReg{244});
566       } else if (v == 0xC000000000000000) { /* -2.0 */
567          op.data_.i = 0xc0000000;
568          op.setFixed(PhysReg{245});
569       } else if (v == 0x4010000000000000) { /* 4.0 */
570          op.data_.i = 0x40800000;
571          op.setFixed(PhysReg{246});
572       } else if (v == 0xC010000000000000) { /* -4.0 */
573          op.data_.i = 0xc0800000;
574          op.setFixed(PhysReg{247});
575       } else { /* Literal Constant: we don't know if it is a long or double.*/
576          op.signext = v >> 63;
577          op.data_.i = v & 0xffffffffu;
578          op.setFixed(PhysReg{255});
579          assert(op.constantValue64() == v &&
580                 "attempt to create a unrepresentable 64-bit literal constant");
581       }
582       return op;
583    }
584 
585    /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)586    static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
587    {
588       Operand op;
589       op.control_ = 0;
590       op.data_.i = v;
591       op.isConstant_ = true;
592       op.constSize = is64bit ? 3 : 2;
593       if (v <= 64)
594          op.setFixed(PhysReg{128 + v});
595       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
596          op.setFixed(PhysReg{192 - v});
597       else if (v == 0x3f000000) /* 0.5 */
598          op.setFixed(PhysReg{240});
599       else if (v == 0xbf000000) /* -0.5 */
600          op.setFixed(PhysReg{241});
601       else if (v == 0x3f800000) /* 1.0 */
602          op.setFixed(PhysReg{242});
603       else if (v == 0xbf800000) /* -1.0 */
604          op.setFixed(PhysReg{243});
605       else if (v == 0x40000000) /* 2.0 */
606          op.setFixed(PhysReg{244});
607       else if (v == 0xc0000000) /* -2.0 */
608          op.setFixed(PhysReg{245});
609       else if (v == 0x40800000) /* 4.0 */
610          op.setFixed(PhysReg{246});
611       else if (v == 0xc0800000) /* -4.0 */
612          op.setFixed(PhysReg{247});
613       else { /* Literal Constant */
614          assert(!is64bit && "attempt to create a 64-bit literal constant");
615          op.setFixed(PhysReg{255});
616       }
617       return op;
618    }
619 
literal32(uint32_t v)620    static Operand literal32(uint32_t v) noexcept
621    {
622       Operand op;
623       op.control_ = 0;
624       op.data_.i = v;
625       op.isConstant_ = true;
626       op.constSize = 2;
627       op.setFixed(PhysReg{255});
628       return op;
629    }
630 
Operand(RegClass type)631    explicit Operand(RegClass type) noexcept
632    {
633       isUndef_ = true;
634       data_.temp = Temp(0, type);
635       setFixed(PhysReg{128});
636    };
Operand(PhysReg reg,RegClass type)637    explicit Operand(PhysReg reg, RegClass type) noexcept
638    {
639       data_.temp = Temp(0, type);
640       setFixed(reg);
641    }
642 
643    static Operand zero(unsigned bytes = 4) noexcept
644    {
645       if (bytes == 8)
646          return Operand::c64(0);
647       else if (bytes == 4)
648          return Operand::c32(0);
649       else if (bytes == 2)
650          return Operand::c16(0);
651       assert(bytes == 1);
652       return Operand::c8(0);
653    }
654 
655    /* This is useful over the constructors when you want to take a gfx level
656     * for 1/2 PI or an unknown operand size.
657     */
get_const(enum amd_gfx_level chip,uint64_t val,unsigned bytes)658    static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes)
659    {
660       if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
661          /* 1/2 PI can be an inline constant on GFX8+ */
662          Operand op = Operand::c32(val);
663          op.setFixed(PhysReg{248});
664          return op;
665       }
666 
667       if (bytes == 8)
668          return Operand::c64(val);
669       else if (bytes == 4)
670          return Operand::c32(val);
671       else if (bytes == 2)
672          return Operand::c16(val);
673       assert(bytes == 1);
674       return Operand::c8(val);
675    }
676 
677    static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
678                                          bool sext = false)
679    {
680       if (bytes <= 4)
681          return true;
682 
683       if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
684          return true;
685       uint64_t upper33 = val & 0xFFFFFFFF80000000;
686       if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
687          return true;
688 
689       return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
690              val == 0x3FE0000000000000 ||              /* 0.5 */
691              val == 0xBFE0000000000000 ||              /* -0.5 */
692              val == 0x3FF0000000000000 ||              /* 1.0 */
693              val == 0xBFF0000000000000 ||              /* -1.0 */
694              val == 0x4000000000000000 ||              /* 2.0 */
695              val == 0xC000000000000000 ||              /* -2.0 */
696              val == 0x4010000000000000 ||              /* 4.0 */
697              val == 0xC010000000000000;                /* -4.0 */
698    }
699 
isTemp()700    constexpr bool isTemp() const noexcept { return isTemp_; }
701 
setTemp(Temp t)702    constexpr void setTemp(Temp t) noexcept
703    {
704       assert(!isConstant_);
705       if (t.id() != 0)
706          isTemp_ = true;
707       data_.temp = t;
708    }
709 
getTemp()710    constexpr Temp getTemp() const noexcept { return data_.temp; }
711 
tempId()712    constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
713 
hasRegClass()714    constexpr bool hasRegClass() const noexcept { return !isConstant(); }
715 
regClass()716    constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
717 
bytes()718    constexpr unsigned bytes() const noexcept
719    {
720       if (isConstant())
721          return 1 << constSize;
722       else
723          return data_.temp.bytes();
724    }
725 
size()726    constexpr unsigned size() const noexcept
727    {
728       if (isConstant())
729          return constSize > 2 ? 2 : 1;
730       else
731          return data_.temp.size();
732    }
733 
isFixed()734    constexpr bool isFixed() const noexcept { return isFixed_; }
735 
physReg()736    constexpr PhysReg physReg() const noexcept { return reg_; }
737 
setFixed(PhysReg reg)738    constexpr void setFixed(PhysReg reg) noexcept
739    {
740       isFixed_ = reg != unsigned(-1);
741       reg_ = reg;
742    }
743 
isPrecolored()744    constexpr bool isPrecolored() const noexcept { return isPrecolored_; }
setPrecolored(PhysReg reg)745    constexpr void setPrecolored(PhysReg reg) noexcept
746    {
747       setFixed(reg);
748       isPrecolored_ = isFixed_;
749    }
750 
isConstant()751    constexpr bool isConstant() const noexcept { return isConstant_; }
752 
isLiteral()753    constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
754 
isUndefined()755    constexpr bool isUndefined() const noexcept { return isUndef_; }
756 
constantValue()757    constexpr uint32_t constantValue() const noexcept { return data_.i; }
758 
constantEquals(uint32_t cmp)759    constexpr bool constantEquals(uint32_t cmp) const noexcept
760    {
761       return isConstant() && constantValue() == cmp;
762    }
763 
constantValue64()764    constexpr uint64_t constantValue64() const noexcept
765    {
766       if (constSize == 3) {
767          if (reg_ <= 192)
768             return reg_ - 128;
769          else if (reg_ <= 208)
770             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
771 
772          switch (reg_) {
773          case 240: return 0x3FE0000000000000;
774          case 241: return 0xBFE0000000000000;
775          case 242: return 0x3FF0000000000000;
776          case 243: return 0xBFF0000000000000;
777          case 244: return 0x4000000000000000;
778          case 245: return 0xC000000000000000;
779          case 246: return 0x4010000000000000;
780          case 247: return 0xC010000000000000;
781          case 255:
782             return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
783          }
784          unreachable("invalid register for 64-bit constant");
785       } else {
786          return data_.i;
787       }
788    }
789 
790    /* Value if this were used with vop3/opsel or vop3p. */
constantValue16(bool opsel)791    constexpr uint16_t constantValue16(bool opsel) const noexcept
792    {
793       assert(bytes() == 2 || bytes() == 4);
794       if (opsel) {
795          if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral())
796             return int16_t(data_.i) >>
797                    16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */
798          else
799             return data_.i >> 16;
800       }
801       return data_.i;
802    }
803 
isOfType(RegType type)804    constexpr bool isOfType(RegType type) const noexcept
805    {
806       return hasRegClass() && regClass().type() == type;
807    }
808 
809    /* Indicates that the killed operand's live range intersects with the
810     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
811     * not set by liveness analysis. */
setLateKill(bool flag)812    constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
813 
isLateKill()814    constexpr bool isLateKill() const noexcept { return isLateKill_; }
815 
816    /* Indicates that the Operand's register gets clobbered by the instruction. */
setClobbered(bool flag)817    constexpr void setClobbered(bool flag) noexcept { isClobbered_ = flag; }
isClobbered()818    constexpr bool isClobbered() const noexcept { return isClobbered_; }
819 
820    /* Indicates that the Operand must be copied in order to satisfy register
821     * constraints. The copy is immediately killed by the instruction.
822     */
setCopyKill(bool flag)823    constexpr void setCopyKill(bool flag) noexcept
824    {
825       isCopyKill_ = flag;
826       if (flag)
827          setKill(flag);
828    }
isCopyKill()829    constexpr bool isCopyKill() const noexcept { return isCopyKill_; }
830 
setKill(bool flag)831    constexpr void setKill(bool flag) noexcept
832    {
833       isKill_ = flag;
834       if (!flag) {
835          setFirstKill(false);
836          setCopyKill(false);
837       }
838    }
839 
isKill()840    constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
841 
setFirstKill(bool flag)842    constexpr void setFirstKill(bool flag) noexcept
843    {
844       isFirstKill_ = flag;
845       if (flag)
846          setKill(flag);
847    }
848 
849    /* When there are multiple operands killing the same temporary,
850     * isFirstKill() is only returns true for the first one. */
isFirstKill()851    constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
852 
isKillBeforeDef()853    constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
854 
isFirstKillBeforeDef()855    constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
856 
857    constexpr bool operator==(Operand other) const noexcept
858    {
859       if (other.bytes() != bytes())
860          return false;
861       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
862          return false;
863       if (isFixed() && physReg() != other.physReg())
864          return false;
865       if (hasRegClass() && (!other.hasRegClass() || other.regClass() != regClass()))
866          return false;
867 
868       if (isConstant())
869          return other.isConstant() && other.constantValue64() == constantValue64();
870       else if (isUndefined())
871          return other.isUndefined();
872       else if (isTemp())
873          return other.isTemp() && other.getTemp() == getTemp();
874       else
875          return true;
876    }
877 
878    constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
879 
set16bit(bool flag)880    constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
881 
is16bit()882    constexpr bool is16bit() const noexcept { return is16bit_; }
883 
set24bit(bool flag)884    constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
885 
is24bit()886    constexpr bool is24bit() const noexcept { return is24bit_; }
887 
888 private:
889    union {
890       Temp temp;
891       uint32_t i;
892       float f;
893    } data_ = {Temp(0, s1)};
894    PhysReg reg_;
895    union {
896       struct {
897          uint8_t isTemp_ : 1;
898          uint8_t isFixed_ : 1;
899          uint8_t isPrecolored_ : 1;
900          uint8_t isConstant_ : 1;
901          uint8_t isKill_ : 1;
902          uint8_t isUndef_ : 1;
903          uint8_t isFirstKill_ : 1;
904          uint8_t isLateKill_ : 1;
905          uint8_t isClobbered_ : 1;
906          uint8_t isCopyKill_ : 1;
907          uint8_t is16bit_ : 1;
908          uint8_t is24bit_ : 1;
909          uint8_t signext : 1;
910          uint8_t constSize : 2;
911       };
912       /* can't initialize bit-fields in c++11, so work around using a union */
913       uint16_t control_ = 0;
914    };
915 };
916 
917 /**
918  * Definition Class
919  * Definitions are the results of Instructions
920  * and refer to temporary virtual registers
921  * which are later mapped to physical registers
922  */
923 class Definition final {
924 public:
Definition()925    constexpr Definition()
926        : temp(Temp(0, s1)), reg_(0), isFixed_(0), isPrecolored_(0), isKill_(0), isPrecise_(0),
927          isInfPreserve_(0), isNaNPreserve_(0), isSZPreserve_(0), isNUW_(0), isNoCSE_(0)
928    {}
Definition(Temp tmp)929    explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)930    explicit Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(Temp tmp,PhysReg reg)931    explicit Definition(Temp tmp, PhysReg reg) noexcept : temp(tmp) { setPrecolored(reg); }
932 
isTemp()933    constexpr bool isTemp() const noexcept { return tempId() > 0; }
934 
getTemp()935    constexpr Temp getTemp() const noexcept { return temp; }
936 
tempId()937    constexpr uint32_t tempId() const noexcept { return temp.id(); }
938 
setTemp(Temp t)939    constexpr void setTemp(Temp t) noexcept { temp = t; }
940 
swapTemp(Definition & other)941    void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
942 
regClass()943    constexpr RegClass regClass() const noexcept { return temp.regClass(); }
944 
bytes()945    constexpr unsigned bytes() const noexcept { return temp.bytes(); }
946 
size()947    constexpr unsigned size() const noexcept { return temp.size(); }
948 
isFixed()949    constexpr bool isFixed() const noexcept { return isFixed_; }
950 
physReg()951    constexpr PhysReg physReg() const noexcept { return reg_; }
952 
setFixed(PhysReg reg)953    constexpr void setFixed(PhysReg reg) noexcept
954    {
955       isFixed_ = 1;
956       reg_ = reg;
957    }
958 
isPrecolored()959    constexpr bool isPrecolored() const noexcept { return isPrecolored_; }
setPrecolored(PhysReg reg)960    constexpr void setPrecolored(PhysReg reg) noexcept
961    {
962       setFixed(reg);
963       isPrecolored_ = isFixed_;
964    }
965 
setKill(bool flag)966    constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
967 
isKill()968    constexpr bool isKill() const noexcept { return isKill_; }
969 
setPrecise(bool precise)970    constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
971 
isPrecise()972    constexpr bool isPrecise() const noexcept { return isPrecise_; }
973 
setInfPreserve(bool inf_preserve)974    constexpr void setInfPreserve(bool inf_preserve) noexcept { isInfPreserve_ = inf_preserve; }
975 
isInfPreserve()976    constexpr bool isInfPreserve() const noexcept { return isInfPreserve_; }
977 
setNaNPreserve(bool nan_preserve)978    constexpr void setNaNPreserve(bool nan_preserve) noexcept { isNaNPreserve_ = nan_preserve; }
979 
isNaNPreserve()980    constexpr bool isNaNPreserve() const noexcept { return isNaNPreserve_; }
981 
setSZPreserve(bool sz_preserve)982    constexpr void setSZPreserve(bool sz_preserve) noexcept { isSZPreserve_ = sz_preserve; }
983 
isSZPreserve()984    constexpr bool isSZPreserve() const noexcept { return isSZPreserve_; }
985 
986    /* No Unsigned Wrap */
setNUW(bool nuw)987    constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
988 
isNUW()989    constexpr bool isNUW() const noexcept { return isNUW_; }
990 
setNoCSE(bool noCSE)991    constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
992 
isNoCSE()993    constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
994 
995 private:
996    Temp temp = Temp(0, s1);
997    PhysReg reg_;
998    union {
999       struct {
1000          uint8_t isFixed_ : 1;
1001          uint8_t isPrecolored_ : 1;
1002          uint8_t isKill_ : 1;
1003          uint8_t isPrecise_ : 1;
1004          uint8_t isInfPreserve_ : 1;
1005          uint8_t isNaNPreserve_ : 1;
1006          uint8_t isSZPreserve_ : 1;
1007          uint8_t isNUW_ : 1;
1008          uint8_t isNoCSE_ : 1;
1009       };
1010       /* can't initialize bit-fields in c++11, so work around using a union */
1011       uint16_t control_ = 0;
1012    };
1013 };
1014 
1015 struct RegisterDemand {
1016    constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1017    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1018    int16_t vgpr = 0;
1019    int16_t sgpr = 0;
1020 
1021    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1022    {
1023       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1024    }
1025 
exceedsRegisterDemand1026    constexpr bool exceeds(const RegisterDemand other) const noexcept
1027    {
1028       return vgpr > other.vgpr || sgpr > other.sgpr;
1029    }
1030 
1031    constexpr RegisterDemand operator+(const Temp t) const noexcept
1032    {
1033       if (t.type() == RegType::sgpr)
1034          return RegisterDemand(vgpr, sgpr + t.size());
1035       else
1036          return RegisterDemand(vgpr + t.size(), sgpr);
1037    }
1038 
1039    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1040    {
1041       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1042    }
1043 
1044    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1045    {
1046       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1047    }
1048 
1049    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1050    {
1051       vgpr += other.vgpr;
1052       sgpr += other.sgpr;
1053       return *this;
1054    }
1055 
1056    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1057    {
1058       vgpr -= other.vgpr;
1059       sgpr -= other.sgpr;
1060       return *this;
1061    }
1062 
1063    constexpr RegisterDemand& operator+=(const Temp t) noexcept
1064    {
1065       if (t.type() == RegType::sgpr)
1066          sgpr += t.size();
1067       else
1068          vgpr += t.size();
1069       return *this;
1070    }
1071 
1072    constexpr RegisterDemand& operator-=(const Temp t) noexcept
1073    {
1074       if (t.type() == RegType::sgpr)
1075          sgpr -= t.size();
1076       else
1077          vgpr -= t.size();
1078       return *this;
1079    }
1080 
updateRegisterDemand1081    constexpr void update(const RegisterDemand other) noexcept
1082    {
1083       vgpr = std::max(vgpr, other.vgpr);
1084       sgpr = std::max(sgpr, other.sgpr);
1085    }
1086 };
1087 
1088 struct Block;
1089 struct Instruction;
1090 struct Pseudo_instruction;
1091 struct SALU_instruction;
1092 struct SMEM_instruction;
1093 struct DS_instruction;
1094 struct LDSDIR_instruction;
1095 struct MTBUF_instruction;
1096 struct MUBUF_instruction;
1097 struct MIMG_instruction;
1098 struct Export_instruction;
1099 struct FLAT_instruction;
1100 struct Pseudo_branch_instruction;
1101 struct Pseudo_barrier_instruction;
1102 struct Pseudo_reduction_instruction;
1103 struct VALU_instruction;
1104 struct VINTERP_inreg_instruction;
1105 struct VINTRP_instruction;
1106 struct VOPD_instruction;
1107 struct DPP16_instruction;
1108 struct DPP8_instruction;
1109 struct SDWA_instruction;
1110 
1111 struct Instruction {
1112    aco_opcode opcode;
1113    Format format;
1114    union {
1115       uint32_t pass_flags;
1116       RegisterDemand register_demand;
1117    };
1118 
1119    aco::span<Operand> operands;
1120    aco::span<Definition> definitions;
1121 
1122    constexpr bool usesModifiers() const noexcept;
1123 
reads_execInstruction1124    constexpr bool reads_exec() const noexcept
1125    {
1126       for (const Operand& op : operands) {
1127          if (op.isFixed() && (op.physReg() == exec_lo || op.physReg() == exec_hi))
1128             return true;
1129       }
1130       return false;
1131    }
1132 
writes_execInstruction1133    constexpr bool writes_exec() const noexcept
1134    {
1135       for (const Definition& def : definitions) {
1136          if (def.isFixed() && (def.physReg() == exec_lo || def.physReg() == exec_hi))
1137             return true;
1138       }
1139       return false;
1140    }
1141 
pseudoInstruction1142    Pseudo_instruction& pseudo() noexcept
1143    {
1144       assert(isPseudo());
1145       return *(Pseudo_instruction*)this;
1146    }
pseudoInstruction1147    const Pseudo_instruction& pseudo() const noexcept
1148    {
1149       assert(isPseudo());
1150       return *(Pseudo_instruction*)this;
1151    }
isPseudoInstruction1152    constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
1153 
isSOP1Instruction1154    constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
isSOP2Instruction1155    constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
isSOPKInstruction1156    constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
isSOPPInstruction1157    constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
isSOPCInstruction1158    constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
1159 
smemInstruction1160    SMEM_instruction& smem() noexcept
1161    {
1162       assert(isSMEM());
1163       return *(SMEM_instruction*)this;
1164    }
smemInstruction1165    const SMEM_instruction& smem() const noexcept
1166    {
1167       assert(isSMEM());
1168       return *(SMEM_instruction*)this;
1169    }
isSMEMInstruction1170    constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1171    DS_instruction& ds() noexcept
1172    {
1173       assert(isDS());
1174       return *(DS_instruction*)this;
1175    }
dsInstruction1176    const DS_instruction& ds() const noexcept
1177    {
1178       assert(isDS());
1179       return *(DS_instruction*)this;
1180    }
isDSInstruction1181    constexpr bool isDS() const noexcept { return format == Format::DS; }
ldsdirInstruction1182    LDSDIR_instruction& ldsdir() noexcept
1183    {
1184       assert(isLDSDIR());
1185       return *(LDSDIR_instruction*)this;
1186    }
ldsdirInstruction1187    const LDSDIR_instruction& ldsdir() const noexcept
1188    {
1189       assert(isLDSDIR());
1190       return *(LDSDIR_instruction*)this;
1191    }
isLDSDIRInstruction1192    constexpr bool isLDSDIR() const noexcept { return format == Format::LDSDIR; }
mtbufInstruction1193    MTBUF_instruction& mtbuf() noexcept
1194    {
1195       assert(isMTBUF());
1196       return *(MTBUF_instruction*)this;
1197    }
mtbufInstruction1198    const MTBUF_instruction& mtbuf() const noexcept
1199    {
1200       assert(isMTBUF());
1201       return *(MTBUF_instruction*)this;
1202    }
isMTBUFInstruction1203    constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1204    MUBUF_instruction& mubuf() noexcept
1205    {
1206       assert(isMUBUF());
1207       return *(MUBUF_instruction*)this;
1208    }
mubufInstruction1209    const MUBUF_instruction& mubuf() const noexcept
1210    {
1211       assert(isMUBUF());
1212       return *(MUBUF_instruction*)this;
1213    }
isMUBUFInstruction1214    constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1215    MIMG_instruction& mimg() noexcept
1216    {
1217       assert(isMIMG());
1218       return *(MIMG_instruction*)this;
1219    }
mimgInstruction1220    const MIMG_instruction& mimg() const noexcept
1221    {
1222       assert(isMIMG());
1223       return *(MIMG_instruction*)this;
1224    }
isMIMGInstruction1225    constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1226    Export_instruction& exp() noexcept
1227    {
1228       assert(isEXP());
1229       return *(Export_instruction*)this;
1230    }
expInstruction1231    const Export_instruction& exp() const noexcept
1232    {
1233       assert(isEXP());
1234       return *(Export_instruction*)this;
1235    }
isEXPInstruction1236    constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1237    FLAT_instruction& flat() noexcept
1238    {
1239       assert(isFlat());
1240       return *(FLAT_instruction*)this;
1241    }
flatInstruction1242    const FLAT_instruction& flat() const noexcept
1243    {
1244       assert(isFlat());
1245       return *(FLAT_instruction*)this;
1246    }
isFlatInstruction1247    constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1248    FLAT_instruction& global() noexcept
1249    {
1250       assert(isGlobal());
1251       return *(FLAT_instruction*)this;
1252    }
globalInstruction1253    const FLAT_instruction& global() const noexcept
1254    {
1255       assert(isGlobal());
1256       return *(FLAT_instruction*)this;
1257    }
isGlobalInstruction1258    constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1259    FLAT_instruction& scratch() noexcept
1260    {
1261       assert(isScratch());
1262       return *(FLAT_instruction*)this;
1263    }
scratchInstruction1264    const FLAT_instruction& scratch() const noexcept
1265    {
1266       assert(isScratch());
1267       return *(FLAT_instruction*)this;
1268    }
isScratchInstruction1269    constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1270    Pseudo_branch_instruction& branch() noexcept
1271    {
1272       assert(isBranch());
1273       return *(Pseudo_branch_instruction*)this;
1274    }
branchInstruction1275    const Pseudo_branch_instruction& branch() const noexcept
1276    {
1277       assert(isBranch());
1278       return *(Pseudo_branch_instruction*)this;
1279    }
isBranchInstruction1280    constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1281    Pseudo_barrier_instruction& barrier() noexcept
1282    {
1283       assert(isBarrier());
1284       return *(Pseudo_barrier_instruction*)this;
1285    }
barrierInstruction1286    const Pseudo_barrier_instruction& barrier() const noexcept
1287    {
1288       assert(isBarrier());
1289       return *(Pseudo_barrier_instruction*)this;
1290    }
isBarrierInstruction1291    constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1292    Pseudo_reduction_instruction& reduction() noexcept
1293    {
1294       assert(isReduction());
1295       return *(Pseudo_reduction_instruction*)this;
1296    }
reductionInstruction1297    const Pseudo_reduction_instruction& reduction() const noexcept
1298    {
1299       assert(isReduction());
1300       return *(Pseudo_reduction_instruction*)this;
1301    }
isReductionInstruction1302    constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
isVOP3PInstruction1303    constexpr bool isVOP3P() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3P; }
vinterp_inregInstruction1304    VINTERP_inreg_instruction& vinterp_inreg() noexcept
1305    {
1306       assert(isVINTERP_INREG());
1307       return *(VINTERP_inreg_instruction*)this;
1308    }
vinterp_inregInstruction1309    const VINTERP_inreg_instruction& vinterp_inreg() const noexcept
1310    {
1311       assert(isVINTERP_INREG());
1312       return *(VINTERP_inreg_instruction*)this;
1313    }
isVINTERP_INREGInstruction1314    constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; }
vopdInstruction1315    VOPD_instruction& vopd() noexcept
1316    {
1317       assert(isVOPD());
1318       return *(VOPD_instruction*)this;
1319    }
vopdInstruction1320    const VOPD_instruction& vopd() const noexcept
1321    {
1322       assert(isVOPD());
1323       return *(VOPD_instruction*)this;
1324    }
isVOPDInstruction1325    constexpr bool isVOPD() const noexcept { return format == Format::VOPD; }
isVOP1Instruction1326    constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
isVOP2Instruction1327    constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
isVOPCInstruction1328    constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
isVOP3Instruction1329    constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1330    VINTRP_instruction& vintrp() noexcept
1331    {
1332       assert(isVINTRP());
1333       return *(VINTRP_instruction*)this;
1334    }
vintrpInstruction1335    const VINTRP_instruction& vintrp() const noexcept
1336    {
1337       assert(isVINTRP());
1338       return *(VINTRP_instruction*)this;
1339    }
isVINTRPInstruction1340    constexpr bool isVINTRP() const noexcept { return format == Format::VINTRP; }
dpp16Instruction1341    DPP16_instruction& dpp16() noexcept
1342    {
1343       assert(isDPP16());
1344       return *(DPP16_instruction*)this;
1345    }
dpp16Instruction1346    const DPP16_instruction& dpp16() const noexcept
1347    {
1348       assert(isDPP16());
1349       return *(DPP16_instruction*)this;
1350    }
isDPP16Instruction1351    constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
dpp8Instruction1352    DPP8_instruction& dpp8() noexcept
1353    {
1354       assert(isDPP8());
1355       return *(DPP8_instruction*)this;
1356    }
dpp8Instruction1357    const DPP8_instruction& dpp8() const noexcept
1358    {
1359       assert(isDPP8());
1360       return *(DPP8_instruction*)this;
1361    }
isDPP8Instruction1362    constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
isDPPInstruction1363    constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
sdwaInstruction1364    SDWA_instruction& sdwa() noexcept
1365    {
1366       assert(isSDWA());
1367       return *(SDWA_instruction*)this;
1368    }
sdwaInstruction1369    const SDWA_instruction& sdwa() const noexcept
1370    {
1371       assert(isSDWA());
1372       return *(SDWA_instruction*)this;
1373    }
isSDWAInstruction1374    constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1375 
flatlikeInstruction1376    FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1377 
flatlikeInstruction1378    const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1379 
isFlatLikeInstruction1380    constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1381 
valuInstruction1382    VALU_instruction& valu() noexcept
1383    {
1384       assert(isVALU());
1385       return *(VALU_instruction*)this;
1386    }
valuInstruction1387    const VALU_instruction& valu() const noexcept
1388    {
1389       assert(isVALU());
1390       return *(VALU_instruction*)this;
1391    }
isVALUInstruction1392    constexpr bool isVALU() const noexcept
1393    {
1394       return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG() ||
1395              isVOPD();
1396    }
1397 
saluInstruction1398    SALU_instruction& salu() noexcept
1399    {
1400       assert(isSALU());
1401       return *(SALU_instruction*)this;
1402    }
saluInstruction1403    const SALU_instruction& salu() const noexcept
1404    {
1405       assert(isSALU());
1406       return *(SALU_instruction*)this;
1407    }
isSALUInstruction1408    constexpr bool isSALU() const noexcept
1409    {
1410       return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1411    }
1412 
isVMEMInstruction1413    constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1414 
1415    bool accessesLDS() const noexcept;
1416    bool isTrans() const noexcept;
1417 };
1418 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1419 
1420 struct SALU_instruction : public Instruction {
1421    /* In case of SOPP branch instructions, contains the Block index,
1422     * and otherwise, for SOPP and SOPK the 16-bit signed immediate.
1423     */
1424    uint32_t imm;
1425 };
1426 static_assert(sizeof(SALU_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1427 
1428 /**
1429  * Scalar Memory Format:
1430  * For s_(buffer_)load_dword*:
1431  * Operand(0): SBASE - SGPR-pair which provides base address
1432  * Operand(1): Offset - immediate (un)signed offset or SGPR
1433  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1434  * Operand(n-1): SOffset - SGPR offset (Vega only)
1435  *
1436  * Having no operands is also valid for instructions such as s_dcache_inv.
1437  *
1438  */
1439 struct SMEM_instruction : public Instruction {
1440    memory_sync_info sync;
1441    ac_hw_cache_flags cache;
1442 };
1443 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1444 
1445 struct VALU_instruction : public Instruction {
1446    union {
1447       bitfield_array8<uint32_t, 0, 3> neg;    /* VOP3, SDWA, DPP16, v_fma_mix, VINTERP_inreg */
1448       bitfield_array8<uint32_t, 0, 3> neg_lo; /* VOP3P */
1449 
1450       bitfield_array8<uint32_t, 3, 3> abs;    /* VOP3, SDWA, DPP16, v_fma_mix */
1451       bitfield_array8<uint32_t, 3, 3> neg_hi; /* VOP3P */
1452 
1453       bitfield_array8<uint32_t, 6, 4> opsel;     /* VOP3, VOPC12(GFX11+), VINTERP_inreg */
1454       bitfield_uint8<uint32_t, 10, 2> omod;      /* VOP3, SDWA(GFX9+) */
1455       bitfield_array8<uint32_t, 12, 3> opsel_lo; /* VOP3P */
1456       bitfield_array8<uint32_t, 15, 3> opsel_hi; /* VOP3P */
1457       bitfield_bool<uint32_t, 18> clamp;         /* VOP3, VOP3P, SDWA, VINTERP_inreg */
1458    };
1459 
1460    void swapOperands(unsigned idx0, unsigned idx1);
1461 };
1462 static_assert(sizeof(VALU_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1463 
1464 struct VINTERP_inreg_instruction : public VALU_instruction {
1465    uint8_t wait_exp : 3;
1466    uint8_t padding3 : 5;
1467    uint8_t padding4;
1468    uint8_t padding5;
1469    uint8_t padding6;
1470 };
1471 static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4,
1472               "Unexpected padding");
1473 
1474 struct VOPD_instruction : public VALU_instruction {
1475    aco_opcode opy;
1476    uint16_t padding;
1477 };
1478 static_assert(sizeof(VOPD_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1479 
1480 /**
1481  * Data Parallel Primitives Format:
1482  * This format can be used for VOP1, VOP2 or VOPC instructions.
1483  * The swizzle applies to the src0 operand.
1484  *
1485  */
1486 struct DPP16_instruction : public VALU_instruction {
1487    uint16_t dpp_ctrl;
1488    uint8_t row_mask : 4;
1489    uint8_t bank_mask : 4;
1490    bool bound_ctrl : 1;
1491    uint8_t fetch_inactive : 1;
1492    uint8_t padding3 : 6;
1493 };
1494 static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1495 
1496 struct DPP8_instruction : public VALU_instruction {
1497    uint32_t lane_sel : 24;
1498    uint32_t fetch_inactive : 1;
1499    uint32_t padding : 7;
1500 };
1501 static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1502 
1503 struct SubdwordSel {
1504    enum sdwa_sel : uint8_t {
1505       ubyte = 0x4,
1506       uword = 0x8,
1507       dword = 0x10,
1508       sext = 0x20,
1509       sbyte = ubyte | sext,
1510       sword = uword | sext,
1511 
1512       ubyte0 = ubyte,
1513       ubyte1 = ubyte | 1,
1514       ubyte2 = ubyte | 2,
1515       ubyte3 = ubyte | 3,
1516       sbyte0 = sbyte,
1517       sbyte1 = sbyte | 1,
1518       sbyte2 = sbyte | 2,
1519       sbyte3 = sbyte | 3,
1520       uword0 = uword,
1521       uword1 = uword | 2,
1522       sword0 = sword,
1523       sword1 = sword | 2,
1524    };
1525 
SubdwordSelSubdwordSel1526    SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1527    constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1528    constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1529        : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1530    {}
sdwa_selSubdwordSel1531    constexpr operator sdwa_sel() const { return sel; }
1532    explicit operator bool() const { return sel != 0; }
1533 
sizeSubdwordSel1534    constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1535    constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1536    constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1537    constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1538    {
1539       reg_byte_offset += offset();
1540       if (size() == 1)
1541          return reg_byte_offset;
1542       else if (size() == 2)
1543          return 4 + (reg_byte_offset >> 1);
1544       else
1545          return 6;
1546    }
1547 
1548 private:
1549    sdwa_sel sel;
1550 };
1551 
1552 /**
1553  * Sub-Dword Addressing Format:
1554  * This format can be used for VOP1, VOP2 or VOPC instructions.
1555  *
1556  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1557  * the definition doesn't have to be VCC on GFX9+.
1558  *
1559  */
1560 struct SDWA_instruction : public VALU_instruction {
1561    /* these destination modifiers aren't available with VOPC except for
1562     * clamp on GFX8 */
1563    SubdwordSel sel[2];
1564    SubdwordSel dst_sel;
1565    uint8_t padding3;
1566 };
1567 static_assert(sizeof(SDWA_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1568 
1569 struct VINTRP_instruction : public Instruction {
1570    uint8_t attribute;
1571    uint8_t component;
1572    bool high_16bits;
1573    uint8_t padding;
1574 };
1575 static_assert(sizeof(VINTRP_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1576 
1577 /**
1578  * Local and Global Data Sharing instructions
1579  * Operand(0): ADDR - VGPR which supplies the address.
1580  * Operand(1): DATA0 - First data VGPR.
1581  * Operand(2): DATA1 - Second data VGPR.
1582  * Operand(n-1): M0 - LDS size.
1583  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1584  *
1585  */
1586 struct DS_instruction : public Instruction {
1587    memory_sync_info sync;
1588    bool gds;
1589    uint16_t offset0;
1590    uint8_t offset1;
1591    uint8_t padding;
1592 };
1593 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1594 
1595 /**
1596  * LDS Direct instructions
1597  * Operand(0): M0
1598  * Definition(0): VDST - Destination VGPR
1599  */
1600 struct LDSDIR_instruction : public Instruction {
1601    memory_sync_info sync;
1602    uint8_t attr : 6;
1603    uint8_t attr_chan : 2;
1604    uint32_t wait_vdst : 4;
1605    uint32_t wait_vsrc : 1;
1606    uint32_t padding : 27;
1607 };
1608 static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1609 
1610 /**
1611  * Vector Memory Untyped-buffer Instructions
1612  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1613  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1614  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1615  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1616  *
1617  */
1618 struct MUBUF_instruction : public Instruction {
1619    memory_sync_info sync;
1620    ac_hw_cache_flags cache;
1621    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1622    bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1623    bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
1624    bool tfe : 1;             /* texture fail enable */
1625    bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
1626    bool disable_wqm : 1;     /* Require an exec mask without helper invocations */
1627    uint8_t padding0 : 2;
1628    uint8_t padding1;
1629    uint16_t offset; /* Unsigned byte offset - 12 bit */
1630 };
1631 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1632 
1633 /**
1634  * Vector Memory Typed-buffer Instructions
1635  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1636  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1637  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1638  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1639  *
1640  */
1641 struct MTBUF_instruction : public Instruction {
1642    memory_sync_info sync;
1643    ac_hw_cache_flags cache;
1644    uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
1645    uint8_t nfmt : 3;         /* Numeric format of data in memory */
1646    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1647    bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1648    bool tfe : 1;             /* texture fail enable */
1649    bool disable_wqm : 1;     /* Require an exec mask without helper invocations */
1650    uint8_t padding : 5;
1651    uint16_t offset; /* Unsigned byte offset - 12 bit */
1652 };
1653 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1654 
1655 /**
1656  * Vector Memory Image Instructions
1657  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1658  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1659  * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1660  * Operand(3): VADDR - Address source. Can carry an offset or an index.
1661  * Definition(0): VDATA - Vector GPR for read result.
1662  *
1663  */
1664 struct MIMG_instruction : public Instruction {
1665    memory_sync_info sync;
1666    ac_hw_cache_flags cache;
1667    uint8_t dmask;        /* Data VGPR enable mask */
1668    uint8_t dim : 3;      /* NAVI: dimensionality */
1669    bool unrm : 1;        /* Force address to be un-normalized */
1670    bool tfe : 1;         /* texture fail enable */
1671    bool da : 1;          /* declare an array */
1672    bool lwe : 1;         /* LOD warning enable */
1673    bool r128 : 1;        /* NAVI: Texture resource size */
1674    bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
1675    bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
1676    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1677    bool strict_wqm : 1;  /* VADDR is a linear VGPR and additional VGPRs may be copied into it */
1678    uint8_t padding0 : 4;
1679    uint8_t padding1;
1680 };
1681 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1682 
1683 /**
1684  * Flat/Scratch/Global Instructions
1685  * Operand(0): ADDR
1686  * Operand(1): SADDR
1687  * Operand(2) / Definition(0): DATA/VDST
1688  *
1689  */
1690 struct FLAT_instruction : public Instruction {
1691    memory_sync_info sync;
1692    ac_hw_cache_flags cache;
1693    bool lds : 1;
1694    bool nv : 1;
1695    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1696    uint8_t padding0 : 5;
1697    uint8_t padding1;
1698    int16_t offset; /* Vega/Navi only */
1699 };
1700 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1701 
1702 struct Export_instruction : public Instruction {
1703    uint8_t enabled_mask;
1704    uint8_t dest;
1705    bool compressed : 1;
1706    bool done : 1;
1707    bool valid_mask : 1;
1708    bool row_en : 1;
1709    uint8_t padding0 : 4;
1710    uint8_t padding1;
1711 };
1712 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1713 
1714 struct Pseudo_instruction : public Instruction {
1715    PhysReg scratch_sgpr;   /* might not be valid if it's not needed */
1716    bool needs_scratch_reg; /* if scratch_sgpr/scc can be written, initialized by RA. */
1717 };
1718 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1719 
1720 struct Pseudo_branch_instruction : public Instruction {
1721    /* target[0] is the block index of the branch target.
1722     * For conditional branches, target[1] contains the fall-through alternative.
1723     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1724     */
1725    uint32_t target[2];
1726 
1727    /* Indicates that this rarely or never jumps to target[0]. */
1728    bool rarely_taken;
1729    bool never_taken;
1730 
1731    uint16_t padding;
1732 };
1733 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding");
1734 
1735 struct Pseudo_barrier_instruction : public Instruction {
1736    memory_sync_info sync;
1737    sync_scope exec_scope;
1738 };
1739 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1740 
1741 enum ReduceOp : uint16_t {
1742    // clang-format off
1743    iadd8, iadd16, iadd32, iadd64,
1744    imul8, imul16, imul32, imul64,
1745           fadd16, fadd32, fadd64,
1746           fmul16, fmul32, fmul64,
1747    imin8, imin16, imin32, imin64,
1748    imax8, imax16, imax32, imax64,
1749    umin8, umin16, umin32, umin64,
1750    umax8, umax16, umax32, umax64,
1751           fmin16, fmin32, fmin64,
1752           fmax16, fmax32, fmax64,
1753    iand8, iand16, iand32, iand64,
1754    ior8, ior16, ior32, ior64,
1755    ixor8, ixor16, ixor32, ixor64,
1756    num_reduce_ops,
1757    // clang-format on
1758 };
1759 
1760 /**
1761  * Subgroup Reduction Instructions, everything except for the data to be
1762  * reduced and the result as inserted by setup_reduce_temp().
1763  * Operand(0): data to be reduced
1764  * Operand(1): reduce temporary
1765  * Operand(2): vector temporary
1766  * Definition(0): result
1767  * Definition(1): scalar temporary
1768  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1769  * Definition(3): scc clobber
1770  * Definition(4): vcc clobber
1771  *
1772  */
1773 struct Pseudo_reduction_instruction : public Instruction {
1774    ReduceOp reduce_op;
1775    uint16_t cluster_size; // must be 0 for scans
1776 };
1777 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1778               "Unexpected padding");
1779 
1780 inline bool
accessesLDS()1781 Instruction::accessesLDS() const noexcept
1782 {
1783    return (isDS() && !ds().gds) || isLDSDIR() || isVINTRP();
1784 }
1785 
1786 inline void
swapOperands(unsigned idx0,unsigned idx1)1787 VALU_instruction::swapOperands(unsigned idx0, unsigned idx1)
1788 {
1789    if (this->isSDWA() && idx0 != idx1) {
1790       assert(idx0 < 2 && idx1 < 2);
1791       std::swap(this->sdwa().sel[0], this->sdwa().sel[1]);
1792    }
1793    assert(idx0 < 3 && idx1 < 3);
1794    std::swap(this->operands[idx0], this->operands[idx1]);
1795    this->neg[idx0].swap(this->neg[idx1]);
1796    this->abs[idx0].swap(this->abs[idx1]);
1797    this->opsel[idx0].swap(this->opsel[idx1]);
1798    this->opsel_lo[idx0].swap(this->opsel_lo[idx1]);
1799    this->opsel_hi[idx0].swap(this->opsel_hi[idx1]);
1800 }
1801 
1802 struct instr_deleter_functor {
1803    /* Don't yet free any instructions. They will be de-allocated
1804     * all at once after compilation finished.
1805     */
operatorinstr_deleter_functor1806    void operator()(void* p) { return; }
1807 };
1808 
1809 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1810 
1811 size_t get_instr_data_size(Format format);
1812 
1813 Instruction* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1814                                 uint32_t num_definitions);
1815 
1816 constexpr bool
usesModifiers()1817 Instruction::usesModifiers() const noexcept
1818 {
1819    if (isDPP() || isSDWA())
1820       return true;
1821 
1822    if (isVOP3P()) {
1823       const VALU_instruction& vop3p = this->valu();
1824       /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1825       return vop3p.opsel_lo || vop3p.clamp || vop3p.neg_lo || vop3p.neg_hi ||
1826              (vop3p.opsel_hi & BITFIELD_MASK(operands.size())) != BITFIELD_MASK(operands.size());
1827    } else if (isVALU()) {
1828       const VALU_instruction& vop3 = this->valu();
1829       return vop3.opsel || vop3.clamp || vop3.omod || vop3.abs || vop3.neg;
1830    }
1831    return false;
1832 }
1833 
1834 constexpr bool
is_phi(Instruction * instr)1835 is_phi(Instruction* instr)
1836 {
1837    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1838 }
1839 
1840 static inline bool
is_phi(aco_ptr<Instruction> & instr)1841 is_phi(aco_ptr<Instruction>& instr)
1842 {
1843    return is_phi(instr.get());
1844 }
1845 
1846 bool is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr);
1847 memory_sync_info get_sync_info(const Instruction* instr);
1848 
1849 inline bool
is_dead(const std::vector<uint16_t> & uses,const Instruction * instr)1850 is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
1851 {
1852    if (instr->definitions.empty() || instr->isBranch() || instr->opcode == aco_opcode::p_startpgm ||
1853        instr->opcode == aco_opcode::p_init_scratch ||
1854        instr->opcode == aco_opcode::p_dual_src_export_gfx11)
1855       return false;
1856 
1857    if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
1858                    [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
1859       return false;
1860 
1861    return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
1862 }
1863 
1864 bool can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx);
1865 bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
1866 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
1867 uint8_t get_gfx11_true16_mask(aco_opcode op);
1868 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
1869 bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
1870 bool can_write_m0(const aco_ptr<Instruction>& instr);
1871 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1872 aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr);
1873 aco_ptr<Instruction> convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr,
1874                                     bool dpp8);
1875 bool needs_exec_mask(const Instruction* instr);
1876 
1877 aco_opcode get_vcmp_inverse(aco_opcode op);
1878 aco_opcode get_vcmp_swapped(aco_opcode op);
1879 aco_opcode get_vcmpx(aco_opcode op);
1880 bool is_cmpx(aco_opcode op);
1881 
1882 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0 = 0,
1883                        unsigned idx1 = 1);
1884 
1885 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1886 
1887 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1888 
1889 unsigned get_vopd_opy_start(const Instruction* instr);
1890 
1891 unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index);
1892 
1893 bool should_form_clause(const Instruction* a, const Instruction* b);
1894 
1895 enum vmem_type : uint8_t {
1896    vmem_nosampler = 1 << 0,
1897    vmem_sampler = 1 << 1,
1898    vmem_bvh = 1 << 2,
1899 };
1900 
1901 /* VMEM instructions of the same type return in-order. For GFX12+, this determines which counter
1902  * is used.
1903  */
1904 uint8_t get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr);
1905 
1906 /* For all of the counters, the maximum value means no wait.
1907  * Some of the counters are larger than their bit field,
1908  * but there is no wait mechanism that allows waiting only for higher values.
1909  */
1910 struct depctr_wait {
1911    union {
1912       struct {
1913          /* VALU completion, apparently even used for VALU without vgpr writes. */
1914          unsigned va_vdst : 4;
1915          /* VALU sgpr write (not including vcc/vcc_hi). */
1916          unsigned va_sdst : 3;
1917          /* VALU sgpr read. */
1918          unsigned va_ssrc : 1;
1919          /* unknown. */
1920          unsigned hold_cnt : 1;
1921          /* VMEM/DS vgpr read. */
1922          unsigned vm_vsrc : 3;
1923          /* VALU vcc/vcc_hi write. */
1924          unsigned va_vcc : 1;
1925          /* SALU sgpr, vcc/vcc_hi or scc write. */
1926          unsigned sa_sdst : 1;
1927          /* VALU exec/exec_hi write. */
1928          unsigned va_exec : 1;
1929          /* SALU exec/exec_hi write. */
1930          unsigned sa_exec : 1;
1931       };
1932       unsigned packed = -1;
1933    };
1934 };
1935 
1936 depctr_wait parse_depctr_wait(const Instruction* instr);
1937 
1938 enum block_kind {
1939    /* uniform indicates that leaving this block,
1940     * all actives lanes stay active */
1941    block_kind_uniform = 1 << 0,
1942    block_kind_top_level = 1 << 1,
1943    block_kind_loop_preheader = 1 << 2,
1944    block_kind_loop_header = 1 << 3,
1945    block_kind_loop_exit = 1 << 4,
1946    block_kind_continue = 1 << 5,
1947    block_kind_break = 1 << 6,
1948    block_kind_continue_or_break = 1 << 7,
1949    block_kind_branch = 1 << 8,
1950    block_kind_merge = 1 << 9,
1951    block_kind_invert = 1 << 10,
1952    block_kind_discard_early_exit = 1 << 11,
1953    block_kind_uses_discard = 1 << 12,
1954    block_kind_resume = 1 << 13,
1955    block_kind_export_end = 1 << 14,
1956    block_kind_end_with_regs = 1 << 15,
1957 };
1958 
1959 /* CFG */
1960 struct Block {
1961    using edge_vec = small_vec<uint32_t, 2>;
1962 
1963    float_mode fp_mode;
1964    unsigned index;
1965    unsigned offset = 0;
1966    std::vector<aco_ptr<Instruction>> instructions;
1967    edge_vec logical_preds;
1968    edge_vec linear_preds;
1969    edge_vec logical_succs;
1970    edge_vec linear_succs;
1971    RegisterDemand register_demand = RegisterDemand();
1972    RegisterDemand live_in_demand = RegisterDemand();
1973    uint32_t kind = 0;
1974    int32_t logical_idom = -1;
1975    int32_t linear_idom = -1;
1976 
1977    /* Preorder and postorder traversal indices of the dominance tree. Because a program can have
1978     * several dominance trees (because of block_kind_resume), these start at the block index of the
1979     * root node. */
1980    uint32_t logical_dom_pre_index = 0;
1981    uint32_t logical_dom_post_index = 0;
1982    uint32_t linear_dom_pre_index = 0;
1983    uint32_t linear_dom_post_index = 0;
1984 
1985    uint16_t loop_nest_depth = 0;
1986    uint16_t divergent_if_logical_depth = 0;
1987    uint16_t uniform_if_depth = 0;
1988 
BlockBlock1989    Block() : index(0) {}
1990 };
1991 
1992 /*
1993  * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1994  */
1995 enum class SWStage : uint16_t {
1996    None = 0,
1997    VS = 1 << 0,  /* Vertex Shader */
1998    GS = 1 << 1,  /* Geometry Shader */
1999    TCS = 1 << 2, /* Tessellation Control aka Hull Shader */
2000    TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */
2001    FS = 1 << 4,  /* Fragment aka Pixel Shader */
2002    CS = 1 << 5,  /* Compute Shader */
2003    TS = 1 << 6,  /* Task Shader */
2004    MS = 1 << 7,  /* Mesh Shader */
2005    RT = 1 << 8,  /* Raytracing Shader */
2006 
2007    /* Stage combinations merged to run on a single HWStage */
2008    VS_GS = VS | GS,
2009    VS_TCS = VS | TCS,
2010    TES_GS = TES | GS,
2011 };
2012 
2013 constexpr SWStage
2014 operator|(SWStage a, SWStage b)
2015 {
2016    return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
2017 }
2018 
2019 /*
2020  * Set of SWStages to be merged into a single shader paired with the
2021  * HWStage it will run on.
2022  */
2023 struct Stage {
2024    constexpr Stage() = default;
2025 
StageStage2026    explicit constexpr Stage(ac_hw_stage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
2027 
2028    /* Check if the given SWStage is included */
hasStage2029    constexpr bool has(SWStage stage) const
2030    {
2031       return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
2032    }
2033 
num_sw_stagesStage2034    unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
2035 
2036    constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
2037 
2038    constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
2039 
2040    /* Mask of merged software stages */
2041    SWStage sw = SWStage::None;
2042 
2043    /* Active hardware stage */
2044    ac_hw_stage hw{};
2045 };
2046 
2047 /* possible settings of Program::stage */
2048 static constexpr Stage vertex_vs(AC_HW_VERTEX_SHADER, SWStage::VS);
2049 static constexpr Stage fragment_fs(AC_HW_PIXEL_SHADER, SWStage::FS);
2050 static constexpr Stage compute_cs(AC_HW_COMPUTE_SHADER, SWStage::CS);
2051 static constexpr Stage tess_eval_vs(AC_HW_VERTEX_SHADER, SWStage::TES);
2052 /* Mesh shading pipeline */
2053 static constexpr Stage task_cs(AC_HW_COMPUTE_SHADER, SWStage::TS);
2054 static constexpr Stage mesh_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::MS);
2055 /* GFX10/NGG */
2056 static constexpr Stage vertex_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::VS);
2057 static constexpr Stage vertex_geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::VS_GS);
2058 static constexpr Stage tess_eval_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::TES);
2059 static constexpr Stage tess_eval_geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::TES_GS);
2060 /* GFX9 (and GFX10 if NGG isn't used) */
2061 static constexpr Stage vertex_geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::VS_GS);
2062 static constexpr Stage vertex_tess_control_hs(AC_HW_HULL_SHADER, SWStage::VS_TCS);
2063 static constexpr Stage tess_eval_geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::TES_GS);
2064 /* pre-GFX9 */
2065 static constexpr Stage vertex_ls(AC_HW_LOCAL_SHADER,
2066                                  SWStage::VS); /* vertex before tessellation control */
2067 static constexpr Stage vertex_es(AC_HW_EXPORT_SHADER, SWStage::VS); /* vertex before geometry */
2068 static constexpr Stage tess_control_hs(AC_HW_HULL_SHADER, SWStage::TCS);
2069 static constexpr Stage tess_eval_es(AC_HW_EXPORT_SHADER,
2070                                     SWStage::TES); /* tessellation evaluation before geometry */
2071 static constexpr Stage geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::GS);
2072 /* Raytracing */
2073 static constexpr Stage raytracing_cs(AC_HW_COMPUTE_SHADER, SWStage::RT);
2074 
2075 struct DeviceInfo {
2076    uint16_t lds_encoding_granule;
2077    uint16_t lds_alloc_granule;
2078    uint32_t lds_limit; /* in bytes */
2079    bool has_16bank_lds;
2080    uint16_t physical_sgprs;
2081    uint16_t physical_vgprs;
2082    uint16_t vgpr_limit;
2083    uint16_t sgpr_limit;
2084    uint16_t sgpr_alloc_granule;
2085    uint16_t vgpr_alloc_granule;
2086    unsigned scratch_alloc_granule;
2087    uint16_t max_waves_per_simd;
2088    unsigned simd_per_cu;
2089    bool has_fast_fma32 = false;
2090    bool has_mac_legacy32 = false;
2091    bool has_fmac_legacy32 = false;
2092    bool fused_mad_mix = false;
2093    bool xnack_enabled = false;
2094    bool sram_ecc_enabled = false;
2095 
2096    int16_t scratch_global_offset_min;
2097    int16_t scratch_global_offset_max;
2098    unsigned max_nsa_vgprs;
2099 };
2100 
2101 enum class CompilationProgress {
2102    after_isel,
2103    after_spilling,
2104    after_ra,
2105    after_lower_to_hw,
2106 };
2107 
2108 class Program final {
2109 public:
2110    aco::monotonic_buffer_resource m{65536};
2111    std::vector<Block> blocks;
2112    std::vector<RegClass> temp_rc = {s1};
2113    RegisterDemand max_reg_demand = RegisterDemand();
2114    ac_shader_config* config;
2115    struct aco_shader_info info;
2116    enum amd_gfx_level gfx_level;
2117    enum radeon_family family;
2118    DeviceInfo dev;
2119    unsigned wave_size;
2120    RegClass lane_mask;
2121    Stage stage;
2122    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2123    bool needs_wqm = false;   /* there exists a p_wqm instruction */
2124    bool has_smem_buffer_or_global_loads = false;
2125    bool has_pops_overlapped_waves_wait = false;
2126    bool has_color_exports = false;
2127    bool is_prolog = false;
2128    bool is_epilog = false;
2129 
2130    std::vector<ac_shader_debug_info> debug_info;
2131 
2132    std::vector<uint8_t> constant_data;
2133    Temp private_segment_buffer;
2134    Temp scratch_offset;
2135 
2136    uint16_t num_waves = 0;
2137    uint16_t min_waves = 0;
2138    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2139    bool wgp_mode;
2140 
2141    bool needs_vcc = false;
2142 
2143    CompilationProgress progress;
2144 
2145    bool collect_statistics = false;
2146    uint32_t statistics[aco_num_statistics];
2147 
2148    float_mode next_fp_mode;
2149    unsigned next_loop_depth = 0;
2150    unsigned next_divergent_if_logical_depth = 0;
2151    unsigned next_uniform_if_depth = 0;
2152 
2153    std::vector<Definition> args_pending_vmem;
2154 
2155    /* For shader part with previous shader part that has lds access. */
2156    bool pending_lds_access = false;
2157 
2158    bool should_repair_ssa = false;
2159 
2160    struct {
2161       monotonic_buffer_resource memory;
2162       /* live-in temps per block */
2163       std::vector<IDSet> live_in;
2164    } live;
2165 
2166    struct {
2167       FILE* output = stderr;
2168       bool shorten_messages = false;
2169       void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message);
2170       void* private_data;
2171    } debug;
2172 
allocateRange(unsigned amount)2173    void allocateRange(unsigned amount)
2174    {
2175       assert(temp_rc.size() + amount <= 16777216);
2176       temp_rc.resize(temp_rc.size() + amount);
2177    }
2178 
allocateTmp(RegClass rc)2179    Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2180 
peekAllocationId()2181    uint32_t peekAllocationId() { return temp_rc.size(); }
2182 
create_and_insert_block()2183    Block* create_and_insert_block()
2184    {
2185       Block block;
2186       return insert_block(std::move(block));
2187    }
2188 
insert_block(Block && block)2189    Block* insert_block(Block&& block)
2190    {
2191       block.index = blocks.size();
2192       block.fp_mode = next_fp_mode;
2193       block.loop_nest_depth = next_loop_depth;
2194       block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2195       block.uniform_if_depth = next_uniform_if_depth;
2196       blocks.emplace_back(std::move(block));
2197       return &blocks.back();
2198    }
2199 
2200 private:
allocateId(RegClass rc)2201    uint32_t allocateId(RegClass rc)
2202    {
2203       assert(temp_rc.size() <= 16777215);
2204       temp_rc.push_back(rc);
2205       return temp_rc.size() - 1;
2206    }
2207 };
2208 
2209 struct ra_test_policy {
2210    /* Force RA to always use its pessimistic fallback algorithm */
2211    bool skip_optimistic_path = false;
2212 };
2213 
2214 void init();
2215 
2216 void init_program(Program* program, Stage stage, const struct aco_shader_info* info,
2217                   enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
2218                   ac_shader_config* config);
2219 
2220 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2221                     ac_shader_config* config, const struct aco_compiler_options* options,
2222                     const struct aco_shader_info* info, const struct ac_shader_args* args);
2223 void select_trap_handler_shader(Program* program, ac_shader_config* config,
2224                                 const struct aco_compiler_options* options,
2225                                 const struct aco_shader_info* info,
2226                                 const struct ac_shader_args* args);
2227 void select_rt_prolog(Program* program, ac_shader_config* config,
2228                       const struct aco_compiler_options* options,
2229                       const struct aco_shader_info* info, const struct ac_shader_args* in_args,
2230                       const struct ac_shader_args* out_args);
2231 void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo,
2232                       ac_shader_config* config, const struct aco_compiler_options* options,
2233                       const struct aco_shader_info* info, const struct ac_shader_args* args);
2234 
2235 void select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
2236                       const struct aco_compiler_options* options,
2237                       const struct aco_shader_info* info, const struct ac_shader_args* args);
2238 
2239 void select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
2240                       const struct aco_compiler_options* options,
2241                       const struct aco_shader_info* info, const struct ac_shader_args* args);
2242 
2243 bool repair_ssa(Program* program);
2244 void lower_phis(Program* program);
2245 void lower_subdword(Program* program);
2246 void calc_min_waves(Program* program);
2247 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2248 void live_var_analysis(Program* program);
2249 std::vector<uint16_t> dead_code_analysis(Program* program);
2250 void dominator_tree(Program* program);
2251 void insert_exec_mask(Program* program);
2252 void value_numbering(Program* program);
2253 void optimize(Program* program);
2254 void optimize_postRA(Program* program);
2255 void lower_branches(Program* program);
2256 void setup_reduce_temp(Program* program);
2257 void lower_to_cssa(Program* program);
2258 void register_allocation(Program* program, ra_test_policy = {});
2259 void reindex_ssa(Program* program);
2260 void ssa_elimination(Program* program);
2261 void jump_threading(Program* program);
2262 void lower_to_hw_instr(Program* program);
2263 void schedule_program(Program* program);
2264 void schedule_ilp(Program* program);
2265 void schedule_vopd(Program* program);
2266 void spill(Program* program);
2267 void insert_waitcnt(Program* program);
2268 void insert_delay_alu(Program* program);
2269 void combine_delay_alu(Program* program);
2270 bool dealloc_vgprs(Program* program);
2271 void insert_NOPs(Program* program);
2272 void form_hard_clauses(Program* program);
2273 unsigned emit_program(Program* program, std::vector<uint32_t>& code,
2274                       std::vector<struct aco_symbol>* symbols = NULL, bool append_endpgm = true);
2275 /**
2276  * Returns true if print_asm can disassemble the given program for the current build/runtime
2277  * configuration
2278  */
2279 bool check_print_asm_support(Program* program);
2280 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2281 bool validate_ir(Program* program);
2282 bool validate_cfg(Program* program);
2283 bool validate_ra(Program* program);
2284 bool validate_live_vars(Program* program);
2285 
2286 void collect_presched_stats(Program* program);
2287 void collect_preasm_stats(Program* program);
2288 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2289 
2290 struct Instruction_cycle_info {
2291    /* Latency until the result is ready (if not needing a waitcnt) */
2292    unsigned latency;
2293 
2294    /* How many cycles issuing this instruction takes (i.e. cycles till the next instruction can be
2295     * issued)*/
2296    unsigned issue_cycles;
2297 };
2298 
2299 Instruction_cycle_info get_cycle_info(const Program& program, const Instruction& instr);
2300 
2301 enum print_flags {
2302    print_no_ssa = 0x1,
2303    print_perf_info = 0x2,
2304    print_kill = 0x4,
2305    print_live_vars = 0x8,
2306 };
2307 
2308 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2309 void aco_print_instr(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* output,
2310                      unsigned flags = 0);
2311 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2312 
2313 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2314 
2315 #define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2316 
2317 int get_op_fixed_to_def(Instruction* instr);
2318 
2319 /* utilities for dealing with register demand */
2320 RegisterDemand get_live_changes(Instruction* instr);
2321 RegisterDemand get_temp_registers(Instruction* instr);
2322 
2323 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2324 uint16_t get_extra_sgprs(Program* program);
2325 
2326 /* adjust num_waves for workgroup size and LDS limits */
2327 uint16_t max_suitable_waves(Program* program, uint16_t waves);
2328 
2329 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2330 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2331 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2332 
2333 /* return number of addressable sgprs/vgprs for max_waves */
2334 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2335 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2336 
2337 bool uses_scratch(Program* program);
2338 
2339 inline bool
dominates_logical(const Block & parent,const Block & child)2340 dominates_logical(const Block& parent, const Block& child)
2341 {
2342    return child.logical_dom_pre_index >= parent.logical_dom_pre_index &&
2343           child.logical_dom_post_index <= parent.logical_dom_post_index;
2344 }
2345 
2346 inline bool
dominates_linear(const Block & parent,const Block & child)2347 dominates_linear(const Block& parent, const Block& child)
2348 {
2349    return child.linear_dom_pre_index >= parent.linear_dom_pre_index &&
2350           child.linear_dom_post_index <= parent.linear_dom_post_index;
2351 }
2352 
2353 typedef struct {
2354    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2355    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2356    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2357    const int16_t opcode_gfx11[static_cast<int>(aco_opcode::num_opcodes)];
2358    const int16_t opcode_gfx12[static_cast<int>(aco_opcode::num_opcodes)];
2359    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2360    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2361    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2362    const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2363    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2364    /* sizes used for input/output modifiers and constants */
2365    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2366    const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2367    const uint32_t definitions[static_cast<int>(aco_opcode::num_opcodes)];
2368    const uint32_t operands[static_cast<int>(aco_opcode::num_opcodes)];
2369 } Info;
2370 
2371 extern const Info instr_info;
2372 
2373 } // namespace aco
2374 
2375 #endif /* ACO_IR_H */
2376