• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27 
28 #include "aco_opcodes.h"
29 #include "aco_shader_info.h"
30 #include "aco_util.h"
31 
32 #include "util/compiler.h"
33 
34 #include "ac_binary.h"
35 #include "ac_hw_stage.h"
36 #include "amd_family.h"
37 #include <algorithm>
38 #include <bitset>
39 #include <memory>
40 #include <vector>
41 
42 typedef struct nir_shader nir_shader;
43 
44 namespace aco {
45 
46 extern uint64_t debug_flags;
47 
48 enum {
49    DEBUG_VALIDATE_IR = 0x1,
50    DEBUG_VALIDATE_RA = 0x2,
51    DEBUG_PERFWARN = 0x4,
52    DEBUG_FORCE_WAITCNT = 0x8,
53    DEBUG_NO_VN = 0x10,
54    DEBUG_NO_OPT = 0x20,
55    DEBUG_NO_SCHED = 0x40,
56    DEBUG_PERF_INFO = 0x80,
57    DEBUG_LIVE_INFO = 0x100,
58    DEBUG_FORCE_WAITDEPS = 0x200,
59    DEBUG_NO_VALIDATE_IR = 0x400,
60    DEBUG_NO_SCHED_ILP = 0x800,
61    DEBUG_NO_SCHED_VOPD = 0x1000,
62 };
63 
64 enum storage_class : uint8_t {
65    storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
66    storage_buffer = 0x1, /* SSBOs and global memory */
67    storage_gds = 0x2,
68    storage_image = 0x4,
69    storage_shared = 0x8,        /* or TCS output */
70    storage_vmem_output = 0x10,  /* GS or TCS output stores using VMEM */
71    storage_task_payload = 0x20, /* Task-Mesh payload */
72    storage_scratch = 0x40,
73    storage_vgpr_spill = 0x80,
74    storage_count = 8, /* not counting storage_none */
75 };
76 
77 enum memory_semantics : uint8_t {
78    semantic_none = 0x0,
79    /* for loads: don't move any access after this load to before this load (even other loads)
80     * for barriers: don't move any access after the barrier to before any
81     * atomics/control_barriers/sendmsg_gs_done/position-primitive-export before the barrier */
82    semantic_acquire = 0x1,
83    /* for stores: don't move any access before this store to after this store
84     * for barriers: don't move any access before the barrier to after any
85     * atomics/control_barriers/sendmsg_gs_done/position-primitive-export after the barrier */
86    semantic_release = 0x2,
87 
88    /* the rest are for load/stores/atomics only */
89    /* cannot be DCE'd or CSE'd */
90    semantic_volatile = 0x4,
91    /* does not interact with barriers and assumes this lane is the only lane
92     * accessing this memory */
93    semantic_private = 0x8,
94    /* this operation can be reordered around operations of the same storage.
95     * says nothing about barriers */
96    semantic_can_reorder = 0x10,
97    /* this is a atomic instruction (may only read or write memory) */
98    semantic_atomic = 0x20,
99    /* this is instruction both reads and writes memory */
100    semantic_rmw = 0x40,
101 
102    semantic_acqrel = semantic_acquire | semantic_release,
103    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
104 };
105 
106 enum sync_scope : uint8_t {
107    scope_invocation = 0,
108    scope_subgroup = 1,
109    scope_workgroup = 2,
110    scope_queuefamily = 3,
111    scope_device = 4,
112 };
113 
114 struct memory_sync_info {
memory_sync_infomemory_sync_info115    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
116    memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
117        : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
118    {}
119 
120    storage_class storage : 8;
121    memory_semantics semantics : 8;
122    sync_scope scope : 8;
123 
124    bool operator==(const memory_sync_info& rhs) const
125    {
126       return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
127    }
128 
can_reordermemory_sync_info129    bool can_reorder() const
130    {
131       if (semantics & semantic_acqrel)
132          return false;
133       /* Also check storage so that zero-initialized memory_sync_info can be
134        * reordered. */
135       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
136    }
137 };
138 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
139 
140 enum fp_round {
141    fp_round_ne = 0,
142    fp_round_pi = 1,
143    fp_round_ni = 2,
144    fp_round_tz = 3,
145 };
146 
147 enum fp_denorm {
148    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
149     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
150    fp_denorm_flush = 0x0,
151    fp_denorm_keep_in = 0x1,
152    fp_denorm_keep_out = 0x2,
153    fp_denorm_keep = 0x3,
154 };
155 
156 struct float_mode {
157    /* matches encoding of the MODE register */
158    union {
159       struct {
160          fp_round round32 : 2;
161          fp_round round16_64 : 2;
162          unsigned denorm32 : 2;
163          unsigned denorm16_64 : 2;
164       };
165       struct {
166          uint8_t round : 4;
167          uint8_t denorm : 4;
168       };
169       uint8_t val = 0;
170    };
171    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
172    bool preserve_signed_zero_inf_nan32 : 1;
173    bool preserve_signed_zero_inf_nan16_64 : 1;
174    /* if false, optimizations which may remove denormal flushing can be done */
175    bool must_flush_denorms32 : 1;
176    bool must_flush_denorms16_64 : 1;
177    bool care_about_round32 : 1;
178    bool care_about_round16_64 : 1;
179 
180    /* Returns true if instructions using the mode "other" can safely use the
181     * current one instead. */
canReplacefloat_mode182    bool canReplace(float_mode other) const noexcept
183    {
184       return val == other.val &&
185              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
186              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
187              (must_flush_denorms32 || !other.must_flush_denorms32) &&
188              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
189              (care_about_round32 || !other.care_about_round32) &&
190              (care_about_round16_64 || !other.care_about_round16_64);
191    }
192 };
193 
194 struct wait_imm {
195    static const uint8_t unset_counter = 0xff;
196 
197    uint8_t vm;
198    uint8_t exp;
199    uint8_t lgkm;
200    uint8_t vs;
201 
202    wait_imm();
203    wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
204    wait_imm(enum amd_gfx_level chip, uint16_t packed);
205 
206    uint16_t pack(enum amd_gfx_level chip) const;
207 
208    bool combine(const wait_imm& other);
209 
210    bool empty() const;
211 
212    void print(FILE* output) const;
213 };
214 
215 /* s_wait_event immediate bits. */
216 enum wait_event_imm : uint16_t {
217    /* If this bit is 0, await that the export buffer space has been allocated.
218     * In Primitive Ordered Pixel Shading, export ready means that the overlapped waves have exited
219     * their ordered sections (by performing the `done` export), and that the current wave may enter
220     * its ordered section.
221     */
222    wait_event_imm_dont_wait_export_ready = 0x1,
223 };
224 
225 constexpr Format
asVOP3(Format format)226 asVOP3(Format format)
227 {
228    return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
229 };
230 
231 constexpr Format
asSDWA(Format format)232 asSDWA(Format format)
233 {
234    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
235    return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
236 }
237 
238 constexpr Format
withoutDPP(Format format)239 withoutDPP(Format format)
240 {
241    return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
242 }
243 
244 constexpr Format
withoutVOP3(Format format)245 withoutVOP3(Format format)
246 {
247    return (Format)((uint32_t)format & ~((uint32_t)Format::VOP3));
248 }
249 
250 enum class RegType {
251    sgpr,
252    vgpr,
253 };
254 
255 struct RegClass {
256 
257    enum RC : uint8_t {
258       s1 = 1,
259       s2 = 2,
260       s3 = 3,
261       s4 = 4,
262       s6 = 6,
263       s8 = 8,
264       s16 = 16,
265       v1 = s1 | (1 << 5),
266       v2 = s2 | (1 << 5),
267       v3 = s3 | (1 << 5),
268       v4 = s4 | (1 << 5),
269       v5 = 5 | (1 << 5),
270       v6 = 6 | (1 << 5),
271       v7 = 7 | (1 << 5),
272       v8 = 8 | (1 << 5),
273       /* byte-sized register class */
274       v1b = v1 | (1 << 7),
275       v2b = v2 | (1 << 7),
276       v3b = v3 | (1 << 7),
277       v4b = v4 | (1 << 7),
278       v6b = v6 | (1 << 7),
279       v8b = v8 | (1 << 7),
280       /* these are used for WWM and spills to vgpr */
281       v1_linear = v1 | (1 << 6),
282       v2_linear = v2 | (1 << 6),
283    };
284 
285    RegClass() = default;
RegClassRegClass286    constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass287    constexpr RegClass(RegType type, unsigned size)
288        : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
289    {}
290 
RCRegClass291    constexpr operator RC() const { return rc; }
292    explicit operator bool() = delete;
293 
typeRegClass294    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass295    constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass296    constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass297    constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
298    // TODO: use size() less in favor of bytes()
sizeRegClass299    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass300    constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass301    constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass302    constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
303 
getRegClass304    static constexpr RegClass get(RegType type, unsigned bytes)
305    {
306       if (type == RegType::sgpr) {
307          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
308       } else {
309          return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
310       }
311    }
312 
resizeRegClass313    constexpr RegClass resize(unsigned bytes) const
314    {
315       if (is_linear_vgpr()) {
316          assert(bytes % 4u == 0);
317          return get(RegType::vgpr, bytes).as_linear();
318       }
319       return get(type(), bytes);
320    }
321 
322 private:
323    RC rc;
324 };
325 
326 /* transitional helper expressions */
327 static constexpr RegClass s1{RegClass::s1};
328 static constexpr RegClass s2{RegClass::s2};
329 static constexpr RegClass s3{RegClass::s3};
330 static constexpr RegClass s4{RegClass::s4};
331 static constexpr RegClass s8{RegClass::s8};
332 static constexpr RegClass s16{RegClass::s16};
333 static constexpr RegClass v1{RegClass::v1};
334 static constexpr RegClass v2{RegClass::v2};
335 static constexpr RegClass v3{RegClass::v3};
336 static constexpr RegClass v4{RegClass::v4};
337 static constexpr RegClass v5{RegClass::v5};
338 static constexpr RegClass v6{RegClass::v6};
339 static constexpr RegClass v7{RegClass::v7};
340 static constexpr RegClass v8{RegClass::v8};
341 static constexpr RegClass v1b{RegClass::v1b};
342 static constexpr RegClass v2b{RegClass::v2b};
343 static constexpr RegClass v3b{RegClass::v3b};
344 static constexpr RegClass v4b{RegClass::v4b};
345 static constexpr RegClass v6b{RegClass::v6b};
346 static constexpr RegClass v8b{RegClass::v8b};
347 
348 /**
349  * Temp Class
350  * Each temporary virtual register has a
351  * register class (i.e. size and type)
352  * and SSA id.
353  */
354 struct Temp {
TempTemp355    Temp() noexcept : id_(0), reg_class(0) {}
TempTemp356    constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
357 
idTemp358    constexpr uint32_t id() const noexcept { return id_; }
regClassTemp359    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
360 
bytesTemp361    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp362    constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp363    constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp364    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
365 
366    constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
367    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
368    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
369 
370 private:
371    uint32_t id_ : 24;
372    uint32_t reg_class : 8;
373 };
374 
375 /**
376  * PhysReg
377  * Represents the physical register for each
378  * Operand and Definition.
379  */
380 struct PhysReg {
381    constexpr PhysReg() = default;
PhysRegPhysReg382    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg383    constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg384    constexpr unsigned byte() const { return reg_b & 0x3; }
385    constexpr operator unsigned() const { return reg(); }
386    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
387    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
388    constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg389    constexpr PhysReg advance(int bytes) const
390    {
391       PhysReg res = *this;
392       res.reg_b += bytes;
393       return res;
394    }
395 
396    uint16_t reg_b = 0;
397 };
398 
399 /* helper expressions for special registers */
400 static constexpr PhysReg m0{124};
401 static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
402 static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
403 static constexpr PhysReg vcc{106};
404 static constexpr PhysReg vcc_hi{107};
405 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
406 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
407 static constexpr PhysReg ttmp0{112};
408 static constexpr PhysReg ttmp1{113};
409 static constexpr PhysReg ttmp2{114};
410 static constexpr PhysReg ttmp3{115};
411 static constexpr PhysReg ttmp4{116};
412 static constexpr PhysReg ttmp5{117};
413 static constexpr PhysReg ttmp6{118};
414 static constexpr PhysReg ttmp7{119};
415 static constexpr PhysReg ttmp8{120};
416 static constexpr PhysReg ttmp9{121};
417 static constexpr PhysReg ttmp10{122};
418 static constexpr PhysReg ttmp11{123};
419 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
420 static constexpr PhysReg exec{126};
421 static constexpr PhysReg exec_lo{126};
422 static constexpr PhysReg exec_hi{127};
423 static constexpr PhysReg pops_exiting_wave_id{239}; /* GFX9-GFX10.3 */
424 static constexpr PhysReg scc{253};
425 
426 /**
427  * Operand Class
428  * Initially, each Operand refers to either
429  * a temporary virtual register
430  * or to a constant value
431  * Temporary registers get mapped to physical register during RA
432  * Constant values are inlined into the instruction sequence.
433  */
434 class Operand final {
435 public:
Operand()436    constexpr Operand()
437        : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
438          isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
439          is24bit_(false), signext(false)
440    {}
441 
Operand(Temp r)442    explicit Operand(Temp r) noexcept
443    {
444       data_.temp = r;
445       if (r.id()) {
446          isTemp_ = true;
447       } else {
448          isUndef_ = true;
449          setFixed(PhysReg{128});
450       }
451    };
Operand(Temp r,PhysReg reg)452    explicit Operand(Temp r, PhysReg reg) noexcept
453    {
454       assert(r.id()); /* Don't allow fixing an undef to a register */
455       data_.temp = r;
456       isTemp_ = true;
457       setFixed(reg);
458    };
459 
460    /* 8-bit constant */
c8(uint8_t v)461    static Operand c8(uint8_t v) noexcept
462    {
463       /* 8-bit constants are only used for copies and copies from any 8-bit
464        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
465        * to be inline constants. */
466       Operand op;
467       op.control_ = 0;
468       op.data_.i = v;
469       op.isConstant_ = true;
470       op.constSize = 0;
471       op.setFixed(PhysReg{0u});
472       return op;
473    };
474 
475    /* 16-bit constant */
c16(uint16_t v)476    static Operand c16(uint16_t v) noexcept
477    {
478       Operand op;
479       op.control_ = 0;
480       op.data_.i = v;
481       op.isConstant_ = true;
482       op.constSize = 1;
483       if (v <= 64)
484          op.setFixed(PhysReg{128u + v});
485       else if (v >= 0xFFF0) /* [-16 .. -1] */
486          op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
487       else if (v == 0x3800) /* 0.5 */
488          op.setFixed(PhysReg{240});
489       else if (v == 0xB800) /* -0.5 */
490          op.setFixed(PhysReg{241});
491       else if (v == 0x3C00) /* 1.0 */
492          op.setFixed(PhysReg{242});
493       else if (v == 0xBC00) /* -1.0 */
494          op.setFixed(PhysReg{243});
495       else if (v == 0x4000) /* 2.0 */
496          op.setFixed(PhysReg{244});
497       else if (v == 0xC000) /* -2.0 */
498          op.setFixed(PhysReg{245});
499       else if (v == 0x4400) /* 4.0 */
500          op.setFixed(PhysReg{246});
501       else if (v == 0xC400) /* -4.0 */
502          op.setFixed(PhysReg{247});
503       else if (v == 0x3118) /* 1/2 PI */
504          op.setFixed(PhysReg{248});
505       else /* Literal Constant */
506          op.setFixed(PhysReg{255});
507       return op;
508    }
509 
510    /* 32-bit constant */
c32(uint32_t v)511    static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
512 
513    /* 64-bit constant */
c64(uint64_t v)514    static Operand c64(uint64_t v) noexcept
515    {
516       Operand op;
517       op.control_ = 0;
518       op.isConstant_ = true;
519       op.constSize = 3;
520       if (v <= 64) {
521          op.data_.i = (uint32_t)v;
522          op.setFixed(PhysReg{128 + (uint32_t)v});
523       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
524          op.data_.i = (uint32_t)v;
525          op.setFixed(PhysReg{192 - (uint32_t)v});
526       } else if (v == 0x3FE0000000000000) { /* 0.5 */
527          op.data_.i = 0x3f000000;
528          op.setFixed(PhysReg{240});
529       } else if (v == 0xBFE0000000000000) { /* -0.5 */
530          op.data_.i = 0xbf000000;
531          op.setFixed(PhysReg{241});
532       } else if (v == 0x3FF0000000000000) { /* 1.0 */
533          op.data_.i = 0x3f800000;
534          op.setFixed(PhysReg{242});
535       } else if (v == 0xBFF0000000000000) { /* -1.0 */
536          op.data_.i = 0xbf800000;
537          op.setFixed(PhysReg{243});
538       } else if (v == 0x4000000000000000) { /* 2.0 */
539          op.data_.i = 0x40000000;
540          op.setFixed(PhysReg{244});
541       } else if (v == 0xC000000000000000) { /* -2.0 */
542          op.data_.i = 0xc0000000;
543          op.setFixed(PhysReg{245});
544       } else if (v == 0x4010000000000000) { /* 4.0 */
545          op.data_.i = 0x40800000;
546          op.setFixed(PhysReg{246});
547       } else if (v == 0xC010000000000000) { /* -4.0 */
548          op.data_.i = 0xc0800000;
549          op.setFixed(PhysReg{247});
550       } else { /* Literal Constant: we don't know if it is a long or double.*/
551          op.signext = v >> 63;
552          op.data_.i = v & 0xffffffffu;
553          op.setFixed(PhysReg{255});
554          assert(op.constantValue64() == v &&
555                 "attempt to create a unrepresentable 64-bit literal constant");
556       }
557       return op;
558    }
559 
560    /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)561    static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
562    {
563       Operand op;
564       op.control_ = 0;
565       op.data_.i = v;
566       op.isConstant_ = true;
567       op.constSize = is64bit ? 3 : 2;
568       if (v <= 64)
569          op.setFixed(PhysReg{128 + v});
570       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
571          op.setFixed(PhysReg{192 - v});
572       else if (v == 0x3f000000) /* 0.5 */
573          op.setFixed(PhysReg{240});
574       else if (v == 0xbf000000) /* -0.5 */
575          op.setFixed(PhysReg{241});
576       else if (v == 0x3f800000) /* 1.0 */
577          op.setFixed(PhysReg{242});
578       else if (v == 0xbf800000) /* -1.0 */
579          op.setFixed(PhysReg{243});
580       else if (v == 0x40000000) /* 2.0 */
581          op.setFixed(PhysReg{244});
582       else if (v == 0xc0000000) /* -2.0 */
583          op.setFixed(PhysReg{245});
584       else if (v == 0x40800000) /* 4.0 */
585          op.setFixed(PhysReg{246});
586       else if (v == 0xc0800000) /* -4.0 */
587          op.setFixed(PhysReg{247});
588       else { /* Literal Constant */
589          assert(!is64bit && "attempt to create a 64-bit literal constant");
590          op.setFixed(PhysReg{255});
591       }
592       return op;
593    }
594 
literal32(uint32_t v)595    static Operand literal32(uint32_t v) noexcept
596    {
597       Operand op;
598       op.control_ = 0;
599       op.data_.i = v;
600       op.isConstant_ = true;
601       op.constSize = 2;
602       op.setFixed(PhysReg{255});
603       return op;
604    }
605 
Operand(RegClass type)606    explicit Operand(RegClass type) noexcept
607    {
608       isUndef_ = true;
609       data_.temp = Temp(0, type);
610       setFixed(PhysReg{128});
611    };
Operand(PhysReg reg,RegClass type)612    explicit Operand(PhysReg reg, RegClass type) noexcept
613    {
614       data_.temp = Temp(0, type);
615       setFixed(reg);
616    }
617 
618    static Operand zero(unsigned bytes = 4) noexcept
619    {
620       if (bytes == 8)
621          return Operand::c64(0);
622       else if (bytes == 4)
623          return Operand::c32(0);
624       else if (bytes == 2)
625          return Operand::c16(0);
626       assert(bytes == 1);
627       return Operand::c8(0);
628    }
629 
630    /* This is useful over the constructors when you want to take a gfx level
631     * for 1/2 PI or an unknown operand size.
632     */
get_const(enum amd_gfx_level chip,uint64_t val,unsigned bytes)633    static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes)
634    {
635       if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
636          /* 1/2 PI can be an inline constant on GFX8+ */
637          Operand op = Operand::c32(val);
638          op.setFixed(PhysReg{248});
639          return op;
640       }
641 
642       if (bytes == 8)
643          return Operand::c64(val);
644       else if (bytes == 4)
645          return Operand::c32(val);
646       else if (bytes == 2)
647          return Operand::c16(val);
648       assert(bytes == 1);
649       return Operand::c8(val);
650    }
651 
652    static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
653                                          bool sext = false)
654    {
655       if (bytes <= 4)
656          return true;
657 
658       if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
659          return true;
660       uint64_t upper33 = val & 0xFFFFFFFF80000000;
661       if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
662          return true;
663 
664       return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
665              val == 0x3FE0000000000000 ||              /* 0.5 */
666              val == 0xBFE0000000000000 ||              /* -0.5 */
667              val == 0x3FF0000000000000 ||              /* 1.0 */
668              val == 0xBFF0000000000000 ||              /* -1.0 */
669              val == 0x4000000000000000 ||              /* 2.0 */
670              val == 0xC000000000000000 ||              /* -2.0 */
671              val == 0x4010000000000000 ||              /* 4.0 */
672              val == 0xC010000000000000;                /* -4.0 */
673    }
674 
isTemp()675    constexpr bool isTemp() const noexcept { return isTemp_; }
676 
setTemp(Temp t)677    constexpr void setTemp(Temp t) noexcept
678    {
679       assert(!isConstant_);
680       isTemp_ = true;
681       data_.temp = t;
682    }
683 
getTemp()684    constexpr Temp getTemp() const noexcept { return data_.temp; }
685 
tempId()686    constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
687 
hasRegClass()688    constexpr bool hasRegClass() const noexcept { return !isConstant(); }
689 
regClass()690    constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
691 
bytes()692    constexpr unsigned bytes() const noexcept
693    {
694       if (isConstant())
695          return 1 << constSize;
696       else
697          return data_.temp.bytes();
698    }
699 
size()700    constexpr unsigned size() const noexcept
701    {
702       if (isConstant())
703          return constSize > 2 ? 2 : 1;
704       else
705          return data_.temp.size();
706    }
707 
isFixed()708    constexpr bool isFixed() const noexcept { return isFixed_; }
709 
physReg()710    constexpr PhysReg physReg() const noexcept { return reg_; }
711 
setFixed(PhysReg reg)712    constexpr void setFixed(PhysReg reg) noexcept
713    {
714       isFixed_ = reg != unsigned(-1);
715       reg_ = reg;
716    }
717 
isConstant()718    constexpr bool isConstant() const noexcept { return isConstant_; }
719 
isLiteral()720    constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
721 
isUndefined()722    constexpr bool isUndefined() const noexcept { return isUndef_; }
723 
constantValue()724    constexpr uint32_t constantValue() const noexcept { return data_.i; }
725 
constantEquals(uint32_t cmp)726    constexpr bool constantEquals(uint32_t cmp) const noexcept
727    {
728       return isConstant() && constantValue() == cmp;
729    }
730 
constantValue64()731    constexpr uint64_t constantValue64() const noexcept
732    {
733       if (constSize == 3) {
734          if (reg_ <= 192)
735             return reg_ - 128;
736          else if (reg_ <= 208)
737             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
738 
739          switch (reg_) {
740          case 240: return 0x3FE0000000000000;
741          case 241: return 0xBFE0000000000000;
742          case 242: return 0x3FF0000000000000;
743          case 243: return 0xBFF0000000000000;
744          case 244: return 0x4000000000000000;
745          case 245: return 0xC000000000000000;
746          case 246: return 0x4010000000000000;
747          case 247: return 0xC010000000000000;
748          case 255:
749             return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
750          }
751          unreachable("invalid register for 64-bit constant");
752       } else {
753          return data_.i;
754       }
755    }
756 
757    /* Value if this were used with vop3/opsel or vop3p. */
constantValue16(bool opsel)758    constexpr uint16_t constantValue16(bool opsel) const noexcept
759    {
760       assert(bytes() == 2 || bytes() == 4);
761       if (opsel) {
762          if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral())
763             return int16_t(data_.i) >>
764                    16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */
765          else
766             return data_.i >> 16;
767       }
768       return data_.i;
769    }
770 
isOfType(RegType type)771    constexpr bool isOfType(RegType type) const noexcept
772    {
773       return hasRegClass() && regClass().type() == type;
774    }
775 
776    /* Indicates that the killed operand's live range intersects with the
777     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
778     * not set by liveness analysis. */
setLateKill(bool flag)779    constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
780 
isLateKill()781    constexpr bool isLateKill() const noexcept { return isLateKill_; }
782 
setKill(bool flag)783    constexpr void setKill(bool flag) noexcept
784    {
785       isKill_ = flag;
786       if (!flag)
787          setFirstKill(false);
788    }
789 
isKill()790    constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
791 
setFirstKill(bool flag)792    constexpr void setFirstKill(bool flag) noexcept
793    {
794       isFirstKill_ = flag;
795       if (flag)
796          setKill(flag);
797    }
798 
799    /* When there are multiple operands killing the same temporary,
800     * isFirstKill() is only returns true for the first one. */
isFirstKill()801    constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
802 
isKillBeforeDef()803    constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
804 
isFirstKillBeforeDef()805    constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
806 
807    constexpr bool operator==(Operand other) const noexcept
808    {
809       if (other.size() != size())
810          return false;
811       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
812          return false;
813       if (isFixed() && other.isFixed() && physReg() != other.physReg())
814          return false;
815       if (isLiteral())
816          return other.isLiteral() && other.constantValue() == constantValue();
817       else if (isConstant())
818          return other.isConstant() && other.physReg() == physReg();
819       else if (isUndefined())
820          return other.isUndefined() && other.regClass() == regClass();
821       else
822          return other.isTemp() && other.getTemp() == getTemp();
823    }
824 
825    constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
826 
set16bit(bool flag)827    constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
828 
is16bit()829    constexpr bool is16bit() const noexcept { return is16bit_; }
830 
set24bit(bool flag)831    constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
832 
is24bit()833    constexpr bool is24bit() const noexcept { return is24bit_; }
834 
835 private:
836    union {
837       Temp temp;
838       uint32_t i;
839       float f;
840    } data_ = {Temp(0, s1)};
841    PhysReg reg_;
842    union {
843       struct {
844          uint8_t isTemp_ : 1;
845          uint8_t isFixed_ : 1;
846          uint8_t isConstant_ : 1;
847          uint8_t isKill_ : 1;
848          uint8_t isUndef_ : 1;
849          uint8_t isFirstKill_ : 1;
850          uint8_t constSize : 2;
851          uint8_t isLateKill_ : 1;
852          uint8_t is16bit_ : 1;
853          uint8_t is24bit_ : 1;
854          uint8_t signext : 1;
855       };
856       /* can't initialize bit-fields in c++11, so work around using a union */
857       uint16_t control_ = 0;
858    };
859 };
860 
861 /**
862  * Definition Class
863  * Definitions are the results of Instructions
864  * and refer to temporary virtual registers
865  * which are later mapped to physical registers
866  */
867 class Definition final {
868 public:
Definition()869    constexpr Definition()
870        : temp(Temp(0, s1)), reg_(0), isFixed_(0), isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0)
871    {}
Definition(uint32_t index,RegClass type)872    Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
Definition(Temp tmp)873    explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)874    Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)875    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
876    {
877       setFixed(reg);
878    }
879 
isTemp()880    constexpr bool isTemp() const noexcept { return tempId() > 0; }
881 
getTemp()882    constexpr Temp getTemp() const noexcept { return temp; }
883 
tempId()884    constexpr uint32_t tempId() const noexcept { return temp.id(); }
885 
setTemp(Temp t)886    constexpr void setTemp(Temp t) noexcept { temp = t; }
887 
swapTemp(Definition & other)888    void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
889 
regClass()890    constexpr RegClass regClass() const noexcept { return temp.regClass(); }
891 
bytes()892    constexpr unsigned bytes() const noexcept { return temp.bytes(); }
893 
size()894    constexpr unsigned size() const noexcept { return temp.size(); }
895 
isFixed()896    constexpr bool isFixed() const noexcept { return isFixed_; }
897 
physReg()898    constexpr PhysReg physReg() const noexcept { return reg_; }
899 
setFixed(PhysReg reg)900    constexpr void setFixed(PhysReg reg) noexcept
901    {
902       isFixed_ = 1;
903       reg_ = reg;
904    }
905 
setKill(bool flag)906    constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
907 
isKill()908    constexpr bool isKill() const noexcept { return isKill_; }
909 
setPrecise(bool precise)910    constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
911 
isPrecise()912    constexpr bool isPrecise() const noexcept { return isPrecise_; }
913 
914    /* No Unsigned Wrap */
setNUW(bool nuw)915    constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
916 
isNUW()917    constexpr bool isNUW() const noexcept { return isNUW_; }
918 
setNoCSE(bool noCSE)919    constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
920 
isNoCSE()921    constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
922 
923 private:
924    Temp temp = Temp(0, s1);
925    PhysReg reg_;
926    union {
927       struct {
928          uint8_t isFixed_ : 1;
929          uint8_t isKill_ : 1;
930          uint8_t isPrecise_ : 1;
931          uint8_t isNUW_ : 1;
932          uint8_t isNoCSE_ : 1;
933       };
934       /* can't initialize bit-fields in c++11, so work around using a union */
935       uint8_t control_ = 0;
936    };
937 };
938 
939 struct Block;
940 struct Instruction;
941 struct Pseudo_instruction;
942 struct SOP1_instruction;
943 struct SOP2_instruction;
944 struct SOPK_instruction;
945 struct SOPP_instruction;
946 struct SOPC_instruction;
947 struct SMEM_instruction;
948 struct DS_instruction;
949 struct LDSDIR_instruction;
950 struct MTBUF_instruction;
951 struct MUBUF_instruction;
952 struct MIMG_instruction;
953 struct Export_instruction;
954 struct FLAT_instruction;
955 struct Pseudo_branch_instruction;
956 struct Pseudo_barrier_instruction;
957 struct Pseudo_reduction_instruction;
958 struct VALU_instruction;
959 struct VINTERP_inreg_instruction;
960 struct VINTRP_instruction;
961 struct VOPD_instruction;
962 struct DPP16_instruction;
963 struct DPP8_instruction;
964 struct SDWA_instruction;
965 
966 struct Instruction {
967    aco_opcode opcode;
968    Format format;
969    uint32_t pass_flags;
970 
971    aco::span<Operand> operands;
972    aco::span<Definition> definitions;
973 
974    constexpr bool usesModifiers() const noexcept;
975 
reads_execInstruction976    constexpr bool reads_exec() const noexcept
977    {
978       for (const Operand& op : operands) {
979          if (op.isFixed() && (op.physReg() == exec_lo || op.physReg() == exec_hi))
980             return true;
981       }
982       return false;
983    }
984 
writes_execInstruction985    constexpr bool writes_exec() const noexcept
986    {
987       for (const Definition& def : definitions) {
988          if (def.isFixed() && (def.physReg() == exec_lo || def.physReg() == exec_hi))
989             return true;
990       }
991       return false;
992    }
993 
pseudoInstruction994    Pseudo_instruction& pseudo() noexcept
995    {
996       assert(isPseudo());
997       return *(Pseudo_instruction*)this;
998    }
pseudoInstruction999    const Pseudo_instruction& pseudo() const noexcept
1000    {
1001       assert(isPseudo());
1002       return *(Pseudo_instruction*)this;
1003    }
isPseudoInstruction1004    constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
sop1Instruction1005    SOP1_instruction& sop1() noexcept
1006    {
1007       assert(isSOP1());
1008       return *(SOP1_instruction*)this;
1009    }
sop1Instruction1010    const SOP1_instruction& sop1() const noexcept
1011    {
1012       assert(isSOP1());
1013       return *(SOP1_instruction*)this;
1014    }
isSOP1Instruction1015    constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
sop2Instruction1016    SOP2_instruction& sop2() noexcept
1017    {
1018       assert(isSOP2());
1019       return *(SOP2_instruction*)this;
1020    }
sop2Instruction1021    const SOP2_instruction& sop2() const noexcept
1022    {
1023       assert(isSOP2());
1024       return *(SOP2_instruction*)this;
1025    }
isSOP2Instruction1026    constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
sopkInstruction1027    SOPK_instruction& sopk() noexcept
1028    {
1029       assert(isSOPK());
1030       return *(SOPK_instruction*)this;
1031    }
sopkInstruction1032    const SOPK_instruction& sopk() const noexcept
1033    {
1034       assert(isSOPK());
1035       return *(SOPK_instruction*)this;
1036    }
isSOPKInstruction1037    constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
soppInstruction1038    SOPP_instruction& sopp() noexcept
1039    {
1040       assert(isSOPP());
1041       return *(SOPP_instruction*)this;
1042    }
soppInstruction1043    const SOPP_instruction& sopp() const noexcept
1044    {
1045       assert(isSOPP());
1046       return *(SOPP_instruction*)this;
1047    }
isSOPPInstruction1048    constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
sopcInstruction1049    SOPC_instruction& sopc() noexcept
1050    {
1051       assert(isSOPC());
1052       return *(SOPC_instruction*)this;
1053    }
sopcInstruction1054    const SOPC_instruction& sopc() const noexcept
1055    {
1056       assert(isSOPC());
1057       return *(SOPC_instruction*)this;
1058    }
isSOPCInstruction1059    constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
smemInstruction1060    SMEM_instruction& smem() noexcept
1061    {
1062       assert(isSMEM());
1063       return *(SMEM_instruction*)this;
1064    }
smemInstruction1065    const SMEM_instruction& smem() const noexcept
1066    {
1067       assert(isSMEM());
1068       return *(SMEM_instruction*)this;
1069    }
isSMEMInstruction1070    constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1071    DS_instruction& ds() noexcept
1072    {
1073       assert(isDS());
1074       return *(DS_instruction*)this;
1075    }
dsInstruction1076    const DS_instruction& ds() const noexcept
1077    {
1078       assert(isDS());
1079       return *(DS_instruction*)this;
1080    }
isDSInstruction1081    constexpr bool isDS() const noexcept { return format == Format::DS; }
ldsdirInstruction1082    LDSDIR_instruction& ldsdir() noexcept
1083    {
1084       assert(isLDSDIR());
1085       return *(LDSDIR_instruction*)this;
1086    }
ldsdirInstruction1087    const LDSDIR_instruction& ldsdir() const noexcept
1088    {
1089       assert(isLDSDIR());
1090       return *(LDSDIR_instruction*)this;
1091    }
isLDSDIRInstruction1092    constexpr bool isLDSDIR() const noexcept { return format == Format::LDSDIR; }
mtbufInstruction1093    MTBUF_instruction& mtbuf() noexcept
1094    {
1095       assert(isMTBUF());
1096       return *(MTBUF_instruction*)this;
1097    }
mtbufInstruction1098    const MTBUF_instruction& mtbuf() const noexcept
1099    {
1100       assert(isMTBUF());
1101       return *(MTBUF_instruction*)this;
1102    }
isMTBUFInstruction1103    constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1104    MUBUF_instruction& mubuf() noexcept
1105    {
1106       assert(isMUBUF());
1107       return *(MUBUF_instruction*)this;
1108    }
mubufInstruction1109    const MUBUF_instruction& mubuf() const noexcept
1110    {
1111       assert(isMUBUF());
1112       return *(MUBUF_instruction*)this;
1113    }
isMUBUFInstruction1114    constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1115    MIMG_instruction& mimg() noexcept
1116    {
1117       assert(isMIMG());
1118       return *(MIMG_instruction*)this;
1119    }
mimgInstruction1120    const MIMG_instruction& mimg() const noexcept
1121    {
1122       assert(isMIMG());
1123       return *(MIMG_instruction*)this;
1124    }
isMIMGInstruction1125    constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1126    Export_instruction& exp() noexcept
1127    {
1128       assert(isEXP());
1129       return *(Export_instruction*)this;
1130    }
expInstruction1131    const Export_instruction& exp() const noexcept
1132    {
1133       assert(isEXP());
1134       return *(Export_instruction*)this;
1135    }
isEXPInstruction1136    constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1137    FLAT_instruction& flat() noexcept
1138    {
1139       assert(isFlat());
1140       return *(FLAT_instruction*)this;
1141    }
flatInstruction1142    const FLAT_instruction& flat() const noexcept
1143    {
1144       assert(isFlat());
1145       return *(FLAT_instruction*)this;
1146    }
isFlatInstruction1147    constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1148    FLAT_instruction& global() noexcept
1149    {
1150       assert(isGlobal());
1151       return *(FLAT_instruction*)this;
1152    }
globalInstruction1153    const FLAT_instruction& global() const noexcept
1154    {
1155       assert(isGlobal());
1156       return *(FLAT_instruction*)this;
1157    }
isGlobalInstruction1158    constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1159    FLAT_instruction& scratch() noexcept
1160    {
1161       assert(isScratch());
1162       return *(FLAT_instruction*)this;
1163    }
scratchInstruction1164    const FLAT_instruction& scratch() const noexcept
1165    {
1166       assert(isScratch());
1167       return *(FLAT_instruction*)this;
1168    }
isScratchInstruction1169    constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1170    Pseudo_branch_instruction& branch() noexcept
1171    {
1172       assert(isBranch());
1173       return *(Pseudo_branch_instruction*)this;
1174    }
branchInstruction1175    const Pseudo_branch_instruction& branch() const noexcept
1176    {
1177       assert(isBranch());
1178       return *(Pseudo_branch_instruction*)this;
1179    }
isBranchInstruction1180    constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1181    Pseudo_barrier_instruction& barrier() noexcept
1182    {
1183       assert(isBarrier());
1184       return *(Pseudo_barrier_instruction*)this;
1185    }
barrierInstruction1186    const Pseudo_barrier_instruction& barrier() const noexcept
1187    {
1188       assert(isBarrier());
1189       return *(Pseudo_barrier_instruction*)this;
1190    }
isBarrierInstruction1191    constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1192    Pseudo_reduction_instruction& reduction() noexcept
1193    {
1194       assert(isReduction());
1195       return *(Pseudo_reduction_instruction*)this;
1196    }
reductionInstruction1197    const Pseudo_reduction_instruction& reduction() const noexcept
1198    {
1199       assert(isReduction());
1200       return *(Pseudo_reduction_instruction*)this;
1201    }
isReductionInstruction1202    constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
isVOP3PInstruction1203    constexpr bool isVOP3P() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3P; }
vinterp_inregInstruction1204    VINTERP_inreg_instruction& vinterp_inreg() noexcept
1205    {
1206       assert(isVINTERP_INREG());
1207       return *(VINTERP_inreg_instruction*)this;
1208    }
vinterp_inregInstruction1209    const VINTERP_inreg_instruction& vinterp_inreg() const noexcept
1210    {
1211       assert(isVINTERP_INREG());
1212       return *(VINTERP_inreg_instruction*)this;
1213    }
isVINTERP_INREGInstruction1214    constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; }
vopdInstruction1215    VOPD_instruction& vopd() noexcept
1216    {
1217       assert(isVOPD());
1218       return *(VOPD_instruction*)this;
1219    }
vopdInstruction1220    const VOPD_instruction& vopd() const noexcept
1221    {
1222       assert(isVOPD());
1223       return *(VOPD_instruction*)this;
1224    }
isVOPDInstruction1225    constexpr bool isVOPD() const noexcept { return format == Format::VOPD; }
isVOP1Instruction1226    constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
isVOP2Instruction1227    constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
isVOPCInstruction1228    constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
isVOP3Instruction1229    constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1230    VINTRP_instruction& vintrp() noexcept
1231    {
1232       assert(isVINTRP());
1233       return *(VINTRP_instruction*)this;
1234    }
vintrpInstruction1235    const VINTRP_instruction& vintrp() const noexcept
1236    {
1237       assert(isVINTRP());
1238       return *(VINTRP_instruction*)this;
1239    }
isVINTRPInstruction1240    constexpr bool isVINTRP() const noexcept { return format == Format::VINTRP; }
dpp16Instruction1241    DPP16_instruction& dpp16() noexcept
1242    {
1243       assert(isDPP16());
1244       return *(DPP16_instruction*)this;
1245    }
dpp16Instruction1246    const DPP16_instruction& dpp16() const noexcept
1247    {
1248       assert(isDPP16());
1249       return *(DPP16_instruction*)this;
1250    }
isDPP16Instruction1251    constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
dpp8Instruction1252    DPP8_instruction& dpp8() noexcept
1253    {
1254       assert(isDPP8());
1255       return *(DPP8_instruction*)this;
1256    }
dpp8Instruction1257    const DPP8_instruction& dpp8() const noexcept
1258    {
1259       assert(isDPP8());
1260       return *(DPP8_instruction*)this;
1261    }
isDPP8Instruction1262    constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
isDPPInstruction1263    constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
sdwaInstruction1264    SDWA_instruction& sdwa() noexcept
1265    {
1266       assert(isSDWA());
1267       return *(SDWA_instruction*)this;
1268    }
sdwaInstruction1269    const SDWA_instruction& sdwa() const noexcept
1270    {
1271       assert(isSDWA());
1272       return *(SDWA_instruction*)this;
1273    }
isSDWAInstruction1274    constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1275 
flatlikeInstruction1276    FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1277 
flatlikeInstruction1278    const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1279 
isFlatLikeInstruction1280    constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1281 
valuInstruction1282    VALU_instruction& valu() noexcept
1283    {
1284       assert(isVALU());
1285       return *(VALU_instruction*)this;
1286    }
valuInstruction1287    const VALU_instruction& valu() const noexcept
1288    {
1289       assert(isVALU());
1290       return *(VALU_instruction*)this;
1291    }
isVALUInstruction1292    constexpr bool isVALU() const noexcept
1293    {
1294       return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG() ||
1295              isVOPD();
1296    }
1297 
isSALUInstruction1298    constexpr bool isSALU() const noexcept
1299    {
1300       return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1301    }
1302 
isVMEMInstruction1303    constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1304 
1305    bool accessesLDS() const noexcept;
1306    bool isTrans() const noexcept;
1307 };
1308 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1309 
1310 struct SOPK_instruction : public Instruction {
1311    uint16_t imm;
1312    uint16_t padding;
1313 };
1314 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1315 
1316 struct SOPP_instruction : public Instruction {
1317    uint32_t imm;
1318    int block;
1319 };
1320 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1321 
1322 struct SOPC_instruction : public Instruction {
1323    uint32_t padding;
1324 };
1325 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1326 
1327 struct SOP1_instruction : public Instruction {};
1328 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1329 
1330 struct SOP2_instruction : public Instruction {
1331    uint32_t padding;
1332 };
1333 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1334 
1335 /**
1336  * Scalar Memory Format:
1337  * For s_(buffer_)load_dword*:
1338  * Operand(0): SBASE - SGPR-pair which provides base address
1339  * Operand(1): Offset - immediate (un)signed offset or SGPR
1340  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1341  * Operand(n-1): SOffset - SGPR offset (Vega only)
1342  *
1343  * Having no operands is also valid for instructions such as s_dcache_inv.
1344  *
1345  */
1346 struct SMEM_instruction : public Instruction {
1347    memory_sync_info sync;
1348    bool glc : 1; /* VI+: globally coherent */
1349    bool dlc : 1; /* NAVI: device level coherent */
1350    bool nv : 1;  /* VEGA only: Non-volatile */
1351    bool disable_wqm : 1;
1352    uint8_t padding : 4;
1353 };
1354 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1355 
1356 struct VALU_instruction : public Instruction {
1357    union {
1358       bitfield_array8<uint32_t, 0, 3> neg;    /* VOP3, SDWA, DPP16, v_fma_mix, VINTERP_inreg */
1359       bitfield_array8<uint32_t, 0, 3> neg_lo; /* VOP3P */
1360 
1361       bitfield_array8<uint32_t, 3, 3> abs;    /* VOP3, SDWA, DPP16, v_fma_mix */
1362       bitfield_array8<uint32_t, 3, 3> neg_hi; /* VOP3P */
1363 
1364       bitfield_array8<uint32_t, 6, 4> opsel;     /* VOP3, VOPC12(GFX11+), VINTERP_inreg */
1365       bitfield_uint8<uint32_t, 10, 2> omod;      /* VOP3, SDWA(GFX9+) */
1366       bitfield_array8<uint32_t, 12, 3> opsel_lo; /* VOP3P */
1367       bitfield_array8<uint32_t, 15, 3> opsel_hi; /* VOP3P */
1368       bitfield_bool<uint32_t, 18> clamp;         /* VOP3, VOP3P, SDWA, VINTERP_inreg */
1369    };
1370 
1371    void swapOperands(unsigned idx0, unsigned idx1);
1372 };
1373 static_assert(sizeof(VALU_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1374 
1375 struct VINTERP_inreg_instruction : public VALU_instruction {
1376    uint8_t wait_exp : 3;
1377    uint8_t padding3 : 5;
1378    uint8_t padding4;
1379    uint8_t padding5;
1380    uint8_t padding6;
1381 };
1382 static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4,
1383               "Unexpected padding");
1384 
1385 struct VOPD_instruction : public VALU_instruction {
1386    aco_opcode opy;
1387    uint16_t padding;
1388 };
1389 static_assert(sizeof(VOPD_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1390 
1391 /**
1392  * Data Parallel Primitives Format:
1393  * This format can be used for VOP1, VOP2 or VOPC instructions.
1394  * The swizzle applies to the src0 operand.
1395  *
1396  */
1397 struct DPP16_instruction : public VALU_instruction {
1398    uint16_t dpp_ctrl;
1399    uint8_t row_mask : 4;
1400    uint8_t bank_mask : 4;
1401    bool bound_ctrl : 1;
1402    uint8_t fetch_inactive : 1;
1403    uint8_t padding3 : 6;
1404 };
1405 static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1406 
1407 struct DPP8_instruction : public VALU_instruction {
1408    uint32_t lane_sel : 24;
1409    uint32_t fetch_inactive : 1;
1410    uint32_t padding : 7;
1411 };
1412 static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1413 
1414 struct SubdwordSel {
1415    enum sdwa_sel : uint8_t {
1416       ubyte = 0x4,
1417       uword = 0x8,
1418       dword = 0x10,
1419       sext = 0x20,
1420       sbyte = ubyte | sext,
1421       sword = uword | sext,
1422 
1423       ubyte0 = ubyte,
1424       ubyte1 = ubyte | 1,
1425       ubyte2 = ubyte | 2,
1426       ubyte3 = ubyte | 3,
1427       sbyte0 = sbyte,
1428       sbyte1 = sbyte | 1,
1429       sbyte2 = sbyte | 2,
1430       sbyte3 = sbyte | 3,
1431       uword0 = uword,
1432       uword1 = uword | 2,
1433       sword0 = sword,
1434       sword1 = sword | 2,
1435    };
1436 
SubdwordSelSubdwordSel1437    SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1438    constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1439    constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1440        : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1441    {}
sdwa_selSubdwordSel1442    constexpr operator sdwa_sel() const { return sel; }
1443    explicit operator bool() const { return sel != 0; }
1444 
sizeSubdwordSel1445    constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1446    constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1447    constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1448    constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1449    {
1450       reg_byte_offset += offset();
1451       if (size() == 1)
1452          return reg_byte_offset;
1453       else if (size() == 2)
1454          return 4 + (reg_byte_offset >> 1);
1455       else
1456          return 6;
1457    }
1458 
1459 private:
1460    sdwa_sel sel;
1461 };
1462 
1463 /**
1464  * Sub-Dword Addressing Format:
1465  * This format can be used for VOP1, VOP2 or VOPC instructions.
1466  *
1467  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1468  * the definition doesn't have to be VCC on GFX9+.
1469  *
1470  */
1471 struct SDWA_instruction : public VALU_instruction {
1472    /* these destination modifiers aren't available with VOPC except for
1473     * clamp on GFX8 */
1474    SubdwordSel sel[2];
1475    SubdwordSel dst_sel;
1476    uint8_t padding3;
1477 };
1478 static_assert(sizeof(SDWA_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1479 
1480 struct VINTRP_instruction : public Instruction {
1481    uint8_t attribute;
1482    uint8_t component;
1483    uint16_t padding;
1484 };
1485 static_assert(sizeof(VINTRP_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1486 
1487 /**
1488  * Local and Global Data Sharing instructions
1489  * Operand(0): ADDR - VGPR which supplies the address.
1490  * Operand(1): DATA0 - First data VGPR.
1491  * Operand(2): DATA1 - Second data VGPR.
1492  * Operand(n-1): M0 - LDS size.
1493  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1494  *
1495  */
1496 struct DS_instruction : public Instruction {
1497    memory_sync_info sync;
1498    bool gds;
1499    uint16_t offset0;
1500    uint8_t offset1;
1501    uint8_t padding;
1502 };
1503 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1504 
1505 /**
1506  * LDS Direct instructions
1507  * Operand(0): M0
1508  * Definition(0): VDST - Destination VGPR
1509  */
1510 struct LDSDIR_instruction : public Instruction {
1511    memory_sync_info sync;
1512    uint8_t attr : 6;
1513    uint8_t attr_chan : 2;
1514    uint32_t wait_vdst : 4;
1515    uint32_t padding : 28;
1516 };
1517 static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1518 
1519 /**
1520  * Vector Memory Untyped-buffer Instructions
1521  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1522  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1523  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1524  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1525  *
1526  */
1527 struct MUBUF_instruction : public Instruction {
1528    memory_sync_info sync;
1529    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1530    bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1531    bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
1532    bool glc : 1;             /* globally coherent */
1533    bool dlc : 1;             /* NAVI: device level coherent */
1534    bool slc : 1;             /* system level coherent */
1535    bool tfe : 1;             /* texture fail enable */
1536    bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
1537    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1538    uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
1539    uint16_t swizzled : 1;
1540    uint16_t padding0 : 2;
1541    uint16_t padding1;
1542 };
1543 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1544 
1545 /**
1546  * Vector Memory Typed-buffer Instructions
1547  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1548  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1549  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1550  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1551  *
1552  */
1553 struct MTBUF_instruction : public Instruction {
1554    memory_sync_info sync;
1555    uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
1556    uint8_t nfmt : 3;         /* Numeric format of data in memory */
1557    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1558    uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
1559    uint16_t glc : 1;         /* globally coherent */
1560    uint16_t dlc : 1;         /* NAVI: device level coherent */
1561    uint16_t slc : 1;         /* system level coherent */
1562    uint16_t tfe : 1;         /* texture fail enable */
1563    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1564    uint16_t padding : 10;
1565    uint16_t offset; /* Unsigned byte offset - 12 bit */
1566 };
1567 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1568 
1569 /**
1570  * Vector Memory Image Instructions
1571  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1572  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1573  * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1574  * Operand(3): VADDR - Address source. Can carry an offset or an index.
1575  * Definition(0): VDATA - Vector GPR for read result.
1576  *
1577  */
1578 struct MIMG_instruction : public Instruction {
1579    memory_sync_info sync;
1580    uint8_t dmask;        /* Data VGPR enable mask */
1581    uint8_t dim : 3;      /* NAVI: dimensionality */
1582    bool unrm : 1;        /* Force address to be un-normalized */
1583    bool dlc : 1;         /* NAVI: device level coherent */
1584    bool glc : 1;         /* globally coherent */
1585    bool slc : 1;         /* system level coherent */
1586    bool tfe : 1;         /* texture fail enable */
1587    bool da : 1;          /* declare an array */
1588    bool lwe : 1;         /* LOD warning enable */
1589    bool r128 : 1;        /* NAVI: Texture resource size */
1590    bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
1591    bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
1592    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1593    bool strict_wqm : 1;  /* VADDR is a linear VGPR and additional VGPRs may be copied into it */
1594    uint8_t padding0 : 1;
1595    uint8_t padding1;
1596    uint8_t padding2;
1597 };
1598 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1599 
1600 /**
1601  * Flat/Scratch/Global Instructions
1602  * Operand(0): ADDR
1603  * Operand(1): SADDR
1604  * Operand(2) / Definition(0): DATA/VDST
1605  *
1606  */
1607 struct FLAT_instruction : public Instruction {
1608    memory_sync_info sync;
1609    bool slc : 1; /* system level coherent */
1610    bool glc : 1; /* globally coherent */
1611    bool dlc : 1; /* NAVI: device level coherent */
1612    bool lds : 1;
1613    bool nv : 1;
1614    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1615    uint8_t padding0 : 2;
1616    int16_t offset; /* Vega/Navi only */
1617    uint16_t padding1;
1618 };
1619 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1620 
1621 struct Export_instruction : public Instruction {
1622    uint8_t enabled_mask;
1623    uint8_t dest;
1624    bool compressed : 1;
1625    bool done : 1;
1626    bool valid_mask : 1;
1627    bool row_en : 1;
1628    uint8_t padding0 : 4;
1629    uint8_t padding1;
1630 };
1631 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1632 
1633 struct Pseudo_instruction : public Instruction {
1634    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1635    bool tmp_in_scc;
1636    uint8_t padding;
1637 };
1638 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1639 
1640 struct Pseudo_branch_instruction : public Instruction {
1641    /* target[0] is the block index of the branch target.
1642     * For conditional branches, target[1] contains the fall-through alternative.
1643     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1644     */
1645    uint32_t target[2];
1646 
1647    /* Indicates that selection control prefers to remove this instruction if possible.
1648     * This is set when the branch is divergent and always taken, or flattened.
1649     */
1650    bool selection_control_remove;
1651 };
1652 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding");
1653 
1654 struct Pseudo_barrier_instruction : public Instruction {
1655    memory_sync_info sync;
1656    sync_scope exec_scope;
1657 };
1658 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1659 
1660 enum ReduceOp : uint16_t {
1661    // clang-format off
1662    iadd8, iadd16, iadd32, iadd64,
1663    imul8, imul16, imul32, imul64,
1664           fadd16, fadd32, fadd64,
1665           fmul16, fmul32, fmul64,
1666    imin8, imin16, imin32, imin64,
1667    imax8, imax16, imax32, imax64,
1668    umin8, umin16, umin32, umin64,
1669    umax8, umax16, umax32, umax64,
1670           fmin16, fmin32, fmin64,
1671           fmax16, fmax32, fmax64,
1672    iand8, iand16, iand32, iand64,
1673    ior8, ior16, ior32, ior64,
1674    ixor8, ixor16, ixor32, ixor64,
1675    num_reduce_ops,
1676    // clang-format on
1677 };
1678 
1679 /**
1680  * Subgroup Reduction Instructions, everything except for the data to be
1681  * reduced and the result as inserted by setup_reduce_temp().
1682  * Operand(0): data to be reduced
1683  * Operand(1): reduce temporary
1684  * Operand(2): vector temporary
1685  * Definition(0): result
1686  * Definition(1): scalar temporary
1687  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1688  * Definition(3): scc clobber
1689  * Definition(4): vcc clobber
1690  *
1691  */
1692 struct Pseudo_reduction_instruction : public Instruction {
1693    ReduceOp reduce_op;
1694    uint16_t cluster_size; // must be 0 for scans
1695 };
1696 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1697               "Unexpected padding");
1698 
1699 inline bool
accessesLDS()1700 Instruction::accessesLDS() const noexcept
1701 {
1702    return (isDS() && !ds().gds) || isLDSDIR() || isVINTRP();
1703 }
1704 
1705 inline void
swapOperands(unsigned idx0,unsigned idx1)1706 VALU_instruction::swapOperands(unsigned idx0, unsigned idx1)
1707 {
1708    if (this->isSDWA() && idx0 != idx1) {
1709       assert(idx0 < 2 && idx1 < 2);
1710       std::swap(this->sdwa().sel[0], this->sdwa().sel[1]);
1711    }
1712    assert(idx0 < 3 && idx1 < 3);
1713    std::swap(this->operands[idx0], this->operands[idx1]);
1714    this->neg[idx0].swap(this->neg[idx1]);
1715    this->abs[idx0].swap(this->abs[idx1]);
1716    this->opsel[idx0].swap(this->opsel[idx1]);
1717    this->opsel_lo[idx0].swap(this->opsel_lo[idx1]);
1718    this->opsel_hi[idx0].swap(this->opsel_hi[idx1]);
1719 }
1720 
1721 extern thread_local aco::monotonic_buffer_resource* instruction_buffer;
1722 
1723 struct instr_deleter_functor {
1724    /* Don't yet free any instructions. They will be de-allocated
1725     * all at once after compilation finished.
1726     */
operatorinstr_deleter_functor1727    void operator()(void* p) { return; }
1728 };
1729 
1730 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1731 
1732 template <typename T>
1733 T*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1734 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1735                    uint32_t num_definitions)
1736 {
1737    std::size_t size =
1738       sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1739    void* data = instruction_buffer->allocate(size, alignof(uint32_t));
1740    memset(data, 0, size);
1741    T* inst = (T*)data;
1742 
1743    inst->opcode = opcode;
1744    inst->format = format;
1745 
1746    uint16_t operands_offset = sizeof(T) - offsetof(Instruction, operands);
1747    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1748    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1749    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1750 
1751    return inst;
1752 }
1753 
1754 constexpr bool
usesModifiers()1755 Instruction::usesModifiers() const noexcept
1756 {
1757    if (isDPP() || isSDWA())
1758       return true;
1759 
1760    if (isVOP3P()) {
1761       const VALU_instruction& vop3p = this->valu();
1762       /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1763       return vop3p.opsel_lo || vop3p.clamp || vop3p.neg_lo || vop3p.neg_hi ||
1764              (vop3p.opsel_hi & BITFIELD_MASK(operands.size())) != BITFIELD_MASK(operands.size());
1765    } else if (isVALU()) {
1766       const VALU_instruction& vop3 = this->valu();
1767       return vop3.opsel || vop3.clamp || vop3.omod || vop3.abs || vop3.neg;
1768    }
1769    return false;
1770 }
1771 
1772 constexpr bool
is_phi(Instruction * instr)1773 is_phi(Instruction* instr)
1774 {
1775    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1776 }
1777 
1778 static inline bool
is_phi(aco_ptr<Instruction> & instr)1779 is_phi(aco_ptr<Instruction>& instr)
1780 {
1781    return is_phi(instr.get());
1782 }
1783 
1784 memory_sync_info get_sync_info(const Instruction* instr);
1785 
1786 inline bool
is_dead(const std::vector<uint16_t> & uses,const Instruction * instr)1787 is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
1788 {
1789    if (instr->definitions.empty() || instr->isBranch() || instr->opcode == aco_opcode::p_startpgm ||
1790        instr->opcode == aco_opcode::p_init_scratch ||
1791        instr->opcode == aco_opcode::p_dual_src_export_gfx11)
1792       return false;
1793 
1794    if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
1795                    [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
1796       return false;
1797 
1798    return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
1799 }
1800 
1801 bool can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx);
1802 bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
1803 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
1804 uint8_t get_gfx11_true16_mask(aco_opcode op);
1805 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
1806 bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
1807 bool can_write_m0(const aco_ptr<Instruction>& instr);
1808 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1809 aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr);
1810 aco_ptr<Instruction> convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr,
1811                                     bool dpp8);
1812 bool needs_exec_mask(const Instruction* instr);
1813 
1814 aco_opcode get_ordered(aco_opcode op);
1815 aco_opcode get_unordered(aco_opcode op);
1816 aco_opcode get_inverse(aco_opcode op);
1817 aco_opcode get_swapped(aco_opcode op);
1818 aco_opcode get_f32_cmp(aco_opcode op);
1819 aco_opcode get_vcmpx(aco_opcode op);
1820 unsigned get_cmp_bitsize(aco_opcode op);
1821 bool is_fp_cmp(aco_opcode op);
1822 bool is_cmpx(aco_opcode op);
1823 
1824 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0 = 0,
1825                        unsigned idx1 = 1);
1826 
1827 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1828 
1829 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1830 
1831 unsigned get_vopd_opy_start(const Instruction* instr);
1832 
1833 unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index);
1834 
1835 bool should_form_clause(const Instruction* a, const Instruction* b);
1836 
1837 enum block_kind {
1838    /* uniform indicates that leaving this block,
1839     * all actives lanes stay active */
1840    block_kind_uniform = 1 << 0,
1841    block_kind_top_level = 1 << 1,
1842    block_kind_loop_preheader = 1 << 2,
1843    block_kind_loop_header = 1 << 3,
1844    block_kind_loop_exit = 1 << 4,
1845    block_kind_continue = 1 << 5,
1846    block_kind_break = 1 << 6,
1847    block_kind_continue_or_break = 1 << 7,
1848    block_kind_branch = 1 << 8,
1849    block_kind_merge = 1 << 9,
1850    block_kind_invert = 1 << 10,
1851    block_kind_discard_early_exit = 1 << 11,
1852    block_kind_uses_discard = 1 << 12,
1853    block_kind_resume = 1 << 13,
1854    block_kind_export_end = 1 << 14,
1855    block_kind_end_with_regs = 1 << 15,
1856 };
1857 
1858 struct RegisterDemand {
1859    constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1860    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1861    int16_t vgpr = 0;
1862    int16_t sgpr = 0;
1863 
1864    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1865    {
1866       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1867    }
1868 
exceedsRegisterDemand1869    constexpr bool exceeds(const RegisterDemand other) const noexcept
1870    {
1871       return vgpr > other.vgpr || sgpr > other.sgpr;
1872    }
1873 
1874    constexpr RegisterDemand operator+(const Temp t) const noexcept
1875    {
1876       if (t.type() == RegType::sgpr)
1877          return RegisterDemand(vgpr, sgpr + t.size());
1878       else
1879          return RegisterDemand(vgpr + t.size(), sgpr);
1880    }
1881 
1882    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1883    {
1884       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1885    }
1886 
1887    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1888    {
1889       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1890    }
1891 
1892    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1893    {
1894       vgpr += other.vgpr;
1895       sgpr += other.sgpr;
1896       return *this;
1897    }
1898 
1899    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1900    {
1901       vgpr -= other.vgpr;
1902       sgpr -= other.sgpr;
1903       return *this;
1904    }
1905 
1906    constexpr RegisterDemand& operator+=(const Temp t) noexcept
1907    {
1908       if (t.type() == RegType::sgpr)
1909          sgpr += t.size();
1910       else
1911          vgpr += t.size();
1912       return *this;
1913    }
1914 
1915    constexpr RegisterDemand& operator-=(const Temp t) noexcept
1916    {
1917       if (t.type() == RegType::sgpr)
1918          sgpr -= t.size();
1919       else
1920          vgpr -= t.size();
1921       return *this;
1922    }
1923 
updateRegisterDemand1924    constexpr void update(const RegisterDemand other) noexcept
1925    {
1926       vgpr = std::max(vgpr, other.vgpr);
1927       sgpr = std::max(sgpr, other.sgpr);
1928    }
1929 };
1930 
1931 /* CFG */
1932 struct Block {
1933    float_mode fp_mode;
1934    unsigned index;
1935    unsigned offset = 0;
1936    std::vector<aco_ptr<Instruction>> instructions;
1937    std::vector<unsigned> logical_preds;
1938    std::vector<unsigned> linear_preds;
1939    std::vector<unsigned> logical_succs;
1940    std::vector<unsigned> linear_succs;
1941    RegisterDemand register_demand = RegisterDemand();
1942    uint32_t kind = 0;
1943    int32_t logical_idom = -1;
1944    int32_t linear_idom = -1;
1945    uint16_t loop_nest_depth = 0;
1946    uint16_t divergent_if_logical_depth = 0;
1947    uint16_t uniform_if_depth = 0;
1948 
1949    /* this information is needed for predecessors to blocks with phis when
1950     * moving out of ssa */
1951    bool scc_live_out = false;
1952 
BlockBlock1953    Block() : index(0) {}
1954 };
1955 
1956 /*
1957  * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1958  */
1959 enum class SWStage : uint16_t {
1960    None = 0,
1961    VS = 1 << 0,  /* Vertex Shader */
1962    GS = 1 << 1,  /* Geometry Shader */
1963    TCS = 1 << 2, /* Tessellation Control aka Hull Shader */
1964    TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */
1965    FS = 1 << 4,  /* Fragment aka Pixel Shader */
1966    CS = 1 << 5,  /* Compute Shader */
1967    TS = 1 << 6,  /* Task Shader */
1968    MS = 1 << 7,  /* Mesh Shader */
1969    RT = 1 << 8,  /* Raytracing Shader */
1970 
1971    /* Stage combinations merged to run on a single HWStage */
1972    VS_GS = VS | GS,
1973    VS_TCS = VS | TCS,
1974    TES_GS = TES | GS,
1975 };
1976 
1977 constexpr SWStage
1978 operator|(SWStage a, SWStage b)
1979 {
1980    return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
1981 }
1982 
1983 /*
1984  * Set of SWStages to be merged into a single shader paired with the
1985  * HWStage it will run on.
1986  */
1987 struct Stage {
1988    constexpr Stage() = default;
1989 
StageStage1990    explicit constexpr Stage(ac_hw_stage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1991 
1992    /* Check if the given SWStage is included */
hasStage1993    constexpr bool has(SWStage stage) const
1994    {
1995       return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
1996    }
1997 
num_sw_stagesStage1998    unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
1999 
2000    constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
2001 
2002    constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
2003 
2004    /* Mask of merged software stages */
2005    SWStage sw = SWStage::None;
2006 
2007    /* Active hardware stage */
2008    ac_hw_stage hw{};
2009 };
2010 
2011 /* possible settings of Program::stage */
2012 static constexpr Stage vertex_vs(AC_HW_VERTEX_SHADER, SWStage::VS);
2013 static constexpr Stage fragment_fs(AC_HW_PIXEL_SHADER, SWStage::FS);
2014 static constexpr Stage compute_cs(AC_HW_COMPUTE_SHADER, SWStage::CS);
2015 static constexpr Stage tess_eval_vs(AC_HW_VERTEX_SHADER, SWStage::TES);
2016 /* Mesh shading pipeline */
2017 static constexpr Stage task_cs(AC_HW_COMPUTE_SHADER, SWStage::TS);
2018 static constexpr Stage mesh_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::MS);
2019 /* GFX10/NGG */
2020 static constexpr Stage vertex_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::VS);
2021 static constexpr Stage vertex_geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::VS_GS);
2022 static constexpr Stage tess_eval_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::TES);
2023 static constexpr Stage tess_eval_geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::TES_GS);
2024 /* GFX9 (and GFX10 if NGG isn't used) */
2025 static constexpr Stage vertex_geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::VS_GS);
2026 static constexpr Stage vertex_tess_control_hs(AC_HW_HULL_SHADER, SWStage::VS_TCS);
2027 static constexpr Stage tess_eval_geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::TES_GS);
2028 /* pre-GFX9 */
2029 static constexpr Stage vertex_ls(AC_HW_LOCAL_SHADER,
2030                                  SWStage::VS); /* vertex before tessellation control */
2031 static constexpr Stage vertex_es(AC_HW_EXPORT_SHADER, SWStage::VS); /* vertex before geometry */
2032 static constexpr Stage tess_control_hs(AC_HW_HULL_SHADER, SWStage::TCS);
2033 static constexpr Stage tess_eval_es(AC_HW_EXPORT_SHADER,
2034                                     SWStage::TES); /* tessellation evaluation before geometry */
2035 static constexpr Stage geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::GS);
2036 /* Raytracing */
2037 static constexpr Stage raytracing_cs(AC_HW_COMPUTE_SHADER, SWStage::RT);
2038 
2039 struct DeviceInfo {
2040    uint16_t lds_encoding_granule;
2041    uint16_t lds_alloc_granule;
2042    uint32_t lds_limit; /* in bytes */
2043    bool has_16bank_lds;
2044    uint16_t physical_sgprs;
2045    uint16_t physical_vgprs;
2046    uint16_t vgpr_limit;
2047    uint16_t sgpr_limit;
2048    uint16_t sgpr_alloc_granule;
2049    uint16_t vgpr_alloc_granule;
2050    unsigned scratch_alloc_granule;
2051    uint16_t max_waves_per_simd;
2052    unsigned simd_per_cu;
2053    bool has_fast_fma32 = false;
2054    bool has_mac_legacy32 = false;
2055    bool fused_mad_mix = false;
2056    bool xnack_enabled = false;
2057    bool sram_ecc_enabled = false;
2058 
2059    int16_t scratch_global_offset_min;
2060    int16_t scratch_global_offset_max;
2061    unsigned max_nsa_vgprs;
2062 };
2063 
2064 enum class CompilationProgress {
2065    after_isel,
2066    after_spilling,
2067    after_ra,
2068 };
2069 
2070 class Program final {
2071 public:
2072    aco::monotonic_buffer_resource m{65536};
2073    std::vector<Block> blocks;
2074    std::vector<RegClass> temp_rc = {s1};
2075    RegisterDemand max_reg_demand = RegisterDemand();
2076    ac_shader_config* config;
2077    struct aco_shader_info info;
2078    enum amd_gfx_level gfx_level;
2079    enum radeon_family family;
2080    DeviceInfo dev;
2081    unsigned wave_size;
2082    RegClass lane_mask;
2083    Stage stage;
2084    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2085    bool needs_wqm = false;   /* there exists a p_wqm instruction */
2086    bool has_smem_buffer_or_global_loads = false;
2087    bool has_pops_overlapped_waves_wait = false;
2088    bool has_color_exports = false;
2089    bool is_prolog = false;
2090 
2091    std::vector<uint8_t> constant_data;
2092    Temp private_segment_buffer;
2093    Temp scratch_offset;
2094 
2095    uint16_t num_waves = 0;
2096    uint16_t min_waves = 0;
2097    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2098    bool wgp_mode;
2099 
2100    bool needs_vcc = false;
2101 
2102    CompilationProgress progress;
2103 
2104    bool collect_statistics = false;
2105    uint32_t statistics[aco_num_statistics];
2106 
2107    float_mode next_fp_mode;
2108    unsigned next_loop_depth = 0;
2109    unsigned next_divergent_if_logical_depth = 0;
2110    unsigned next_uniform_if_depth = 0;
2111 
2112    std::vector<Definition> args_pending_vmem;
2113 
2114    /* For shader part with previous shader part that has lds access. */
2115    bool pending_lds_access = false;
2116 
2117    struct {
2118       FILE* output = stderr;
2119       bool shorten_messages = false;
2120       void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message);
2121       void* private_data;
2122    } debug;
2123 
allocateId(RegClass rc)2124    uint32_t allocateId(RegClass rc)
2125    {
2126       assert(allocationID <= 16777215);
2127       temp_rc.push_back(rc);
2128       return allocationID++;
2129    }
2130 
allocateRange(unsigned amount)2131    void allocateRange(unsigned amount)
2132    {
2133       assert(allocationID + amount <= 16777216);
2134       temp_rc.resize(temp_rc.size() + amount);
2135       allocationID += amount;
2136    }
2137 
allocateTmp(RegClass rc)2138    Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2139 
peekAllocationId()2140    uint32_t peekAllocationId() { return allocationID; }
2141 
2142    friend void reindex_ssa(Program* program);
2143    friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2144 
create_and_insert_block()2145    Block* create_and_insert_block()
2146    {
2147       Block block;
2148       return insert_block(std::move(block));
2149    }
2150 
insert_block(Block && block)2151    Block* insert_block(Block&& block)
2152    {
2153       block.index = blocks.size();
2154       block.fp_mode = next_fp_mode;
2155       block.loop_nest_depth = next_loop_depth;
2156       block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2157       block.uniform_if_depth = next_uniform_if_depth;
2158       blocks.emplace_back(std::move(block));
2159       return &blocks.back();
2160    }
2161 
2162 private:
2163    uint32_t allocationID = 1;
2164 };
2165 
2166 struct live {
2167    /* live temps out per block */
2168    std::vector<IDSet> live_out;
2169    /* register demand (sgpr/vgpr) per instruction per block */
2170    std::vector<std::vector<RegisterDemand>> register_demand;
2171 };
2172 
2173 struct ra_test_policy {
2174    /* Force RA to always use its pessimistic fallback algorithm */
2175    bool skip_optimistic_path = false;
2176 };
2177 
2178 void init();
2179 
2180 void init_program(Program* program, Stage stage, const struct aco_shader_info* info,
2181                   enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
2182                   ac_shader_config* config);
2183 
2184 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2185                     ac_shader_config* config, const struct aco_compiler_options* options,
2186                     const struct aco_shader_info* info, const struct ac_shader_args* args);
2187 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2188                                 ac_shader_config* config,
2189                                 const struct aco_compiler_options* options,
2190                                 const struct aco_shader_info* info,
2191                                 const struct ac_shader_args* args);
2192 void select_rt_prolog(Program* program, ac_shader_config* config,
2193                       const struct aco_compiler_options* options,
2194                       const struct aco_shader_info* info, const struct ac_shader_args* in_args,
2195                       const struct ac_shader_args* out_args);
2196 void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo,
2197                       ac_shader_config* config, const struct aco_compiler_options* options,
2198                       const struct aco_shader_info* info, const struct ac_shader_args* args);
2199 
2200 void select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
2201                       const struct aco_compiler_options* options,
2202                       const struct aco_shader_info* info, const struct ac_shader_args* args);
2203 
2204 void select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config,
2205                        const struct aco_compiler_options* options,
2206                        const struct aco_shader_info* info, const struct ac_shader_args* args);
2207 
2208 void select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
2209                       const struct aco_compiler_options* options,
2210                       const struct aco_shader_info* info, const struct ac_shader_args* args);
2211 
2212 void lower_phis(Program* program);
2213 void calc_min_waves(Program* program);
2214 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2215 live live_var_analysis(Program* program);
2216 std::vector<uint16_t> dead_code_analysis(Program* program);
2217 void dominator_tree(Program* program);
2218 void insert_exec_mask(Program* program);
2219 void value_numbering(Program* program);
2220 void optimize(Program* program);
2221 void optimize_postRA(Program* program);
2222 void setup_reduce_temp(Program* program);
2223 void lower_to_cssa(Program* program, live& live_vars);
2224 void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2225                          ra_test_policy = {});
2226 void ssa_elimination(Program* program);
2227 void lower_to_hw_instr(Program* program);
2228 void schedule_program(Program* program, live& live_vars);
2229 void schedule_ilp(Program* program);
2230 void schedule_vopd(Program* program);
2231 void spill(Program* program, live& live_vars);
2232 void insert_wait_states(Program* program);
2233 bool dealloc_vgprs(Program* program);
2234 void insert_NOPs(Program* program);
2235 void form_hard_clauses(Program* program);
2236 unsigned emit_program(Program* program, std::vector<uint32_t>& code,
2237                       std::vector<struct aco_symbol>* symbols = NULL, bool append_endpgm = true);
2238 /**
2239  * Returns true if print_asm can disassemble the given program for the current build/runtime
2240  * configuration
2241  */
2242 bool check_print_asm_support(Program* program);
2243 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2244 bool validate_ir(Program* program);
2245 bool validate_cfg(Program* program);
2246 bool validate_ra(Program* program);
2247 #ifndef NDEBUG
2248 void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2249 #else
2250 #define perfwarn(program, cond, msg, ...)                                                          \
2251    do {                                                                                            \
2252    } while (0)
2253 #endif
2254 
2255 void collect_presched_stats(Program* program);
2256 void collect_preasm_stats(Program* program);
2257 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2258 
2259 struct Instruction_cycle_info {
2260    /* Latency until the result is ready (if not needing a waitcnt) */
2261    unsigned latency;
2262 
2263    /* How many cycles issuing this instruction takes (i.e. cycles till the next instruction can be
2264     * issued)*/
2265    unsigned issue_cycles;
2266 };
2267 
2268 Instruction_cycle_info get_cycle_info(const Program& program, const Instruction& instr);
2269 
2270 enum print_flags {
2271    print_no_ssa = 0x1,
2272    print_perf_info = 0x2,
2273    print_kill = 0x4,
2274    print_live_vars = 0x8,
2275 };
2276 
2277 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2278 void aco_print_instr(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* output,
2279                      unsigned flags = 0);
2280 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2281 void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2282                        unsigned flags = 0);
2283 
2284 void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2285 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2286 
2287 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2288 #define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2289 
2290 int get_op_fixed_to_def(Instruction* instr);
2291 
2292 /* utilities for dealing with register demand */
2293 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2294 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2295 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2296                                  aco_ptr<Instruction>& instr_before);
2297 
2298 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2299 uint16_t get_extra_sgprs(Program* program);
2300 
2301 /* adjust num_waves for workgroup size and LDS limits */
2302 uint16_t max_suitable_waves(Program* program, uint16_t waves);
2303 
2304 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2305 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2306 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2307 
2308 /* return number of addressable sgprs/vgprs for max_waves */
2309 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2310 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2311 
2312 bool uses_scratch(Program* program);
2313 
2314 typedef struct {
2315    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2316    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2317    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2318    const int16_t opcode_gfx11[static_cast<int>(aco_opcode::num_opcodes)];
2319    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2320    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2321    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2322    const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2323    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2324    /* sizes used for input/output modifiers and constants */
2325    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2326    const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2327    const uint32_t definitions[static_cast<int>(aco_opcode::num_opcodes)];
2328    const uint32_t operands[static_cast<int>(aco_opcode::num_opcodes)];
2329 } Info;
2330 
2331 extern const Info instr_info;
2332 
2333 } // namespace aco
2334 
2335 #endif /* ACO_IR_H */
2336