• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27 
28 #include "aco_opcodes.h"
29 #include "aco_util.h"
30 #include "aco_interface.h"
31 #include "aco_shader_info.h"
32 #include "vulkan/radv_shader.h"
33 
34 #include "nir.h"
35 
36 #include <bitset>
37 #include <memory>
38 #include <vector>
39 
40 struct radv_shader_args;
41 
42 namespace aco {
43 
44 extern uint64_t debug_flags;
45 
46 enum {
47    DEBUG_VALIDATE_IR = 0x1,
48    DEBUG_VALIDATE_RA = 0x2,
49    DEBUG_PERFWARN = 0x4,
50    DEBUG_FORCE_WAITCNT = 0x8,
51    DEBUG_NO_VN = 0x10,
52    DEBUG_NO_OPT = 0x20,
53    DEBUG_NO_SCHED = 0x40,
54    DEBUG_PERF_INFO = 0x80,
55    DEBUG_LIVE_INFO = 0x100,
56 };
57 
58 /**
59  * Representation of the instruction's microcode encoding format
60  * Note: Some Vector ALU Formats can be combined, such that:
61  * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
62  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
63  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
64  *
65  * (*) The same is applicable for VOP1 and VOPC instructions.
66  */
67 enum class Format : std::uint16_t {
68    /* Pseudo Instruction Format */
69    PSEUDO = 0,
70    /* Scalar ALU & Control Formats */
71    SOP1 = 1,
72    SOP2 = 2,
73    SOPK = 3,
74    SOPP = 4,
75    SOPC = 5,
76    /* Scalar Memory Format */
77    SMEM = 6,
78    /* LDS/GDS Format */
79    DS = 8,
80    /* Vector Memory Buffer Formats */
81    MTBUF = 9,
82    MUBUF = 10,
83    /* Vector Memory Image Format */
84    MIMG = 11,
85    /* Export Format */
86    EXP = 12,
87    /* Flat Formats */
88    FLAT = 13,
89    GLOBAL = 14,
90    SCRATCH = 15,
91 
92    PSEUDO_BRANCH = 16,
93    PSEUDO_BARRIER = 17,
94    PSEUDO_REDUCTION = 18,
95 
96    /* Vector ALU Formats */
97    VOP3P = 19,
98    VOP1 = 1 << 8,
99    VOP2 = 1 << 9,
100    VOPC = 1 << 10,
101    VOP3 = 1 << 11,
102    /* Vector Parameter Interpolation Format */
103    VINTRP = 1 << 12,
104    DPP16 = 1 << 13,
105    SDWA = 1 << 14,
106    DPP8 = 1 << 15,
107 };
108 
109 enum class instr_class : uint8_t {
110    valu32 = 0,
111    valu_convert32 = 1,
112    valu64 = 2,
113    valu_quarter_rate32 = 3,
114    valu_fma = 4,
115    valu_transcendental32 = 5,
116    valu_double = 6,
117    valu_double_add = 7,
118    valu_double_convert = 8,
119    valu_double_transcendental = 9,
120    salu = 10,
121    smem = 11,
122    barrier = 12,
123    branch = 13,
124    sendmsg = 14,
125    ds = 15,
126    exp = 16,
127    vmem = 17,
128    waitcnt = 18,
129    other = 19,
130    count,
131 };
132 
133 enum storage_class : uint8_t {
134    storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
135    storage_buffer = 0x1, /* SSBOs and global memory */
136    storage_atomic_counter = 0x2, /* not used for Vulkan */
137    storage_image = 0x4,
138    storage_shared = 0x8,       /* or TCS output */
139    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
140    storage_task_payload = 0x20,/* Task-Mesh payload */
141    storage_scratch = 0x40,
142    storage_vgpr_spill = 0x80,
143    storage_count = 8, /* not counting storage_none */
144 };
145 
146 enum memory_semantics : uint8_t {
147    semantic_none = 0x0,
148    /* for loads: don't move any access after this load to before this load (even other loads)
149     * for barriers: don't move any access after the barrier to before any
150     * atomics/control_barriers/sendmsg_gs_done before the barrier */
151    semantic_acquire = 0x1,
152    /* for stores: don't move any access before this store to after this store
153     * for barriers: don't move any access before the barrier to after any
154     * atomics/control_barriers/sendmsg_gs_done after the barrier */
155    semantic_release = 0x2,
156 
157    /* the rest are for load/stores/atomics only */
158    /* cannot be DCE'd or CSE'd */
159    semantic_volatile = 0x4,
160    /* does not interact with barriers and assumes this lane is the only lane
161     * accessing this memory */
162    semantic_private = 0x8,
163    /* this operation can be reordered around operations of the same storage.
164     * says nothing about barriers */
165    semantic_can_reorder = 0x10,
166    /* this is a atomic instruction (may only read or write memory) */
167    semantic_atomic = 0x20,
168    /* this is instruction both reads and writes memory */
169    semantic_rmw = 0x40,
170 
171    semantic_acqrel = semantic_acquire | semantic_release,
172    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
173 };
174 
175 enum sync_scope : uint8_t {
176    scope_invocation = 0,
177    scope_subgroup = 1,
178    scope_workgroup = 2,
179    scope_queuefamily = 3,
180    scope_device = 4,
181 };
182 
183 struct memory_sync_info {
memory_sync_infomemory_sync_info184    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
185    memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
186        : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
187    {}
188 
189    storage_class storage : 8;
190    memory_semantics semantics : 8;
191    sync_scope scope : 8;
192 
193    bool operator==(const memory_sync_info& rhs) const
194    {
195       return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
196    }
197 
can_reordermemory_sync_info198    bool can_reorder() const
199    {
200       if (semantics & semantic_acqrel)
201          return false;
202       /* Also check storage so that zero-initialized memory_sync_info can be
203        * reordered. */
204       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
205    }
206 };
207 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
208 
209 enum fp_round {
210    fp_round_ne = 0,
211    fp_round_pi = 1,
212    fp_round_ni = 2,
213    fp_round_tz = 3,
214 };
215 
216 enum fp_denorm {
217    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
218     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
219    fp_denorm_flush = 0x0,
220    fp_denorm_keep_in = 0x1,
221    fp_denorm_keep_out = 0x2,
222    fp_denorm_keep = 0x3,
223 };
224 
225 struct float_mode {
226    /* matches encoding of the MODE register */
227    union {
228       struct {
229          fp_round round32 : 2;
230          fp_round round16_64 : 2;
231          unsigned denorm32 : 2;
232          unsigned denorm16_64 : 2;
233       };
234       struct {
235          uint8_t round : 4;
236          uint8_t denorm : 4;
237       };
238       uint8_t val = 0;
239    };
240    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
241    bool preserve_signed_zero_inf_nan32 : 1;
242    bool preserve_signed_zero_inf_nan16_64 : 1;
243    /* if false, optimizations which may remove denormal flushing can be done */
244    bool must_flush_denorms32 : 1;
245    bool must_flush_denorms16_64 : 1;
246    bool care_about_round32 : 1;
247    bool care_about_round16_64 : 1;
248 
249    /* Returns true if instructions using the mode "other" can safely use the
250     * current one instead. */
canReplacefloat_mode251    bool canReplace(float_mode other) const noexcept
252    {
253       return val == other.val &&
254              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
255              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
256              (must_flush_denorms32 || !other.must_flush_denorms32) &&
257              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
258              (care_about_round32 || !other.care_about_round32) &&
259              (care_about_round16_64 || !other.care_about_round16_64);
260    }
261 };
262 
263 struct wait_imm {
264    static const uint8_t unset_counter = 0xff;
265 
266    uint8_t vm;
267    uint8_t exp;
268    uint8_t lgkm;
269    uint8_t vs;
270 
271    wait_imm();
272    wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
273    wait_imm(enum amd_gfx_level chip, uint16_t packed);
274 
275    uint16_t pack(enum amd_gfx_level chip) const;
276 
277    bool combine(const wait_imm& other);
278 
279    bool empty() const;
280 };
281 
282 constexpr Format
asVOP3(Format format)283 asVOP3(Format format)
284 {
285    return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
286 };
287 
288 constexpr Format
asSDWA(Format format)289 asSDWA(Format format)
290 {
291    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
292    return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
293 }
294 
295 constexpr Format
withoutDPP(Format format)296 withoutDPP(Format format)
297 {
298    return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
299 }
300 
301 enum class RegType {
302    none = 0,
303    sgpr,
304    vgpr,
305    linear_vgpr,
306 };
307 
308 struct RegClass {
309 
310    enum RC : uint8_t {
311       s1 = 1,
312       s2 = 2,
313       s3 = 3,
314       s4 = 4,
315       s6 = 6,
316       s8 = 8,
317       s16 = 16,
318       v1 = s1 | (1 << 5),
319       v2 = s2 | (1 << 5),
320       v3 = s3 | (1 << 5),
321       v4 = s4 | (1 << 5),
322       v5 = 5 | (1 << 5),
323       v6 = 6 | (1 << 5),
324       v7 = 7 | (1 << 5),
325       v8 = 8 | (1 << 5),
326       /* byte-sized register class */
327       v1b = v1 | (1 << 7),
328       v2b = v2 | (1 << 7),
329       v3b = v3 | (1 << 7),
330       v4b = v4 | (1 << 7),
331       v6b = v6 | (1 << 7),
332       v8b = v8 | (1 << 7),
333       /* these are used for WWM and spills to vgpr */
334       v1_linear = v1 | (1 << 6),
335       v2_linear = v2 | (1 << 6),
336    };
337 
338    RegClass() = default;
RegClassRegClass339    constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass340    constexpr RegClass(RegType type, unsigned size)
341        : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
342    {}
343 
RCRegClass344    constexpr operator RC() const { return rc; }
345    explicit operator bool() = delete;
346 
typeRegClass347    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass348    constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass349    constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass350    constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
351    // TODO: use size() less in favor of bytes()
sizeRegClass352    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass353    constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass354    constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass355    constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
356 
getRegClass357    static constexpr RegClass get(RegType type, unsigned bytes)
358    {
359       if (type == RegType::sgpr) {
360          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
361       } else {
362          return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
363       }
364    }
365 
resizeRegClass366    constexpr RegClass resize(unsigned bytes) const
367    {
368       if (is_linear_vgpr()) {
369          assert(bytes % 4u == 0);
370          return get(RegType::vgpr, bytes).as_linear();
371       }
372       return get(type(), bytes);
373    }
374 
375 private:
376    RC rc;
377 };
378 
379 /* transitional helper expressions */
380 static constexpr RegClass s1{RegClass::s1};
381 static constexpr RegClass s2{RegClass::s2};
382 static constexpr RegClass s3{RegClass::s3};
383 static constexpr RegClass s4{RegClass::s4};
384 static constexpr RegClass s8{RegClass::s8};
385 static constexpr RegClass s16{RegClass::s16};
386 static constexpr RegClass v1{RegClass::v1};
387 static constexpr RegClass v2{RegClass::v2};
388 static constexpr RegClass v3{RegClass::v3};
389 static constexpr RegClass v4{RegClass::v4};
390 static constexpr RegClass v5{RegClass::v5};
391 static constexpr RegClass v6{RegClass::v6};
392 static constexpr RegClass v7{RegClass::v7};
393 static constexpr RegClass v8{RegClass::v8};
394 static constexpr RegClass v1b{RegClass::v1b};
395 static constexpr RegClass v2b{RegClass::v2b};
396 static constexpr RegClass v3b{RegClass::v3b};
397 static constexpr RegClass v4b{RegClass::v4b};
398 static constexpr RegClass v6b{RegClass::v6b};
399 static constexpr RegClass v8b{RegClass::v8b};
400 
401 /**
402  * Temp Class
403  * Each temporary virtual register has a
404  * register class (i.e. size and type)
405  * and SSA id.
406  */
407 struct Temp {
TempTemp408    Temp() noexcept : id_(0), reg_class(0) {}
TempTemp409    constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
410 
idTemp411    constexpr uint32_t id() const noexcept { return id_; }
regClassTemp412    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
413 
bytesTemp414    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp415    constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp416    constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp417    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
418 
419    constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
420    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
421    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
422 
423 private:
424    uint32_t id_ : 24;
425    uint32_t reg_class : 8;
426 };
427 
428 /**
429  * PhysReg
430  * Represents the physical register for each
431  * Operand and Definition.
432  */
433 struct PhysReg {
434    constexpr PhysReg() = default;
PhysRegPhysReg435    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg436    constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg437    constexpr unsigned byte() const { return reg_b & 0x3; }
438    constexpr operator unsigned() const { return reg(); }
439    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
440    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
441    constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg442    constexpr PhysReg advance(int bytes) const
443    {
444       PhysReg res = *this;
445       res.reg_b += bytes;
446       return res;
447    }
448 
449    uint16_t reg_b = 0;
450 };
451 
452 /* helper expressions for special registers */
453 static constexpr PhysReg m0{124};
454 static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
455 static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
456 static constexpr PhysReg vcc{106};
457 static constexpr PhysReg vcc_hi{107};
458 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
459 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
460 static constexpr PhysReg ttmp0{112};
461 static constexpr PhysReg ttmp1{113};
462 static constexpr PhysReg ttmp2{114};
463 static constexpr PhysReg ttmp3{115};
464 static constexpr PhysReg ttmp4{116};
465 static constexpr PhysReg ttmp5{117};
466 static constexpr PhysReg ttmp6{118};
467 static constexpr PhysReg ttmp7{119};
468 static constexpr PhysReg ttmp8{120};
469 static constexpr PhysReg ttmp9{121};
470 static constexpr PhysReg ttmp10{122};
471 static constexpr PhysReg ttmp11{123};
472 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
473 static constexpr PhysReg exec{126};
474 static constexpr PhysReg exec_lo{126};
475 static constexpr PhysReg exec_hi{127};
476 static constexpr PhysReg vccz{251};
477 static constexpr PhysReg execz{252};
478 static constexpr PhysReg scc{253};
479 
480 /**
481  * Operand Class
482  * Initially, each Operand refers to either
483  * a temporary virtual register
484  * or to a constant value
485  * Temporary registers get mapped to physical register during RA
486  * Constant values are inlined into the instruction sequence.
487  */
488 class Operand final {
489 public:
Operand()490    constexpr Operand()
491        : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
492          isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
493          is24bit_(false), signext(false)
494    {}
495 
Operand(Temp r)496    explicit Operand(Temp r) noexcept
497    {
498       data_.temp = r;
499       if (r.id()) {
500          isTemp_ = true;
501       } else {
502          isUndef_ = true;
503          setFixed(PhysReg{128});
504       }
505    };
Operand(Temp r,PhysReg reg)506    explicit Operand(Temp r, PhysReg reg) noexcept
507    {
508       assert(r.id()); /* Don't allow fixing an undef to a register */
509       data_.temp = r;
510       isTemp_ = true;
511       setFixed(reg);
512    };
513 
514    /* 8-bit constant */
c8(uint8_t v)515    static Operand c8(uint8_t v) noexcept
516    {
517       /* 8-bit constants are only used for copies and copies from any 8-bit
518        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
519        * to be inline constants. */
520       Operand op;
521       op.control_ = 0;
522       op.data_.i = v;
523       op.isConstant_ = true;
524       op.constSize = 0;
525       op.setFixed(PhysReg{0u});
526       return op;
527    };
528 
529    /* 16-bit constant */
c16(uint16_t v)530    static Operand c16(uint16_t v) noexcept
531    {
532       Operand op;
533       op.control_ = 0;
534       op.data_.i = v;
535       op.isConstant_ = true;
536       op.constSize = 1;
537       if (v <= 64)
538          op.setFixed(PhysReg{128u + v});
539       else if (v >= 0xFFF0) /* [-16 .. -1] */
540          op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
541       else if (v == 0x3800) /* 0.5 */
542          op.setFixed(PhysReg{240});
543       else if (v == 0xB800) /* -0.5 */
544          op.setFixed(PhysReg{241});
545       else if (v == 0x3C00) /* 1.0 */
546          op.setFixed(PhysReg{242});
547       else if (v == 0xBC00) /* -1.0 */
548          op.setFixed(PhysReg{243});
549       else if (v == 0x4000) /* 2.0 */
550          op.setFixed(PhysReg{244});
551       else if (v == 0xC000) /* -2.0 */
552          op.setFixed(PhysReg{245});
553       else if (v == 0x4400) /* 4.0 */
554          op.setFixed(PhysReg{246});
555       else if (v == 0xC400) /* -4.0 */
556          op.setFixed(PhysReg{247});
557       else if (v == 0x3118) /* 1/2 PI */
558          op.setFixed(PhysReg{248});
559       else /* Literal Constant */
560          op.setFixed(PhysReg{255});
561       return op;
562    }
563 
564    /* 32-bit constant */
c32(uint32_t v)565    static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
566 
567    /* 64-bit constant */
c64(uint64_t v)568    static Operand c64(uint64_t v) noexcept
569    {
570       Operand op;
571       op.control_ = 0;
572       op.isConstant_ = true;
573       op.constSize = 3;
574       if (v <= 64) {
575          op.data_.i = (uint32_t)v;
576          op.setFixed(PhysReg{128 + (uint32_t)v});
577       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
578          op.data_.i = (uint32_t)v;
579          op.setFixed(PhysReg{192 - (uint32_t)v});
580       } else if (v == 0x3FE0000000000000) { /* 0.5 */
581          op.data_.i = 0x3f000000;
582          op.setFixed(PhysReg{240});
583       } else if (v == 0xBFE0000000000000) { /* -0.5 */
584          op.data_.i = 0xbf000000;
585          op.setFixed(PhysReg{241});
586       } else if (v == 0x3FF0000000000000) { /* 1.0 */
587          op.data_.i = 0x3f800000;
588          op.setFixed(PhysReg{242});
589       } else if (v == 0xBFF0000000000000) { /* -1.0 */
590          op.data_.i = 0xbf800000;
591          op.setFixed(PhysReg{243});
592       } else if (v == 0x4000000000000000) { /* 2.0 */
593          op.data_.i = 0x40000000;
594          op.setFixed(PhysReg{244});
595       } else if (v == 0xC000000000000000) { /* -2.0 */
596          op.data_.i = 0xc0000000;
597          op.setFixed(PhysReg{245});
598       } else if (v == 0x4010000000000000) { /* 4.0 */
599          op.data_.i = 0x40800000;
600          op.setFixed(PhysReg{246});
601       } else if (v == 0xC010000000000000) { /* -4.0 */
602          op.data_.i = 0xc0800000;
603          op.setFixed(PhysReg{247});
604       } else { /* Literal Constant: we don't know if it is a long or double.*/
605          op.signext = v >> 63;
606          op.data_.i = v & 0xffffffffu;
607          op.setFixed(PhysReg{255});
608          assert(op.constantValue64() == v &&
609                 "attempt to create a unrepresentable 64-bit literal constant");
610       }
611       return op;
612    }
613 
614    /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)615    static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
616    {
617       Operand op;
618       op.control_ = 0;
619       op.data_.i = v;
620       op.isConstant_ = true;
621       op.constSize = is64bit ? 3 : 2;
622       if (v <= 64)
623          op.setFixed(PhysReg{128 + v});
624       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
625          op.setFixed(PhysReg{192 - v});
626       else if (v == 0x3f000000) /* 0.5 */
627          op.setFixed(PhysReg{240});
628       else if (v == 0xbf000000) /* -0.5 */
629          op.setFixed(PhysReg{241});
630       else if (v == 0x3f800000) /* 1.0 */
631          op.setFixed(PhysReg{242});
632       else if (v == 0xbf800000) /* -1.0 */
633          op.setFixed(PhysReg{243});
634       else if (v == 0x40000000) /* 2.0 */
635          op.setFixed(PhysReg{244});
636       else if (v == 0xc0000000) /* -2.0 */
637          op.setFixed(PhysReg{245});
638       else if (v == 0x40800000) /* 4.0 */
639          op.setFixed(PhysReg{246});
640       else if (v == 0xc0800000) /* -4.0 */
641          op.setFixed(PhysReg{247});
642       else { /* Literal Constant */
643          assert(!is64bit && "attempt to create a 64-bit literal constant");
644          op.setFixed(PhysReg{255});
645       }
646       return op;
647    }
648 
literal32(uint32_t v)649    static Operand literal32(uint32_t v) noexcept
650    {
651       Operand op;
652       op.control_ = 0;
653       op.data_.i = v;
654       op.isConstant_ = true;
655       op.constSize = 2;
656       op.setFixed(PhysReg{255});
657       return op;
658    }
659 
Operand(RegClass type)660    explicit Operand(RegClass type) noexcept
661    {
662       isUndef_ = true;
663       data_.temp = Temp(0, type);
664       setFixed(PhysReg{128});
665    };
Operand(PhysReg reg,RegClass type)666    explicit Operand(PhysReg reg, RegClass type) noexcept
667    {
668       data_.temp = Temp(0, type);
669       setFixed(reg);
670    }
671 
672    static Operand zero(unsigned bytes = 4) noexcept
673    {
674       if (bytes == 8)
675          return Operand::c64(0);
676       else if (bytes == 4)
677          return Operand::c32(0);
678       else if (bytes == 2)
679          return Operand::c16(0);
680       assert(bytes == 1);
681       return Operand::c8(0);
682    }
683 
684    /* This is useful over the constructors when you want to take a gfx level
685     * for 1/2 PI or an unknown operand size.
686     */
get_const(enum amd_gfx_level chip,uint64_t val,unsigned bytes)687    static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes)
688    {
689       if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
690          /* 1/2 PI can be an inline constant on GFX8+ */
691          Operand op = Operand::c32(val);
692          op.setFixed(PhysReg{248});
693          return op;
694       }
695 
696       if (bytes == 8)
697          return Operand::c64(val);
698       else if (bytes == 4)
699          return Operand::c32(val);
700       else if (bytes == 2)
701          return Operand::c16(val);
702       assert(bytes == 1);
703       return Operand::c8(val);
704    }
705 
706    static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
707                                          bool sext = false)
708    {
709       if (bytes <= 4)
710          return true;
711 
712       if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
713          return true;
714       uint64_t upper33 = val & 0xFFFFFFFF80000000;
715       if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
716          return true;
717 
718       return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
719              val == 0x3FE0000000000000 ||              /* 0.5 */
720              val == 0xBFE0000000000000 ||              /* -0.5 */
721              val == 0x3FF0000000000000 ||              /* 1.0 */
722              val == 0xBFF0000000000000 ||              /* -1.0 */
723              val == 0x4000000000000000 ||              /* 2.0 */
724              val == 0xC000000000000000 ||              /* -2.0 */
725              val == 0x4010000000000000 ||              /* 4.0 */
726              val == 0xC010000000000000;                /* -4.0 */
727    }
728 
isTemp()729    constexpr bool isTemp() const noexcept { return isTemp_; }
730 
setTemp(Temp t)731    constexpr void setTemp(Temp t) noexcept
732    {
733       assert(!isConstant_);
734       isTemp_ = true;
735       data_.temp = t;
736    }
737 
getTemp()738    constexpr Temp getTemp() const noexcept { return data_.temp; }
739 
tempId()740    constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
741 
hasRegClass()742    constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
743 
regClass()744    constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
745 
bytes()746    constexpr unsigned bytes() const noexcept
747    {
748       if (isConstant())
749          return 1 << constSize;
750       else
751          return data_.temp.bytes();
752    }
753 
size()754    constexpr unsigned size() const noexcept
755    {
756       if (isConstant())
757          return constSize > 2 ? 2 : 1;
758       else
759          return data_.temp.size();
760    }
761 
isFixed()762    constexpr bool isFixed() const noexcept { return isFixed_; }
763 
physReg()764    constexpr PhysReg physReg() const noexcept { return reg_; }
765 
setFixed(PhysReg reg)766    constexpr void setFixed(PhysReg reg) noexcept
767    {
768       isFixed_ = reg != unsigned(-1);
769       reg_ = reg;
770    }
771 
isConstant()772    constexpr bool isConstant() const noexcept { return isConstant_; }
773 
isLiteral()774    constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
775 
isUndefined()776    constexpr bool isUndefined() const noexcept { return isUndef_; }
777 
constantValue()778    constexpr uint32_t constantValue() const noexcept { return data_.i; }
779 
constantEquals(uint32_t cmp)780    constexpr bool constantEquals(uint32_t cmp) const noexcept
781    {
782       return isConstant() && constantValue() == cmp;
783    }
784 
constantValue64()785    constexpr uint64_t constantValue64() const noexcept
786    {
787       if (constSize == 3) {
788          if (reg_ <= 192)
789             return reg_ - 128;
790          else if (reg_ <= 208)
791             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
792 
793          switch (reg_) {
794          case 240: return 0x3FE0000000000000;
795          case 241: return 0xBFE0000000000000;
796          case 242: return 0x3FF0000000000000;
797          case 243: return 0xBFF0000000000000;
798          case 244: return 0x4000000000000000;
799          case 245: return 0xC000000000000000;
800          case 246: return 0x4010000000000000;
801          case 247: return 0xC010000000000000;
802          case 255:
803             return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
804          }
805          unreachable("invalid register for 64-bit constant");
806       } else {
807          return data_.i;
808       }
809    }
810 
811    /* Value if this were used with vop3/opsel or vop3p. */
constantValue16(bool opsel)812    constexpr uint16_t constantValue16(bool opsel) const noexcept
813    {
814       assert(bytes() == 2 || bytes() == 4);
815       if (opsel) {
816          if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral())
817             return int16_t(data_.i) >> 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */
818          else
819             return data_.i >> 16;
820       }
821       return data_.i;
822    }
823 
isOfType(RegType type)824    constexpr bool isOfType(RegType type) const noexcept
825    {
826       return hasRegClass() && regClass().type() == type;
827    }
828 
829    /* Indicates that the killed operand's live range intersects with the
830     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
831     * not set by liveness analysis. */
setLateKill(bool flag)832    constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
833 
isLateKill()834    constexpr bool isLateKill() const noexcept { return isLateKill_; }
835 
setKill(bool flag)836    constexpr void setKill(bool flag) noexcept
837    {
838       isKill_ = flag;
839       if (!flag)
840          setFirstKill(false);
841    }
842 
isKill()843    constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
844 
setFirstKill(bool flag)845    constexpr void setFirstKill(bool flag) noexcept
846    {
847       isFirstKill_ = flag;
848       if (flag)
849          setKill(flag);
850    }
851 
852    /* When there are multiple operands killing the same temporary,
853     * isFirstKill() is only returns true for the first one. */
isFirstKill()854    constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
855 
isKillBeforeDef()856    constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
857 
isFirstKillBeforeDef()858    constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
859 
860    constexpr bool operator==(Operand other) const noexcept
861    {
862       if (other.size() != size())
863          return false;
864       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
865          return false;
866       if (isFixed() && other.isFixed() && physReg() != other.physReg())
867          return false;
868       if (isLiteral())
869          return other.isLiteral() && other.constantValue() == constantValue();
870       else if (isConstant())
871          return other.isConstant() && other.physReg() == physReg();
872       else if (isUndefined())
873          return other.isUndefined() && other.regClass() == regClass();
874       else
875          return other.isTemp() && other.getTemp() == getTemp();
876    }
877 
878    constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
879 
set16bit(bool flag)880    constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
881 
is16bit()882    constexpr bool is16bit() const noexcept { return is16bit_; }
883 
set24bit(bool flag)884    constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
885 
is24bit()886    constexpr bool is24bit() const noexcept { return is24bit_; }
887 
888 private:
889    union {
890       Temp temp;
891       uint32_t i;
892       float f;
893    } data_ = {Temp(0, s1)};
894    PhysReg reg_;
895    union {
896       struct {
897          uint8_t isTemp_ : 1;
898          uint8_t isFixed_ : 1;
899          uint8_t isConstant_ : 1;
900          uint8_t isKill_ : 1;
901          uint8_t isUndef_ : 1;
902          uint8_t isFirstKill_ : 1;
903          uint8_t constSize : 2;
904          uint8_t isLateKill_ : 1;
905          uint8_t is16bit_ : 1;
906          uint8_t is24bit_ : 1;
907          uint8_t signext : 1;
908       };
909       /* can't initialize bit-fields in c++11, so work around using a union */
910       uint16_t control_ = 0;
911    };
912 };
913 
914 /**
915  * Definition Class
916  * Definitions are the results of Instructions
917  * and refer to temporary virtual registers
918  * which are later mapped to physical registers
919  */
920 class Definition final {
921 public:
Definition()922    constexpr Definition()
923        : temp(Temp(0, s1)), reg_(0), isFixed_(0), isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0)
924    {}
Definition(uint32_t index,RegClass type)925    Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
Definition(Temp tmp)926    explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)927    Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)928    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
929    {
930       setFixed(reg);
931    }
932 
isTemp()933    constexpr bool isTemp() const noexcept { return tempId() > 0; }
934 
getTemp()935    constexpr Temp getTemp() const noexcept { return temp; }
936 
tempId()937    constexpr uint32_t tempId() const noexcept { return temp.id(); }
938 
setTemp(Temp t)939    constexpr void setTemp(Temp t) noexcept { temp = t; }
940 
swapTemp(Definition & other)941    void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
942 
regClass()943    constexpr RegClass regClass() const noexcept { return temp.regClass(); }
944 
bytes()945    constexpr unsigned bytes() const noexcept { return temp.bytes(); }
946 
size()947    constexpr unsigned size() const noexcept { return temp.size(); }
948 
isFixed()949    constexpr bool isFixed() const noexcept { return isFixed_; }
950 
physReg()951    constexpr PhysReg physReg() const noexcept { return reg_; }
952 
setFixed(PhysReg reg)953    constexpr void setFixed(PhysReg reg) noexcept
954    {
955       isFixed_ = 1;
956       reg_ = reg;
957    }
958 
setKill(bool flag)959    constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
960 
isKill()961    constexpr bool isKill() const noexcept { return isKill_; }
962 
setPrecise(bool precise)963    constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
964 
isPrecise()965    constexpr bool isPrecise() const noexcept { return isPrecise_; }
966 
967    /* No Unsigned Wrap */
setNUW(bool nuw)968    constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
969 
isNUW()970    constexpr bool isNUW() const noexcept { return isNUW_; }
971 
setNoCSE(bool noCSE)972    constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
973 
isNoCSE()974    constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
975 
976 private:
977    Temp temp = Temp(0, s1);
978    PhysReg reg_;
979    union {
980       struct {
981          uint8_t isFixed_ : 1;
982          uint8_t isKill_ : 1;
983          uint8_t isPrecise_ : 1;
984          uint8_t isNUW_ : 1;
985          uint8_t isNoCSE_ : 1;
986       };
987       /* can't initialize bit-fields in c++11, so work around using a union */
988       uint8_t control_ = 0;
989    };
990 };
991 
992 struct Block;
993 struct Instruction;
994 struct Pseudo_instruction;
995 struct SOP1_instruction;
996 struct SOP2_instruction;
997 struct SOPK_instruction;
998 struct SOPP_instruction;
999 struct SOPC_instruction;
1000 struct SMEM_instruction;
1001 struct DS_instruction;
1002 struct MTBUF_instruction;
1003 struct MUBUF_instruction;
1004 struct MIMG_instruction;
1005 struct Export_instruction;
1006 struct FLAT_instruction;
1007 struct Pseudo_branch_instruction;
1008 struct Pseudo_barrier_instruction;
1009 struct Pseudo_reduction_instruction;
1010 struct VOP3P_instruction;
1011 struct VOP1_instruction;
1012 struct VOP2_instruction;
1013 struct VOPC_instruction;
1014 struct VOP3_instruction;
1015 struct Interp_instruction;
1016 struct DPP16_instruction;
1017 struct DPP8_instruction;
1018 struct SDWA_instruction;
1019 
1020 struct Instruction {
1021    aco_opcode opcode;
1022    Format format;
1023    uint32_t pass_flags;
1024 
1025    aco::span<Operand> operands;
1026    aco::span<Definition> definitions;
1027 
1028    constexpr bool usesModifiers() const noexcept;
1029 
reads_execInstruction1030    constexpr bool reads_exec() const noexcept
1031    {
1032       for (const Operand& op : operands) {
1033          if (op.isFixed() && op.physReg() == exec)
1034             return true;
1035       }
1036       return false;
1037    }
1038 
pseudoInstruction1039    Pseudo_instruction& pseudo() noexcept
1040    {
1041       assert(isPseudo());
1042       return *(Pseudo_instruction*)this;
1043    }
pseudoInstruction1044    const Pseudo_instruction& pseudo() const noexcept
1045    {
1046       assert(isPseudo());
1047       return *(Pseudo_instruction*)this;
1048    }
isPseudoInstruction1049    constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
sop1Instruction1050    SOP1_instruction& sop1() noexcept
1051    {
1052       assert(isSOP1());
1053       return *(SOP1_instruction*)this;
1054    }
sop1Instruction1055    const SOP1_instruction& sop1() const noexcept
1056    {
1057       assert(isSOP1());
1058       return *(SOP1_instruction*)this;
1059    }
isSOP1Instruction1060    constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
sop2Instruction1061    SOP2_instruction& sop2() noexcept
1062    {
1063       assert(isSOP2());
1064       return *(SOP2_instruction*)this;
1065    }
sop2Instruction1066    const SOP2_instruction& sop2() const noexcept
1067    {
1068       assert(isSOP2());
1069       return *(SOP2_instruction*)this;
1070    }
isSOP2Instruction1071    constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
sopkInstruction1072    SOPK_instruction& sopk() noexcept
1073    {
1074       assert(isSOPK());
1075       return *(SOPK_instruction*)this;
1076    }
sopkInstruction1077    const SOPK_instruction& sopk() const noexcept
1078    {
1079       assert(isSOPK());
1080       return *(SOPK_instruction*)this;
1081    }
isSOPKInstruction1082    constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
soppInstruction1083    SOPP_instruction& sopp() noexcept
1084    {
1085       assert(isSOPP());
1086       return *(SOPP_instruction*)this;
1087    }
soppInstruction1088    const SOPP_instruction& sopp() const noexcept
1089    {
1090       assert(isSOPP());
1091       return *(SOPP_instruction*)this;
1092    }
isSOPPInstruction1093    constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
sopcInstruction1094    SOPC_instruction& sopc() noexcept
1095    {
1096       assert(isSOPC());
1097       return *(SOPC_instruction*)this;
1098    }
sopcInstruction1099    const SOPC_instruction& sopc() const noexcept
1100    {
1101       assert(isSOPC());
1102       return *(SOPC_instruction*)this;
1103    }
isSOPCInstruction1104    constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
smemInstruction1105    SMEM_instruction& smem() noexcept
1106    {
1107       assert(isSMEM());
1108       return *(SMEM_instruction*)this;
1109    }
smemInstruction1110    const SMEM_instruction& smem() const noexcept
1111    {
1112       assert(isSMEM());
1113       return *(SMEM_instruction*)this;
1114    }
isSMEMInstruction1115    constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1116    DS_instruction& ds() noexcept
1117    {
1118       assert(isDS());
1119       return *(DS_instruction*)this;
1120    }
dsInstruction1121    const DS_instruction& ds() const noexcept
1122    {
1123       assert(isDS());
1124       return *(DS_instruction*)this;
1125    }
isDSInstruction1126    constexpr bool isDS() const noexcept { return format == Format::DS; }
mtbufInstruction1127    MTBUF_instruction& mtbuf() noexcept
1128    {
1129       assert(isMTBUF());
1130       return *(MTBUF_instruction*)this;
1131    }
mtbufInstruction1132    const MTBUF_instruction& mtbuf() const noexcept
1133    {
1134       assert(isMTBUF());
1135       return *(MTBUF_instruction*)this;
1136    }
isMTBUFInstruction1137    constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1138    MUBUF_instruction& mubuf() noexcept
1139    {
1140       assert(isMUBUF());
1141       return *(MUBUF_instruction*)this;
1142    }
mubufInstruction1143    const MUBUF_instruction& mubuf() const noexcept
1144    {
1145       assert(isMUBUF());
1146       return *(MUBUF_instruction*)this;
1147    }
isMUBUFInstruction1148    constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1149    MIMG_instruction& mimg() noexcept
1150    {
1151       assert(isMIMG());
1152       return *(MIMG_instruction*)this;
1153    }
mimgInstruction1154    const MIMG_instruction& mimg() const noexcept
1155    {
1156       assert(isMIMG());
1157       return *(MIMG_instruction*)this;
1158    }
isMIMGInstruction1159    constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1160    Export_instruction& exp() noexcept
1161    {
1162       assert(isEXP());
1163       return *(Export_instruction*)this;
1164    }
expInstruction1165    const Export_instruction& exp() const noexcept
1166    {
1167       assert(isEXP());
1168       return *(Export_instruction*)this;
1169    }
isEXPInstruction1170    constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1171    FLAT_instruction& flat() noexcept
1172    {
1173       assert(isFlat());
1174       return *(FLAT_instruction*)this;
1175    }
flatInstruction1176    const FLAT_instruction& flat() const noexcept
1177    {
1178       assert(isFlat());
1179       return *(FLAT_instruction*)this;
1180    }
isFlatInstruction1181    constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1182    FLAT_instruction& global() noexcept
1183    {
1184       assert(isGlobal());
1185       return *(FLAT_instruction*)this;
1186    }
globalInstruction1187    const FLAT_instruction& global() const noexcept
1188    {
1189       assert(isGlobal());
1190       return *(FLAT_instruction*)this;
1191    }
isGlobalInstruction1192    constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1193    FLAT_instruction& scratch() noexcept
1194    {
1195       assert(isScratch());
1196       return *(FLAT_instruction*)this;
1197    }
scratchInstruction1198    const FLAT_instruction& scratch() const noexcept
1199    {
1200       assert(isScratch());
1201       return *(FLAT_instruction*)this;
1202    }
isScratchInstruction1203    constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1204    Pseudo_branch_instruction& branch() noexcept
1205    {
1206       assert(isBranch());
1207       return *(Pseudo_branch_instruction*)this;
1208    }
branchInstruction1209    const Pseudo_branch_instruction& branch() const noexcept
1210    {
1211       assert(isBranch());
1212       return *(Pseudo_branch_instruction*)this;
1213    }
isBranchInstruction1214    constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1215    Pseudo_barrier_instruction& barrier() noexcept
1216    {
1217       assert(isBarrier());
1218       return *(Pseudo_barrier_instruction*)this;
1219    }
barrierInstruction1220    const Pseudo_barrier_instruction& barrier() const noexcept
1221    {
1222       assert(isBarrier());
1223       return *(Pseudo_barrier_instruction*)this;
1224    }
isBarrierInstruction1225    constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1226    Pseudo_reduction_instruction& reduction() noexcept
1227    {
1228       assert(isReduction());
1229       return *(Pseudo_reduction_instruction*)this;
1230    }
reductionInstruction1231    const Pseudo_reduction_instruction& reduction() const noexcept
1232    {
1233       assert(isReduction());
1234       return *(Pseudo_reduction_instruction*)this;
1235    }
isReductionInstruction1236    constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
vop3pInstruction1237    VOP3P_instruction& vop3p() noexcept
1238    {
1239       assert(isVOP3P());
1240       return *(VOP3P_instruction*)this;
1241    }
vop3pInstruction1242    const VOP3P_instruction& vop3p() const noexcept
1243    {
1244       assert(isVOP3P());
1245       return *(VOP3P_instruction*)this;
1246    }
isVOP3PInstruction1247    constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
vop1Instruction1248    VOP1_instruction& vop1() noexcept
1249    {
1250       assert(isVOP1());
1251       return *(VOP1_instruction*)this;
1252    }
vop1Instruction1253    const VOP1_instruction& vop1() const noexcept
1254    {
1255       assert(isVOP1());
1256       return *(VOP1_instruction*)this;
1257    }
isVOP1Instruction1258    constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
vop2Instruction1259    VOP2_instruction& vop2() noexcept
1260    {
1261       assert(isVOP2());
1262       return *(VOP2_instruction*)this;
1263    }
vop2Instruction1264    const VOP2_instruction& vop2() const noexcept
1265    {
1266       assert(isVOP2());
1267       return *(VOP2_instruction*)this;
1268    }
isVOP2Instruction1269    constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
vopcInstruction1270    VOPC_instruction& vopc() noexcept
1271    {
1272       assert(isVOPC());
1273       return *(VOPC_instruction*)this;
1274    }
vopcInstruction1275    const VOPC_instruction& vopc() const noexcept
1276    {
1277       assert(isVOPC());
1278       return *(VOPC_instruction*)this;
1279    }
isVOPCInstruction1280    constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
vop3Instruction1281    VOP3_instruction& vop3() noexcept
1282    {
1283       assert(isVOP3());
1284       return *(VOP3_instruction*)this;
1285    }
vop3Instruction1286    const VOP3_instruction& vop3() const noexcept
1287    {
1288       assert(isVOP3());
1289       return *(VOP3_instruction*)this;
1290    }
isVOP3Instruction1291    constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1292    Interp_instruction& vintrp() noexcept
1293    {
1294       assert(isVINTRP());
1295       return *(Interp_instruction*)this;
1296    }
vintrpInstruction1297    const Interp_instruction& vintrp() const noexcept
1298    {
1299       assert(isVINTRP());
1300       return *(Interp_instruction*)this;
1301    }
isVINTRPInstruction1302    constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
dpp16Instruction1303    DPP16_instruction& dpp16() noexcept
1304    {
1305       assert(isDPP16());
1306       return *(DPP16_instruction*)this;
1307    }
dpp16Instruction1308    const DPP16_instruction& dpp16() const noexcept
1309    {
1310       assert(isDPP16());
1311       return *(DPP16_instruction*)this;
1312    }
isDPP16Instruction1313    constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
dpp8Instruction1314    DPP8_instruction& dpp8() noexcept
1315    {
1316       assert(isDPP8());
1317       return *(DPP8_instruction*)this;
1318    }
dpp8Instruction1319    const DPP8_instruction& dpp8() const noexcept
1320    {
1321       assert(isDPP8());
1322       return *(DPP8_instruction*)this;
1323    }
isDPP8Instruction1324    constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
isDPPInstruction1325    constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
sdwaInstruction1326    SDWA_instruction& sdwa() noexcept
1327    {
1328       assert(isSDWA());
1329       return *(SDWA_instruction*)this;
1330    }
sdwaInstruction1331    const SDWA_instruction& sdwa() const noexcept
1332    {
1333       assert(isSDWA());
1334       return *(SDWA_instruction*)this;
1335    }
isSDWAInstruction1336    constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1337 
flatlikeInstruction1338    FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1339 
flatlikeInstruction1340    const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1341 
isFlatLikeInstruction1342    constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1343 
isVALUInstruction1344    constexpr bool isVALU() const noexcept
1345    {
1346       return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1347    }
1348 
isSALUInstruction1349    constexpr bool isSALU() const noexcept
1350    {
1351       return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1352    }
1353 
isVMEMInstruction1354    constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1355 };
1356 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1357 
1358 struct SOPK_instruction : public Instruction {
1359    uint16_t imm;
1360    uint16_t padding;
1361 };
1362 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1363 
1364 struct SOPP_instruction : public Instruction {
1365    uint32_t imm;
1366    int block;
1367 };
1368 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1369 
1370 struct SOPC_instruction : public Instruction {
1371    uint32_t padding;
1372 };
1373 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1374 
1375 struct SOP1_instruction : public Instruction {};
1376 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1377 
1378 struct SOP2_instruction : public Instruction {
1379    uint32_t padding;
1380 };
1381 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1382 
1383 /**
1384  * Scalar Memory Format:
1385  * For s_(buffer_)load_dword*:
1386  * Operand(0): SBASE - SGPR-pair which provides base address
1387  * Operand(1): Offset - immediate (un)signed offset or SGPR
1388  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1389  * Operand(n-1): SOffset - SGPR offset (Vega only)
1390  *
1391  * Having no operands is also valid for instructions such as s_dcache_inv.
1392  *
1393  */
1394 struct SMEM_instruction : public Instruction {
1395    memory_sync_info sync;
1396    bool glc : 1; /* VI+: globally coherent */
1397    bool dlc : 1; /* NAVI: device level coherent */
1398    bool nv : 1;  /* VEGA only: Non-volatile */
1399    bool disable_wqm : 1;
1400    bool prevent_overflow : 1; /* avoid overflow when combining additions */
1401    uint8_t padding : 3;
1402 };
1403 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1404 
1405 struct VOP1_instruction : public Instruction {};
1406 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1407 
1408 struct VOP2_instruction : public Instruction {};
1409 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1410 
1411 struct VOPC_instruction : public Instruction {};
1412 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1413 
1414 struct VOP3_instruction : public Instruction {
1415    bool abs[3];
1416    bool neg[3];
1417    uint8_t opsel : 4;
1418    uint8_t omod : 2;
1419    bool clamp : 1;
1420    uint8_t padding0 : 1;
1421    uint8_t padding1;
1422 };
1423 static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1424 
1425 struct VOP3P_instruction : public Instruction {
1426    bool neg_lo[3];
1427    bool neg_hi[3]; /* abs modifier, for v_mad_mix/v_fma_mix */
1428    uint8_t opsel_lo : 3;
1429    uint8_t opsel_hi : 3;
1430    bool clamp : 1;
1431    uint8_t padding0 : 1;
1432    uint8_t padding1;
1433 };
1434 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1435 
1436 /**
1437  * Data Parallel Primitives Format:
1438  * This format can be used for VOP1, VOP2 or VOPC instructions.
1439  * The swizzle applies to the src0 operand.
1440  *
1441  */
1442 struct DPP16_instruction : public Instruction {
1443    bool abs[2];
1444    bool neg[2];
1445    uint16_t dpp_ctrl;
1446    uint8_t row_mask : 4;
1447    uint8_t bank_mask : 4;
1448    bool bound_ctrl : 1;
1449    uint8_t padding : 7;
1450 };
1451 static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1452 
1453 struct DPP8_instruction : public Instruction {
1454    uint8_t lane_sel[8];
1455 };
1456 static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1457 
1458 struct SubdwordSel {
1459    enum sdwa_sel : uint8_t {
1460       ubyte = 0x4,
1461       uword = 0x8,
1462       dword = 0x10,
1463       sext = 0x20,
1464       sbyte = ubyte | sext,
1465       sword = uword | sext,
1466 
1467       ubyte0 = ubyte,
1468       ubyte1 = ubyte | 1,
1469       ubyte2 = ubyte | 2,
1470       ubyte3 = ubyte | 3,
1471       sbyte0 = sbyte,
1472       sbyte1 = sbyte | 1,
1473       sbyte2 = sbyte | 2,
1474       sbyte3 = sbyte | 3,
1475       uword0 = uword,
1476       uword1 = uword | 2,
1477       sword0 = sword,
1478       sword1 = sword | 2,
1479    };
1480 
SubdwordSelSubdwordSel1481    SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1482    constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1483    constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1484        : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1485    {}
sdwa_selSubdwordSel1486    constexpr operator sdwa_sel() const { return sel; }
1487    explicit operator bool() const { return sel != 0; }
1488 
sizeSubdwordSel1489    constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1490    constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1491    constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1492    constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1493    {
1494       reg_byte_offset += offset();
1495       if (size() == 1)
1496          return reg_byte_offset;
1497       else if (size() == 2)
1498          return 4 + (reg_byte_offset >> 1);
1499       else
1500          return 6;
1501    }
1502 
1503 private:
1504    sdwa_sel sel;
1505 };
1506 
1507 /**
1508  * Sub-Dword Addressing Format:
1509  * This format can be used for VOP1, VOP2 or VOPC instructions.
1510  *
1511  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1512  * the definition doesn't have to be VCC on GFX9+.
1513  *
1514  */
1515 struct SDWA_instruction : public Instruction {
1516    /* these destination modifiers aren't available with VOPC except for
1517     * clamp on GFX8 */
1518    SubdwordSel sel[2];
1519    SubdwordSel dst_sel;
1520    bool neg[2];
1521    bool abs[2];
1522    bool clamp : 1;
1523    uint8_t omod : 2; /* GFX9+ */
1524    uint8_t padding : 5;
1525 };
1526 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1527 
1528 struct Interp_instruction : public Instruction {
1529    uint8_t attribute;
1530    uint8_t component;
1531    uint16_t padding;
1532 };
1533 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1534 
1535 /**
1536  * Local and Global Data Sharing instructions
1537  * Operand(0): ADDR - VGPR which supplies the address.
1538  * Operand(1): DATA0 - First data VGPR.
1539  * Operand(2): DATA1 - Second data VGPR.
1540  * Operand(n-1): M0 - LDS size.
1541  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1542  *
1543  */
1544 struct DS_instruction : public Instruction {
1545    memory_sync_info sync;
1546    bool gds;
1547    uint16_t offset0;
1548    uint8_t offset1;
1549    uint8_t padding;
1550 };
1551 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1552 
1553 /**
1554  * Vector Memory Untyped-buffer Instructions
1555  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1556  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1557  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1558  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1559  *
1560  */
1561 struct MUBUF_instruction : public Instruction {
1562    memory_sync_info sync;
1563    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1564    bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1565    bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
1566    bool glc : 1;             /* globally coherent */
1567    bool dlc : 1;             /* NAVI: device level coherent */
1568    bool slc : 1;             /* system level coherent */
1569    bool tfe : 1;             /* texture fail enable */
1570    bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
1571    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1572    uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
1573    uint16_t swizzled : 1;
1574    uint16_t padding0 : 2;
1575    uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1576    uint16_t padding1 : 10;
1577 };
1578 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1579 
1580 /**
1581  * Vector Memory Typed-buffer Instructions
1582  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1583  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1584  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1585  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1586  *
1587  */
1588 struct MTBUF_instruction : public Instruction {
1589    memory_sync_info sync;
1590    uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
1591    uint8_t nfmt : 3;         /* Numeric format of data in memory */
1592    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1593    uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
1594    uint16_t glc : 1;         /* globally coherent */
1595    uint16_t dlc : 1;         /* NAVI: device level coherent */
1596    uint16_t slc : 1;         /* system level coherent */
1597    uint16_t tfe : 1;         /* texture fail enable */
1598    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1599    uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1600    uint16_t padding : 4;
1601    uint16_t offset; /* Unsigned byte offset - 12 bit */
1602 };
1603 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1604 
1605 /**
1606  * Vector Memory Image Instructions
1607  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1608  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1609  * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1610  * Operand(3): VADDR - Address source. Can carry an offset or an index.
1611  * Definition(0): VDATA - Vector GPR for read result.
1612  *
1613  */
1614 struct MIMG_instruction : public Instruction {
1615    memory_sync_info sync;
1616    uint8_t dmask;        /* Data VGPR enable mask */
1617    uint8_t dim : 3;      /* NAVI: dimensionality */
1618    bool unrm : 1;        /* Force address to be un-normalized */
1619    bool dlc : 1;         /* NAVI: device level coherent */
1620    bool glc : 1;         /* globally coherent */
1621    bool slc : 1;         /* system level coherent */
1622    bool tfe : 1;         /* texture fail enable */
1623    bool da : 1;          /* declare an array */
1624    bool lwe : 1;         /* LOD warning enable */
1625    bool r128 : 1;        /* NAVI: Texture resource size */
1626    bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
1627    bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
1628    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1629    uint8_t padding0 : 2;
1630    uint8_t padding1;
1631    uint8_t padding2;
1632 };
1633 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1634 
1635 /**
1636  * Flat/Scratch/Global Instructions
1637  * Operand(0): ADDR
1638  * Operand(1): SADDR
1639  * Operand(2) / Definition(0): DATA/VDST
1640  *
1641  */
1642 struct FLAT_instruction : public Instruction {
1643    memory_sync_info sync;
1644    bool slc : 1; /* system level coherent */
1645    bool glc : 1; /* globally coherent */
1646    bool dlc : 1; /* NAVI: device level coherent */
1647    bool lds : 1;
1648    bool nv : 1;
1649    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1650    uint8_t padding0 : 2;
1651    int16_t offset; /* Vega/Navi only */
1652    uint16_t padding1;
1653 };
1654 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1655 
1656 struct Export_instruction : public Instruction {
1657    uint8_t enabled_mask;
1658    uint8_t dest;
1659    bool compressed : 1;
1660    bool done : 1;
1661    bool valid_mask : 1;
1662    uint8_t padding0 : 5;
1663    uint8_t padding1;
1664 };
1665 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1666 
1667 struct Pseudo_instruction : public Instruction {
1668    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1669    bool tmp_in_scc;
1670    uint8_t padding;
1671 };
1672 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1673 
1674 struct Pseudo_branch_instruction : public Instruction {
1675    /* target[0] is the block index of the branch target.
1676     * For conditional branches, target[1] contains the fall-through alternative.
1677     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1678     */
1679    uint32_t target[2];
1680 };
1681 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1682 
1683 struct Pseudo_barrier_instruction : public Instruction {
1684    memory_sync_info sync;
1685    sync_scope exec_scope;
1686 };
1687 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1688 
1689 enum ReduceOp : uint16_t {
1690    // clang-format off
1691    iadd8, iadd16, iadd32, iadd64,
1692    imul8, imul16, imul32, imul64,
1693           fadd16, fadd32, fadd64,
1694           fmul16, fmul32, fmul64,
1695    imin8, imin16, imin32, imin64,
1696    imax8, imax16, imax32, imax64,
1697    umin8, umin16, umin32, umin64,
1698    umax8, umax16, umax32, umax64,
1699           fmin16, fmin32, fmin64,
1700           fmax16, fmax32, fmax64,
1701    iand8, iand16, iand32, iand64,
1702    ior8, ior16, ior32, ior64,
1703    ixor8, ixor16, ixor32, ixor64,
1704    num_reduce_ops,
1705    // clang-format on
1706 };
1707 
1708 /**
1709  * Subgroup Reduction Instructions, everything except for the data to be
1710  * reduced and the result as inserted by setup_reduce_temp().
1711  * Operand(0): data to be reduced
1712  * Operand(1): reduce temporary
1713  * Operand(2): vector temporary
1714  * Definition(0): result
1715  * Definition(1): scalar temporary
1716  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1717  * Definition(3): scc clobber
1718  * Definition(4): vcc clobber
1719  *
1720  */
1721 struct Pseudo_reduction_instruction : public Instruction {
1722    ReduceOp reduce_op;
1723    uint16_t cluster_size; // must be 0 for scans
1724 };
1725 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1726               "Unexpected padding");
1727 
1728 struct instr_deleter_functor {
operatorinstr_deleter_functor1729    void operator()(void* p) { free(p); }
1730 };
1731 
1732 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1733 
1734 template <typename T>
1735 T*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1736 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1737                    uint32_t num_definitions)
1738 {
1739    std::size_t size =
1740       sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1741    char* data = (char*)calloc(1, size);
1742    T* inst = (T*)data;
1743 
1744    inst->opcode = opcode;
1745    inst->format = format;
1746 
1747    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1748    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1749    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1750    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1751 
1752    return inst;
1753 }
1754 
1755 constexpr bool
usesModifiers()1756 Instruction::usesModifiers() const noexcept
1757 {
1758    if (isDPP() || isSDWA())
1759       return true;
1760 
1761    if (isVOP3P()) {
1762       const VOP3P_instruction& vop3p = this->vop3p();
1763       for (unsigned i = 0; i < operands.size(); i++) {
1764          if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1765             return true;
1766 
1767          /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1768          if (!(vop3p.opsel_hi & (1 << i)))
1769             return true;
1770       }
1771       return vop3p.opsel_lo || vop3p.clamp;
1772    } else if (isVOP3()) {
1773       const VOP3_instruction& vop3 = this->vop3();
1774       for (unsigned i = 0; i < operands.size(); i++) {
1775          if (vop3.abs[i] || vop3.neg[i])
1776             return true;
1777       }
1778       return vop3.opsel || vop3.clamp || vop3.omod;
1779    }
1780    return false;
1781 }
1782 
1783 constexpr bool
is_phi(Instruction * instr)1784 is_phi(Instruction* instr)
1785 {
1786    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1787 }
1788 
1789 static inline bool
is_phi(aco_ptr<Instruction> & instr)1790 is_phi(aco_ptr<Instruction>& instr)
1791 {
1792    return is_phi(instr.get());
1793 }
1794 
1795 memory_sync_info get_sync_info(const Instruction* instr);
1796 
1797 bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1798 
1799 bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
1800 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
1801 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
1802 bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8);
1803 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1804 aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr);
1805 aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8);
1806 bool needs_exec_mask(const Instruction* instr);
1807 
1808 aco_opcode get_ordered(aco_opcode op);
1809 aco_opcode get_unordered(aco_opcode op);
1810 aco_opcode get_inverse(aco_opcode op);
1811 aco_opcode get_f32_cmp(aco_opcode op);
1812 aco_opcode get_vcmpx(aco_opcode op);
1813 unsigned get_cmp_bitsize(aco_opcode op);
1814 bool is_cmp(aco_opcode op);
1815 
1816 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
1817 
1818 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1819 
1820 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1821 
1822 bool should_form_clause(const Instruction* a, const Instruction* b);
1823 
1824 enum block_kind {
1825    /* uniform indicates that leaving this block,
1826     * all actives lanes stay active */
1827    block_kind_uniform = 1 << 0,
1828    block_kind_top_level = 1 << 1,
1829    block_kind_loop_preheader = 1 << 2,
1830    block_kind_loop_header = 1 << 3,
1831    block_kind_loop_exit = 1 << 4,
1832    block_kind_continue = 1 << 5,
1833    block_kind_break = 1 << 6,
1834    block_kind_continue_or_break = 1 << 7,
1835    block_kind_branch = 1 << 8,
1836    block_kind_merge = 1 << 9,
1837    block_kind_invert = 1 << 10,
1838    block_kind_uses_discard = 1 << 12,
1839    block_kind_needs_lowering = 1 << 13,
1840    block_kind_export_end = 1 << 15,
1841 };
1842 
1843 struct RegisterDemand {
1844    constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1845    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1846    int16_t vgpr = 0;
1847    int16_t sgpr = 0;
1848 
1849    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1850    {
1851       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1852    }
1853 
exceedsRegisterDemand1854    constexpr bool exceeds(const RegisterDemand other) const noexcept
1855    {
1856       return vgpr > other.vgpr || sgpr > other.sgpr;
1857    }
1858 
1859    constexpr RegisterDemand operator+(const Temp t) const noexcept
1860    {
1861       if (t.type() == RegType::sgpr)
1862          return RegisterDemand(vgpr, sgpr + t.size());
1863       else
1864          return RegisterDemand(vgpr + t.size(), sgpr);
1865    }
1866 
1867    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1868    {
1869       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1870    }
1871 
1872    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1873    {
1874       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1875    }
1876 
1877    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1878    {
1879       vgpr += other.vgpr;
1880       sgpr += other.sgpr;
1881       return *this;
1882    }
1883 
1884    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1885    {
1886       vgpr -= other.vgpr;
1887       sgpr -= other.sgpr;
1888       return *this;
1889    }
1890 
1891    constexpr RegisterDemand& operator+=(const Temp t) noexcept
1892    {
1893       if (t.type() == RegType::sgpr)
1894          sgpr += t.size();
1895       else
1896          vgpr += t.size();
1897       return *this;
1898    }
1899 
1900    constexpr RegisterDemand& operator-=(const Temp t) noexcept
1901    {
1902       if (t.type() == RegType::sgpr)
1903          sgpr -= t.size();
1904       else
1905          vgpr -= t.size();
1906       return *this;
1907    }
1908 
updateRegisterDemand1909    constexpr void update(const RegisterDemand other) noexcept
1910    {
1911       vgpr = std::max(vgpr, other.vgpr);
1912       sgpr = std::max(sgpr, other.sgpr);
1913    }
1914 };
1915 
1916 /* CFG */
1917 struct Block {
1918    float_mode fp_mode;
1919    unsigned index;
1920    unsigned offset = 0;
1921    std::vector<aco_ptr<Instruction>> instructions;
1922    std::vector<unsigned> logical_preds;
1923    std::vector<unsigned> linear_preds;
1924    std::vector<unsigned> logical_succs;
1925    std::vector<unsigned> linear_succs;
1926    RegisterDemand register_demand = RegisterDemand();
1927    uint16_t loop_nest_depth = 0;
1928    uint16_t divergent_if_logical_depth = 0;
1929    uint16_t uniform_if_depth = 0;
1930    uint16_t kind = 0;
1931    int logical_idom = -1;
1932    int linear_idom = -1;
1933 
1934    /* this information is needed for predecessors to blocks with phis when
1935     * moving out of ssa */
1936    bool scc_live_out = false;
1937 
BlockBlock1938    Block() : index(0) {}
1939 };
1940 
1941 /*
1942  * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1943  */
1944 enum class SWStage : uint16_t {
1945    None = 0,
1946    VS = 1 << 0,     /* Vertex Shader */
1947    GS = 1 << 1,     /* Geometry Shader */
1948    TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
1949    TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
1950    FS = 1 << 4,     /* Fragment aka Pixel Shader */
1951    CS = 1 << 5,     /* Compute Shader */
1952    TS = 1 << 6,     /* Task Shader */
1953    MS = 1 << 7,     /* Mesh Shader */
1954    GSCopy = 1 << 8, /* GS Copy Shader (internal) */
1955 
1956    /* Stage combinations merged to run on a single HWStage */
1957    VS_GS = VS | GS,
1958    VS_TCS = VS | TCS,
1959    TES_GS = TES | GS,
1960 };
1961 
1962 constexpr SWStage
1963 operator|(SWStage a, SWStage b)
1964 {
1965    return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
1966 }
1967 
1968 /*
1969  * Shader stages as running on the AMD GPU.
1970  *
1971  * The relation between HWStages and SWStages is not a one-to-one mapping:
1972  * Some SWStages are merged by ACO to run on a single HWStage.
1973  * See README.md for details.
1974  */
1975 enum class HWStage : uint8_t {
1976    VS,
1977    ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1978    GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
1979    NGG, /* Primitive shader, used to implement VS, TES, GS. */
1980    LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1981    HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1982    FS,
1983    CS,
1984 };
1985 
1986 /*
1987  * Set of SWStages to be merged into a single shader paired with the
1988  * HWStage it will run on.
1989  */
1990 struct Stage {
1991    constexpr Stage() = default;
1992 
StageStage1993    explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1994 
1995    /* Check if the given SWStage is included */
hasStage1996    constexpr bool has(SWStage stage) const
1997    {
1998       return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
1999    }
2000 
num_sw_stagesStage2001    unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
2002 
2003    constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
2004 
2005    constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
2006 
2007    /* Mask of merged software stages */
2008    SWStage sw = SWStage::None;
2009 
2010    /* Active hardware stage */
2011    HWStage hw{};
2012 };
2013 
2014 /* possible settings of Program::stage */
2015 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
2016 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
2017 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
2018 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
2019 static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
2020 /* Mesh shading pipeline */
2021 static constexpr Stage task_cs(HWStage::CS, SWStage::TS);
2022 static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS);
2023 /* GFX10/NGG */
2024 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
2025 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
2026 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
2027 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
2028 /* GFX9 (and GFX10 if NGG isn't used) */
2029 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
2030 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
2031 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
2032 /* pre-GFX9 */
2033 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
2034 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
2035 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
2036 static constexpr Stage tess_eval_es(HWStage::ES,
2037                                     SWStage::TES); /* tesselation evaluation before geometry */
2038 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
2039 
2040 enum statistic {
2041    statistic_hash,
2042    statistic_instructions,
2043    statistic_copies,
2044    statistic_branches,
2045    statistic_latency,
2046    statistic_inv_throughput,
2047    statistic_vmem_clauses,
2048    statistic_smem_clauses,
2049    statistic_sgpr_presched,
2050    statistic_vgpr_presched,
2051    num_statistics
2052 };
2053 
2054 struct DeviceInfo {
2055    uint16_t lds_encoding_granule;
2056    uint16_t lds_alloc_granule;
2057    uint32_t lds_limit; /* in bytes */
2058    bool has_16bank_lds;
2059    uint16_t physical_sgprs;
2060    uint16_t physical_vgprs;
2061    uint16_t vgpr_limit;
2062    uint16_t sgpr_limit;
2063    uint16_t sgpr_alloc_granule;
2064    uint16_t vgpr_alloc_granule; /* must be power of two */
2065    unsigned max_wave64_per_simd;
2066    unsigned simd_per_cu;
2067    bool has_fast_fma32 = false;
2068    bool has_mac_legacy32 = false;
2069    bool fused_mad_mix = false;
2070    bool xnack_enabled = false;
2071    bool sram_ecc_enabled = false;
2072 
2073    int16_t scratch_global_offset_min;
2074    int16_t scratch_global_offset_max;
2075 };
2076 
2077 enum class CompilationProgress {
2078    after_isel,
2079    after_spilling,
2080    after_ra,
2081 };
2082 
2083 class Program final {
2084 public:
2085    std::vector<Block> blocks;
2086    std::vector<RegClass> temp_rc = {s1};
2087    RegisterDemand max_reg_demand = RegisterDemand();
2088    ac_shader_config* config;
2089    struct aco_shader_info info;
2090    enum amd_gfx_level gfx_level;
2091    enum radeon_family family;
2092    DeviceInfo dev;
2093    unsigned wave_size;
2094    RegClass lane_mask;
2095    Stage stage;
2096    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2097    bool needs_wqm = false;   /* there exists a p_wqm instruction */
2098 
2099    std::vector<uint8_t> constant_data;
2100    Temp private_segment_buffer;
2101    Temp scratch_offset;
2102 
2103    uint16_t num_waves = 0;
2104    uint16_t min_waves = 0;
2105    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2106    bool wgp_mode;
2107    bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2108 
2109    bool needs_vcc = false;
2110 
2111    CompilationProgress progress;
2112 
2113    bool collect_statistics = false;
2114    uint32_t statistics[num_statistics];
2115 
2116    float_mode next_fp_mode;
2117    unsigned next_loop_depth = 0;
2118    unsigned next_divergent_if_logical_depth = 0;
2119    unsigned next_uniform_if_depth = 0;
2120 
2121    std::vector<Definition> vs_inputs;
2122 
2123    struct {
2124       FILE* output = stderr;
2125       bool shorten_messages = false;
2126       void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message);
2127       void* private_data;
2128    } debug;
2129 
allocateId(RegClass rc)2130    uint32_t allocateId(RegClass rc)
2131    {
2132       assert(allocationID <= 16777215);
2133       temp_rc.push_back(rc);
2134       return allocationID++;
2135    }
2136 
allocateRange(unsigned amount)2137    void allocateRange(unsigned amount)
2138    {
2139       assert(allocationID + amount <= 16777216);
2140       temp_rc.resize(temp_rc.size() + amount);
2141       allocationID += amount;
2142    }
2143 
allocateTmp(RegClass rc)2144    Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2145 
peekAllocationId()2146    uint32_t peekAllocationId() { return allocationID; }
2147 
2148    friend void reindex_ssa(Program* program);
2149    friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2150 
create_and_insert_block()2151    Block* create_and_insert_block()
2152    {
2153       Block block;
2154       return insert_block(std::move(block));
2155    }
2156 
insert_block(Block && block)2157    Block* insert_block(Block&& block)
2158    {
2159       block.index = blocks.size();
2160       block.fp_mode = next_fp_mode;
2161       block.loop_nest_depth = next_loop_depth;
2162       block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2163       block.uniform_if_depth = next_uniform_if_depth;
2164       blocks.emplace_back(std::move(block));
2165       return &blocks.back();
2166    }
2167 
2168 private:
2169    uint32_t allocationID = 1;
2170 };
2171 
2172 struct live {
2173    /* live temps out per block */
2174    std::vector<IDSet> live_out;
2175    /* register demand (sgpr/vgpr) per instruction per block */
2176    std::vector<std::vector<RegisterDemand>> register_demand;
2177 };
2178 
2179 struct ra_test_policy {
2180    /* Force RA to always use its pessimistic fallback algorithm */
2181    bool skip_optimistic_path = false;
2182 };
2183 
2184 void init();
2185 
2186 void init_program(Program* program, Stage stage, const struct aco_shader_info* info,
2187                   enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
2188                   ac_shader_config* config);
2189 
2190 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2191                     ac_shader_config* config, const struct aco_compiler_options* options,
2192                     const struct aco_shader_info* info,
2193                     const struct radv_shader_args* args);
2194 void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2195                            const struct aco_compiler_options* options,
2196                            const struct aco_shader_info* info,
2197                            const struct radv_shader_args* args);
2198 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2199                                 ac_shader_config* config,
2200                                 const struct aco_compiler_options* options,
2201                                 const struct aco_shader_info* info,
2202                                 const struct radv_shader_args* args);
2203 void select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key,
2204                       ac_shader_config* config,
2205                       const struct aco_compiler_options* options,
2206                       const struct aco_shader_info* info,
2207                       const struct radv_shader_args* args,
2208                       unsigned* num_preserved_sgprs);
2209 
2210 void select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key,
2211                       ac_shader_config* config,
2212                       const struct aco_compiler_options* options,
2213                       const struct aco_shader_info* info,
2214                       const struct radv_shader_args* args);
2215 
2216 void lower_phis(Program* program);
2217 void calc_min_waves(Program* program);
2218 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2219 live live_var_analysis(Program* program);
2220 std::vector<uint16_t> dead_code_analysis(Program* program);
2221 void dominator_tree(Program* program);
2222 void insert_exec_mask(Program* program);
2223 void value_numbering(Program* program);
2224 void optimize(Program* program);
2225 void optimize_postRA(Program* program);
2226 void setup_reduce_temp(Program* program);
2227 void lower_to_cssa(Program* program, live& live_vars);
2228 void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2229                          ra_test_policy = {});
2230 void ssa_elimination(Program* program);
2231 void lower_to_hw_instr(Program* program);
2232 void schedule_program(Program* program, live& live_vars);
2233 void spill(Program* program, live& live_vars);
2234 void insert_wait_states(Program* program);
2235 void insert_NOPs(Program* program);
2236 void form_hard_clauses(Program* program);
2237 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2238 /**
2239  * Returns true if print_asm can disassemble the given program for the current build/runtime
2240  * configuration
2241  */
2242 bool check_print_asm_support(Program* program);
2243 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2244 bool validate_ir(Program* program);
2245 bool validate_ra(Program* program);
2246 #ifndef NDEBUG
2247 void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2248 #else
2249 #define perfwarn(program, cond, msg, ...)                                                          \
2250    do {                                                                                            \
2251    } while (0)
2252 #endif
2253 
2254 void collect_presched_stats(Program* program);
2255 void collect_preasm_stats(Program* program);
2256 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2257 
2258 enum print_flags {
2259    print_no_ssa = 0x1,
2260    print_perf_info = 0x2,
2261    print_kill = 0x4,
2262    print_live_vars = 0x8,
2263 };
2264 
2265 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2266 void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2267 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2268 void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2269                        unsigned flags = 0);
2270 
2271 void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2272 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2273 
2274 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2275 #define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2276 
2277 /* utilities for dealing with register demand */
2278 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2279 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2280 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2281                                  aco_ptr<Instruction>& instr_before);
2282 
2283 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2284 uint16_t get_extra_sgprs(Program* program);
2285 
2286 /* adjust num_waves for workgroup size and LDS limits */
2287 uint16_t max_suitable_waves(Program* program, uint16_t waves);
2288 
2289 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2290 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2291 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2292 
2293 /* return number of addressable sgprs/vgprs for max_waves */
2294 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2295 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2296 
2297 typedef struct {
2298    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2299    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2300    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2301    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2302    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2303    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2304    const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2305    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2306    /* sizes used for input/output modifiers and constants */
2307    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2308    const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2309 } Info;
2310 
2311 extern const Info instr_info;
2312 
2313 } // namespace aco
2314 
2315 #endif /* ACO_IR_H */
2316