1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #ifndef ACO_IR_H
8 #define ACO_IR_H
9
10 #include "aco_opcodes.h"
11 #include "aco_shader_info.h"
12 #include "aco_util.h"
13
14 #include "util/compiler.h"
15
16 #include "ac_binary.h"
17 #include "ac_hw_stage.h"
18 #include "ac_shader_debug_info.h"
19 #include "ac_shader_util.h"
20 #include "amd_family.h"
21 #include <algorithm>
22 #include <bitset>
23 #include <memory>
24 #include <vector>
25
26 typedef struct nir_shader nir_shader;
27
28 namespace aco {
29
30 extern uint64_t debug_flags;
31
32 enum {
33 DEBUG_VALIDATE_IR = 0x1,
34 DEBUG_VALIDATE_RA = 0x2,
35 DEBUG_VALIDATE_LIVE_VARS = 0x4,
36 DEBUG_FORCE_WAITCNT = 0x8,
37 DEBUG_NO_VN = 0x10,
38 DEBUG_NO_OPT = 0x20,
39 DEBUG_NO_SCHED = 0x40,
40 DEBUG_PERF_INFO = 0x80,
41 DEBUG_LIVE_INFO = 0x100,
42 DEBUG_FORCE_WAITDEPS = 0x200,
43 DEBUG_NO_VALIDATE_IR = 0x400,
44 DEBUG_NO_SCHED_ILP = 0x800,
45 DEBUG_NO_SCHED_VOPD = 0x1000,
46 };
47
48 enum storage_class : uint8_t {
49 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
50 storage_buffer = 0x1, /* SSBOs and global memory */
51 storage_gds = 0x2,
52 storage_image = 0x4,
53 storage_shared = 0x8, /* or TCS output */
54 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
55 storage_task_payload = 0x20, /* Task-Mesh payload */
56 storage_scratch = 0x40,
57 storage_vgpr_spill = 0x80,
58 storage_count = 8, /* not counting storage_none */
59 };
60
61 enum memory_semantics : uint8_t {
62 semantic_none = 0x0,
63 /* for loads: don't move any access after this load to before this load (even other loads)
64 * for barriers: don't move any access after the barrier to before any
65 * atomics/control_barriers/sendmsg_gs_done/position-primitive-export before the barrier */
66 semantic_acquire = 0x1,
67 /* for stores: don't move any access before this store to after this store
68 * for barriers: don't move any access before the barrier to after any
69 * atomics/control_barriers/sendmsg_gs_done/position-primitive-export after the barrier */
70 semantic_release = 0x2,
71
72 /* the rest are for load/stores/atomics only */
73 /* cannot be DCE'd or CSE'd */
74 semantic_volatile = 0x4,
75 /* does not interact with barriers and assumes this lane is the only lane
76 * accessing this memory */
77 semantic_private = 0x8,
78 /* this operation can be reordered around operations of the same storage.
79 * says nothing about barriers */
80 semantic_can_reorder = 0x10,
81 /* this is a atomic instruction (may only read or write memory) */
82 semantic_atomic = 0x20,
83 /* this is instruction both reads and writes memory */
84 semantic_rmw = 0x40,
85
86 semantic_acqrel = semantic_acquire | semantic_release,
87 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
88 };
89
90 enum sync_scope : uint8_t {
91 scope_invocation = 0,
92 scope_subgroup = 1,
93 scope_workgroup = 2,
94 scope_queuefamily = 3,
95 scope_device = 4,
96 };
97
98 struct memory_sync_info {
memory_sync_infomemory_sync_info99 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
100 memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
101 : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
102 {}
103
104 storage_class storage : 8;
105 memory_semantics semantics : 8;
106 sync_scope scope : 8;
107
108 bool operator==(const memory_sync_info& rhs) const
109 {
110 return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
111 }
112
can_reordermemory_sync_info113 bool can_reorder() const
114 {
115 if (semantics & semantic_acqrel)
116 return false;
117 /* Also check storage so that zero-initialized memory_sync_info can be
118 * reordered. */
119 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
120 }
121 };
122 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
123
124 enum fp_round {
125 fp_round_ne = 0,
126 fp_round_pi = 1,
127 fp_round_ni = 2,
128 fp_round_tz = 3,
129 };
130
131 enum fp_denorm {
132 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
133 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
134 fp_denorm_flush = 0x0,
135 fp_denorm_keep_in = 0x1,
136 fp_denorm_keep_out = 0x2,
137 fp_denorm_keep = 0x3,
138 };
139
140 struct float_mode {
141 /* matches encoding of the MODE register */
142 union {
143 struct {
144 fp_round round32 : 2;
145 fp_round round16_64 : 2;
146 unsigned denorm32 : 2;
147 unsigned denorm16_64 : 2;
148 };
149 struct {
150 uint8_t round : 4;
151 uint8_t denorm : 4;
152 };
153 uint8_t val = 0;
154 };
155 /* if false, optimizations which may remove denormal flushing can be done */
156 bool must_flush_denorms32 : 1;
157 bool must_flush_denorms16_64 : 1;
158 bool care_about_round32 : 1;
159 bool care_about_round16_64 : 1;
160
161 /* Returns true if instructions using the mode "other" can safely use the
162 * current one instead. */
canReplacefloat_mode163 bool canReplace(float_mode other) const noexcept
164 {
165 return val == other.val && (must_flush_denorms32 || !other.must_flush_denorms32) &&
166 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
167 (care_about_round32 || !other.care_about_round32) &&
168 (care_about_round16_64 || !other.care_about_round16_64);
169 }
170 };
171
172 enum wait_type {
173 wait_type_exp = 0,
174 wait_type_lgkm = 1,
175 wait_type_vm = 2,
176 /* GFX10+ */
177 wait_type_vs = 3,
178 /* GFX12+ */
179 wait_type_sample = 4,
180 wait_type_bvh = 5,
181 wait_type_km = 6,
182 wait_type_num = 7,
183 };
184
185 struct Instruction;
186 class Builder;
187
188 struct wait_imm {
189 static const uint8_t unset_counter = 0xff;
190
191 uint8_t exp;
192 uint8_t lgkm;
193 uint8_t vm;
194 uint8_t vs;
195 uint8_t sample;
196 uint8_t bvh;
197 uint8_t km;
198
199 wait_imm();
200 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
201
202 uint16_t pack(enum amd_gfx_level chip) const;
203
204 static wait_imm max(enum amd_gfx_level gfx_level);
205
206 bool unpack(enum amd_gfx_level gfx_level, const Instruction* instr);
207
208 bool combine(const wait_imm& other);
209
210 bool empty() const;
211
212 void print(FILE* output) const;
213
214 void build_waitcnt(Builder& bld);
215
216 uint8_t& operator[](size_t i)
217 {
218 assert(i < wait_type_num);
219 return *((uint8_t*)this + i);
220 }
221
222 const uint8_t& operator[](size_t i) const
223 {
224 assert(i < wait_type_num);
225 return *((uint8_t*)this + i);
226 }
227 };
228 static_assert(offsetof(wait_imm, exp) == wait_type_exp);
229 static_assert(offsetof(wait_imm, lgkm) == wait_type_lgkm);
230 static_assert(offsetof(wait_imm, vm) == wait_type_vm);
231 static_assert(offsetof(wait_imm, vs) == wait_type_vs);
232 static_assert(offsetof(wait_imm, sample) == wait_type_sample);
233 static_assert(offsetof(wait_imm, bvh) == wait_type_bvh);
234 static_assert(offsetof(wait_imm, km) == wait_type_km);
235
236 /* s_wait_event immediate bits. */
237 enum wait_event_imm : uint16_t {
238 /* If this bit is 0, await that the export buffer space has been allocated.
239 * In Primitive Ordered Pixel Shading, export ready means that the overlapped waves have exited
240 * their ordered sections (by performing the `done` export), and that the current wave may enter
241 * its ordered section.
242 */
243 wait_event_imm_dont_wait_export_ready_gfx11 = 0x1,
244 wait_event_imm_wait_export_ready_gfx12 = 0x2,
245 };
246
247 constexpr Format
asVOP3(Format format)248 asVOP3(Format format)
249 {
250 return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
251 };
252
253 constexpr Format
asSDWA(Format format)254 asSDWA(Format format)
255 {
256 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
257 return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
258 }
259
260 constexpr Format
withoutDPP(Format format)261 withoutDPP(Format format)
262 {
263 return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
264 }
265
266 constexpr Format
withoutVOP3(Format format)267 withoutVOP3(Format format)
268 {
269 return (Format)((uint32_t)format & ~((uint32_t)Format::VOP3));
270 }
271
272 enum class RegType {
273 sgpr,
274 vgpr,
275 };
276
277 struct RegClass {
278
279 enum RC : uint8_t {
280 s1 = 1,
281 s2 = 2,
282 s3 = 3,
283 s4 = 4,
284 s6 = 6,
285 s8 = 8,
286 s16 = 16,
287 v1 = s1 | (1 << 5),
288 v2 = s2 | (1 << 5),
289 v3 = s3 | (1 << 5),
290 v4 = s4 | (1 << 5),
291 v5 = 5 | (1 << 5),
292 v6 = 6 | (1 << 5),
293 v7 = 7 | (1 << 5),
294 v8 = 8 | (1 << 5),
295 /* byte-sized register class */
296 v1b = v1 | (1 << 7),
297 v2b = v2 | (1 << 7),
298 v3b = v3 | (1 << 7),
299 v4b = v4 | (1 << 7),
300 v6b = v6 | (1 << 7),
301 v8b = v8 | (1 << 7),
302 /* these are used for WWM and spills to vgpr */
303 v1_linear = v1 | (1 << 6),
304 v2_linear = v2 | (1 << 6),
305 };
306
307 RegClass() = default;
RegClassRegClass308 constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass309 constexpr RegClass(RegType type, unsigned size)
310 : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
311 {}
312
RCRegClass313 constexpr operator RC() const { return rc; }
314 explicit operator bool() = delete;
315
typeRegClass316 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass317 constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass318 constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass319 constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
320 // TODO: use size() less in favor of bytes()
sizeRegClass321 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass322 constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass323 constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass324 constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
325
getRegClass326 static constexpr RegClass get(RegType type, unsigned bytes)
327 {
328 if (type == RegType::sgpr) {
329 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
330 } else {
331 return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
332 }
333 }
334
resizeRegClass335 constexpr RegClass resize(unsigned bytes) const
336 {
337 if (is_linear_vgpr()) {
338 assert(bytes % 4u == 0);
339 return get(RegType::vgpr, bytes).as_linear();
340 }
341 return get(type(), bytes);
342 }
343
344 private:
345 RC rc;
346 };
347
348 /* transitional helper expressions */
349 static constexpr RegClass s1{RegClass::s1};
350 static constexpr RegClass s2{RegClass::s2};
351 static constexpr RegClass s3{RegClass::s3};
352 static constexpr RegClass s4{RegClass::s4};
353 static constexpr RegClass s8{RegClass::s8};
354 static constexpr RegClass s16{RegClass::s16};
355 static constexpr RegClass v1{RegClass::v1};
356 static constexpr RegClass v2{RegClass::v2};
357 static constexpr RegClass v3{RegClass::v3};
358 static constexpr RegClass v4{RegClass::v4};
359 static constexpr RegClass v5{RegClass::v5};
360 static constexpr RegClass v6{RegClass::v6};
361 static constexpr RegClass v7{RegClass::v7};
362 static constexpr RegClass v8{RegClass::v8};
363 static constexpr RegClass v1b{RegClass::v1b};
364 static constexpr RegClass v2b{RegClass::v2b};
365 static constexpr RegClass v3b{RegClass::v3b};
366 static constexpr RegClass v4b{RegClass::v4b};
367 static constexpr RegClass v6b{RegClass::v6b};
368 static constexpr RegClass v8b{RegClass::v8b};
369
370 /**
371 * Temp Class
372 * Each temporary virtual register has a
373 * register class (i.e. size and type)
374 * and SSA id.
375 */
376 struct Temp {
TempTemp377 Temp() noexcept : id_(0), reg_class(0) {}
TempTemp378 constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
379
idTemp380 constexpr uint32_t id() const noexcept { return id_; }
regClassTemp381 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
382
bytesTemp383 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp384 constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp385 constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp386 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
387
388 constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
389 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
390 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
391
392 private:
393 uint32_t id_ : 24;
394 uint32_t reg_class : 8;
395 };
396
397 /**
398 * PhysReg
399 * Represents the physical register for each
400 * Operand and Definition.
401 */
402 struct PhysReg {
403 constexpr PhysReg() = default;
PhysRegPhysReg404 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg405 constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg406 constexpr unsigned byte() const { return reg_b & 0x3; }
407 constexpr operator unsigned() const { return reg(); }
408 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
409 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
410 constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg411 constexpr PhysReg advance(int bytes) const
412 {
413 PhysReg res = *this;
414 res.reg_b += bytes;
415 return res;
416 }
417
418 uint16_t reg_b = 0;
419 };
420
421 /* helper expressions for special registers */
422 static constexpr PhysReg m0{124};
423 static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
424 static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
425 static constexpr PhysReg vcc{106};
426 static constexpr PhysReg vcc_hi{107};
427 static constexpr PhysReg tba_lo{108}; /* GFX6-GFX8 */
428 static constexpr PhysReg tba_hi{109}; /* GFX6-GFX8 */
429 static constexpr PhysReg tma_lo{110}; /* GFX6-GFX8 */
430 static constexpr PhysReg tma_hi{111}; /* GFX6-GFX8 */
431 static constexpr PhysReg ttmp0{112};
432 static constexpr PhysReg ttmp1{113};
433 static constexpr PhysReg ttmp2{114};
434 static constexpr PhysReg ttmp3{115};
435 static constexpr PhysReg ttmp4{116};
436 static constexpr PhysReg ttmp5{117};
437 static constexpr PhysReg ttmp6{118};
438 static constexpr PhysReg ttmp7{119};
439 static constexpr PhysReg ttmp8{120};
440 static constexpr PhysReg ttmp9{121};
441 static constexpr PhysReg ttmp10{122};
442 static constexpr PhysReg ttmp11{123};
443 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
444 static constexpr PhysReg exec{126};
445 static constexpr PhysReg exec_lo{126};
446 static constexpr PhysReg exec_hi{127};
447 static constexpr PhysReg pops_exiting_wave_id{239}; /* GFX9-GFX10.3 */
448 static constexpr PhysReg scc{253};
449
450 /**
451 * Operand Class
452 * Initially, each Operand refers to either
453 * a temporary virtual register
454 * or to a constant value
455 * Temporary registers get mapped to physical register during RA
456 * Constant values are inlined into the instruction sequence.
457 */
458 class Operand final {
459 public:
Operand()460 constexpr Operand()
461 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isPrecolored_(false),
462 isConstant_(false), isKill_(false), isUndef_(true), isFirstKill_(false),
463 isLateKill_(false), isClobbered_(false), isCopyKill_(false), is16bit_(false),
464 is24bit_(false), signext(false), constSize(0)
465 {}
466
Operand(Temp r)467 explicit Operand(Temp r) noexcept
468 {
469 data_.temp = r;
470 if (r.id()) {
471 isTemp_ = true;
472 } else {
473 isUndef_ = true;
474 setFixed(PhysReg{128});
475 }
476 };
Operand(Temp r,PhysReg reg)477 explicit Operand(Temp r, PhysReg reg) noexcept
478 {
479 assert(r.id()); /* Don't allow fixing an undef to a register */
480 data_.temp = r;
481 isTemp_ = true;
482 setPrecolored(reg);
483 };
484
485 /* 8-bit constant */
c8(uint8_t v)486 static Operand c8(uint8_t v) noexcept
487 {
488 /* 8-bit constants are only used for copies and copies from any 8-bit
489 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
490 * to be inline constants. */
491 Operand op;
492 op.control_ = 0;
493 op.data_.i = v;
494 op.isConstant_ = true;
495 op.constSize = 0;
496 op.setFixed(PhysReg{0u});
497 return op;
498 };
499
500 /* 16-bit constant */
c16(uint16_t v)501 static Operand c16(uint16_t v) noexcept
502 {
503 Operand op;
504 op.control_ = 0;
505 op.data_.i = v;
506 op.isConstant_ = true;
507 op.constSize = 1;
508 if (v <= 64)
509 op.setFixed(PhysReg{128u + v});
510 else if (v >= 0xFFF0) /* [-16 .. -1] */
511 op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
512 else if (v == 0x3800) /* 0.5 */
513 op.setFixed(PhysReg{240});
514 else if (v == 0xB800) /* -0.5 */
515 op.setFixed(PhysReg{241});
516 else if (v == 0x3C00) /* 1.0 */
517 op.setFixed(PhysReg{242});
518 else if (v == 0xBC00) /* -1.0 */
519 op.setFixed(PhysReg{243});
520 else if (v == 0x4000) /* 2.0 */
521 op.setFixed(PhysReg{244});
522 else if (v == 0xC000) /* -2.0 */
523 op.setFixed(PhysReg{245});
524 else if (v == 0x4400) /* 4.0 */
525 op.setFixed(PhysReg{246});
526 else if (v == 0xC400) /* -4.0 */
527 op.setFixed(PhysReg{247});
528 else if (v == 0x3118) /* 1/2 PI */
529 op.setFixed(PhysReg{248});
530 else /* Literal Constant */
531 op.setFixed(PhysReg{255});
532 return op;
533 }
534
535 /* 32-bit constant */
c32(uint32_t v)536 static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
537
538 /* 64-bit constant */
c64(uint64_t v)539 static Operand c64(uint64_t v) noexcept
540 {
541 Operand op;
542 op.control_ = 0;
543 op.isConstant_ = true;
544 op.constSize = 3;
545 if (v <= 64) {
546 op.data_.i = (uint32_t)v;
547 op.setFixed(PhysReg{128 + (uint32_t)v});
548 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
549 op.data_.i = (uint32_t)v;
550 op.setFixed(PhysReg{192 - (uint32_t)v});
551 } else if (v == 0x3FE0000000000000) { /* 0.5 */
552 op.data_.i = 0x3f000000;
553 op.setFixed(PhysReg{240});
554 } else if (v == 0xBFE0000000000000) { /* -0.5 */
555 op.data_.i = 0xbf000000;
556 op.setFixed(PhysReg{241});
557 } else if (v == 0x3FF0000000000000) { /* 1.0 */
558 op.data_.i = 0x3f800000;
559 op.setFixed(PhysReg{242});
560 } else if (v == 0xBFF0000000000000) { /* -1.0 */
561 op.data_.i = 0xbf800000;
562 op.setFixed(PhysReg{243});
563 } else if (v == 0x4000000000000000) { /* 2.0 */
564 op.data_.i = 0x40000000;
565 op.setFixed(PhysReg{244});
566 } else if (v == 0xC000000000000000) { /* -2.0 */
567 op.data_.i = 0xc0000000;
568 op.setFixed(PhysReg{245});
569 } else if (v == 0x4010000000000000) { /* 4.0 */
570 op.data_.i = 0x40800000;
571 op.setFixed(PhysReg{246});
572 } else if (v == 0xC010000000000000) { /* -4.0 */
573 op.data_.i = 0xc0800000;
574 op.setFixed(PhysReg{247});
575 } else { /* Literal Constant: we don't know if it is a long or double.*/
576 op.signext = v >> 63;
577 op.data_.i = v & 0xffffffffu;
578 op.setFixed(PhysReg{255});
579 assert(op.constantValue64() == v &&
580 "attempt to create a unrepresentable 64-bit literal constant");
581 }
582 return op;
583 }
584
585 /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)586 static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
587 {
588 Operand op;
589 op.control_ = 0;
590 op.data_.i = v;
591 op.isConstant_ = true;
592 op.constSize = is64bit ? 3 : 2;
593 if (v <= 64)
594 op.setFixed(PhysReg{128 + v});
595 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
596 op.setFixed(PhysReg{192 - v});
597 else if (v == 0x3f000000) /* 0.5 */
598 op.setFixed(PhysReg{240});
599 else if (v == 0xbf000000) /* -0.5 */
600 op.setFixed(PhysReg{241});
601 else if (v == 0x3f800000) /* 1.0 */
602 op.setFixed(PhysReg{242});
603 else if (v == 0xbf800000) /* -1.0 */
604 op.setFixed(PhysReg{243});
605 else if (v == 0x40000000) /* 2.0 */
606 op.setFixed(PhysReg{244});
607 else if (v == 0xc0000000) /* -2.0 */
608 op.setFixed(PhysReg{245});
609 else if (v == 0x40800000) /* 4.0 */
610 op.setFixed(PhysReg{246});
611 else if (v == 0xc0800000) /* -4.0 */
612 op.setFixed(PhysReg{247});
613 else { /* Literal Constant */
614 assert(!is64bit && "attempt to create a 64-bit literal constant");
615 op.setFixed(PhysReg{255});
616 }
617 return op;
618 }
619
literal32(uint32_t v)620 static Operand literal32(uint32_t v) noexcept
621 {
622 Operand op;
623 op.control_ = 0;
624 op.data_.i = v;
625 op.isConstant_ = true;
626 op.constSize = 2;
627 op.setFixed(PhysReg{255});
628 return op;
629 }
630
Operand(RegClass type)631 explicit Operand(RegClass type) noexcept
632 {
633 isUndef_ = true;
634 data_.temp = Temp(0, type);
635 setFixed(PhysReg{128});
636 };
Operand(PhysReg reg,RegClass type)637 explicit Operand(PhysReg reg, RegClass type) noexcept
638 {
639 data_.temp = Temp(0, type);
640 setFixed(reg);
641 }
642
643 static Operand zero(unsigned bytes = 4) noexcept
644 {
645 if (bytes == 8)
646 return Operand::c64(0);
647 else if (bytes == 4)
648 return Operand::c32(0);
649 else if (bytes == 2)
650 return Operand::c16(0);
651 assert(bytes == 1);
652 return Operand::c8(0);
653 }
654
655 /* This is useful over the constructors when you want to take a gfx level
656 * for 1/2 PI or an unknown operand size.
657 */
get_const(enum amd_gfx_level chip,uint64_t val,unsigned bytes)658 static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes)
659 {
660 if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
661 /* 1/2 PI can be an inline constant on GFX8+ */
662 Operand op = Operand::c32(val);
663 op.setFixed(PhysReg{248});
664 return op;
665 }
666
667 if (bytes == 8)
668 return Operand::c64(val);
669 else if (bytes == 4)
670 return Operand::c32(val);
671 else if (bytes == 2)
672 return Operand::c16(val);
673 assert(bytes == 1);
674 return Operand::c8(val);
675 }
676
677 static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
678 bool sext = false)
679 {
680 if (bytes <= 4)
681 return true;
682
683 if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
684 return true;
685 uint64_t upper33 = val & 0xFFFFFFFF80000000;
686 if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
687 return true;
688
689 return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
690 val == 0x3FE0000000000000 || /* 0.5 */
691 val == 0xBFE0000000000000 || /* -0.5 */
692 val == 0x3FF0000000000000 || /* 1.0 */
693 val == 0xBFF0000000000000 || /* -1.0 */
694 val == 0x4000000000000000 || /* 2.0 */
695 val == 0xC000000000000000 || /* -2.0 */
696 val == 0x4010000000000000 || /* 4.0 */
697 val == 0xC010000000000000; /* -4.0 */
698 }
699
isTemp()700 constexpr bool isTemp() const noexcept { return isTemp_; }
701
setTemp(Temp t)702 constexpr void setTemp(Temp t) noexcept
703 {
704 assert(!isConstant_);
705 if (t.id() != 0)
706 isTemp_ = true;
707 data_.temp = t;
708 }
709
getTemp()710 constexpr Temp getTemp() const noexcept { return data_.temp; }
711
tempId()712 constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
713
hasRegClass()714 constexpr bool hasRegClass() const noexcept { return !isConstant(); }
715
regClass()716 constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
717
bytes()718 constexpr unsigned bytes() const noexcept
719 {
720 if (isConstant())
721 return 1 << constSize;
722 else
723 return data_.temp.bytes();
724 }
725
size()726 constexpr unsigned size() const noexcept
727 {
728 if (isConstant())
729 return constSize > 2 ? 2 : 1;
730 else
731 return data_.temp.size();
732 }
733
isFixed()734 constexpr bool isFixed() const noexcept { return isFixed_; }
735
physReg()736 constexpr PhysReg physReg() const noexcept { return reg_; }
737
setFixed(PhysReg reg)738 constexpr void setFixed(PhysReg reg) noexcept
739 {
740 isFixed_ = reg != unsigned(-1);
741 reg_ = reg;
742 }
743
isPrecolored()744 constexpr bool isPrecolored() const noexcept { return isPrecolored_; }
setPrecolored(PhysReg reg)745 constexpr void setPrecolored(PhysReg reg) noexcept
746 {
747 setFixed(reg);
748 isPrecolored_ = isFixed_;
749 }
750
isConstant()751 constexpr bool isConstant() const noexcept { return isConstant_; }
752
isLiteral()753 constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
754
isUndefined()755 constexpr bool isUndefined() const noexcept { return isUndef_; }
756
constantValue()757 constexpr uint32_t constantValue() const noexcept { return data_.i; }
758
constantEquals(uint32_t cmp)759 constexpr bool constantEquals(uint32_t cmp) const noexcept
760 {
761 return isConstant() && constantValue() == cmp;
762 }
763
constantValue64()764 constexpr uint64_t constantValue64() const noexcept
765 {
766 if (constSize == 3) {
767 if (reg_ <= 192)
768 return reg_ - 128;
769 else if (reg_ <= 208)
770 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
771
772 switch (reg_) {
773 case 240: return 0x3FE0000000000000;
774 case 241: return 0xBFE0000000000000;
775 case 242: return 0x3FF0000000000000;
776 case 243: return 0xBFF0000000000000;
777 case 244: return 0x4000000000000000;
778 case 245: return 0xC000000000000000;
779 case 246: return 0x4010000000000000;
780 case 247: return 0xC010000000000000;
781 case 255:
782 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
783 }
784 unreachable("invalid register for 64-bit constant");
785 } else {
786 return data_.i;
787 }
788 }
789
790 /* Value if this were used with vop3/opsel or vop3p. */
constantValue16(bool opsel)791 constexpr uint16_t constantValue16(bool opsel) const noexcept
792 {
793 assert(bytes() == 2 || bytes() == 4);
794 if (opsel) {
795 if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral())
796 return int16_t(data_.i) >>
797 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */
798 else
799 return data_.i >> 16;
800 }
801 return data_.i;
802 }
803
isOfType(RegType type)804 constexpr bool isOfType(RegType type) const noexcept
805 {
806 return hasRegClass() && regClass().type() == type;
807 }
808
809 /* Indicates that the killed operand's live range intersects with the
810 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
811 * not set by liveness analysis. */
setLateKill(bool flag)812 constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
813
isLateKill()814 constexpr bool isLateKill() const noexcept { return isLateKill_; }
815
816 /* Indicates that the Operand's register gets clobbered by the instruction. */
setClobbered(bool flag)817 constexpr void setClobbered(bool flag) noexcept { isClobbered_ = flag; }
isClobbered()818 constexpr bool isClobbered() const noexcept { return isClobbered_; }
819
820 /* Indicates that the Operand must be copied in order to satisfy register
821 * constraints. The copy is immediately killed by the instruction.
822 */
setCopyKill(bool flag)823 constexpr void setCopyKill(bool flag) noexcept
824 {
825 isCopyKill_ = flag;
826 if (flag)
827 setKill(flag);
828 }
isCopyKill()829 constexpr bool isCopyKill() const noexcept { return isCopyKill_; }
830
setKill(bool flag)831 constexpr void setKill(bool flag) noexcept
832 {
833 isKill_ = flag;
834 if (!flag) {
835 setFirstKill(false);
836 setCopyKill(false);
837 }
838 }
839
isKill()840 constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
841
setFirstKill(bool flag)842 constexpr void setFirstKill(bool flag) noexcept
843 {
844 isFirstKill_ = flag;
845 if (flag)
846 setKill(flag);
847 }
848
849 /* When there are multiple operands killing the same temporary,
850 * isFirstKill() is only returns true for the first one. */
isFirstKill()851 constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
852
isKillBeforeDef()853 constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
854
isFirstKillBeforeDef()855 constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
856
857 constexpr bool operator==(Operand other) const noexcept
858 {
859 if (other.bytes() != bytes())
860 return false;
861 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
862 return false;
863 if (isFixed() && physReg() != other.physReg())
864 return false;
865 if (hasRegClass() && (!other.hasRegClass() || other.regClass() != regClass()))
866 return false;
867
868 if (isConstant())
869 return other.isConstant() && other.constantValue64() == constantValue64();
870 else if (isUndefined())
871 return other.isUndefined();
872 else if (isTemp())
873 return other.isTemp() && other.getTemp() == getTemp();
874 else
875 return true;
876 }
877
878 constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
879
set16bit(bool flag)880 constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
881
is16bit()882 constexpr bool is16bit() const noexcept { return is16bit_; }
883
set24bit(bool flag)884 constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
885
is24bit()886 constexpr bool is24bit() const noexcept { return is24bit_; }
887
888 private:
889 union {
890 Temp temp;
891 uint32_t i;
892 float f;
893 } data_ = {Temp(0, s1)};
894 PhysReg reg_;
895 union {
896 struct {
897 uint8_t isTemp_ : 1;
898 uint8_t isFixed_ : 1;
899 uint8_t isPrecolored_ : 1;
900 uint8_t isConstant_ : 1;
901 uint8_t isKill_ : 1;
902 uint8_t isUndef_ : 1;
903 uint8_t isFirstKill_ : 1;
904 uint8_t isLateKill_ : 1;
905 uint8_t isClobbered_ : 1;
906 uint8_t isCopyKill_ : 1;
907 uint8_t is16bit_ : 1;
908 uint8_t is24bit_ : 1;
909 uint8_t signext : 1;
910 uint8_t constSize : 2;
911 };
912 /* can't initialize bit-fields in c++11, so work around using a union */
913 uint16_t control_ = 0;
914 };
915 };
916
917 /**
918 * Definition Class
919 * Definitions are the results of Instructions
920 * and refer to temporary virtual registers
921 * which are later mapped to physical registers
922 */
923 class Definition final {
924 public:
Definition()925 constexpr Definition()
926 : temp(Temp(0, s1)), reg_(0), isFixed_(0), isPrecolored_(0), isKill_(0), isPrecise_(0),
927 isInfPreserve_(0), isNaNPreserve_(0), isSZPreserve_(0), isNUW_(0), isNoCSE_(0)
928 {}
Definition(Temp tmp)929 explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)930 explicit Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(Temp tmp,PhysReg reg)931 explicit Definition(Temp tmp, PhysReg reg) noexcept : temp(tmp) { setPrecolored(reg); }
932
isTemp()933 constexpr bool isTemp() const noexcept { return tempId() > 0; }
934
getTemp()935 constexpr Temp getTemp() const noexcept { return temp; }
936
tempId()937 constexpr uint32_t tempId() const noexcept { return temp.id(); }
938
setTemp(Temp t)939 constexpr void setTemp(Temp t) noexcept { temp = t; }
940
swapTemp(Definition & other)941 void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
942
regClass()943 constexpr RegClass regClass() const noexcept { return temp.regClass(); }
944
bytes()945 constexpr unsigned bytes() const noexcept { return temp.bytes(); }
946
size()947 constexpr unsigned size() const noexcept { return temp.size(); }
948
isFixed()949 constexpr bool isFixed() const noexcept { return isFixed_; }
950
physReg()951 constexpr PhysReg physReg() const noexcept { return reg_; }
952
setFixed(PhysReg reg)953 constexpr void setFixed(PhysReg reg) noexcept
954 {
955 isFixed_ = 1;
956 reg_ = reg;
957 }
958
isPrecolored()959 constexpr bool isPrecolored() const noexcept { return isPrecolored_; }
setPrecolored(PhysReg reg)960 constexpr void setPrecolored(PhysReg reg) noexcept
961 {
962 setFixed(reg);
963 isPrecolored_ = isFixed_;
964 }
965
setKill(bool flag)966 constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
967
isKill()968 constexpr bool isKill() const noexcept { return isKill_; }
969
setPrecise(bool precise)970 constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
971
isPrecise()972 constexpr bool isPrecise() const noexcept { return isPrecise_; }
973
setInfPreserve(bool inf_preserve)974 constexpr void setInfPreserve(bool inf_preserve) noexcept { isInfPreserve_ = inf_preserve; }
975
isInfPreserve()976 constexpr bool isInfPreserve() const noexcept { return isInfPreserve_; }
977
setNaNPreserve(bool nan_preserve)978 constexpr void setNaNPreserve(bool nan_preserve) noexcept { isNaNPreserve_ = nan_preserve; }
979
isNaNPreserve()980 constexpr bool isNaNPreserve() const noexcept { return isNaNPreserve_; }
981
setSZPreserve(bool sz_preserve)982 constexpr void setSZPreserve(bool sz_preserve) noexcept { isSZPreserve_ = sz_preserve; }
983
isSZPreserve()984 constexpr bool isSZPreserve() const noexcept { return isSZPreserve_; }
985
986 /* No Unsigned Wrap */
setNUW(bool nuw)987 constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
988
isNUW()989 constexpr bool isNUW() const noexcept { return isNUW_; }
990
setNoCSE(bool noCSE)991 constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
992
isNoCSE()993 constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
994
995 private:
996 Temp temp = Temp(0, s1);
997 PhysReg reg_;
998 union {
999 struct {
1000 uint8_t isFixed_ : 1;
1001 uint8_t isPrecolored_ : 1;
1002 uint8_t isKill_ : 1;
1003 uint8_t isPrecise_ : 1;
1004 uint8_t isInfPreserve_ : 1;
1005 uint8_t isNaNPreserve_ : 1;
1006 uint8_t isSZPreserve_ : 1;
1007 uint8_t isNUW_ : 1;
1008 uint8_t isNoCSE_ : 1;
1009 };
1010 /* can't initialize bit-fields in c++11, so work around using a union */
1011 uint16_t control_ = 0;
1012 };
1013 };
1014
1015 struct RegisterDemand {
1016 constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1017 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1018 int16_t vgpr = 0;
1019 int16_t sgpr = 0;
1020
1021 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1022 {
1023 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1024 }
1025
exceedsRegisterDemand1026 constexpr bool exceeds(const RegisterDemand other) const noexcept
1027 {
1028 return vgpr > other.vgpr || sgpr > other.sgpr;
1029 }
1030
1031 constexpr RegisterDemand operator+(const Temp t) const noexcept
1032 {
1033 if (t.type() == RegType::sgpr)
1034 return RegisterDemand(vgpr, sgpr + t.size());
1035 else
1036 return RegisterDemand(vgpr + t.size(), sgpr);
1037 }
1038
1039 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1040 {
1041 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1042 }
1043
1044 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1045 {
1046 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1047 }
1048
1049 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1050 {
1051 vgpr += other.vgpr;
1052 sgpr += other.sgpr;
1053 return *this;
1054 }
1055
1056 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1057 {
1058 vgpr -= other.vgpr;
1059 sgpr -= other.sgpr;
1060 return *this;
1061 }
1062
1063 constexpr RegisterDemand& operator+=(const Temp t) noexcept
1064 {
1065 if (t.type() == RegType::sgpr)
1066 sgpr += t.size();
1067 else
1068 vgpr += t.size();
1069 return *this;
1070 }
1071
1072 constexpr RegisterDemand& operator-=(const Temp t) noexcept
1073 {
1074 if (t.type() == RegType::sgpr)
1075 sgpr -= t.size();
1076 else
1077 vgpr -= t.size();
1078 return *this;
1079 }
1080
updateRegisterDemand1081 constexpr void update(const RegisterDemand other) noexcept
1082 {
1083 vgpr = std::max(vgpr, other.vgpr);
1084 sgpr = std::max(sgpr, other.sgpr);
1085 }
1086 };
1087
1088 struct Block;
1089 struct Instruction;
1090 struct Pseudo_instruction;
1091 struct SALU_instruction;
1092 struct SMEM_instruction;
1093 struct DS_instruction;
1094 struct LDSDIR_instruction;
1095 struct MTBUF_instruction;
1096 struct MUBUF_instruction;
1097 struct MIMG_instruction;
1098 struct Export_instruction;
1099 struct FLAT_instruction;
1100 struct Pseudo_branch_instruction;
1101 struct Pseudo_barrier_instruction;
1102 struct Pseudo_reduction_instruction;
1103 struct VALU_instruction;
1104 struct VINTERP_inreg_instruction;
1105 struct VINTRP_instruction;
1106 struct VOPD_instruction;
1107 struct DPP16_instruction;
1108 struct DPP8_instruction;
1109 struct SDWA_instruction;
1110
1111 struct Instruction {
1112 aco_opcode opcode;
1113 Format format;
1114 union {
1115 uint32_t pass_flags;
1116 RegisterDemand register_demand;
1117 };
1118
1119 aco::span<Operand> operands;
1120 aco::span<Definition> definitions;
1121
1122 constexpr bool usesModifiers() const noexcept;
1123
reads_execInstruction1124 constexpr bool reads_exec() const noexcept
1125 {
1126 for (const Operand& op : operands) {
1127 if (op.isFixed() && (op.physReg() == exec_lo || op.physReg() == exec_hi))
1128 return true;
1129 }
1130 return false;
1131 }
1132
writes_execInstruction1133 constexpr bool writes_exec() const noexcept
1134 {
1135 for (const Definition& def : definitions) {
1136 if (def.isFixed() && (def.physReg() == exec_lo || def.physReg() == exec_hi))
1137 return true;
1138 }
1139 return false;
1140 }
1141
pseudoInstruction1142 Pseudo_instruction& pseudo() noexcept
1143 {
1144 assert(isPseudo());
1145 return *(Pseudo_instruction*)this;
1146 }
pseudoInstruction1147 const Pseudo_instruction& pseudo() const noexcept
1148 {
1149 assert(isPseudo());
1150 return *(Pseudo_instruction*)this;
1151 }
isPseudoInstruction1152 constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
1153
isSOP1Instruction1154 constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
isSOP2Instruction1155 constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
isSOPKInstruction1156 constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
isSOPPInstruction1157 constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
isSOPCInstruction1158 constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
1159
smemInstruction1160 SMEM_instruction& smem() noexcept
1161 {
1162 assert(isSMEM());
1163 return *(SMEM_instruction*)this;
1164 }
smemInstruction1165 const SMEM_instruction& smem() const noexcept
1166 {
1167 assert(isSMEM());
1168 return *(SMEM_instruction*)this;
1169 }
isSMEMInstruction1170 constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1171 DS_instruction& ds() noexcept
1172 {
1173 assert(isDS());
1174 return *(DS_instruction*)this;
1175 }
dsInstruction1176 const DS_instruction& ds() const noexcept
1177 {
1178 assert(isDS());
1179 return *(DS_instruction*)this;
1180 }
isDSInstruction1181 constexpr bool isDS() const noexcept { return format == Format::DS; }
ldsdirInstruction1182 LDSDIR_instruction& ldsdir() noexcept
1183 {
1184 assert(isLDSDIR());
1185 return *(LDSDIR_instruction*)this;
1186 }
ldsdirInstruction1187 const LDSDIR_instruction& ldsdir() const noexcept
1188 {
1189 assert(isLDSDIR());
1190 return *(LDSDIR_instruction*)this;
1191 }
isLDSDIRInstruction1192 constexpr bool isLDSDIR() const noexcept { return format == Format::LDSDIR; }
mtbufInstruction1193 MTBUF_instruction& mtbuf() noexcept
1194 {
1195 assert(isMTBUF());
1196 return *(MTBUF_instruction*)this;
1197 }
mtbufInstruction1198 const MTBUF_instruction& mtbuf() const noexcept
1199 {
1200 assert(isMTBUF());
1201 return *(MTBUF_instruction*)this;
1202 }
isMTBUFInstruction1203 constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1204 MUBUF_instruction& mubuf() noexcept
1205 {
1206 assert(isMUBUF());
1207 return *(MUBUF_instruction*)this;
1208 }
mubufInstruction1209 const MUBUF_instruction& mubuf() const noexcept
1210 {
1211 assert(isMUBUF());
1212 return *(MUBUF_instruction*)this;
1213 }
isMUBUFInstruction1214 constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1215 MIMG_instruction& mimg() noexcept
1216 {
1217 assert(isMIMG());
1218 return *(MIMG_instruction*)this;
1219 }
mimgInstruction1220 const MIMG_instruction& mimg() const noexcept
1221 {
1222 assert(isMIMG());
1223 return *(MIMG_instruction*)this;
1224 }
isMIMGInstruction1225 constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1226 Export_instruction& exp() noexcept
1227 {
1228 assert(isEXP());
1229 return *(Export_instruction*)this;
1230 }
expInstruction1231 const Export_instruction& exp() const noexcept
1232 {
1233 assert(isEXP());
1234 return *(Export_instruction*)this;
1235 }
isEXPInstruction1236 constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1237 FLAT_instruction& flat() noexcept
1238 {
1239 assert(isFlat());
1240 return *(FLAT_instruction*)this;
1241 }
flatInstruction1242 const FLAT_instruction& flat() const noexcept
1243 {
1244 assert(isFlat());
1245 return *(FLAT_instruction*)this;
1246 }
isFlatInstruction1247 constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1248 FLAT_instruction& global() noexcept
1249 {
1250 assert(isGlobal());
1251 return *(FLAT_instruction*)this;
1252 }
globalInstruction1253 const FLAT_instruction& global() const noexcept
1254 {
1255 assert(isGlobal());
1256 return *(FLAT_instruction*)this;
1257 }
isGlobalInstruction1258 constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1259 FLAT_instruction& scratch() noexcept
1260 {
1261 assert(isScratch());
1262 return *(FLAT_instruction*)this;
1263 }
scratchInstruction1264 const FLAT_instruction& scratch() const noexcept
1265 {
1266 assert(isScratch());
1267 return *(FLAT_instruction*)this;
1268 }
isScratchInstruction1269 constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1270 Pseudo_branch_instruction& branch() noexcept
1271 {
1272 assert(isBranch());
1273 return *(Pseudo_branch_instruction*)this;
1274 }
branchInstruction1275 const Pseudo_branch_instruction& branch() const noexcept
1276 {
1277 assert(isBranch());
1278 return *(Pseudo_branch_instruction*)this;
1279 }
isBranchInstruction1280 constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1281 Pseudo_barrier_instruction& barrier() noexcept
1282 {
1283 assert(isBarrier());
1284 return *(Pseudo_barrier_instruction*)this;
1285 }
barrierInstruction1286 const Pseudo_barrier_instruction& barrier() const noexcept
1287 {
1288 assert(isBarrier());
1289 return *(Pseudo_barrier_instruction*)this;
1290 }
isBarrierInstruction1291 constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1292 Pseudo_reduction_instruction& reduction() noexcept
1293 {
1294 assert(isReduction());
1295 return *(Pseudo_reduction_instruction*)this;
1296 }
reductionInstruction1297 const Pseudo_reduction_instruction& reduction() const noexcept
1298 {
1299 assert(isReduction());
1300 return *(Pseudo_reduction_instruction*)this;
1301 }
isReductionInstruction1302 constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
isVOP3PInstruction1303 constexpr bool isVOP3P() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3P; }
vinterp_inregInstruction1304 VINTERP_inreg_instruction& vinterp_inreg() noexcept
1305 {
1306 assert(isVINTERP_INREG());
1307 return *(VINTERP_inreg_instruction*)this;
1308 }
vinterp_inregInstruction1309 const VINTERP_inreg_instruction& vinterp_inreg() const noexcept
1310 {
1311 assert(isVINTERP_INREG());
1312 return *(VINTERP_inreg_instruction*)this;
1313 }
isVINTERP_INREGInstruction1314 constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; }
vopdInstruction1315 VOPD_instruction& vopd() noexcept
1316 {
1317 assert(isVOPD());
1318 return *(VOPD_instruction*)this;
1319 }
vopdInstruction1320 const VOPD_instruction& vopd() const noexcept
1321 {
1322 assert(isVOPD());
1323 return *(VOPD_instruction*)this;
1324 }
isVOPDInstruction1325 constexpr bool isVOPD() const noexcept { return format == Format::VOPD; }
isVOP1Instruction1326 constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
isVOP2Instruction1327 constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
isVOPCInstruction1328 constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
isVOP3Instruction1329 constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1330 VINTRP_instruction& vintrp() noexcept
1331 {
1332 assert(isVINTRP());
1333 return *(VINTRP_instruction*)this;
1334 }
vintrpInstruction1335 const VINTRP_instruction& vintrp() const noexcept
1336 {
1337 assert(isVINTRP());
1338 return *(VINTRP_instruction*)this;
1339 }
isVINTRPInstruction1340 constexpr bool isVINTRP() const noexcept { return format == Format::VINTRP; }
dpp16Instruction1341 DPP16_instruction& dpp16() noexcept
1342 {
1343 assert(isDPP16());
1344 return *(DPP16_instruction*)this;
1345 }
dpp16Instruction1346 const DPP16_instruction& dpp16() const noexcept
1347 {
1348 assert(isDPP16());
1349 return *(DPP16_instruction*)this;
1350 }
isDPP16Instruction1351 constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
dpp8Instruction1352 DPP8_instruction& dpp8() noexcept
1353 {
1354 assert(isDPP8());
1355 return *(DPP8_instruction*)this;
1356 }
dpp8Instruction1357 const DPP8_instruction& dpp8() const noexcept
1358 {
1359 assert(isDPP8());
1360 return *(DPP8_instruction*)this;
1361 }
isDPP8Instruction1362 constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
isDPPInstruction1363 constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
sdwaInstruction1364 SDWA_instruction& sdwa() noexcept
1365 {
1366 assert(isSDWA());
1367 return *(SDWA_instruction*)this;
1368 }
sdwaInstruction1369 const SDWA_instruction& sdwa() const noexcept
1370 {
1371 assert(isSDWA());
1372 return *(SDWA_instruction*)this;
1373 }
isSDWAInstruction1374 constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1375
flatlikeInstruction1376 FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1377
flatlikeInstruction1378 const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1379
isFlatLikeInstruction1380 constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1381
valuInstruction1382 VALU_instruction& valu() noexcept
1383 {
1384 assert(isVALU());
1385 return *(VALU_instruction*)this;
1386 }
valuInstruction1387 const VALU_instruction& valu() const noexcept
1388 {
1389 assert(isVALU());
1390 return *(VALU_instruction*)this;
1391 }
isVALUInstruction1392 constexpr bool isVALU() const noexcept
1393 {
1394 return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG() ||
1395 isVOPD();
1396 }
1397
saluInstruction1398 SALU_instruction& salu() noexcept
1399 {
1400 assert(isSALU());
1401 return *(SALU_instruction*)this;
1402 }
saluInstruction1403 const SALU_instruction& salu() const noexcept
1404 {
1405 assert(isSALU());
1406 return *(SALU_instruction*)this;
1407 }
isSALUInstruction1408 constexpr bool isSALU() const noexcept
1409 {
1410 return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1411 }
1412
isVMEMInstruction1413 constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1414
1415 bool accessesLDS() const noexcept;
1416 bool isTrans() const noexcept;
1417 };
1418 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1419
1420 struct SALU_instruction : public Instruction {
1421 /* In case of SOPP branch instructions, contains the Block index,
1422 * and otherwise, for SOPP and SOPK the 16-bit signed immediate.
1423 */
1424 uint32_t imm;
1425 };
1426 static_assert(sizeof(SALU_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1427
1428 /**
1429 * Scalar Memory Format:
1430 * For s_(buffer_)load_dword*:
1431 * Operand(0): SBASE - SGPR-pair which provides base address
1432 * Operand(1): Offset - immediate (un)signed offset or SGPR
1433 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1434 * Operand(n-1): SOffset - SGPR offset (Vega only)
1435 *
1436 * Having no operands is also valid for instructions such as s_dcache_inv.
1437 *
1438 */
1439 struct SMEM_instruction : public Instruction {
1440 memory_sync_info sync;
1441 ac_hw_cache_flags cache;
1442 };
1443 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1444
1445 struct VALU_instruction : public Instruction {
1446 union {
1447 bitfield_array8<uint32_t, 0, 3> neg; /* VOP3, SDWA, DPP16, v_fma_mix, VINTERP_inreg */
1448 bitfield_array8<uint32_t, 0, 3> neg_lo; /* VOP3P */
1449
1450 bitfield_array8<uint32_t, 3, 3> abs; /* VOP3, SDWA, DPP16, v_fma_mix */
1451 bitfield_array8<uint32_t, 3, 3> neg_hi; /* VOP3P */
1452
1453 bitfield_array8<uint32_t, 6, 4> opsel; /* VOP3, VOPC12(GFX11+), VINTERP_inreg */
1454 bitfield_uint8<uint32_t, 10, 2> omod; /* VOP3, SDWA(GFX9+) */
1455 bitfield_array8<uint32_t, 12, 3> opsel_lo; /* VOP3P */
1456 bitfield_array8<uint32_t, 15, 3> opsel_hi; /* VOP3P */
1457 bitfield_bool<uint32_t, 18> clamp; /* VOP3, VOP3P, SDWA, VINTERP_inreg */
1458 };
1459
1460 void swapOperands(unsigned idx0, unsigned idx1);
1461 };
1462 static_assert(sizeof(VALU_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1463
1464 struct VINTERP_inreg_instruction : public VALU_instruction {
1465 uint8_t wait_exp : 3;
1466 uint8_t padding3 : 5;
1467 uint8_t padding4;
1468 uint8_t padding5;
1469 uint8_t padding6;
1470 };
1471 static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4,
1472 "Unexpected padding");
1473
1474 struct VOPD_instruction : public VALU_instruction {
1475 aco_opcode opy;
1476 uint16_t padding;
1477 };
1478 static_assert(sizeof(VOPD_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1479
1480 /**
1481 * Data Parallel Primitives Format:
1482 * This format can be used for VOP1, VOP2 or VOPC instructions.
1483 * The swizzle applies to the src0 operand.
1484 *
1485 */
1486 struct DPP16_instruction : public VALU_instruction {
1487 uint16_t dpp_ctrl;
1488 uint8_t row_mask : 4;
1489 uint8_t bank_mask : 4;
1490 bool bound_ctrl : 1;
1491 uint8_t fetch_inactive : 1;
1492 uint8_t padding3 : 6;
1493 };
1494 static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1495
1496 struct DPP8_instruction : public VALU_instruction {
1497 uint32_t lane_sel : 24;
1498 uint32_t fetch_inactive : 1;
1499 uint32_t padding : 7;
1500 };
1501 static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1502
1503 struct SubdwordSel {
1504 enum sdwa_sel : uint8_t {
1505 ubyte = 0x4,
1506 uword = 0x8,
1507 dword = 0x10,
1508 sext = 0x20,
1509 sbyte = ubyte | sext,
1510 sword = uword | sext,
1511
1512 ubyte0 = ubyte,
1513 ubyte1 = ubyte | 1,
1514 ubyte2 = ubyte | 2,
1515 ubyte3 = ubyte | 3,
1516 sbyte0 = sbyte,
1517 sbyte1 = sbyte | 1,
1518 sbyte2 = sbyte | 2,
1519 sbyte3 = sbyte | 3,
1520 uword0 = uword,
1521 uword1 = uword | 2,
1522 sword0 = sword,
1523 sword1 = sword | 2,
1524 };
1525
SubdwordSelSubdwordSel1526 SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1527 constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1528 constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1529 : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1530 {}
sdwa_selSubdwordSel1531 constexpr operator sdwa_sel() const { return sel; }
1532 explicit operator bool() const { return sel != 0; }
1533
sizeSubdwordSel1534 constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1535 constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1536 constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1537 constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1538 {
1539 reg_byte_offset += offset();
1540 if (size() == 1)
1541 return reg_byte_offset;
1542 else if (size() == 2)
1543 return 4 + (reg_byte_offset >> 1);
1544 else
1545 return 6;
1546 }
1547
1548 private:
1549 sdwa_sel sel;
1550 };
1551
1552 /**
1553 * Sub-Dword Addressing Format:
1554 * This format can be used for VOP1, VOP2 or VOPC instructions.
1555 *
1556 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1557 * the definition doesn't have to be VCC on GFX9+.
1558 *
1559 */
1560 struct SDWA_instruction : public VALU_instruction {
1561 /* these destination modifiers aren't available with VOPC except for
1562 * clamp on GFX8 */
1563 SubdwordSel sel[2];
1564 SubdwordSel dst_sel;
1565 uint8_t padding3;
1566 };
1567 static_assert(sizeof(SDWA_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1568
1569 struct VINTRP_instruction : public Instruction {
1570 uint8_t attribute;
1571 uint8_t component;
1572 bool high_16bits;
1573 uint8_t padding;
1574 };
1575 static_assert(sizeof(VINTRP_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1576
1577 /**
1578 * Local and Global Data Sharing instructions
1579 * Operand(0): ADDR - VGPR which supplies the address.
1580 * Operand(1): DATA0 - First data VGPR.
1581 * Operand(2): DATA1 - Second data VGPR.
1582 * Operand(n-1): M0 - LDS size.
1583 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1584 *
1585 */
1586 struct DS_instruction : public Instruction {
1587 memory_sync_info sync;
1588 bool gds;
1589 uint16_t offset0;
1590 uint8_t offset1;
1591 uint8_t padding;
1592 };
1593 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1594
1595 /**
1596 * LDS Direct instructions
1597 * Operand(0): M0
1598 * Definition(0): VDST - Destination VGPR
1599 */
1600 struct LDSDIR_instruction : public Instruction {
1601 memory_sync_info sync;
1602 uint8_t attr : 6;
1603 uint8_t attr_chan : 2;
1604 uint32_t wait_vdst : 4;
1605 uint32_t wait_vsrc : 1;
1606 uint32_t padding : 27;
1607 };
1608 static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1609
1610 /**
1611 * Vector Memory Untyped-buffer Instructions
1612 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1613 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1614 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1615 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1616 *
1617 */
1618 struct MUBUF_instruction : public Instruction {
1619 memory_sync_info sync;
1620 ac_hw_cache_flags cache;
1621 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1622 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1623 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1624 bool tfe : 1; /* texture fail enable */
1625 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1626 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1627 uint8_t padding0 : 2;
1628 uint8_t padding1;
1629 uint16_t offset; /* Unsigned byte offset - 12 bit */
1630 };
1631 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1632
1633 /**
1634 * Vector Memory Typed-buffer Instructions
1635 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1636 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1637 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1638 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1639 *
1640 */
1641 struct MTBUF_instruction : public Instruction {
1642 memory_sync_info sync;
1643 ac_hw_cache_flags cache;
1644 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1645 uint8_t nfmt : 3; /* Numeric format of data in memory */
1646 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1647 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1648 bool tfe : 1; /* texture fail enable */
1649 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1650 uint8_t padding : 5;
1651 uint16_t offset; /* Unsigned byte offset - 12 bit */
1652 };
1653 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1654
1655 /**
1656 * Vector Memory Image Instructions
1657 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1658 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1659 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1660 * Operand(3): VADDR - Address source. Can carry an offset or an index.
1661 * Definition(0): VDATA - Vector GPR for read result.
1662 *
1663 */
1664 struct MIMG_instruction : public Instruction {
1665 memory_sync_info sync;
1666 ac_hw_cache_flags cache;
1667 uint8_t dmask; /* Data VGPR enable mask */
1668 uint8_t dim : 3; /* NAVI: dimensionality */
1669 bool unrm : 1; /* Force address to be un-normalized */
1670 bool tfe : 1; /* texture fail enable */
1671 bool da : 1; /* declare an array */
1672 bool lwe : 1; /* LOD warning enable */
1673 bool r128 : 1; /* NAVI: Texture resource size */
1674 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1675 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1676 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1677 bool strict_wqm : 1; /* VADDR is a linear VGPR and additional VGPRs may be copied into it */
1678 uint8_t padding0 : 4;
1679 uint8_t padding1;
1680 };
1681 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1682
1683 /**
1684 * Flat/Scratch/Global Instructions
1685 * Operand(0): ADDR
1686 * Operand(1): SADDR
1687 * Operand(2) / Definition(0): DATA/VDST
1688 *
1689 */
1690 struct FLAT_instruction : public Instruction {
1691 memory_sync_info sync;
1692 ac_hw_cache_flags cache;
1693 bool lds : 1;
1694 bool nv : 1;
1695 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1696 uint8_t padding0 : 5;
1697 uint8_t padding1;
1698 int16_t offset; /* Vega/Navi only */
1699 };
1700 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1701
1702 struct Export_instruction : public Instruction {
1703 uint8_t enabled_mask;
1704 uint8_t dest;
1705 bool compressed : 1;
1706 bool done : 1;
1707 bool valid_mask : 1;
1708 bool row_en : 1;
1709 uint8_t padding0 : 4;
1710 uint8_t padding1;
1711 };
1712 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1713
1714 struct Pseudo_instruction : public Instruction {
1715 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1716 bool needs_scratch_reg; /* if scratch_sgpr/scc can be written, initialized by RA. */
1717 };
1718 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1719
1720 struct Pseudo_branch_instruction : public Instruction {
1721 /* target[0] is the block index of the branch target.
1722 * For conditional branches, target[1] contains the fall-through alternative.
1723 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1724 */
1725 uint32_t target[2];
1726
1727 /* Indicates that this rarely or never jumps to target[0]. */
1728 bool rarely_taken;
1729 bool never_taken;
1730
1731 uint16_t padding;
1732 };
1733 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding");
1734
1735 struct Pseudo_barrier_instruction : public Instruction {
1736 memory_sync_info sync;
1737 sync_scope exec_scope;
1738 };
1739 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1740
1741 enum ReduceOp : uint16_t {
1742 // clang-format off
1743 iadd8, iadd16, iadd32, iadd64,
1744 imul8, imul16, imul32, imul64,
1745 fadd16, fadd32, fadd64,
1746 fmul16, fmul32, fmul64,
1747 imin8, imin16, imin32, imin64,
1748 imax8, imax16, imax32, imax64,
1749 umin8, umin16, umin32, umin64,
1750 umax8, umax16, umax32, umax64,
1751 fmin16, fmin32, fmin64,
1752 fmax16, fmax32, fmax64,
1753 iand8, iand16, iand32, iand64,
1754 ior8, ior16, ior32, ior64,
1755 ixor8, ixor16, ixor32, ixor64,
1756 num_reduce_ops,
1757 // clang-format on
1758 };
1759
1760 /**
1761 * Subgroup Reduction Instructions, everything except for the data to be
1762 * reduced and the result as inserted by setup_reduce_temp().
1763 * Operand(0): data to be reduced
1764 * Operand(1): reduce temporary
1765 * Operand(2): vector temporary
1766 * Definition(0): result
1767 * Definition(1): scalar temporary
1768 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1769 * Definition(3): scc clobber
1770 * Definition(4): vcc clobber
1771 *
1772 */
1773 struct Pseudo_reduction_instruction : public Instruction {
1774 ReduceOp reduce_op;
1775 uint16_t cluster_size; // must be 0 for scans
1776 };
1777 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1778 "Unexpected padding");
1779
1780 inline bool
accessesLDS()1781 Instruction::accessesLDS() const noexcept
1782 {
1783 return (isDS() && !ds().gds) || isLDSDIR() || isVINTRP();
1784 }
1785
1786 inline void
swapOperands(unsigned idx0,unsigned idx1)1787 VALU_instruction::swapOperands(unsigned idx0, unsigned idx1)
1788 {
1789 if (this->isSDWA() && idx0 != idx1) {
1790 assert(idx0 < 2 && idx1 < 2);
1791 std::swap(this->sdwa().sel[0], this->sdwa().sel[1]);
1792 }
1793 assert(idx0 < 3 && idx1 < 3);
1794 std::swap(this->operands[idx0], this->operands[idx1]);
1795 this->neg[idx0].swap(this->neg[idx1]);
1796 this->abs[idx0].swap(this->abs[idx1]);
1797 this->opsel[idx0].swap(this->opsel[idx1]);
1798 this->opsel_lo[idx0].swap(this->opsel_lo[idx1]);
1799 this->opsel_hi[idx0].swap(this->opsel_hi[idx1]);
1800 }
1801
1802 struct instr_deleter_functor {
1803 /* Don't yet free any instructions. They will be de-allocated
1804 * all at once after compilation finished.
1805 */
operatorinstr_deleter_functor1806 void operator()(void* p) { return; }
1807 };
1808
1809 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1810
1811 size_t get_instr_data_size(Format format);
1812
1813 Instruction* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1814 uint32_t num_definitions);
1815
1816 constexpr bool
usesModifiers()1817 Instruction::usesModifiers() const noexcept
1818 {
1819 if (isDPP() || isSDWA())
1820 return true;
1821
1822 if (isVOP3P()) {
1823 const VALU_instruction& vop3p = this->valu();
1824 /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1825 return vop3p.opsel_lo || vop3p.clamp || vop3p.neg_lo || vop3p.neg_hi ||
1826 (vop3p.opsel_hi & BITFIELD_MASK(operands.size())) != BITFIELD_MASK(operands.size());
1827 } else if (isVALU()) {
1828 const VALU_instruction& vop3 = this->valu();
1829 return vop3.opsel || vop3.clamp || vop3.omod || vop3.abs || vop3.neg;
1830 }
1831 return false;
1832 }
1833
1834 constexpr bool
is_phi(Instruction * instr)1835 is_phi(Instruction* instr)
1836 {
1837 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1838 }
1839
1840 static inline bool
is_phi(aco_ptr<Instruction> & instr)1841 is_phi(aco_ptr<Instruction>& instr)
1842 {
1843 return is_phi(instr.get());
1844 }
1845
1846 bool is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr);
1847 memory_sync_info get_sync_info(const Instruction* instr);
1848
1849 inline bool
is_dead(const std::vector<uint16_t> & uses,const Instruction * instr)1850 is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
1851 {
1852 if (instr->definitions.empty() || instr->isBranch() || instr->opcode == aco_opcode::p_startpgm ||
1853 instr->opcode == aco_opcode::p_init_scratch ||
1854 instr->opcode == aco_opcode::p_dual_src_export_gfx11)
1855 return false;
1856
1857 if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
1858 [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
1859 return false;
1860
1861 return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
1862 }
1863
1864 bool can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx);
1865 bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
1866 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
1867 uint8_t get_gfx11_true16_mask(aco_opcode op);
1868 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
1869 bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
1870 bool can_write_m0(const aco_ptr<Instruction>& instr);
1871 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1872 aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr);
1873 aco_ptr<Instruction> convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr,
1874 bool dpp8);
1875 bool needs_exec_mask(const Instruction* instr);
1876
1877 aco_opcode get_vcmp_inverse(aco_opcode op);
1878 aco_opcode get_vcmp_swapped(aco_opcode op);
1879 aco_opcode get_vcmpx(aco_opcode op);
1880 bool is_cmpx(aco_opcode op);
1881
1882 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0 = 0,
1883 unsigned idx1 = 1);
1884
1885 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1886
1887 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1888
1889 unsigned get_vopd_opy_start(const Instruction* instr);
1890
1891 unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index);
1892
1893 bool should_form_clause(const Instruction* a, const Instruction* b);
1894
1895 enum vmem_type : uint8_t {
1896 vmem_nosampler = 1 << 0,
1897 vmem_sampler = 1 << 1,
1898 vmem_bvh = 1 << 2,
1899 };
1900
1901 /* VMEM instructions of the same type return in-order. For GFX12+, this determines which counter
1902 * is used.
1903 */
1904 uint8_t get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr);
1905
1906 /* For all of the counters, the maximum value means no wait.
1907 * Some of the counters are larger than their bit field,
1908 * but there is no wait mechanism that allows waiting only for higher values.
1909 */
1910 struct depctr_wait {
1911 union {
1912 struct {
1913 /* VALU completion, apparently even used for VALU without vgpr writes. */
1914 unsigned va_vdst : 4;
1915 /* VALU sgpr write (not including vcc/vcc_hi). */
1916 unsigned va_sdst : 3;
1917 /* VALU sgpr read. */
1918 unsigned va_ssrc : 1;
1919 /* unknown. */
1920 unsigned hold_cnt : 1;
1921 /* VMEM/DS vgpr read. */
1922 unsigned vm_vsrc : 3;
1923 /* VALU vcc/vcc_hi write. */
1924 unsigned va_vcc : 1;
1925 /* SALU sgpr, vcc/vcc_hi or scc write. */
1926 unsigned sa_sdst : 1;
1927 /* VALU exec/exec_hi write. */
1928 unsigned va_exec : 1;
1929 /* SALU exec/exec_hi write. */
1930 unsigned sa_exec : 1;
1931 };
1932 unsigned packed = -1;
1933 };
1934 };
1935
1936 depctr_wait parse_depctr_wait(const Instruction* instr);
1937
1938 enum block_kind {
1939 /* uniform indicates that leaving this block,
1940 * all actives lanes stay active */
1941 block_kind_uniform = 1 << 0,
1942 block_kind_top_level = 1 << 1,
1943 block_kind_loop_preheader = 1 << 2,
1944 block_kind_loop_header = 1 << 3,
1945 block_kind_loop_exit = 1 << 4,
1946 block_kind_continue = 1 << 5,
1947 block_kind_break = 1 << 6,
1948 block_kind_continue_or_break = 1 << 7,
1949 block_kind_branch = 1 << 8,
1950 block_kind_merge = 1 << 9,
1951 block_kind_invert = 1 << 10,
1952 block_kind_discard_early_exit = 1 << 11,
1953 block_kind_uses_discard = 1 << 12,
1954 block_kind_resume = 1 << 13,
1955 block_kind_export_end = 1 << 14,
1956 block_kind_end_with_regs = 1 << 15,
1957 };
1958
1959 /* CFG */
1960 struct Block {
1961 using edge_vec = small_vec<uint32_t, 2>;
1962
1963 float_mode fp_mode;
1964 unsigned index;
1965 unsigned offset = 0;
1966 std::vector<aco_ptr<Instruction>> instructions;
1967 edge_vec logical_preds;
1968 edge_vec linear_preds;
1969 edge_vec logical_succs;
1970 edge_vec linear_succs;
1971 RegisterDemand register_demand = RegisterDemand();
1972 RegisterDemand live_in_demand = RegisterDemand();
1973 uint32_t kind = 0;
1974 int32_t logical_idom = -1;
1975 int32_t linear_idom = -1;
1976
1977 /* Preorder and postorder traversal indices of the dominance tree. Because a program can have
1978 * several dominance trees (because of block_kind_resume), these start at the block index of the
1979 * root node. */
1980 uint32_t logical_dom_pre_index = 0;
1981 uint32_t logical_dom_post_index = 0;
1982 uint32_t linear_dom_pre_index = 0;
1983 uint32_t linear_dom_post_index = 0;
1984
1985 uint16_t loop_nest_depth = 0;
1986 uint16_t divergent_if_logical_depth = 0;
1987 uint16_t uniform_if_depth = 0;
1988
BlockBlock1989 Block() : index(0) {}
1990 };
1991
1992 /*
1993 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1994 */
1995 enum class SWStage : uint16_t {
1996 None = 0,
1997 VS = 1 << 0, /* Vertex Shader */
1998 GS = 1 << 1, /* Geometry Shader */
1999 TCS = 1 << 2, /* Tessellation Control aka Hull Shader */
2000 TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */
2001 FS = 1 << 4, /* Fragment aka Pixel Shader */
2002 CS = 1 << 5, /* Compute Shader */
2003 TS = 1 << 6, /* Task Shader */
2004 MS = 1 << 7, /* Mesh Shader */
2005 RT = 1 << 8, /* Raytracing Shader */
2006
2007 /* Stage combinations merged to run on a single HWStage */
2008 VS_GS = VS | GS,
2009 VS_TCS = VS | TCS,
2010 TES_GS = TES | GS,
2011 };
2012
2013 constexpr SWStage
2014 operator|(SWStage a, SWStage b)
2015 {
2016 return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
2017 }
2018
2019 /*
2020 * Set of SWStages to be merged into a single shader paired with the
2021 * HWStage it will run on.
2022 */
2023 struct Stage {
2024 constexpr Stage() = default;
2025
StageStage2026 explicit constexpr Stage(ac_hw_stage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
2027
2028 /* Check if the given SWStage is included */
hasStage2029 constexpr bool has(SWStage stage) const
2030 {
2031 return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
2032 }
2033
num_sw_stagesStage2034 unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
2035
2036 constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
2037
2038 constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
2039
2040 /* Mask of merged software stages */
2041 SWStage sw = SWStage::None;
2042
2043 /* Active hardware stage */
2044 ac_hw_stage hw{};
2045 };
2046
2047 /* possible settings of Program::stage */
2048 static constexpr Stage vertex_vs(AC_HW_VERTEX_SHADER, SWStage::VS);
2049 static constexpr Stage fragment_fs(AC_HW_PIXEL_SHADER, SWStage::FS);
2050 static constexpr Stage compute_cs(AC_HW_COMPUTE_SHADER, SWStage::CS);
2051 static constexpr Stage tess_eval_vs(AC_HW_VERTEX_SHADER, SWStage::TES);
2052 /* Mesh shading pipeline */
2053 static constexpr Stage task_cs(AC_HW_COMPUTE_SHADER, SWStage::TS);
2054 static constexpr Stage mesh_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::MS);
2055 /* GFX10/NGG */
2056 static constexpr Stage vertex_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::VS);
2057 static constexpr Stage vertex_geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::VS_GS);
2058 static constexpr Stage tess_eval_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::TES);
2059 static constexpr Stage tess_eval_geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::TES_GS);
2060 /* GFX9 (and GFX10 if NGG isn't used) */
2061 static constexpr Stage vertex_geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::VS_GS);
2062 static constexpr Stage vertex_tess_control_hs(AC_HW_HULL_SHADER, SWStage::VS_TCS);
2063 static constexpr Stage tess_eval_geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::TES_GS);
2064 /* pre-GFX9 */
2065 static constexpr Stage vertex_ls(AC_HW_LOCAL_SHADER,
2066 SWStage::VS); /* vertex before tessellation control */
2067 static constexpr Stage vertex_es(AC_HW_EXPORT_SHADER, SWStage::VS); /* vertex before geometry */
2068 static constexpr Stage tess_control_hs(AC_HW_HULL_SHADER, SWStage::TCS);
2069 static constexpr Stage tess_eval_es(AC_HW_EXPORT_SHADER,
2070 SWStage::TES); /* tessellation evaluation before geometry */
2071 static constexpr Stage geometry_gs(AC_HW_LEGACY_GEOMETRY_SHADER, SWStage::GS);
2072 /* Raytracing */
2073 static constexpr Stage raytracing_cs(AC_HW_COMPUTE_SHADER, SWStage::RT);
2074
2075 struct DeviceInfo {
2076 uint16_t lds_encoding_granule;
2077 uint16_t lds_alloc_granule;
2078 uint32_t lds_limit; /* in bytes */
2079 bool has_16bank_lds;
2080 uint16_t physical_sgprs;
2081 uint16_t physical_vgprs;
2082 uint16_t vgpr_limit;
2083 uint16_t sgpr_limit;
2084 uint16_t sgpr_alloc_granule;
2085 uint16_t vgpr_alloc_granule;
2086 unsigned scratch_alloc_granule;
2087 uint16_t max_waves_per_simd;
2088 unsigned simd_per_cu;
2089 bool has_fast_fma32 = false;
2090 bool has_mac_legacy32 = false;
2091 bool has_fmac_legacy32 = false;
2092 bool fused_mad_mix = false;
2093 bool xnack_enabled = false;
2094 bool sram_ecc_enabled = false;
2095
2096 int16_t scratch_global_offset_min;
2097 int16_t scratch_global_offset_max;
2098 unsigned max_nsa_vgprs;
2099 };
2100
2101 enum class CompilationProgress {
2102 after_isel,
2103 after_spilling,
2104 after_ra,
2105 after_lower_to_hw,
2106 };
2107
2108 class Program final {
2109 public:
2110 aco::monotonic_buffer_resource m{65536};
2111 std::vector<Block> blocks;
2112 std::vector<RegClass> temp_rc = {s1};
2113 RegisterDemand max_reg_demand = RegisterDemand();
2114 ac_shader_config* config;
2115 struct aco_shader_info info;
2116 enum amd_gfx_level gfx_level;
2117 enum radeon_family family;
2118 DeviceInfo dev;
2119 unsigned wave_size;
2120 RegClass lane_mask;
2121 Stage stage;
2122 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2123 bool needs_wqm = false; /* there exists a p_wqm instruction */
2124 bool has_smem_buffer_or_global_loads = false;
2125 bool has_pops_overlapped_waves_wait = false;
2126 bool has_color_exports = false;
2127 bool is_prolog = false;
2128 bool is_epilog = false;
2129
2130 std::vector<ac_shader_debug_info> debug_info;
2131
2132 std::vector<uint8_t> constant_data;
2133 Temp private_segment_buffer;
2134 Temp scratch_offset;
2135
2136 uint16_t num_waves = 0;
2137 uint16_t min_waves = 0;
2138 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2139 bool wgp_mode;
2140
2141 bool needs_vcc = false;
2142
2143 CompilationProgress progress;
2144
2145 bool collect_statistics = false;
2146 uint32_t statistics[aco_num_statistics];
2147
2148 float_mode next_fp_mode;
2149 unsigned next_loop_depth = 0;
2150 unsigned next_divergent_if_logical_depth = 0;
2151 unsigned next_uniform_if_depth = 0;
2152
2153 std::vector<Definition> args_pending_vmem;
2154
2155 /* For shader part with previous shader part that has lds access. */
2156 bool pending_lds_access = false;
2157
2158 bool should_repair_ssa = false;
2159
2160 struct {
2161 monotonic_buffer_resource memory;
2162 /* live-in temps per block */
2163 std::vector<IDSet> live_in;
2164 } live;
2165
2166 struct {
2167 FILE* output = stderr;
2168 bool shorten_messages = false;
2169 void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message);
2170 void* private_data;
2171 } debug;
2172
allocateRange(unsigned amount)2173 void allocateRange(unsigned amount)
2174 {
2175 assert(temp_rc.size() + amount <= 16777216);
2176 temp_rc.resize(temp_rc.size() + amount);
2177 }
2178
allocateTmp(RegClass rc)2179 Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2180
peekAllocationId()2181 uint32_t peekAllocationId() { return temp_rc.size(); }
2182
create_and_insert_block()2183 Block* create_and_insert_block()
2184 {
2185 Block block;
2186 return insert_block(std::move(block));
2187 }
2188
insert_block(Block && block)2189 Block* insert_block(Block&& block)
2190 {
2191 block.index = blocks.size();
2192 block.fp_mode = next_fp_mode;
2193 block.loop_nest_depth = next_loop_depth;
2194 block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2195 block.uniform_if_depth = next_uniform_if_depth;
2196 blocks.emplace_back(std::move(block));
2197 return &blocks.back();
2198 }
2199
2200 private:
allocateId(RegClass rc)2201 uint32_t allocateId(RegClass rc)
2202 {
2203 assert(temp_rc.size() <= 16777215);
2204 temp_rc.push_back(rc);
2205 return temp_rc.size() - 1;
2206 }
2207 };
2208
2209 struct ra_test_policy {
2210 /* Force RA to always use its pessimistic fallback algorithm */
2211 bool skip_optimistic_path = false;
2212 };
2213
2214 void init();
2215
2216 void init_program(Program* program, Stage stage, const struct aco_shader_info* info,
2217 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
2218 ac_shader_config* config);
2219
2220 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2221 ac_shader_config* config, const struct aco_compiler_options* options,
2222 const struct aco_shader_info* info, const struct ac_shader_args* args);
2223 void select_trap_handler_shader(Program* program, ac_shader_config* config,
2224 const struct aco_compiler_options* options,
2225 const struct aco_shader_info* info,
2226 const struct ac_shader_args* args);
2227 void select_rt_prolog(Program* program, ac_shader_config* config,
2228 const struct aco_compiler_options* options,
2229 const struct aco_shader_info* info, const struct ac_shader_args* in_args,
2230 const struct ac_shader_args* out_args);
2231 void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo,
2232 ac_shader_config* config, const struct aco_compiler_options* options,
2233 const struct aco_shader_info* info, const struct ac_shader_args* args);
2234
2235 void select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
2236 const struct aco_compiler_options* options,
2237 const struct aco_shader_info* info, const struct ac_shader_args* args);
2238
2239 void select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
2240 const struct aco_compiler_options* options,
2241 const struct aco_shader_info* info, const struct ac_shader_args* args);
2242
2243 bool repair_ssa(Program* program);
2244 void lower_phis(Program* program);
2245 void lower_subdword(Program* program);
2246 void calc_min_waves(Program* program);
2247 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2248 void live_var_analysis(Program* program);
2249 std::vector<uint16_t> dead_code_analysis(Program* program);
2250 void dominator_tree(Program* program);
2251 void insert_exec_mask(Program* program);
2252 void value_numbering(Program* program);
2253 void optimize(Program* program);
2254 void optimize_postRA(Program* program);
2255 void lower_branches(Program* program);
2256 void setup_reduce_temp(Program* program);
2257 void lower_to_cssa(Program* program);
2258 void register_allocation(Program* program, ra_test_policy = {});
2259 void reindex_ssa(Program* program);
2260 void ssa_elimination(Program* program);
2261 void jump_threading(Program* program);
2262 void lower_to_hw_instr(Program* program);
2263 void schedule_program(Program* program);
2264 void schedule_ilp(Program* program);
2265 void schedule_vopd(Program* program);
2266 void spill(Program* program);
2267 void insert_waitcnt(Program* program);
2268 void insert_delay_alu(Program* program);
2269 void combine_delay_alu(Program* program);
2270 bool dealloc_vgprs(Program* program);
2271 void insert_NOPs(Program* program);
2272 void form_hard_clauses(Program* program);
2273 unsigned emit_program(Program* program, std::vector<uint32_t>& code,
2274 std::vector<struct aco_symbol>* symbols = NULL, bool append_endpgm = true);
2275 /**
2276 * Returns true if print_asm can disassemble the given program for the current build/runtime
2277 * configuration
2278 */
2279 bool check_print_asm_support(Program* program);
2280 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2281 bool validate_ir(Program* program);
2282 bool validate_cfg(Program* program);
2283 bool validate_ra(Program* program);
2284 bool validate_live_vars(Program* program);
2285
2286 void collect_presched_stats(Program* program);
2287 void collect_preasm_stats(Program* program);
2288 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2289
2290 struct Instruction_cycle_info {
2291 /* Latency until the result is ready (if not needing a waitcnt) */
2292 unsigned latency;
2293
2294 /* How many cycles issuing this instruction takes (i.e. cycles till the next instruction can be
2295 * issued)*/
2296 unsigned issue_cycles;
2297 };
2298
2299 Instruction_cycle_info get_cycle_info(const Program& program, const Instruction& instr);
2300
2301 enum print_flags {
2302 print_no_ssa = 0x1,
2303 print_perf_info = 0x2,
2304 print_kill = 0x4,
2305 print_live_vars = 0x8,
2306 };
2307
2308 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2309 void aco_print_instr(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* output,
2310 unsigned flags = 0);
2311 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2312
2313 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2314
2315 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2316
2317 int get_op_fixed_to_def(Instruction* instr);
2318
2319 /* utilities for dealing with register demand */
2320 RegisterDemand get_live_changes(Instruction* instr);
2321 RegisterDemand get_temp_registers(Instruction* instr);
2322
2323 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2324 uint16_t get_extra_sgprs(Program* program);
2325
2326 /* adjust num_waves for workgroup size and LDS limits */
2327 uint16_t max_suitable_waves(Program* program, uint16_t waves);
2328
2329 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2330 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2331 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2332
2333 /* return number of addressable sgprs/vgprs for max_waves */
2334 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2335 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2336
2337 bool uses_scratch(Program* program);
2338
2339 inline bool
dominates_logical(const Block & parent,const Block & child)2340 dominates_logical(const Block& parent, const Block& child)
2341 {
2342 return child.logical_dom_pre_index >= parent.logical_dom_pre_index &&
2343 child.logical_dom_post_index <= parent.logical_dom_post_index;
2344 }
2345
2346 inline bool
dominates_linear(const Block & parent,const Block & child)2347 dominates_linear(const Block& parent, const Block& child)
2348 {
2349 return child.linear_dom_pre_index >= parent.linear_dom_pre_index &&
2350 child.linear_dom_post_index <= parent.linear_dom_post_index;
2351 }
2352
2353 typedef struct {
2354 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2355 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2356 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2357 const int16_t opcode_gfx11[static_cast<int>(aco_opcode::num_opcodes)];
2358 const int16_t opcode_gfx12[static_cast<int>(aco_opcode::num_opcodes)];
2359 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2360 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2361 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2362 const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2363 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2364 /* sizes used for input/output modifiers and constants */
2365 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2366 const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2367 const uint32_t definitions[static_cast<int>(aco_opcode::num_opcodes)];
2368 const uint32_t operands[static_cast<int>(aco_opcode::num_opcodes)];
2369 } Info;
2370
2371 extern const Info instr_info;
2372
2373 } // namespace aco
2374
2375 #endif /* ACO_IR_H */
2376