1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include "aco_opcodes.h"
29 #include "aco_util.h"
30 #include "aco_interface.h"
31 #include "aco_shader_info.h"
32 #include "vulkan/radv_shader.h"
33
34 #include "nir.h"
35
36 #include <bitset>
37 #include <memory>
38 #include <vector>
39
40 struct radv_shader_args;
41
42 namespace aco {
43
44 extern uint64_t debug_flags;
45
46 enum {
47 DEBUG_VALIDATE_IR = 0x1,
48 DEBUG_VALIDATE_RA = 0x2,
49 DEBUG_PERFWARN = 0x4,
50 DEBUG_FORCE_WAITCNT = 0x8,
51 DEBUG_NO_VN = 0x10,
52 DEBUG_NO_OPT = 0x20,
53 DEBUG_NO_SCHED = 0x40,
54 DEBUG_PERF_INFO = 0x80,
55 DEBUG_LIVE_INFO = 0x100,
56 };
57
58 /**
59 * Representation of the instruction's microcode encoding format
60 * Note: Some Vector ALU Formats can be combined, such that:
61 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
62 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
63 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
64 *
65 * (*) The same is applicable for VOP1 and VOPC instructions.
66 */
67 enum class Format : std::uint16_t {
68 /* Pseudo Instruction Format */
69 PSEUDO = 0,
70 /* Scalar ALU & Control Formats */
71 SOP1 = 1,
72 SOP2 = 2,
73 SOPK = 3,
74 SOPP = 4,
75 SOPC = 5,
76 /* Scalar Memory Format */
77 SMEM = 6,
78 /* LDS/GDS Format */
79 DS = 8,
80 /* Vector Memory Buffer Formats */
81 MTBUF = 9,
82 MUBUF = 10,
83 /* Vector Memory Image Format */
84 MIMG = 11,
85 /* Export Format */
86 EXP = 12,
87 /* Flat Formats */
88 FLAT = 13,
89 GLOBAL = 14,
90 SCRATCH = 15,
91
92 PSEUDO_BRANCH = 16,
93 PSEUDO_BARRIER = 17,
94 PSEUDO_REDUCTION = 18,
95
96 /* Vector ALU Formats */
97 VOP3P = 19,
98 VOP1 = 1 << 8,
99 VOP2 = 1 << 9,
100 VOPC = 1 << 10,
101 VOP3 = 1 << 11,
102 /* Vector Parameter Interpolation Format */
103 VINTRP = 1 << 12,
104 DPP16 = 1 << 13,
105 SDWA = 1 << 14,
106 DPP8 = 1 << 15,
107 };
108
109 enum class instr_class : uint8_t {
110 valu32 = 0,
111 valu_convert32 = 1,
112 valu64 = 2,
113 valu_quarter_rate32 = 3,
114 valu_fma = 4,
115 valu_transcendental32 = 5,
116 valu_double = 6,
117 valu_double_add = 7,
118 valu_double_convert = 8,
119 valu_double_transcendental = 9,
120 salu = 10,
121 smem = 11,
122 barrier = 12,
123 branch = 13,
124 sendmsg = 14,
125 ds = 15,
126 exp = 16,
127 vmem = 17,
128 waitcnt = 18,
129 other = 19,
130 count,
131 };
132
133 enum storage_class : uint8_t {
134 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
135 storage_buffer = 0x1, /* SSBOs and global memory */
136 storage_atomic_counter = 0x2, /* not used for Vulkan */
137 storage_image = 0x4,
138 storage_shared = 0x8, /* or TCS output */
139 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
140 storage_task_payload = 0x20,/* Task-Mesh payload */
141 storage_scratch = 0x40,
142 storage_vgpr_spill = 0x80,
143 storage_count = 8, /* not counting storage_none */
144 };
145
146 enum memory_semantics : uint8_t {
147 semantic_none = 0x0,
148 /* for loads: don't move any access after this load to before this load (even other loads)
149 * for barriers: don't move any access after the barrier to before any
150 * atomics/control_barriers/sendmsg_gs_done before the barrier */
151 semantic_acquire = 0x1,
152 /* for stores: don't move any access before this store to after this store
153 * for barriers: don't move any access before the barrier to after any
154 * atomics/control_barriers/sendmsg_gs_done after the barrier */
155 semantic_release = 0x2,
156
157 /* the rest are for load/stores/atomics only */
158 /* cannot be DCE'd or CSE'd */
159 semantic_volatile = 0x4,
160 /* does not interact with barriers and assumes this lane is the only lane
161 * accessing this memory */
162 semantic_private = 0x8,
163 /* this operation can be reordered around operations of the same storage.
164 * says nothing about barriers */
165 semantic_can_reorder = 0x10,
166 /* this is a atomic instruction (may only read or write memory) */
167 semantic_atomic = 0x20,
168 /* this is instruction both reads and writes memory */
169 semantic_rmw = 0x40,
170
171 semantic_acqrel = semantic_acquire | semantic_release,
172 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
173 };
174
175 enum sync_scope : uint8_t {
176 scope_invocation = 0,
177 scope_subgroup = 1,
178 scope_workgroup = 2,
179 scope_queuefamily = 3,
180 scope_device = 4,
181 };
182
183 struct memory_sync_info {
memory_sync_infomemory_sync_info184 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
185 memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
186 : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
187 {}
188
189 storage_class storage : 8;
190 memory_semantics semantics : 8;
191 sync_scope scope : 8;
192
193 bool operator==(const memory_sync_info& rhs) const
194 {
195 return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
196 }
197
can_reordermemory_sync_info198 bool can_reorder() const
199 {
200 if (semantics & semantic_acqrel)
201 return false;
202 /* Also check storage so that zero-initialized memory_sync_info can be
203 * reordered. */
204 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
205 }
206 };
207 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
208
209 enum fp_round {
210 fp_round_ne = 0,
211 fp_round_pi = 1,
212 fp_round_ni = 2,
213 fp_round_tz = 3,
214 };
215
216 enum fp_denorm {
217 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
218 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
219 fp_denorm_flush = 0x0,
220 fp_denorm_keep_in = 0x1,
221 fp_denorm_keep_out = 0x2,
222 fp_denorm_keep = 0x3,
223 };
224
225 struct float_mode {
226 /* matches encoding of the MODE register */
227 union {
228 struct {
229 fp_round round32 : 2;
230 fp_round round16_64 : 2;
231 unsigned denorm32 : 2;
232 unsigned denorm16_64 : 2;
233 };
234 struct {
235 uint8_t round : 4;
236 uint8_t denorm : 4;
237 };
238 uint8_t val = 0;
239 };
240 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
241 bool preserve_signed_zero_inf_nan32 : 1;
242 bool preserve_signed_zero_inf_nan16_64 : 1;
243 /* if false, optimizations which may remove denormal flushing can be done */
244 bool must_flush_denorms32 : 1;
245 bool must_flush_denorms16_64 : 1;
246 bool care_about_round32 : 1;
247 bool care_about_round16_64 : 1;
248
249 /* Returns true if instructions using the mode "other" can safely use the
250 * current one instead. */
canReplacefloat_mode251 bool canReplace(float_mode other) const noexcept
252 {
253 return val == other.val &&
254 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
255 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
256 (must_flush_denorms32 || !other.must_flush_denorms32) &&
257 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
258 (care_about_round32 || !other.care_about_round32) &&
259 (care_about_round16_64 || !other.care_about_round16_64);
260 }
261 };
262
263 struct wait_imm {
264 static const uint8_t unset_counter = 0xff;
265
266 uint8_t vm;
267 uint8_t exp;
268 uint8_t lgkm;
269 uint8_t vs;
270
271 wait_imm();
272 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
273 wait_imm(enum amd_gfx_level chip, uint16_t packed);
274
275 uint16_t pack(enum amd_gfx_level chip) const;
276
277 bool combine(const wait_imm& other);
278
279 bool empty() const;
280 };
281
282 constexpr Format
asVOP3(Format format)283 asVOP3(Format format)
284 {
285 return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
286 };
287
288 constexpr Format
asSDWA(Format format)289 asSDWA(Format format)
290 {
291 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
292 return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
293 }
294
295 constexpr Format
withoutDPP(Format format)296 withoutDPP(Format format)
297 {
298 return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
299 }
300
301 enum class RegType {
302 none = 0,
303 sgpr,
304 vgpr,
305 linear_vgpr,
306 };
307
308 struct RegClass {
309
310 enum RC : uint8_t {
311 s1 = 1,
312 s2 = 2,
313 s3 = 3,
314 s4 = 4,
315 s6 = 6,
316 s8 = 8,
317 s16 = 16,
318 v1 = s1 | (1 << 5),
319 v2 = s2 | (1 << 5),
320 v3 = s3 | (1 << 5),
321 v4 = s4 | (1 << 5),
322 v5 = 5 | (1 << 5),
323 v6 = 6 | (1 << 5),
324 v7 = 7 | (1 << 5),
325 v8 = 8 | (1 << 5),
326 /* byte-sized register class */
327 v1b = v1 | (1 << 7),
328 v2b = v2 | (1 << 7),
329 v3b = v3 | (1 << 7),
330 v4b = v4 | (1 << 7),
331 v6b = v6 | (1 << 7),
332 v8b = v8 | (1 << 7),
333 /* these are used for WWM and spills to vgpr */
334 v1_linear = v1 | (1 << 6),
335 v2_linear = v2 | (1 << 6),
336 };
337
338 RegClass() = default;
RegClassRegClass339 constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass340 constexpr RegClass(RegType type, unsigned size)
341 : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
342 {}
343
RCRegClass344 constexpr operator RC() const { return rc; }
345 explicit operator bool() = delete;
346
typeRegClass347 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass348 constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass349 constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass350 constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
351 // TODO: use size() less in favor of bytes()
sizeRegClass352 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass353 constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass354 constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass355 constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
356
getRegClass357 static constexpr RegClass get(RegType type, unsigned bytes)
358 {
359 if (type == RegType::sgpr) {
360 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
361 } else {
362 return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
363 }
364 }
365
resizeRegClass366 constexpr RegClass resize(unsigned bytes) const
367 {
368 if (is_linear_vgpr()) {
369 assert(bytes % 4u == 0);
370 return get(RegType::vgpr, bytes).as_linear();
371 }
372 return get(type(), bytes);
373 }
374
375 private:
376 RC rc;
377 };
378
379 /* transitional helper expressions */
380 static constexpr RegClass s1{RegClass::s1};
381 static constexpr RegClass s2{RegClass::s2};
382 static constexpr RegClass s3{RegClass::s3};
383 static constexpr RegClass s4{RegClass::s4};
384 static constexpr RegClass s8{RegClass::s8};
385 static constexpr RegClass s16{RegClass::s16};
386 static constexpr RegClass v1{RegClass::v1};
387 static constexpr RegClass v2{RegClass::v2};
388 static constexpr RegClass v3{RegClass::v3};
389 static constexpr RegClass v4{RegClass::v4};
390 static constexpr RegClass v5{RegClass::v5};
391 static constexpr RegClass v6{RegClass::v6};
392 static constexpr RegClass v7{RegClass::v7};
393 static constexpr RegClass v8{RegClass::v8};
394 static constexpr RegClass v1b{RegClass::v1b};
395 static constexpr RegClass v2b{RegClass::v2b};
396 static constexpr RegClass v3b{RegClass::v3b};
397 static constexpr RegClass v4b{RegClass::v4b};
398 static constexpr RegClass v6b{RegClass::v6b};
399 static constexpr RegClass v8b{RegClass::v8b};
400
401 /**
402 * Temp Class
403 * Each temporary virtual register has a
404 * register class (i.e. size and type)
405 * and SSA id.
406 */
407 struct Temp {
TempTemp408 Temp() noexcept : id_(0), reg_class(0) {}
TempTemp409 constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
410
idTemp411 constexpr uint32_t id() const noexcept { return id_; }
regClassTemp412 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
413
bytesTemp414 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp415 constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp416 constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp417 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
418
419 constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
420 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
421 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
422
423 private:
424 uint32_t id_ : 24;
425 uint32_t reg_class : 8;
426 };
427
428 /**
429 * PhysReg
430 * Represents the physical register for each
431 * Operand and Definition.
432 */
433 struct PhysReg {
434 constexpr PhysReg() = default;
PhysRegPhysReg435 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg436 constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg437 constexpr unsigned byte() const { return reg_b & 0x3; }
438 constexpr operator unsigned() const { return reg(); }
439 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
440 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
441 constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg442 constexpr PhysReg advance(int bytes) const
443 {
444 PhysReg res = *this;
445 res.reg_b += bytes;
446 return res;
447 }
448
449 uint16_t reg_b = 0;
450 };
451
452 /* helper expressions for special registers */
453 static constexpr PhysReg m0{124};
454 static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
455 static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
456 static constexpr PhysReg vcc{106};
457 static constexpr PhysReg vcc_hi{107};
458 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
459 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
460 static constexpr PhysReg ttmp0{112};
461 static constexpr PhysReg ttmp1{113};
462 static constexpr PhysReg ttmp2{114};
463 static constexpr PhysReg ttmp3{115};
464 static constexpr PhysReg ttmp4{116};
465 static constexpr PhysReg ttmp5{117};
466 static constexpr PhysReg ttmp6{118};
467 static constexpr PhysReg ttmp7{119};
468 static constexpr PhysReg ttmp8{120};
469 static constexpr PhysReg ttmp9{121};
470 static constexpr PhysReg ttmp10{122};
471 static constexpr PhysReg ttmp11{123};
472 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
473 static constexpr PhysReg exec{126};
474 static constexpr PhysReg exec_lo{126};
475 static constexpr PhysReg exec_hi{127};
476 static constexpr PhysReg vccz{251};
477 static constexpr PhysReg execz{252};
478 static constexpr PhysReg scc{253};
479
480 /**
481 * Operand Class
482 * Initially, each Operand refers to either
483 * a temporary virtual register
484 * or to a constant value
485 * Temporary registers get mapped to physical register during RA
486 * Constant values are inlined into the instruction sequence.
487 */
488 class Operand final {
489 public:
Operand()490 constexpr Operand()
491 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
492 isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
493 is24bit_(false), signext(false)
494 {}
495
Operand(Temp r)496 explicit Operand(Temp r) noexcept
497 {
498 data_.temp = r;
499 if (r.id()) {
500 isTemp_ = true;
501 } else {
502 isUndef_ = true;
503 setFixed(PhysReg{128});
504 }
505 };
Operand(Temp r,PhysReg reg)506 explicit Operand(Temp r, PhysReg reg) noexcept
507 {
508 assert(r.id()); /* Don't allow fixing an undef to a register */
509 data_.temp = r;
510 isTemp_ = true;
511 setFixed(reg);
512 };
513
514 /* 8-bit constant */
c8(uint8_t v)515 static Operand c8(uint8_t v) noexcept
516 {
517 /* 8-bit constants are only used for copies and copies from any 8-bit
518 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
519 * to be inline constants. */
520 Operand op;
521 op.control_ = 0;
522 op.data_.i = v;
523 op.isConstant_ = true;
524 op.constSize = 0;
525 op.setFixed(PhysReg{0u});
526 return op;
527 };
528
529 /* 16-bit constant */
c16(uint16_t v)530 static Operand c16(uint16_t v) noexcept
531 {
532 Operand op;
533 op.control_ = 0;
534 op.data_.i = v;
535 op.isConstant_ = true;
536 op.constSize = 1;
537 if (v <= 64)
538 op.setFixed(PhysReg{128u + v});
539 else if (v >= 0xFFF0) /* [-16 .. -1] */
540 op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
541 else if (v == 0x3800) /* 0.5 */
542 op.setFixed(PhysReg{240});
543 else if (v == 0xB800) /* -0.5 */
544 op.setFixed(PhysReg{241});
545 else if (v == 0x3C00) /* 1.0 */
546 op.setFixed(PhysReg{242});
547 else if (v == 0xBC00) /* -1.0 */
548 op.setFixed(PhysReg{243});
549 else if (v == 0x4000) /* 2.0 */
550 op.setFixed(PhysReg{244});
551 else if (v == 0xC000) /* -2.0 */
552 op.setFixed(PhysReg{245});
553 else if (v == 0x4400) /* 4.0 */
554 op.setFixed(PhysReg{246});
555 else if (v == 0xC400) /* -4.0 */
556 op.setFixed(PhysReg{247});
557 else if (v == 0x3118) /* 1/2 PI */
558 op.setFixed(PhysReg{248});
559 else /* Literal Constant */
560 op.setFixed(PhysReg{255});
561 return op;
562 }
563
564 /* 32-bit constant */
c32(uint32_t v)565 static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
566
567 /* 64-bit constant */
c64(uint64_t v)568 static Operand c64(uint64_t v) noexcept
569 {
570 Operand op;
571 op.control_ = 0;
572 op.isConstant_ = true;
573 op.constSize = 3;
574 if (v <= 64) {
575 op.data_.i = (uint32_t)v;
576 op.setFixed(PhysReg{128 + (uint32_t)v});
577 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
578 op.data_.i = (uint32_t)v;
579 op.setFixed(PhysReg{192 - (uint32_t)v});
580 } else if (v == 0x3FE0000000000000) { /* 0.5 */
581 op.data_.i = 0x3f000000;
582 op.setFixed(PhysReg{240});
583 } else if (v == 0xBFE0000000000000) { /* -0.5 */
584 op.data_.i = 0xbf000000;
585 op.setFixed(PhysReg{241});
586 } else if (v == 0x3FF0000000000000) { /* 1.0 */
587 op.data_.i = 0x3f800000;
588 op.setFixed(PhysReg{242});
589 } else if (v == 0xBFF0000000000000) { /* -1.0 */
590 op.data_.i = 0xbf800000;
591 op.setFixed(PhysReg{243});
592 } else if (v == 0x4000000000000000) { /* 2.0 */
593 op.data_.i = 0x40000000;
594 op.setFixed(PhysReg{244});
595 } else if (v == 0xC000000000000000) { /* -2.0 */
596 op.data_.i = 0xc0000000;
597 op.setFixed(PhysReg{245});
598 } else if (v == 0x4010000000000000) { /* 4.0 */
599 op.data_.i = 0x40800000;
600 op.setFixed(PhysReg{246});
601 } else if (v == 0xC010000000000000) { /* -4.0 */
602 op.data_.i = 0xc0800000;
603 op.setFixed(PhysReg{247});
604 } else { /* Literal Constant: we don't know if it is a long or double.*/
605 op.signext = v >> 63;
606 op.data_.i = v & 0xffffffffu;
607 op.setFixed(PhysReg{255});
608 assert(op.constantValue64() == v &&
609 "attempt to create a unrepresentable 64-bit literal constant");
610 }
611 return op;
612 }
613
614 /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)615 static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
616 {
617 Operand op;
618 op.control_ = 0;
619 op.data_.i = v;
620 op.isConstant_ = true;
621 op.constSize = is64bit ? 3 : 2;
622 if (v <= 64)
623 op.setFixed(PhysReg{128 + v});
624 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
625 op.setFixed(PhysReg{192 - v});
626 else if (v == 0x3f000000) /* 0.5 */
627 op.setFixed(PhysReg{240});
628 else if (v == 0xbf000000) /* -0.5 */
629 op.setFixed(PhysReg{241});
630 else if (v == 0x3f800000) /* 1.0 */
631 op.setFixed(PhysReg{242});
632 else if (v == 0xbf800000) /* -1.0 */
633 op.setFixed(PhysReg{243});
634 else if (v == 0x40000000) /* 2.0 */
635 op.setFixed(PhysReg{244});
636 else if (v == 0xc0000000) /* -2.0 */
637 op.setFixed(PhysReg{245});
638 else if (v == 0x40800000) /* 4.0 */
639 op.setFixed(PhysReg{246});
640 else if (v == 0xc0800000) /* -4.0 */
641 op.setFixed(PhysReg{247});
642 else { /* Literal Constant */
643 assert(!is64bit && "attempt to create a 64-bit literal constant");
644 op.setFixed(PhysReg{255});
645 }
646 return op;
647 }
648
literal32(uint32_t v)649 static Operand literal32(uint32_t v) noexcept
650 {
651 Operand op;
652 op.control_ = 0;
653 op.data_.i = v;
654 op.isConstant_ = true;
655 op.constSize = 2;
656 op.setFixed(PhysReg{255});
657 return op;
658 }
659
Operand(RegClass type)660 explicit Operand(RegClass type) noexcept
661 {
662 isUndef_ = true;
663 data_.temp = Temp(0, type);
664 setFixed(PhysReg{128});
665 };
Operand(PhysReg reg,RegClass type)666 explicit Operand(PhysReg reg, RegClass type) noexcept
667 {
668 data_.temp = Temp(0, type);
669 setFixed(reg);
670 }
671
672 static Operand zero(unsigned bytes = 4) noexcept
673 {
674 if (bytes == 8)
675 return Operand::c64(0);
676 else if (bytes == 4)
677 return Operand::c32(0);
678 else if (bytes == 2)
679 return Operand::c16(0);
680 assert(bytes == 1);
681 return Operand::c8(0);
682 }
683
684 /* This is useful over the constructors when you want to take a gfx level
685 * for 1/2 PI or an unknown operand size.
686 */
get_const(enum amd_gfx_level chip,uint64_t val,unsigned bytes)687 static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes)
688 {
689 if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
690 /* 1/2 PI can be an inline constant on GFX8+ */
691 Operand op = Operand::c32(val);
692 op.setFixed(PhysReg{248});
693 return op;
694 }
695
696 if (bytes == 8)
697 return Operand::c64(val);
698 else if (bytes == 4)
699 return Operand::c32(val);
700 else if (bytes == 2)
701 return Operand::c16(val);
702 assert(bytes == 1);
703 return Operand::c8(val);
704 }
705
706 static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
707 bool sext = false)
708 {
709 if (bytes <= 4)
710 return true;
711
712 if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
713 return true;
714 uint64_t upper33 = val & 0xFFFFFFFF80000000;
715 if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
716 return true;
717
718 return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
719 val == 0x3FE0000000000000 || /* 0.5 */
720 val == 0xBFE0000000000000 || /* -0.5 */
721 val == 0x3FF0000000000000 || /* 1.0 */
722 val == 0xBFF0000000000000 || /* -1.0 */
723 val == 0x4000000000000000 || /* 2.0 */
724 val == 0xC000000000000000 || /* -2.0 */
725 val == 0x4010000000000000 || /* 4.0 */
726 val == 0xC010000000000000; /* -4.0 */
727 }
728
isTemp()729 constexpr bool isTemp() const noexcept { return isTemp_; }
730
setTemp(Temp t)731 constexpr void setTemp(Temp t) noexcept
732 {
733 assert(!isConstant_);
734 isTemp_ = true;
735 data_.temp = t;
736 }
737
getTemp()738 constexpr Temp getTemp() const noexcept { return data_.temp; }
739
tempId()740 constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
741
hasRegClass()742 constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
743
regClass()744 constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
745
bytes()746 constexpr unsigned bytes() const noexcept
747 {
748 if (isConstant())
749 return 1 << constSize;
750 else
751 return data_.temp.bytes();
752 }
753
size()754 constexpr unsigned size() const noexcept
755 {
756 if (isConstant())
757 return constSize > 2 ? 2 : 1;
758 else
759 return data_.temp.size();
760 }
761
isFixed()762 constexpr bool isFixed() const noexcept { return isFixed_; }
763
physReg()764 constexpr PhysReg physReg() const noexcept { return reg_; }
765
setFixed(PhysReg reg)766 constexpr void setFixed(PhysReg reg) noexcept
767 {
768 isFixed_ = reg != unsigned(-1);
769 reg_ = reg;
770 }
771
isConstant()772 constexpr bool isConstant() const noexcept { return isConstant_; }
773
isLiteral()774 constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
775
isUndefined()776 constexpr bool isUndefined() const noexcept { return isUndef_; }
777
constantValue()778 constexpr uint32_t constantValue() const noexcept { return data_.i; }
779
constantEquals(uint32_t cmp)780 constexpr bool constantEquals(uint32_t cmp) const noexcept
781 {
782 return isConstant() && constantValue() == cmp;
783 }
784
constantValue64()785 constexpr uint64_t constantValue64() const noexcept
786 {
787 if (constSize == 3) {
788 if (reg_ <= 192)
789 return reg_ - 128;
790 else if (reg_ <= 208)
791 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
792
793 switch (reg_) {
794 case 240: return 0x3FE0000000000000;
795 case 241: return 0xBFE0000000000000;
796 case 242: return 0x3FF0000000000000;
797 case 243: return 0xBFF0000000000000;
798 case 244: return 0x4000000000000000;
799 case 245: return 0xC000000000000000;
800 case 246: return 0x4010000000000000;
801 case 247: return 0xC010000000000000;
802 case 255:
803 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
804 }
805 unreachable("invalid register for 64-bit constant");
806 } else {
807 return data_.i;
808 }
809 }
810
811 /* Value if this were used with vop3/opsel or vop3p. */
constantValue16(bool opsel)812 constexpr uint16_t constantValue16(bool opsel) const noexcept
813 {
814 assert(bytes() == 2 || bytes() == 4);
815 if (opsel) {
816 if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral())
817 return int16_t(data_.i) >> 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */
818 else
819 return data_.i >> 16;
820 }
821 return data_.i;
822 }
823
isOfType(RegType type)824 constexpr bool isOfType(RegType type) const noexcept
825 {
826 return hasRegClass() && regClass().type() == type;
827 }
828
829 /* Indicates that the killed operand's live range intersects with the
830 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
831 * not set by liveness analysis. */
setLateKill(bool flag)832 constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
833
isLateKill()834 constexpr bool isLateKill() const noexcept { return isLateKill_; }
835
setKill(bool flag)836 constexpr void setKill(bool flag) noexcept
837 {
838 isKill_ = flag;
839 if (!flag)
840 setFirstKill(false);
841 }
842
isKill()843 constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
844
setFirstKill(bool flag)845 constexpr void setFirstKill(bool flag) noexcept
846 {
847 isFirstKill_ = flag;
848 if (flag)
849 setKill(flag);
850 }
851
852 /* When there are multiple operands killing the same temporary,
853 * isFirstKill() is only returns true for the first one. */
isFirstKill()854 constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
855
isKillBeforeDef()856 constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
857
isFirstKillBeforeDef()858 constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
859
860 constexpr bool operator==(Operand other) const noexcept
861 {
862 if (other.size() != size())
863 return false;
864 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
865 return false;
866 if (isFixed() && other.isFixed() && physReg() != other.physReg())
867 return false;
868 if (isLiteral())
869 return other.isLiteral() && other.constantValue() == constantValue();
870 else if (isConstant())
871 return other.isConstant() && other.physReg() == physReg();
872 else if (isUndefined())
873 return other.isUndefined() && other.regClass() == regClass();
874 else
875 return other.isTemp() && other.getTemp() == getTemp();
876 }
877
878 constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
879
set16bit(bool flag)880 constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
881
is16bit()882 constexpr bool is16bit() const noexcept { return is16bit_; }
883
set24bit(bool flag)884 constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
885
is24bit()886 constexpr bool is24bit() const noexcept { return is24bit_; }
887
888 private:
889 union {
890 Temp temp;
891 uint32_t i;
892 float f;
893 } data_ = {Temp(0, s1)};
894 PhysReg reg_;
895 union {
896 struct {
897 uint8_t isTemp_ : 1;
898 uint8_t isFixed_ : 1;
899 uint8_t isConstant_ : 1;
900 uint8_t isKill_ : 1;
901 uint8_t isUndef_ : 1;
902 uint8_t isFirstKill_ : 1;
903 uint8_t constSize : 2;
904 uint8_t isLateKill_ : 1;
905 uint8_t is16bit_ : 1;
906 uint8_t is24bit_ : 1;
907 uint8_t signext : 1;
908 };
909 /* can't initialize bit-fields in c++11, so work around using a union */
910 uint16_t control_ = 0;
911 };
912 };
913
914 /**
915 * Definition Class
916 * Definitions are the results of Instructions
917 * and refer to temporary virtual registers
918 * which are later mapped to physical registers
919 */
920 class Definition final {
921 public:
Definition()922 constexpr Definition()
923 : temp(Temp(0, s1)), reg_(0), isFixed_(0), isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0)
924 {}
Definition(uint32_t index,RegClass type)925 Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
Definition(Temp tmp)926 explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)927 Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)928 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
929 {
930 setFixed(reg);
931 }
932
isTemp()933 constexpr bool isTemp() const noexcept { return tempId() > 0; }
934
getTemp()935 constexpr Temp getTemp() const noexcept { return temp; }
936
tempId()937 constexpr uint32_t tempId() const noexcept { return temp.id(); }
938
setTemp(Temp t)939 constexpr void setTemp(Temp t) noexcept { temp = t; }
940
swapTemp(Definition & other)941 void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
942
regClass()943 constexpr RegClass regClass() const noexcept { return temp.regClass(); }
944
bytes()945 constexpr unsigned bytes() const noexcept { return temp.bytes(); }
946
size()947 constexpr unsigned size() const noexcept { return temp.size(); }
948
isFixed()949 constexpr bool isFixed() const noexcept { return isFixed_; }
950
physReg()951 constexpr PhysReg physReg() const noexcept { return reg_; }
952
setFixed(PhysReg reg)953 constexpr void setFixed(PhysReg reg) noexcept
954 {
955 isFixed_ = 1;
956 reg_ = reg;
957 }
958
setKill(bool flag)959 constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
960
isKill()961 constexpr bool isKill() const noexcept { return isKill_; }
962
setPrecise(bool precise)963 constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
964
isPrecise()965 constexpr bool isPrecise() const noexcept { return isPrecise_; }
966
967 /* No Unsigned Wrap */
setNUW(bool nuw)968 constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
969
isNUW()970 constexpr bool isNUW() const noexcept { return isNUW_; }
971
setNoCSE(bool noCSE)972 constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
973
isNoCSE()974 constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
975
976 private:
977 Temp temp = Temp(0, s1);
978 PhysReg reg_;
979 union {
980 struct {
981 uint8_t isFixed_ : 1;
982 uint8_t isKill_ : 1;
983 uint8_t isPrecise_ : 1;
984 uint8_t isNUW_ : 1;
985 uint8_t isNoCSE_ : 1;
986 };
987 /* can't initialize bit-fields in c++11, so work around using a union */
988 uint8_t control_ = 0;
989 };
990 };
991
992 struct Block;
993 struct Instruction;
994 struct Pseudo_instruction;
995 struct SOP1_instruction;
996 struct SOP2_instruction;
997 struct SOPK_instruction;
998 struct SOPP_instruction;
999 struct SOPC_instruction;
1000 struct SMEM_instruction;
1001 struct DS_instruction;
1002 struct MTBUF_instruction;
1003 struct MUBUF_instruction;
1004 struct MIMG_instruction;
1005 struct Export_instruction;
1006 struct FLAT_instruction;
1007 struct Pseudo_branch_instruction;
1008 struct Pseudo_barrier_instruction;
1009 struct Pseudo_reduction_instruction;
1010 struct VOP3P_instruction;
1011 struct VOP1_instruction;
1012 struct VOP2_instruction;
1013 struct VOPC_instruction;
1014 struct VOP3_instruction;
1015 struct Interp_instruction;
1016 struct DPP16_instruction;
1017 struct DPP8_instruction;
1018 struct SDWA_instruction;
1019
1020 struct Instruction {
1021 aco_opcode opcode;
1022 Format format;
1023 uint32_t pass_flags;
1024
1025 aco::span<Operand> operands;
1026 aco::span<Definition> definitions;
1027
1028 constexpr bool usesModifiers() const noexcept;
1029
reads_execInstruction1030 constexpr bool reads_exec() const noexcept
1031 {
1032 for (const Operand& op : operands) {
1033 if (op.isFixed() && op.physReg() == exec)
1034 return true;
1035 }
1036 return false;
1037 }
1038
pseudoInstruction1039 Pseudo_instruction& pseudo() noexcept
1040 {
1041 assert(isPseudo());
1042 return *(Pseudo_instruction*)this;
1043 }
pseudoInstruction1044 const Pseudo_instruction& pseudo() const noexcept
1045 {
1046 assert(isPseudo());
1047 return *(Pseudo_instruction*)this;
1048 }
isPseudoInstruction1049 constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
sop1Instruction1050 SOP1_instruction& sop1() noexcept
1051 {
1052 assert(isSOP1());
1053 return *(SOP1_instruction*)this;
1054 }
sop1Instruction1055 const SOP1_instruction& sop1() const noexcept
1056 {
1057 assert(isSOP1());
1058 return *(SOP1_instruction*)this;
1059 }
isSOP1Instruction1060 constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
sop2Instruction1061 SOP2_instruction& sop2() noexcept
1062 {
1063 assert(isSOP2());
1064 return *(SOP2_instruction*)this;
1065 }
sop2Instruction1066 const SOP2_instruction& sop2() const noexcept
1067 {
1068 assert(isSOP2());
1069 return *(SOP2_instruction*)this;
1070 }
isSOP2Instruction1071 constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
sopkInstruction1072 SOPK_instruction& sopk() noexcept
1073 {
1074 assert(isSOPK());
1075 return *(SOPK_instruction*)this;
1076 }
sopkInstruction1077 const SOPK_instruction& sopk() const noexcept
1078 {
1079 assert(isSOPK());
1080 return *(SOPK_instruction*)this;
1081 }
isSOPKInstruction1082 constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
soppInstruction1083 SOPP_instruction& sopp() noexcept
1084 {
1085 assert(isSOPP());
1086 return *(SOPP_instruction*)this;
1087 }
soppInstruction1088 const SOPP_instruction& sopp() const noexcept
1089 {
1090 assert(isSOPP());
1091 return *(SOPP_instruction*)this;
1092 }
isSOPPInstruction1093 constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
sopcInstruction1094 SOPC_instruction& sopc() noexcept
1095 {
1096 assert(isSOPC());
1097 return *(SOPC_instruction*)this;
1098 }
sopcInstruction1099 const SOPC_instruction& sopc() const noexcept
1100 {
1101 assert(isSOPC());
1102 return *(SOPC_instruction*)this;
1103 }
isSOPCInstruction1104 constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
smemInstruction1105 SMEM_instruction& smem() noexcept
1106 {
1107 assert(isSMEM());
1108 return *(SMEM_instruction*)this;
1109 }
smemInstruction1110 const SMEM_instruction& smem() const noexcept
1111 {
1112 assert(isSMEM());
1113 return *(SMEM_instruction*)this;
1114 }
isSMEMInstruction1115 constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1116 DS_instruction& ds() noexcept
1117 {
1118 assert(isDS());
1119 return *(DS_instruction*)this;
1120 }
dsInstruction1121 const DS_instruction& ds() const noexcept
1122 {
1123 assert(isDS());
1124 return *(DS_instruction*)this;
1125 }
isDSInstruction1126 constexpr bool isDS() const noexcept { return format == Format::DS; }
mtbufInstruction1127 MTBUF_instruction& mtbuf() noexcept
1128 {
1129 assert(isMTBUF());
1130 return *(MTBUF_instruction*)this;
1131 }
mtbufInstruction1132 const MTBUF_instruction& mtbuf() const noexcept
1133 {
1134 assert(isMTBUF());
1135 return *(MTBUF_instruction*)this;
1136 }
isMTBUFInstruction1137 constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1138 MUBUF_instruction& mubuf() noexcept
1139 {
1140 assert(isMUBUF());
1141 return *(MUBUF_instruction*)this;
1142 }
mubufInstruction1143 const MUBUF_instruction& mubuf() const noexcept
1144 {
1145 assert(isMUBUF());
1146 return *(MUBUF_instruction*)this;
1147 }
isMUBUFInstruction1148 constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1149 MIMG_instruction& mimg() noexcept
1150 {
1151 assert(isMIMG());
1152 return *(MIMG_instruction*)this;
1153 }
mimgInstruction1154 const MIMG_instruction& mimg() const noexcept
1155 {
1156 assert(isMIMG());
1157 return *(MIMG_instruction*)this;
1158 }
isMIMGInstruction1159 constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1160 Export_instruction& exp() noexcept
1161 {
1162 assert(isEXP());
1163 return *(Export_instruction*)this;
1164 }
expInstruction1165 const Export_instruction& exp() const noexcept
1166 {
1167 assert(isEXP());
1168 return *(Export_instruction*)this;
1169 }
isEXPInstruction1170 constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1171 FLAT_instruction& flat() noexcept
1172 {
1173 assert(isFlat());
1174 return *(FLAT_instruction*)this;
1175 }
flatInstruction1176 const FLAT_instruction& flat() const noexcept
1177 {
1178 assert(isFlat());
1179 return *(FLAT_instruction*)this;
1180 }
isFlatInstruction1181 constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1182 FLAT_instruction& global() noexcept
1183 {
1184 assert(isGlobal());
1185 return *(FLAT_instruction*)this;
1186 }
globalInstruction1187 const FLAT_instruction& global() const noexcept
1188 {
1189 assert(isGlobal());
1190 return *(FLAT_instruction*)this;
1191 }
isGlobalInstruction1192 constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1193 FLAT_instruction& scratch() noexcept
1194 {
1195 assert(isScratch());
1196 return *(FLAT_instruction*)this;
1197 }
scratchInstruction1198 const FLAT_instruction& scratch() const noexcept
1199 {
1200 assert(isScratch());
1201 return *(FLAT_instruction*)this;
1202 }
isScratchInstruction1203 constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1204 Pseudo_branch_instruction& branch() noexcept
1205 {
1206 assert(isBranch());
1207 return *(Pseudo_branch_instruction*)this;
1208 }
branchInstruction1209 const Pseudo_branch_instruction& branch() const noexcept
1210 {
1211 assert(isBranch());
1212 return *(Pseudo_branch_instruction*)this;
1213 }
isBranchInstruction1214 constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1215 Pseudo_barrier_instruction& barrier() noexcept
1216 {
1217 assert(isBarrier());
1218 return *(Pseudo_barrier_instruction*)this;
1219 }
barrierInstruction1220 const Pseudo_barrier_instruction& barrier() const noexcept
1221 {
1222 assert(isBarrier());
1223 return *(Pseudo_barrier_instruction*)this;
1224 }
isBarrierInstruction1225 constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1226 Pseudo_reduction_instruction& reduction() noexcept
1227 {
1228 assert(isReduction());
1229 return *(Pseudo_reduction_instruction*)this;
1230 }
reductionInstruction1231 const Pseudo_reduction_instruction& reduction() const noexcept
1232 {
1233 assert(isReduction());
1234 return *(Pseudo_reduction_instruction*)this;
1235 }
isReductionInstruction1236 constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
vop3pInstruction1237 VOP3P_instruction& vop3p() noexcept
1238 {
1239 assert(isVOP3P());
1240 return *(VOP3P_instruction*)this;
1241 }
vop3pInstruction1242 const VOP3P_instruction& vop3p() const noexcept
1243 {
1244 assert(isVOP3P());
1245 return *(VOP3P_instruction*)this;
1246 }
isVOP3PInstruction1247 constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
vop1Instruction1248 VOP1_instruction& vop1() noexcept
1249 {
1250 assert(isVOP1());
1251 return *(VOP1_instruction*)this;
1252 }
vop1Instruction1253 const VOP1_instruction& vop1() const noexcept
1254 {
1255 assert(isVOP1());
1256 return *(VOP1_instruction*)this;
1257 }
isVOP1Instruction1258 constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
vop2Instruction1259 VOP2_instruction& vop2() noexcept
1260 {
1261 assert(isVOP2());
1262 return *(VOP2_instruction*)this;
1263 }
vop2Instruction1264 const VOP2_instruction& vop2() const noexcept
1265 {
1266 assert(isVOP2());
1267 return *(VOP2_instruction*)this;
1268 }
isVOP2Instruction1269 constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
vopcInstruction1270 VOPC_instruction& vopc() noexcept
1271 {
1272 assert(isVOPC());
1273 return *(VOPC_instruction*)this;
1274 }
vopcInstruction1275 const VOPC_instruction& vopc() const noexcept
1276 {
1277 assert(isVOPC());
1278 return *(VOPC_instruction*)this;
1279 }
isVOPCInstruction1280 constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
vop3Instruction1281 VOP3_instruction& vop3() noexcept
1282 {
1283 assert(isVOP3());
1284 return *(VOP3_instruction*)this;
1285 }
vop3Instruction1286 const VOP3_instruction& vop3() const noexcept
1287 {
1288 assert(isVOP3());
1289 return *(VOP3_instruction*)this;
1290 }
isVOP3Instruction1291 constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1292 Interp_instruction& vintrp() noexcept
1293 {
1294 assert(isVINTRP());
1295 return *(Interp_instruction*)this;
1296 }
vintrpInstruction1297 const Interp_instruction& vintrp() const noexcept
1298 {
1299 assert(isVINTRP());
1300 return *(Interp_instruction*)this;
1301 }
isVINTRPInstruction1302 constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
dpp16Instruction1303 DPP16_instruction& dpp16() noexcept
1304 {
1305 assert(isDPP16());
1306 return *(DPP16_instruction*)this;
1307 }
dpp16Instruction1308 const DPP16_instruction& dpp16() const noexcept
1309 {
1310 assert(isDPP16());
1311 return *(DPP16_instruction*)this;
1312 }
isDPP16Instruction1313 constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
dpp8Instruction1314 DPP8_instruction& dpp8() noexcept
1315 {
1316 assert(isDPP8());
1317 return *(DPP8_instruction*)this;
1318 }
dpp8Instruction1319 const DPP8_instruction& dpp8() const noexcept
1320 {
1321 assert(isDPP8());
1322 return *(DPP8_instruction*)this;
1323 }
isDPP8Instruction1324 constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
isDPPInstruction1325 constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
sdwaInstruction1326 SDWA_instruction& sdwa() noexcept
1327 {
1328 assert(isSDWA());
1329 return *(SDWA_instruction*)this;
1330 }
sdwaInstruction1331 const SDWA_instruction& sdwa() const noexcept
1332 {
1333 assert(isSDWA());
1334 return *(SDWA_instruction*)this;
1335 }
isSDWAInstruction1336 constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1337
flatlikeInstruction1338 FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1339
flatlikeInstruction1340 const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1341
isFlatLikeInstruction1342 constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1343
isVALUInstruction1344 constexpr bool isVALU() const noexcept
1345 {
1346 return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1347 }
1348
isSALUInstruction1349 constexpr bool isSALU() const noexcept
1350 {
1351 return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1352 }
1353
isVMEMInstruction1354 constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1355 };
1356 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1357
1358 struct SOPK_instruction : public Instruction {
1359 uint16_t imm;
1360 uint16_t padding;
1361 };
1362 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1363
1364 struct SOPP_instruction : public Instruction {
1365 uint32_t imm;
1366 int block;
1367 };
1368 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1369
1370 struct SOPC_instruction : public Instruction {
1371 uint32_t padding;
1372 };
1373 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1374
1375 struct SOP1_instruction : public Instruction {};
1376 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1377
1378 struct SOP2_instruction : public Instruction {
1379 uint32_t padding;
1380 };
1381 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1382
1383 /**
1384 * Scalar Memory Format:
1385 * For s_(buffer_)load_dword*:
1386 * Operand(0): SBASE - SGPR-pair which provides base address
1387 * Operand(1): Offset - immediate (un)signed offset or SGPR
1388 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1389 * Operand(n-1): SOffset - SGPR offset (Vega only)
1390 *
1391 * Having no operands is also valid for instructions such as s_dcache_inv.
1392 *
1393 */
1394 struct SMEM_instruction : public Instruction {
1395 memory_sync_info sync;
1396 bool glc : 1; /* VI+: globally coherent */
1397 bool dlc : 1; /* NAVI: device level coherent */
1398 bool nv : 1; /* VEGA only: Non-volatile */
1399 bool disable_wqm : 1;
1400 bool prevent_overflow : 1; /* avoid overflow when combining additions */
1401 uint8_t padding : 3;
1402 };
1403 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1404
1405 struct VOP1_instruction : public Instruction {};
1406 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1407
1408 struct VOP2_instruction : public Instruction {};
1409 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1410
1411 struct VOPC_instruction : public Instruction {};
1412 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1413
1414 struct VOP3_instruction : public Instruction {
1415 bool abs[3];
1416 bool neg[3];
1417 uint8_t opsel : 4;
1418 uint8_t omod : 2;
1419 bool clamp : 1;
1420 uint8_t padding0 : 1;
1421 uint8_t padding1;
1422 };
1423 static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1424
1425 struct VOP3P_instruction : public Instruction {
1426 bool neg_lo[3];
1427 bool neg_hi[3]; /* abs modifier, for v_mad_mix/v_fma_mix */
1428 uint8_t opsel_lo : 3;
1429 uint8_t opsel_hi : 3;
1430 bool clamp : 1;
1431 uint8_t padding0 : 1;
1432 uint8_t padding1;
1433 };
1434 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1435
1436 /**
1437 * Data Parallel Primitives Format:
1438 * This format can be used for VOP1, VOP2 or VOPC instructions.
1439 * The swizzle applies to the src0 operand.
1440 *
1441 */
1442 struct DPP16_instruction : public Instruction {
1443 bool abs[2];
1444 bool neg[2];
1445 uint16_t dpp_ctrl;
1446 uint8_t row_mask : 4;
1447 uint8_t bank_mask : 4;
1448 bool bound_ctrl : 1;
1449 uint8_t padding : 7;
1450 };
1451 static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1452
1453 struct DPP8_instruction : public Instruction {
1454 uint8_t lane_sel[8];
1455 };
1456 static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1457
1458 struct SubdwordSel {
1459 enum sdwa_sel : uint8_t {
1460 ubyte = 0x4,
1461 uword = 0x8,
1462 dword = 0x10,
1463 sext = 0x20,
1464 sbyte = ubyte | sext,
1465 sword = uword | sext,
1466
1467 ubyte0 = ubyte,
1468 ubyte1 = ubyte | 1,
1469 ubyte2 = ubyte | 2,
1470 ubyte3 = ubyte | 3,
1471 sbyte0 = sbyte,
1472 sbyte1 = sbyte | 1,
1473 sbyte2 = sbyte | 2,
1474 sbyte3 = sbyte | 3,
1475 uword0 = uword,
1476 uword1 = uword | 2,
1477 sword0 = sword,
1478 sword1 = sword | 2,
1479 };
1480
SubdwordSelSubdwordSel1481 SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1482 constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1483 constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1484 : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1485 {}
sdwa_selSubdwordSel1486 constexpr operator sdwa_sel() const { return sel; }
1487 explicit operator bool() const { return sel != 0; }
1488
sizeSubdwordSel1489 constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1490 constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1491 constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1492 constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1493 {
1494 reg_byte_offset += offset();
1495 if (size() == 1)
1496 return reg_byte_offset;
1497 else if (size() == 2)
1498 return 4 + (reg_byte_offset >> 1);
1499 else
1500 return 6;
1501 }
1502
1503 private:
1504 sdwa_sel sel;
1505 };
1506
1507 /**
1508 * Sub-Dword Addressing Format:
1509 * This format can be used for VOP1, VOP2 or VOPC instructions.
1510 *
1511 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1512 * the definition doesn't have to be VCC on GFX9+.
1513 *
1514 */
1515 struct SDWA_instruction : public Instruction {
1516 /* these destination modifiers aren't available with VOPC except for
1517 * clamp on GFX8 */
1518 SubdwordSel sel[2];
1519 SubdwordSel dst_sel;
1520 bool neg[2];
1521 bool abs[2];
1522 bool clamp : 1;
1523 uint8_t omod : 2; /* GFX9+ */
1524 uint8_t padding : 5;
1525 };
1526 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1527
1528 struct Interp_instruction : public Instruction {
1529 uint8_t attribute;
1530 uint8_t component;
1531 uint16_t padding;
1532 };
1533 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1534
1535 /**
1536 * Local and Global Data Sharing instructions
1537 * Operand(0): ADDR - VGPR which supplies the address.
1538 * Operand(1): DATA0 - First data VGPR.
1539 * Operand(2): DATA1 - Second data VGPR.
1540 * Operand(n-1): M0 - LDS size.
1541 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1542 *
1543 */
1544 struct DS_instruction : public Instruction {
1545 memory_sync_info sync;
1546 bool gds;
1547 uint16_t offset0;
1548 uint8_t offset1;
1549 uint8_t padding;
1550 };
1551 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1552
1553 /**
1554 * Vector Memory Untyped-buffer Instructions
1555 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1556 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1557 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1558 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1559 *
1560 */
1561 struct MUBUF_instruction : public Instruction {
1562 memory_sync_info sync;
1563 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1564 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1565 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1566 bool glc : 1; /* globally coherent */
1567 bool dlc : 1; /* NAVI: device level coherent */
1568 bool slc : 1; /* system level coherent */
1569 bool tfe : 1; /* texture fail enable */
1570 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1571 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1572 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1573 uint16_t swizzled : 1;
1574 uint16_t padding0 : 2;
1575 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1576 uint16_t padding1 : 10;
1577 };
1578 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1579
1580 /**
1581 * Vector Memory Typed-buffer Instructions
1582 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1583 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1584 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1585 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1586 *
1587 */
1588 struct MTBUF_instruction : public Instruction {
1589 memory_sync_info sync;
1590 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1591 uint8_t nfmt : 3; /* Numeric format of data in memory */
1592 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1593 uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */
1594 uint16_t glc : 1; /* globally coherent */
1595 uint16_t dlc : 1; /* NAVI: device level coherent */
1596 uint16_t slc : 1; /* system level coherent */
1597 uint16_t tfe : 1; /* texture fail enable */
1598 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1599 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1600 uint16_t padding : 4;
1601 uint16_t offset; /* Unsigned byte offset - 12 bit */
1602 };
1603 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1604
1605 /**
1606 * Vector Memory Image Instructions
1607 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1608 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1609 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1610 * Operand(3): VADDR - Address source. Can carry an offset or an index.
1611 * Definition(0): VDATA - Vector GPR for read result.
1612 *
1613 */
1614 struct MIMG_instruction : public Instruction {
1615 memory_sync_info sync;
1616 uint8_t dmask; /* Data VGPR enable mask */
1617 uint8_t dim : 3; /* NAVI: dimensionality */
1618 bool unrm : 1; /* Force address to be un-normalized */
1619 bool dlc : 1; /* NAVI: device level coherent */
1620 bool glc : 1; /* globally coherent */
1621 bool slc : 1; /* system level coherent */
1622 bool tfe : 1; /* texture fail enable */
1623 bool da : 1; /* declare an array */
1624 bool lwe : 1; /* LOD warning enable */
1625 bool r128 : 1; /* NAVI: Texture resource size */
1626 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1627 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1628 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1629 uint8_t padding0 : 2;
1630 uint8_t padding1;
1631 uint8_t padding2;
1632 };
1633 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1634
1635 /**
1636 * Flat/Scratch/Global Instructions
1637 * Operand(0): ADDR
1638 * Operand(1): SADDR
1639 * Operand(2) / Definition(0): DATA/VDST
1640 *
1641 */
1642 struct FLAT_instruction : public Instruction {
1643 memory_sync_info sync;
1644 bool slc : 1; /* system level coherent */
1645 bool glc : 1; /* globally coherent */
1646 bool dlc : 1; /* NAVI: device level coherent */
1647 bool lds : 1;
1648 bool nv : 1;
1649 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1650 uint8_t padding0 : 2;
1651 int16_t offset; /* Vega/Navi only */
1652 uint16_t padding1;
1653 };
1654 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1655
1656 struct Export_instruction : public Instruction {
1657 uint8_t enabled_mask;
1658 uint8_t dest;
1659 bool compressed : 1;
1660 bool done : 1;
1661 bool valid_mask : 1;
1662 uint8_t padding0 : 5;
1663 uint8_t padding1;
1664 };
1665 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1666
1667 struct Pseudo_instruction : public Instruction {
1668 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1669 bool tmp_in_scc;
1670 uint8_t padding;
1671 };
1672 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1673
1674 struct Pseudo_branch_instruction : public Instruction {
1675 /* target[0] is the block index of the branch target.
1676 * For conditional branches, target[1] contains the fall-through alternative.
1677 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1678 */
1679 uint32_t target[2];
1680 };
1681 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1682
1683 struct Pseudo_barrier_instruction : public Instruction {
1684 memory_sync_info sync;
1685 sync_scope exec_scope;
1686 };
1687 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1688
1689 enum ReduceOp : uint16_t {
1690 // clang-format off
1691 iadd8, iadd16, iadd32, iadd64,
1692 imul8, imul16, imul32, imul64,
1693 fadd16, fadd32, fadd64,
1694 fmul16, fmul32, fmul64,
1695 imin8, imin16, imin32, imin64,
1696 imax8, imax16, imax32, imax64,
1697 umin8, umin16, umin32, umin64,
1698 umax8, umax16, umax32, umax64,
1699 fmin16, fmin32, fmin64,
1700 fmax16, fmax32, fmax64,
1701 iand8, iand16, iand32, iand64,
1702 ior8, ior16, ior32, ior64,
1703 ixor8, ixor16, ixor32, ixor64,
1704 num_reduce_ops,
1705 // clang-format on
1706 };
1707
1708 /**
1709 * Subgroup Reduction Instructions, everything except for the data to be
1710 * reduced and the result as inserted by setup_reduce_temp().
1711 * Operand(0): data to be reduced
1712 * Operand(1): reduce temporary
1713 * Operand(2): vector temporary
1714 * Definition(0): result
1715 * Definition(1): scalar temporary
1716 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1717 * Definition(3): scc clobber
1718 * Definition(4): vcc clobber
1719 *
1720 */
1721 struct Pseudo_reduction_instruction : public Instruction {
1722 ReduceOp reduce_op;
1723 uint16_t cluster_size; // must be 0 for scans
1724 };
1725 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1726 "Unexpected padding");
1727
1728 struct instr_deleter_functor {
operatorinstr_deleter_functor1729 void operator()(void* p) { free(p); }
1730 };
1731
1732 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1733
1734 template <typename T>
1735 T*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1736 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1737 uint32_t num_definitions)
1738 {
1739 std::size_t size =
1740 sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1741 char* data = (char*)calloc(1, size);
1742 T* inst = (T*)data;
1743
1744 inst->opcode = opcode;
1745 inst->format = format;
1746
1747 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1748 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1749 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1750 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1751
1752 return inst;
1753 }
1754
1755 constexpr bool
usesModifiers()1756 Instruction::usesModifiers() const noexcept
1757 {
1758 if (isDPP() || isSDWA())
1759 return true;
1760
1761 if (isVOP3P()) {
1762 const VOP3P_instruction& vop3p = this->vop3p();
1763 for (unsigned i = 0; i < operands.size(); i++) {
1764 if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1765 return true;
1766
1767 /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1768 if (!(vop3p.opsel_hi & (1 << i)))
1769 return true;
1770 }
1771 return vop3p.opsel_lo || vop3p.clamp;
1772 } else if (isVOP3()) {
1773 const VOP3_instruction& vop3 = this->vop3();
1774 for (unsigned i = 0; i < operands.size(); i++) {
1775 if (vop3.abs[i] || vop3.neg[i])
1776 return true;
1777 }
1778 return vop3.opsel || vop3.clamp || vop3.omod;
1779 }
1780 return false;
1781 }
1782
1783 constexpr bool
is_phi(Instruction * instr)1784 is_phi(Instruction* instr)
1785 {
1786 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1787 }
1788
1789 static inline bool
is_phi(aco_ptr<Instruction> & instr)1790 is_phi(aco_ptr<Instruction>& instr)
1791 {
1792 return is_phi(instr.get());
1793 }
1794
1795 memory_sync_info get_sync_info(const Instruction* instr);
1796
1797 bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1798
1799 bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
1800 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
1801 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
1802 bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8);
1803 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1804 aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr);
1805 aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8);
1806 bool needs_exec_mask(const Instruction* instr);
1807
1808 aco_opcode get_ordered(aco_opcode op);
1809 aco_opcode get_unordered(aco_opcode op);
1810 aco_opcode get_inverse(aco_opcode op);
1811 aco_opcode get_f32_cmp(aco_opcode op);
1812 aco_opcode get_vcmpx(aco_opcode op);
1813 unsigned get_cmp_bitsize(aco_opcode op);
1814 bool is_cmp(aco_opcode op);
1815
1816 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
1817
1818 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1819
1820 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1821
1822 bool should_form_clause(const Instruction* a, const Instruction* b);
1823
1824 enum block_kind {
1825 /* uniform indicates that leaving this block,
1826 * all actives lanes stay active */
1827 block_kind_uniform = 1 << 0,
1828 block_kind_top_level = 1 << 1,
1829 block_kind_loop_preheader = 1 << 2,
1830 block_kind_loop_header = 1 << 3,
1831 block_kind_loop_exit = 1 << 4,
1832 block_kind_continue = 1 << 5,
1833 block_kind_break = 1 << 6,
1834 block_kind_continue_or_break = 1 << 7,
1835 block_kind_branch = 1 << 8,
1836 block_kind_merge = 1 << 9,
1837 block_kind_invert = 1 << 10,
1838 block_kind_uses_discard = 1 << 12,
1839 block_kind_needs_lowering = 1 << 13,
1840 block_kind_export_end = 1 << 15,
1841 };
1842
1843 struct RegisterDemand {
1844 constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1845 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1846 int16_t vgpr = 0;
1847 int16_t sgpr = 0;
1848
1849 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1850 {
1851 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1852 }
1853
exceedsRegisterDemand1854 constexpr bool exceeds(const RegisterDemand other) const noexcept
1855 {
1856 return vgpr > other.vgpr || sgpr > other.sgpr;
1857 }
1858
1859 constexpr RegisterDemand operator+(const Temp t) const noexcept
1860 {
1861 if (t.type() == RegType::sgpr)
1862 return RegisterDemand(vgpr, sgpr + t.size());
1863 else
1864 return RegisterDemand(vgpr + t.size(), sgpr);
1865 }
1866
1867 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1868 {
1869 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1870 }
1871
1872 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1873 {
1874 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1875 }
1876
1877 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1878 {
1879 vgpr += other.vgpr;
1880 sgpr += other.sgpr;
1881 return *this;
1882 }
1883
1884 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1885 {
1886 vgpr -= other.vgpr;
1887 sgpr -= other.sgpr;
1888 return *this;
1889 }
1890
1891 constexpr RegisterDemand& operator+=(const Temp t) noexcept
1892 {
1893 if (t.type() == RegType::sgpr)
1894 sgpr += t.size();
1895 else
1896 vgpr += t.size();
1897 return *this;
1898 }
1899
1900 constexpr RegisterDemand& operator-=(const Temp t) noexcept
1901 {
1902 if (t.type() == RegType::sgpr)
1903 sgpr -= t.size();
1904 else
1905 vgpr -= t.size();
1906 return *this;
1907 }
1908
updateRegisterDemand1909 constexpr void update(const RegisterDemand other) noexcept
1910 {
1911 vgpr = std::max(vgpr, other.vgpr);
1912 sgpr = std::max(sgpr, other.sgpr);
1913 }
1914 };
1915
1916 /* CFG */
1917 struct Block {
1918 float_mode fp_mode;
1919 unsigned index;
1920 unsigned offset = 0;
1921 std::vector<aco_ptr<Instruction>> instructions;
1922 std::vector<unsigned> logical_preds;
1923 std::vector<unsigned> linear_preds;
1924 std::vector<unsigned> logical_succs;
1925 std::vector<unsigned> linear_succs;
1926 RegisterDemand register_demand = RegisterDemand();
1927 uint16_t loop_nest_depth = 0;
1928 uint16_t divergent_if_logical_depth = 0;
1929 uint16_t uniform_if_depth = 0;
1930 uint16_t kind = 0;
1931 int logical_idom = -1;
1932 int linear_idom = -1;
1933
1934 /* this information is needed for predecessors to blocks with phis when
1935 * moving out of ssa */
1936 bool scc_live_out = false;
1937
BlockBlock1938 Block() : index(0) {}
1939 };
1940
1941 /*
1942 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1943 */
1944 enum class SWStage : uint16_t {
1945 None = 0,
1946 VS = 1 << 0, /* Vertex Shader */
1947 GS = 1 << 1, /* Geometry Shader */
1948 TCS = 1 << 2, /* Tessellation Control aka Hull Shader */
1949 TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */
1950 FS = 1 << 4, /* Fragment aka Pixel Shader */
1951 CS = 1 << 5, /* Compute Shader */
1952 TS = 1 << 6, /* Task Shader */
1953 MS = 1 << 7, /* Mesh Shader */
1954 GSCopy = 1 << 8, /* GS Copy Shader (internal) */
1955
1956 /* Stage combinations merged to run on a single HWStage */
1957 VS_GS = VS | GS,
1958 VS_TCS = VS | TCS,
1959 TES_GS = TES | GS,
1960 };
1961
1962 constexpr SWStage
1963 operator|(SWStage a, SWStage b)
1964 {
1965 return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
1966 }
1967
1968 /*
1969 * Shader stages as running on the AMD GPU.
1970 *
1971 * The relation between HWStages and SWStages is not a one-to-one mapping:
1972 * Some SWStages are merged by ACO to run on a single HWStage.
1973 * See README.md for details.
1974 */
1975 enum class HWStage : uint8_t {
1976 VS,
1977 ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1978 GS, /* Geometry shader on GFX10/legacy and GFX6-9. */
1979 NGG, /* Primitive shader, used to implement VS, TES, GS. */
1980 LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1981 HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1982 FS,
1983 CS,
1984 };
1985
1986 /*
1987 * Set of SWStages to be merged into a single shader paired with the
1988 * HWStage it will run on.
1989 */
1990 struct Stage {
1991 constexpr Stage() = default;
1992
StageStage1993 explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1994
1995 /* Check if the given SWStage is included */
hasStage1996 constexpr bool has(SWStage stage) const
1997 {
1998 return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
1999 }
2000
num_sw_stagesStage2001 unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
2002
2003 constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
2004
2005 constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
2006
2007 /* Mask of merged software stages */
2008 SWStage sw = SWStage::None;
2009
2010 /* Active hardware stage */
2011 HWStage hw{};
2012 };
2013
2014 /* possible settings of Program::stage */
2015 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
2016 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
2017 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
2018 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
2019 static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
2020 /* Mesh shading pipeline */
2021 static constexpr Stage task_cs(HWStage::CS, SWStage::TS);
2022 static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS);
2023 /* GFX10/NGG */
2024 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
2025 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
2026 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
2027 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
2028 /* GFX9 (and GFX10 if NGG isn't used) */
2029 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
2030 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
2031 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
2032 /* pre-GFX9 */
2033 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
2034 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
2035 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
2036 static constexpr Stage tess_eval_es(HWStage::ES,
2037 SWStage::TES); /* tesselation evaluation before geometry */
2038 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
2039
2040 enum statistic {
2041 statistic_hash,
2042 statistic_instructions,
2043 statistic_copies,
2044 statistic_branches,
2045 statistic_latency,
2046 statistic_inv_throughput,
2047 statistic_vmem_clauses,
2048 statistic_smem_clauses,
2049 statistic_sgpr_presched,
2050 statistic_vgpr_presched,
2051 num_statistics
2052 };
2053
2054 struct DeviceInfo {
2055 uint16_t lds_encoding_granule;
2056 uint16_t lds_alloc_granule;
2057 uint32_t lds_limit; /* in bytes */
2058 bool has_16bank_lds;
2059 uint16_t physical_sgprs;
2060 uint16_t physical_vgprs;
2061 uint16_t vgpr_limit;
2062 uint16_t sgpr_limit;
2063 uint16_t sgpr_alloc_granule;
2064 uint16_t vgpr_alloc_granule; /* must be power of two */
2065 unsigned max_wave64_per_simd;
2066 unsigned simd_per_cu;
2067 bool has_fast_fma32 = false;
2068 bool has_mac_legacy32 = false;
2069 bool fused_mad_mix = false;
2070 bool xnack_enabled = false;
2071 bool sram_ecc_enabled = false;
2072
2073 int16_t scratch_global_offset_min;
2074 int16_t scratch_global_offset_max;
2075 };
2076
2077 enum class CompilationProgress {
2078 after_isel,
2079 after_spilling,
2080 after_ra,
2081 };
2082
2083 class Program final {
2084 public:
2085 std::vector<Block> blocks;
2086 std::vector<RegClass> temp_rc = {s1};
2087 RegisterDemand max_reg_demand = RegisterDemand();
2088 ac_shader_config* config;
2089 struct aco_shader_info info;
2090 enum amd_gfx_level gfx_level;
2091 enum radeon_family family;
2092 DeviceInfo dev;
2093 unsigned wave_size;
2094 RegClass lane_mask;
2095 Stage stage;
2096 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2097 bool needs_wqm = false; /* there exists a p_wqm instruction */
2098
2099 std::vector<uint8_t> constant_data;
2100 Temp private_segment_buffer;
2101 Temp scratch_offset;
2102
2103 uint16_t num_waves = 0;
2104 uint16_t min_waves = 0;
2105 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2106 bool wgp_mode;
2107 bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2108
2109 bool needs_vcc = false;
2110
2111 CompilationProgress progress;
2112
2113 bool collect_statistics = false;
2114 uint32_t statistics[num_statistics];
2115
2116 float_mode next_fp_mode;
2117 unsigned next_loop_depth = 0;
2118 unsigned next_divergent_if_logical_depth = 0;
2119 unsigned next_uniform_if_depth = 0;
2120
2121 std::vector<Definition> vs_inputs;
2122
2123 struct {
2124 FILE* output = stderr;
2125 bool shorten_messages = false;
2126 void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message);
2127 void* private_data;
2128 } debug;
2129
allocateId(RegClass rc)2130 uint32_t allocateId(RegClass rc)
2131 {
2132 assert(allocationID <= 16777215);
2133 temp_rc.push_back(rc);
2134 return allocationID++;
2135 }
2136
allocateRange(unsigned amount)2137 void allocateRange(unsigned amount)
2138 {
2139 assert(allocationID + amount <= 16777216);
2140 temp_rc.resize(temp_rc.size() + amount);
2141 allocationID += amount;
2142 }
2143
allocateTmp(RegClass rc)2144 Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2145
peekAllocationId()2146 uint32_t peekAllocationId() { return allocationID; }
2147
2148 friend void reindex_ssa(Program* program);
2149 friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2150
create_and_insert_block()2151 Block* create_and_insert_block()
2152 {
2153 Block block;
2154 return insert_block(std::move(block));
2155 }
2156
insert_block(Block && block)2157 Block* insert_block(Block&& block)
2158 {
2159 block.index = blocks.size();
2160 block.fp_mode = next_fp_mode;
2161 block.loop_nest_depth = next_loop_depth;
2162 block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2163 block.uniform_if_depth = next_uniform_if_depth;
2164 blocks.emplace_back(std::move(block));
2165 return &blocks.back();
2166 }
2167
2168 private:
2169 uint32_t allocationID = 1;
2170 };
2171
2172 struct live {
2173 /* live temps out per block */
2174 std::vector<IDSet> live_out;
2175 /* register demand (sgpr/vgpr) per instruction per block */
2176 std::vector<std::vector<RegisterDemand>> register_demand;
2177 };
2178
2179 struct ra_test_policy {
2180 /* Force RA to always use its pessimistic fallback algorithm */
2181 bool skip_optimistic_path = false;
2182 };
2183
2184 void init();
2185
2186 void init_program(Program* program, Stage stage, const struct aco_shader_info* info,
2187 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
2188 ac_shader_config* config);
2189
2190 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2191 ac_shader_config* config, const struct aco_compiler_options* options,
2192 const struct aco_shader_info* info,
2193 const struct radv_shader_args* args);
2194 void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2195 const struct aco_compiler_options* options,
2196 const struct aco_shader_info* info,
2197 const struct radv_shader_args* args);
2198 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2199 ac_shader_config* config,
2200 const struct aco_compiler_options* options,
2201 const struct aco_shader_info* info,
2202 const struct radv_shader_args* args);
2203 void select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key,
2204 ac_shader_config* config,
2205 const struct aco_compiler_options* options,
2206 const struct aco_shader_info* info,
2207 const struct radv_shader_args* args,
2208 unsigned* num_preserved_sgprs);
2209
2210 void select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key,
2211 ac_shader_config* config,
2212 const struct aco_compiler_options* options,
2213 const struct aco_shader_info* info,
2214 const struct radv_shader_args* args);
2215
2216 void lower_phis(Program* program);
2217 void calc_min_waves(Program* program);
2218 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2219 live live_var_analysis(Program* program);
2220 std::vector<uint16_t> dead_code_analysis(Program* program);
2221 void dominator_tree(Program* program);
2222 void insert_exec_mask(Program* program);
2223 void value_numbering(Program* program);
2224 void optimize(Program* program);
2225 void optimize_postRA(Program* program);
2226 void setup_reduce_temp(Program* program);
2227 void lower_to_cssa(Program* program, live& live_vars);
2228 void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2229 ra_test_policy = {});
2230 void ssa_elimination(Program* program);
2231 void lower_to_hw_instr(Program* program);
2232 void schedule_program(Program* program, live& live_vars);
2233 void spill(Program* program, live& live_vars);
2234 void insert_wait_states(Program* program);
2235 void insert_NOPs(Program* program);
2236 void form_hard_clauses(Program* program);
2237 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2238 /**
2239 * Returns true if print_asm can disassemble the given program for the current build/runtime
2240 * configuration
2241 */
2242 bool check_print_asm_support(Program* program);
2243 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2244 bool validate_ir(Program* program);
2245 bool validate_ra(Program* program);
2246 #ifndef NDEBUG
2247 void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2248 #else
2249 #define perfwarn(program, cond, msg, ...) \
2250 do { \
2251 } while (0)
2252 #endif
2253
2254 void collect_presched_stats(Program* program);
2255 void collect_preasm_stats(Program* program);
2256 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2257
2258 enum print_flags {
2259 print_no_ssa = 0x1,
2260 print_perf_info = 0x2,
2261 print_kill = 0x4,
2262 print_live_vars = 0x8,
2263 };
2264
2265 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2266 void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2267 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2268 void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2269 unsigned flags = 0);
2270
2271 void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2272 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2273
2274 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2275 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2276
2277 /* utilities for dealing with register demand */
2278 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2279 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2280 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2281 aco_ptr<Instruction>& instr_before);
2282
2283 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2284 uint16_t get_extra_sgprs(Program* program);
2285
2286 /* adjust num_waves for workgroup size and LDS limits */
2287 uint16_t max_suitable_waves(Program* program, uint16_t waves);
2288
2289 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2290 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2291 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2292
2293 /* return number of addressable sgprs/vgprs for max_waves */
2294 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2295 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2296
2297 typedef struct {
2298 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2299 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2300 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2301 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2302 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2303 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2304 const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2305 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2306 /* sizes used for input/output modifiers and constants */
2307 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2308 const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2309 } Info;
2310
2311 extern const Info instr_info;
2312
2313 } // namespace aco
2314
2315 #endif /* ACO_IR_H */
2316