1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include "aco_opcodes.h"
29 #include "aco_util.h"
30
31 #include "vulkan/radv_shader.h"
32
33 #include "nir.h"
34
35 #include <bitset>
36 #include <memory>
37 #include <vector>
38
39 struct radv_shader_args;
40 struct radv_shader_info;
41 struct radv_vs_prolog_key;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE_IR = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 DEBUG_FORCE_WAITCNT = 0x8,
52 DEBUG_NO_VN = 0x10,
53 DEBUG_NO_OPT = 0x20,
54 DEBUG_NO_SCHED = 0x40,
55 DEBUG_PERF_INFO = 0x80,
56 DEBUG_LIVE_INFO = 0x100,
57 };
58
59 /**
60 * Representation of the instruction's microcode encoding format
61 * Note: Some Vector ALU Formats can be combined, such that:
62 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
63 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
64 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
65 *
66 * (*) The same is applicable for VOP1 and VOPC instructions.
67 */
68 enum class Format : std::uint16_t {
69 /* Pseudo Instruction Format */
70 PSEUDO = 0,
71 /* Scalar ALU & Control Formats */
72 SOP1 = 1,
73 SOP2 = 2,
74 SOPK = 3,
75 SOPP = 4,
76 SOPC = 5,
77 /* Scalar Memory Format */
78 SMEM = 6,
79 /* LDS/GDS Format */
80 DS = 8,
81 /* Vector Memory Buffer Formats */
82 MTBUF = 9,
83 MUBUF = 10,
84 /* Vector Memory Image Format */
85 MIMG = 11,
86 /* Export Format */
87 EXP = 12,
88 /* Flat Formats */
89 FLAT = 13,
90 GLOBAL = 14,
91 SCRATCH = 15,
92
93 PSEUDO_BRANCH = 16,
94 PSEUDO_BARRIER = 17,
95 PSEUDO_REDUCTION = 18,
96
97 /* Vector ALU Formats */
98 VOP3P = 19,
99 VOP1 = 1 << 8,
100 VOP2 = 1 << 9,
101 VOPC = 1 << 10,
102 VOP3 = 1 << 11,
103 /* Vector Parameter Interpolation Format */
104 VINTRP = 1 << 12,
105 DPP = 1 << 13,
106 SDWA = 1 << 14,
107 };
108
109 enum class instr_class : uint8_t {
110 valu32 = 0,
111 valu_convert32 = 1,
112 valu64 = 2,
113 valu_quarter_rate32 = 3,
114 valu_fma = 4,
115 valu_transcendental32 = 5,
116 valu_double = 6,
117 valu_double_add = 7,
118 valu_double_convert = 8,
119 valu_double_transcendental = 9,
120 salu = 10,
121 smem = 11,
122 barrier = 12,
123 branch = 13,
124 sendmsg = 14,
125 ds = 15,
126 exp = 16,
127 vmem = 17,
128 waitcnt = 18,
129 other = 19,
130 count,
131 };
132
133 enum storage_class : uint8_t {
134 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
135 storage_buffer = 0x1, /* SSBOs and global memory */
136 storage_atomic_counter = 0x2, /* not used for Vulkan */
137 storage_image = 0x4,
138 storage_shared = 0x8, /* or TCS output */
139 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
140 storage_scratch = 0x20,
141 storage_vgpr_spill = 0x40,
142 storage_count = 8,
143 };
144
145 enum memory_semantics : uint8_t {
146 semantic_none = 0x0,
147 /* for loads: don't move any access after this load to before this load (even other loads)
148 * for barriers: don't move any access after the barrier to before any
149 * atomics/control_barriers/sendmsg_gs_done before the barrier */
150 semantic_acquire = 0x1,
151 /* for stores: don't move any access before this store to after this store
152 * for barriers: don't move any access before the barrier to after any
153 * atomics/control_barriers/sendmsg_gs_done after the barrier */
154 semantic_release = 0x2,
155
156 /* the rest are for load/stores/atomics only */
157 /* cannot be DCE'd or CSE'd */
158 semantic_volatile = 0x4,
159 /* does not interact with barriers and assumes this lane is the only lane
160 * accessing this memory */
161 semantic_private = 0x8,
162 /* this operation can be reordered around operations of the same storage.
163 * says nothing about barriers */
164 semantic_can_reorder = 0x10,
165 /* this is a atomic instruction (may only read or write memory) */
166 semantic_atomic = 0x20,
167 /* this is instruction both reads and writes memory */
168 semantic_rmw = 0x40,
169
170 semantic_acqrel = semantic_acquire | semantic_release,
171 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
172 };
173
174 enum sync_scope : uint8_t {
175 scope_invocation = 0,
176 scope_subgroup = 1,
177 scope_workgroup = 2,
178 scope_queuefamily = 3,
179 scope_device = 4,
180 };
181
182 struct memory_sync_info {
memory_sync_infomemory_sync_info183 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
184 memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
185 : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
186 {}
187
188 storage_class storage : 8;
189 memory_semantics semantics : 8;
190 sync_scope scope : 8;
191
192 bool operator==(const memory_sync_info& rhs) const
193 {
194 return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
195 }
196
can_reordermemory_sync_info197 bool can_reorder() const
198 {
199 if (semantics & semantic_acqrel)
200 return false;
201 /* Also check storage so that zero-initialized memory_sync_info can be
202 * reordered. */
203 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
204 }
205 };
206 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
207
208 enum fp_round {
209 fp_round_ne = 0,
210 fp_round_pi = 1,
211 fp_round_ni = 2,
212 fp_round_tz = 3,
213 };
214
215 enum fp_denorm {
216 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
217 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
218 fp_denorm_flush = 0x0,
219 fp_denorm_keep_in = 0x1,
220 fp_denorm_keep_out = 0x2,
221 fp_denorm_keep = 0x3,
222 };
223
224 struct float_mode {
225 /* matches encoding of the MODE register */
226 union {
227 struct {
228 fp_round round32 : 2;
229 fp_round round16_64 : 2;
230 unsigned denorm32 : 2;
231 unsigned denorm16_64 : 2;
232 };
233 struct {
234 uint8_t round : 4;
235 uint8_t denorm : 4;
236 };
237 uint8_t val = 0;
238 };
239 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
240 bool preserve_signed_zero_inf_nan32 : 1;
241 bool preserve_signed_zero_inf_nan16_64 : 1;
242 /* if false, optimizations which may remove denormal flushing can be done */
243 bool must_flush_denorms32 : 1;
244 bool must_flush_denorms16_64 : 1;
245 bool care_about_round32 : 1;
246 bool care_about_round16_64 : 1;
247
248 /* Returns true if instructions using the mode "other" can safely use the
249 * current one instead. */
canReplacefloat_mode250 bool canReplace(float_mode other) const noexcept
251 {
252 return val == other.val &&
253 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
254 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
255 (must_flush_denorms32 || !other.must_flush_denorms32) &&
256 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
257 (care_about_round32 || !other.care_about_round32) &&
258 (care_about_round16_64 || !other.care_about_round16_64);
259 }
260 };
261
262 struct wait_imm {
263 static const uint8_t unset_counter = 0xff;
264
265 uint8_t vm;
266 uint8_t exp;
267 uint8_t lgkm;
268 uint8_t vs;
269
270 wait_imm();
271 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
272 wait_imm(enum chip_class chip, uint16_t packed);
273
274 uint16_t pack(enum chip_class chip) const;
275
276 bool combine(const wait_imm& other);
277
278 bool empty() const;
279 };
280
281 constexpr Format
asVOP3(Format format)282 asVOP3(Format format)
283 {
284 return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
285 };
286
287 constexpr Format
asSDWA(Format format)288 asSDWA(Format format)
289 {
290 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
291 return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
292 }
293
294 constexpr Format
withoutDPP(Format format)295 withoutDPP(Format format)
296 {
297 return (Format)((uint32_t)format & ~(uint32_t)Format::DPP);
298 }
299
300 enum class RegType {
301 none = 0,
302 sgpr,
303 vgpr,
304 linear_vgpr,
305 };
306
307 struct RegClass {
308
309 enum RC : uint8_t {
310 s1 = 1,
311 s2 = 2,
312 s3 = 3,
313 s4 = 4,
314 s6 = 6,
315 s8 = 8,
316 s16 = 16,
317 v1 = s1 | (1 << 5),
318 v2 = s2 | (1 << 5),
319 v3 = s3 | (1 << 5),
320 v4 = s4 | (1 << 5),
321 v5 = 5 | (1 << 5),
322 v6 = 6 | (1 << 5),
323 v7 = 7 | (1 << 5),
324 v8 = 8 | (1 << 5),
325 /* byte-sized register class */
326 v1b = v1 | (1 << 7),
327 v2b = v2 | (1 << 7),
328 v3b = v3 | (1 << 7),
329 v4b = v4 | (1 << 7),
330 v6b = v6 | (1 << 7),
331 v8b = v8 | (1 << 7),
332 /* these are used for WWM and spills to vgpr */
333 v1_linear = v1 | (1 << 6),
334 v2_linear = v2 | (1 << 6),
335 };
336
337 RegClass() = default;
RegClassRegClass338 constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass339 constexpr RegClass(RegType type, unsigned size)
340 : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
341 {}
342
RCRegClass343 constexpr operator RC() const { return rc; }
344 explicit operator bool() = delete;
345
typeRegClass346 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass347 constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass348 constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass349 constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
350 // TODO: use size() less in favor of bytes()
sizeRegClass351 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass352 constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass353 constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass354 constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
355
getRegClass356 static constexpr RegClass get(RegType type, unsigned bytes)
357 {
358 if (type == RegType::sgpr) {
359 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
360 } else {
361 return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
362 }
363 }
364
resizeRegClass365 constexpr RegClass resize(unsigned bytes) const
366 {
367 if (is_linear_vgpr()) {
368 assert(bytes % 4u == 0);
369 return get(RegType::vgpr, bytes).as_linear();
370 }
371 return get(type(), bytes);
372 }
373
374 private:
375 RC rc;
376 };
377
378 /* transitional helper expressions */
379 static constexpr RegClass s1{RegClass::s1};
380 static constexpr RegClass s2{RegClass::s2};
381 static constexpr RegClass s3{RegClass::s3};
382 static constexpr RegClass s4{RegClass::s4};
383 static constexpr RegClass s8{RegClass::s8};
384 static constexpr RegClass s16{RegClass::s16};
385 static constexpr RegClass v1{RegClass::v1};
386 static constexpr RegClass v2{RegClass::v2};
387 static constexpr RegClass v3{RegClass::v3};
388 static constexpr RegClass v4{RegClass::v4};
389 static constexpr RegClass v5{RegClass::v5};
390 static constexpr RegClass v6{RegClass::v6};
391 static constexpr RegClass v7{RegClass::v7};
392 static constexpr RegClass v8{RegClass::v8};
393 static constexpr RegClass v1b{RegClass::v1b};
394 static constexpr RegClass v2b{RegClass::v2b};
395 static constexpr RegClass v3b{RegClass::v3b};
396 static constexpr RegClass v4b{RegClass::v4b};
397 static constexpr RegClass v6b{RegClass::v6b};
398 static constexpr RegClass v8b{RegClass::v8b};
399
400 /**
401 * Temp Class
402 * Each temporary virtual register has a
403 * register class (i.e. size and type)
404 * and SSA id.
405 */
406 struct Temp {
TempTemp407 Temp() noexcept : id_(0), reg_class(0) {}
TempTemp408 constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
409
idTemp410 constexpr uint32_t id() const noexcept { return id_; }
regClassTemp411 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
412
bytesTemp413 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp414 constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp415 constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp416 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
417
418 constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
419 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
420 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
421
422 private:
423 uint32_t id_ : 24;
424 uint32_t reg_class : 8;
425 };
426
427 /**
428 * PhysReg
429 * Represents the physical register for each
430 * Operand and Definition.
431 */
432 struct PhysReg {
433 constexpr PhysReg() = default;
PhysRegPhysReg434 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg435 constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg436 constexpr unsigned byte() const { return reg_b & 0x3; }
437 constexpr operator unsigned() const { return reg(); }
438 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
439 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
440 constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg441 constexpr PhysReg advance(int bytes) const
442 {
443 PhysReg res = *this;
444 res.reg_b += bytes;
445 return res;
446 }
447
448 uint16_t reg_b = 0;
449 };
450
451 /* helper expressions for special registers */
452 static constexpr PhysReg m0{124};
453 static constexpr PhysReg vcc{106};
454 static constexpr PhysReg vcc_hi{107};
455 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
456 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
457 static constexpr PhysReg ttmp0{112};
458 static constexpr PhysReg ttmp1{113};
459 static constexpr PhysReg ttmp2{114};
460 static constexpr PhysReg ttmp3{115};
461 static constexpr PhysReg ttmp4{116};
462 static constexpr PhysReg ttmp5{117};
463 static constexpr PhysReg ttmp6{118};
464 static constexpr PhysReg ttmp7{119};
465 static constexpr PhysReg ttmp8{120};
466 static constexpr PhysReg ttmp9{121};
467 static constexpr PhysReg ttmp10{122};
468 static constexpr PhysReg ttmp11{123};
469 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
470 static constexpr PhysReg exec{126};
471 static constexpr PhysReg exec_lo{126};
472 static constexpr PhysReg exec_hi{127};
473 static constexpr PhysReg vccz{251};
474 static constexpr PhysReg execz{252};
475 static constexpr PhysReg scc{253};
476
477 /**
478 * Operand Class
479 * Initially, each Operand refers to either
480 * a temporary virtual register
481 * or to a constant value
482 * Temporary registers get mapped to physical register during RA
483 * Constant values are inlined into the instruction sequence.
484 */
485 class Operand final {
486 public:
Operand()487 constexpr Operand()
488 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
489 isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
490 is24bit_(false), signext(false)
491 {}
492
Operand(Temp r)493 explicit Operand(Temp r) noexcept
494 {
495 data_.temp = r;
496 if (r.id()) {
497 isTemp_ = true;
498 } else {
499 isUndef_ = true;
500 setFixed(PhysReg{128});
501 }
502 };
Operand(Temp r,PhysReg reg)503 explicit Operand(Temp r, PhysReg reg) noexcept
504 {
505 assert(r.id()); /* Don't allow fixing an undef to a register */
506 data_.temp = r;
507 isTemp_ = true;
508 setFixed(reg);
509 };
510
511 /* 8-bit constant */
c8(uint8_t v)512 static Operand c8(uint8_t v) noexcept
513 {
514 /* 8-bit constants are only used for copies and copies from any 8-bit
515 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
516 * to be inline constants. */
517 Operand op;
518 op.control_ = 0;
519 op.data_.i = v;
520 op.isConstant_ = true;
521 op.constSize = 0;
522 op.setFixed(PhysReg{0u});
523 return op;
524 };
525
526 /* 16-bit constant */
c16(uint16_t v)527 static Operand c16(uint16_t v) noexcept
528 {
529 Operand op;
530 op.control_ = 0;
531 op.data_.i = v;
532 op.isConstant_ = true;
533 op.constSize = 1;
534 if (v <= 64)
535 op.setFixed(PhysReg{128u + v});
536 else if (v >= 0xFFF0) /* [-16 .. -1] */
537 op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
538 else if (v == 0x3800) /* 0.5 */
539 op.setFixed(PhysReg{240});
540 else if (v == 0xB800) /* -0.5 */
541 op.setFixed(PhysReg{241});
542 else if (v == 0x3C00) /* 1.0 */
543 op.setFixed(PhysReg{242});
544 else if (v == 0xBC00) /* -1.0 */
545 op.setFixed(PhysReg{243});
546 else if (v == 0x4000) /* 2.0 */
547 op.setFixed(PhysReg{244});
548 else if (v == 0xC000) /* -2.0 */
549 op.setFixed(PhysReg{245});
550 else if (v == 0x4400) /* 4.0 */
551 op.setFixed(PhysReg{246});
552 else if (v == 0xC400) /* -4.0 */
553 op.setFixed(PhysReg{247});
554 else if (v == 0x3118) /* 1/2 PI */
555 op.setFixed(PhysReg{248});
556 else /* Literal Constant */
557 op.setFixed(PhysReg{255});
558 return op;
559 }
560
561 /* 32-bit constant */
c32(uint32_t v)562 static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
563
564 /* 64-bit constant */
c64(uint64_t v)565 static Operand c64(uint64_t v) noexcept
566 {
567 Operand op;
568 op.control_ = 0;
569 op.isConstant_ = true;
570 op.constSize = 3;
571 if (v <= 64) {
572 op.data_.i = (uint32_t)v;
573 op.setFixed(PhysReg{128 + (uint32_t)v});
574 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
575 op.data_.i = (uint32_t)v;
576 op.setFixed(PhysReg{192 - (uint32_t)v});
577 } else if (v == 0x3FE0000000000000) { /* 0.5 */
578 op.data_.i = 0x3f000000;
579 op.setFixed(PhysReg{240});
580 } else if (v == 0xBFE0000000000000) { /* -0.5 */
581 op.data_.i = 0xbf000000;
582 op.setFixed(PhysReg{241});
583 } else if (v == 0x3FF0000000000000) { /* 1.0 */
584 op.data_.i = 0x3f800000;
585 op.setFixed(PhysReg{242});
586 } else if (v == 0xBFF0000000000000) { /* -1.0 */
587 op.data_.i = 0xbf800000;
588 op.setFixed(PhysReg{243});
589 } else if (v == 0x4000000000000000) { /* 2.0 */
590 op.data_.i = 0x40000000;
591 op.setFixed(PhysReg{244});
592 } else if (v == 0xC000000000000000) { /* -2.0 */
593 op.data_.i = 0xc0000000;
594 op.setFixed(PhysReg{245});
595 } else if (v == 0x4010000000000000) { /* 4.0 */
596 op.data_.i = 0x40800000;
597 op.setFixed(PhysReg{246});
598 } else if (v == 0xC010000000000000) { /* -4.0 */
599 op.data_.i = 0xc0800000;
600 op.setFixed(PhysReg{247});
601 } else { /* Literal Constant: we don't know if it is a long or double.*/
602 op.signext = v >> 63;
603 op.data_.i = v & 0xffffffffu;
604 op.setFixed(PhysReg{255});
605 assert(op.constantValue64() == v &&
606 "attempt to create a unrepresentable 64-bit literal constant");
607 }
608 return op;
609 }
610
611 /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)612 static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
613 {
614 Operand op;
615 op.control_ = 0;
616 op.data_.i = v;
617 op.isConstant_ = true;
618 op.constSize = is64bit ? 3 : 2;
619 if (v <= 64)
620 op.setFixed(PhysReg{128 + v});
621 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
622 op.setFixed(PhysReg{192 - v});
623 else if (v == 0x3f000000) /* 0.5 */
624 op.setFixed(PhysReg{240});
625 else if (v == 0xbf000000) /* -0.5 */
626 op.setFixed(PhysReg{241});
627 else if (v == 0x3f800000) /* 1.0 */
628 op.setFixed(PhysReg{242});
629 else if (v == 0xbf800000) /* -1.0 */
630 op.setFixed(PhysReg{243});
631 else if (v == 0x40000000) /* 2.0 */
632 op.setFixed(PhysReg{244});
633 else if (v == 0xc0000000) /* -2.0 */
634 op.setFixed(PhysReg{245});
635 else if (v == 0x40800000) /* 4.0 */
636 op.setFixed(PhysReg{246});
637 else if (v == 0xc0800000) /* -4.0 */
638 op.setFixed(PhysReg{247});
639 else { /* Literal Constant */
640 assert(!is64bit && "attempt to create a 64-bit literal constant");
641 op.setFixed(PhysReg{255});
642 }
643 return op;
644 }
645
Operand(RegClass type)646 explicit Operand(RegClass type) noexcept
647 {
648 isUndef_ = true;
649 data_.temp = Temp(0, type);
650 setFixed(PhysReg{128});
651 };
Operand(PhysReg reg,RegClass type)652 explicit Operand(PhysReg reg, RegClass type) noexcept
653 {
654 data_.temp = Temp(0, type);
655 setFixed(reg);
656 }
657
658 static Operand zero(unsigned bytes = 4) noexcept
659 {
660 if (bytes == 8)
661 return Operand::c64(0);
662 else if (bytes == 4)
663 return Operand::c32(0);
664 else if (bytes == 2)
665 return Operand::c16(0);
666 assert(bytes == 1);
667 return Operand::c8(0);
668 }
669
670 /* This is useful over the constructors when you want to take a chip class
671 * for 1/2 PI or an unknown operand size.
672 */
get_const(enum chip_class chip,uint64_t val,unsigned bytes)673 static Operand get_const(enum chip_class chip, uint64_t val, unsigned bytes)
674 {
675 if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
676 /* 1/2 PI can be an inline constant on GFX8+ */
677 Operand op = Operand::c32(val);
678 op.setFixed(PhysReg{248});
679 return op;
680 }
681
682 if (bytes == 8)
683 return Operand::c64(val);
684 else if (bytes == 4)
685 return Operand::c32(val);
686 else if (bytes == 2)
687 return Operand::c16(val);
688 assert(bytes == 1);
689 return Operand::c8(val);
690 }
691
692 static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
693 bool sext = false)
694 {
695 if (bytes <= 4)
696 return true;
697
698 if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
699 return true;
700 uint64_t upper33 = val & 0xFFFFFFFF80000000;
701 if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
702 return true;
703
704 return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
705 val == 0x3FE0000000000000 || /* 0.5 */
706 val == 0xBFE0000000000000 || /* -0.5 */
707 val == 0x3FF0000000000000 || /* 1.0 */
708 val == 0xBFF0000000000000 || /* -1.0 */
709 val == 0x4000000000000000 || /* 2.0 */
710 val == 0xC000000000000000 || /* -2.0 */
711 val == 0x4010000000000000 || /* 4.0 */
712 val == 0xC010000000000000; /* -4.0 */
713 }
714
isTemp()715 constexpr bool isTemp() const noexcept { return isTemp_; }
716
setTemp(Temp t)717 constexpr void setTemp(Temp t) noexcept
718 {
719 assert(!isConstant_);
720 isTemp_ = true;
721 data_.temp = t;
722 }
723
getTemp()724 constexpr Temp getTemp() const noexcept { return data_.temp; }
725
tempId()726 constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
727
hasRegClass()728 constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
729
regClass()730 constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
731
bytes()732 constexpr unsigned bytes() const noexcept
733 {
734 if (isConstant())
735 return 1 << constSize;
736 else
737 return data_.temp.bytes();
738 }
739
size()740 constexpr unsigned size() const noexcept
741 {
742 if (isConstant())
743 return constSize > 2 ? 2 : 1;
744 else
745 return data_.temp.size();
746 }
747
isFixed()748 constexpr bool isFixed() const noexcept { return isFixed_; }
749
physReg()750 constexpr PhysReg physReg() const noexcept { return reg_; }
751
setFixed(PhysReg reg)752 constexpr void setFixed(PhysReg reg) noexcept
753 {
754 isFixed_ = reg != unsigned(-1);
755 reg_ = reg;
756 }
757
isConstant()758 constexpr bool isConstant() const noexcept { return isConstant_; }
759
isLiteral()760 constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
761
isUndefined()762 constexpr bool isUndefined() const noexcept { return isUndef_; }
763
constantValue()764 constexpr uint32_t constantValue() const noexcept { return data_.i; }
765
constantEquals(uint32_t cmp)766 constexpr bool constantEquals(uint32_t cmp) const noexcept
767 {
768 return isConstant() && constantValue() == cmp;
769 }
770
constantValue64()771 constexpr uint64_t constantValue64() const noexcept
772 {
773 if (constSize == 3) {
774 if (reg_ <= 192)
775 return reg_ - 128;
776 else if (reg_ <= 208)
777 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
778
779 switch (reg_) {
780 case 240: return 0x3FE0000000000000;
781 case 241: return 0xBFE0000000000000;
782 case 242: return 0x3FF0000000000000;
783 case 243: return 0xBFF0000000000000;
784 case 244: return 0x4000000000000000;
785 case 245: return 0xC000000000000000;
786 case 246: return 0x4010000000000000;
787 case 247: return 0xC010000000000000;
788 case 255:
789 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
790 }
791 unreachable("invalid register for 64-bit constant");
792 } else {
793 return data_.i;
794 }
795 }
796
isOfType(RegType type)797 constexpr bool isOfType(RegType type) const noexcept
798 {
799 return hasRegClass() && regClass().type() == type;
800 }
801
802 /* Indicates that the killed operand's live range intersects with the
803 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
804 * not set by liveness analysis. */
setLateKill(bool flag)805 constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
806
isLateKill()807 constexpr bool isLateKill() const noexcept { return isLateKill_; }
808
setKill(bool flag)809 constexpr void setKill(bool flag) noexcept
810 {
811 isKill_ = flag;
812 if (!flag)
813 setFirstKill(false);
814 }
815
isKill()816 constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
817
setFirstKill(bool flag)818 constexpr void setFirstKill(bool flag) noexcept
819 {
820 isFirstKill_ = flag;
821 if (flag)
822 setKill(flag);
823 }
824
825 /* When there are multiple operands killing the same temporary,
826 * isFirstKill() is only returns true for the first one. */
isFirstKill()827 constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
828
isKillBeforeDef()829 constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
830
isFirstKillBeforeDef()831 constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
832
833 constexpr bool operator==(Operand other) const noexcept
834 {
835 if (other.size() != size())
836 return false;
837 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
838 return false;
839 if (isFixed() && other.isFixed() && physReg() != other.physReg())
840 return false;
841 if (isLiteral())
842 return other.isLiteral() && other.constantValue() == constantValue();
843 else if (isConstant())
844 return other.isConstant() && other.physReg() == physReg();
845 else if (isUndefined())
846 return other.isUndefined() && other.regClass() == regClass();
847 else
848 return other.isTemp() && other.getTemp() == getTemp();
849 }
850
851 constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
852
set16bit(bool flag)853 constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
854
is16bit()855 constexpr bool is16bit() const noexcept { return is16bit_; }
856
set24bit(bool flag)857 constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
858
is24bit()859 constexpr bool is24bit() const noexcept { return is24bit_; }
860
861 private:
862 union {
863 Temp temp;
864 uint32_t i;
865 float f;
866 } data_ = {Temp(0, s1)};
867 PhysReg reg_;
868 union {
869 struct {
870 uint8_t isTemp_ : 1;
871 uint8_t isFixed_ : 1;
872 uint8_t isConstant_ : 1;
873 uint8_t isKill_ : 1;
874 uint8_t isUndef_ : 1;
875 uint8_t isFirstKill_ : 1;
876 uint8_t constSize : 2;
877 uint8_t isLateKill_ : 1;
878 uint8_t is16bit_ : 1;
879 uint8_t is24bit_ : 1;
880 uint8_t signext : 1;
881 };
882 /* can't initialize bit-fields in c++11, so work around using a union */
883 uint16_t control_ = 0;
884 };
885 };
886
887 /**
888 * Definition Class
889 * Definitions are the results of Instructions
890 * and refer to temporary virtual registers
891 * which are later mapped to physical registers
892 */
893 class Definition final {
894 public:
Definition()895 constexpr Definition()
896 : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0),
897 isNoCSE_(0)
898 {}
Definition(uint32_t index,RegClass type)899 Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
Definition(Temp tmp)900 explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)901 Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)902 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
903 {
904 setFixed(reg);
905 }
906
isTemp()907 constexpr bool isTemp() const noexcept { return tempId() > 0; }
908
getTemp()909 constexpr Temp getTemp() const noexcept { return temp; }
910
tempId()911 constexpr uint32_t tempId() const noexcept { return temp.id(); }
912
setTemp(Temp t)913 constexpr void setTemp(Temp t) noexcept { temp = t; }
914
swapTemp(Definition & other)915 void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
916
regClass()917 constexpr RegClass regClass() const noexcept { return temp.regClass(); }
918
bytes()919 constexpr unsigned bytes() const noexcept { return temp.bytes(); }
920
size()921 constexpr unsigned size() const noexcept { return temp.size(); }
922
isFixed()923 constexpr bool isFixed() const noexcept { return isFixed_; }
924
physReg()925 constexpr PhysReg physReg() const noexcept { return reg_; }
926
setFixed(PhysReg reg)927 constexpr void setFixed(PhysReg reg) noexcept
928 {
929 isFixed_ = 1;
930 reg_ = reg;
931 }
932
setHint(PhysReg reg)933 constexpr void setHint(PhysReg reg) noexcept
934 {
935 hasHint_ = 1;
936 reg_ = reg;
937 }
938
hasHint()939 constexpr bool hasHint() const noexcept { return hasHint_; }
940
setKill(bool flag)941 constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
942
isKill()943 constexpr bool isKill() const noexcept { return isKill_; }
944
setPrecise(bool precise)945 constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
946
isPrecise()947 constexpr bool isPrecise() const noexcept { return isPrecise_; }
948
949 /* No Unsigned Wrap */
setNUW(bool nuw)950 constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
951
isNUW()952 constexpr bool isNUW() const noexcept { return isNUW_; }
953
setNoCSE(bool noCSE)954 constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
955
isNoCSE()956 constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
957
958 private:
959 Temp temp = Temp(0, s1);
960 PhysReg reg_;
961 union {
962 struct {
963 uint8_t isFixed_ : 1;
964 uint8_t hasHint_ : 1;
965 uint8_t isKill_ : 1;
966 uint8_t isPrecise_ : 1;
967 uint8_t isNUW_ : 1;
968 uint8_t isNoCSE_ : 1;
969 };
970 /* can't initialize bit-fields in c++11, so work around using a union */
971 uint8_t control_ = 0;
972 };
973 };
974
975 struct Block;
976 struct Instruction;
977 struct Pseudo_instruction;
978 struct SOP1_instruction;
979 struct SOP2_instruction;
980 struct SOPK_instruction;
981 struct SOPP_instruction;
982 struct SOPC_instruction;
983 struct SMEM_instruction;
984 struct DS_instruction;
985 struct MTBUF_instruction;
986 struct MUBUF_instruction;
987 struct MIMG_instruction;
988 struct Export_instruction;
989 struct FLAT_instruction;
990 struct Pseudo_branch_instruction;
991 struct Pseudo_barrier_instruction;
992 struct Pseudo_reduction_instruction;
993 struct VOP3P_instruction;
994 struct VOP1_instruction;
995 struct VOP2_instruction;
996 struct VOPC_instruction;
997 struct VOP3_instruction;
998 struct Interp_instruction;
999 struct DPP_instruction;
1000 struct SDWA_instruction;
1001
1002 struct Instruction {
1003 aco_opcode opcode;
1004 Format format;
1005 uint32_t pass_flags;
1006
1007 aco::span<Operand> operands;
1008 aco::span<Definition> definitions;
1009
1010 constexpr bool usesModifiers() const noexcept;
1011
reads_execInstruction1012 constexpr bool reads_exec() const noexcept
1013 {
1014 for (const Operand& op : operands) {
1015 if (op.isFixed() && op.physReg() == exec)
1016 return true;
1017 }
1018 return false;
1019 }
1020
pseudoInstruction1021 Pseudo_instruction& pseudo() noexcept
1022 {
1023 assert(isPseudo());
1024 return *(Pseudo_instruction*)this;
1025 }
pseudoInstruction1026 const Pseudo_instruction& pseudo() const noexcept
1027 {
1028 assert(isPseudo());
1029 return *(Pseudo_instruction*)this;
1030 }
isPseudoInstruction1031 constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
sop1Instruction1032 SOP1_instruction& sop1() noexcept
1033 {
1034 assert(isSOP1());
1035 return *(SOP1_instruction*)this;
1036 }
sop1Instruction1037 const SOP1_instruction& sop1() const noexcept
1038 {
1039 assert(isSOP1());
1040 return *(SOP1_instruction*)this;
1041 }
isSOP1Instruction1042 constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
sop2Instruction1043 SOP2_instruction& sop2() noexcept
1044 {
1045 assert(isSOP2());
1046 return *(SOP2_instruction*)this;
1047 }
sop2Instruction1048 const SOP2_instruction& sop2() const noexcept
1049 {
1050 assert(isSOP2());
1051 return *(SOP2_instruction*)this;
1052 }
isSOP2Instruction1053 constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
sopkInstruction1054 SOPK_instruction& sopk() noexcept
1055 {
1056 assert(isSOPK());
1057 return *(SOPK_instruction*)this;
1058 }
sopkInstruction1059 const SOPK_instruction& sopk() const noexcept
1060 {
1061 assert(isSOPK());
1062 return *(SOPK_instruction*)this;
1063 }
isSOPKInstruction1064 constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
soppInstruction1065 SOPP_instruction& sopp() noexcept
1066 {
1067 assert(isSOPP());
1068 return *(SOPP_instruction*)this;
1069 }
soppInstruction1070 const SOPP_instruction& sopp() const noexcept
1071 {
1072 assert(isSOPP());
1073 return *(SOPP_instruction*)this;
1074 }
isSOPPInstruction1075 constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
sopcInstruction1076 SOPC_instruction& sopc() noexcept
1077 {
1078 assert(isSOPC());
1079 return *(SOPC_instruction*)this;
1080 }
sopcInstruction1081 const SOPC_instruction& sopc() const noexcept
1082 {
1083 assert(isSOPC());
1084 return *(SOPC_instruction*)this;
1085 }
isSOPCInstruction1086 constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
smemInstruction1087 SMEM_instruction& smem() noexcept
1088 {
1089 assert(isSMEM());
1090 return *(SMEM_instruction*)this;
1091 }
smemInstruction1092 const SMEM_instruction& smem() const noexcept
1093 {
1094 assert(isSMEM());
1095 return *(SMEM_instruction*)this;
1096 }
isSMEMInstruction1097 constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1098 DS_instruction& ds() noexcept
1099 {
1100 assert(isDS());
1101 return *(DS_instruction*)this;
1102 }
dsInstruction1103 const DS_instruction& ds() const noexcept
1104 {
1105 assert(isDS());
1106 return *(DS_instruction*)this;
1107 }
isDSInstruction1108 constexpr bool isDS() const noexcept { return format == Format::DS; }
mtbufInstruction1109 MTBUF_instruction& mtbuf() noexcept
1110 {
1111 assert(isMTBUF());
1112 return *(MTBUF_instruction*)this;
1113 }
mtbufInstruction1114 const MTBUF_instruction& mtbuf() const noexcept
1115 {
1116 assert(isMTBUF());
1117 return *(MTBUF_instruction*)this;
1118 }
isMTBUFInstruction1119 constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1120 MUBUF_instruction& mubuf() noexcept
1121 {
1122 assert(isMUBUF());
1123 return *(MUBUF_instruction*)this;
1124 }
mubufInstruction1125 const MUBUF_instruction& mubuf() const noexcept
1126 {
1127 assert(isMUBUF());
1128 return *(MUBUF_instruction*)this;
1129 }
isMUBUFInstruction1130 constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1131 MIMG_instruction& mimg() noexcept
1132 {
1133 assert(isMIMG());
1134 return *(MIMG_instruction*)this;
1135 }
mimgInstruction1136 const MIMG_instruction& mimg() const noexcept
1137 {
1138 assert(isMIMG());
1139 return *(MIMG_instruction*)this;
1140 }
isMIMGInstruction1141 constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1142 Export_instruction& exp() noexcept
1143 {
1144 assert(isEXP());
1145 return *(Export_instruction*)this;
1146 }
expInstruction1147 const Export_instruction& exp() const noexcept
1148 {
1149 assert(isEXP());
1150 return *(Export_instruction*)this;
1151 }
isEXPInstruction1152 constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1153 FLAT_instruction& flat() noexcept
1154 {
1155 assert(isFlat());
1156 return *(FLAT_instruction*)this;
1157 }
flatInstruction1158 const FLAT_instruction& flat() const noexcept
1159 {
1160 assert(isFlat());
1161 return *(FLAT_instruction*)this;
1162 }
isFlatInstruction1163 constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1164 FLAT_instruction& global() noexcept
1165 {
1166 assert(isGlobal());
1167 return *(FLAT_instruction*)this;
1168 }
globalInstruction1169 const FLAT_instruction& global() const noexcept
1170 {
1171 assert(isGlobal());
1172 return *(FLAT_instruction*)this;
1173 }
isGlobalInstruction1174 constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1175 FLAT_instruction& scratch() noexcept
1176 {
1177 assert(isScratch());
1178 return *(FLAT_instruction*)this;
1179 }
scratchInstruction1180 const FLAT_instruction& scratch() const noexcept
1181 {
1182 assert(isScratch());
1183 return *(FLAT_instruction*)this;
1184 }
isScratchInstruction1185 constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1186 Pseudo_branch_instruction& branch() noexcept
1187 {
1188 assert(isBranch());
1189 return *(Pseudo_branch_instruction*)this;
1190 }
branchInstruction1191 const Pseudo_branch_instruction& branch() const noexcept
1192 {
1193 assert(isBranch());
1194 return *(Pseudo_branch_instruction*)this;
1195 }
isBranchInstruction1196 constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1197 Pseudo_barrier_instruction& barrier() noexcept
1198 {
1199 assert(isBarrier());
1200 return *(Pseudo_barrier_instruction*)this;
1201 }
barrierInstruction1202 const Pseudo_barrier_instruction& barrier() const noexcept
1203 {
1204 assert(isBarrier());
1205 return *(Pseudo_barrier_instruction*)this;
1206 }
isBarrierInstruction1207 constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1208 Pseudo_reduction_instruction& reduction() noexcept
1209 {
1210 assert(isReduction());
1211 return *(Pseudo_reduction_instruction*)this;
1212 }
reductionInstruction1213 const Pseudo_reduction_instruction& reduction() const noexcept
1214 {
1215 assert(isReduction());
1216 return *(Pseudo_reduction_instruction*)this;
1217 }
isReductionInstruction1218 constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
vop3pInstruction1219 VOP3P_instruction& vop3p() noexcept
1220 {
1221 assert(isVOP3P());
1222 return *(VOP3P_instruction*)this;
1223 }
vop3pInstruction1224 const VOP3P_instruction& vop3p() const noexcept
1225 {
1226 assert(isVOP3P());
1227 return *(VOP3P_instruction*)this;
1228 }
isVOP3PInstruction1229 constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
vop1Instruction1230 VOP1_instruction& vop1() noexcept
1231 {
1232 assert(isVOP1());
1233 return *(VOP1_instruction*)this;
1234 }
vop1Instruction1235 const VOP1_instruction& vop1() const noexcept
1236 {
1237 assert(isVOP1());
1238 return *(VOP1_instruction*)this;
1239 }
isVOP1Instruction1240 constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
vop2Instruction1241 VOP2_instruction& vop2() noexcept
1242 {
1243 assert(isVOP2());
1244 return *(VOP2_instruction*)this;
1245 }
vop2Instruction1246 const VOP2_instruction& vop2() const noexcept
1247 {
1248 assert(isVOP2());
1249 return *(VOP2_instruction*)this;
1250 }
isVOP2Instruction1251 constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
vopcInstruction1252 VOPC_instruction& vopc() noexcept
1253 {
1254 assert(isVOPC());
1255 return *(VOPC_instruction*)this;
1256 }
vopcInstruction1257 const VOPC_instruction& vopc() const noexcept
1258 {
1259 assert(isVOPC());
1260 return *(VOPC_instruction*)this;
1261 }
isVOPCInstruction1262 constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
vop3Instruction1263 VOP3_instruction& vop3() noexcept
1264 {
1265 assert(isVOP3());
1266 return *(VOP3_instruction*)this;
1267 }
vop3Instruction1268 const VOP3_instruction& vop3() const noexcept
1269 {
1270 assert(isVOP3());
1271 return *(VOP3_instruction*)this;
1272 }
isVOP3Instruction1273 constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1274 Interp_instruction& vintrp() noexcept
1275 {
1276 assert(isVINTRP());
1277 return *(Interp_instruction*)this;
1278 }
vintrpInstruction1279 const Interp_instruction& vintrp() const noexcept
1280 {
1281 assert(isVINTRP());
1282 return *(Interp_instruction*)this;
1283 }
isVINTRPInstruction1284 constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
dppInstruction1285 DPP_instruction& dpp() noexcept
1286 {
1287 assert(isDPP());
1288 return *(DPP_instruction*)this;
1289 }
dppInstruction1290 const DPP_instruction& dpp() const noexcept
1291 {
1292 assert(isDPP());
1293 return *(DPP_instruction*)this;
1294 }
isDPPInstruction1295 constexpr bool isDPP() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP; }
sdwaInstruction1296 SDWA_instruction& sdwa() noexcept
1297 {
1298 assert(isSDWA());
1299 return *(SDWA_instruction*)this;
1300 }
sdwaInstruction1301 const SDWA_instruction& sdwa() const noexcept
1302 {
1303 assert(isSDWA());
1304 return *(SDWA_instruction*)this;
1305 }
isSDWAInstruction1306 constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1307
flatlikeInstruction1308 FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1309
flatlikeInstruction1310 const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1311
isFlatLikeInstruction1312 constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1313
isVALUInstruction1314 constexpr bool isVALU() const noexcept
1315 {
1316 return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1317 }
1318
isSALUInstruction1319 constexpr bool isSALU() const noexcept
1320 {
1321 return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1322 }
1323
isVMEMInstruction1324 constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1325 };
1326 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1327
1328 struct SOPK_instruction : public Instruction {
1329 uint16_t imm;
1330 uint16_t padding;
1331 };
1332 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1333
1334 struct SOPP_instruction : public Instruction {
1335 uint32_t imm;
1336 int block;
1337 };
1338 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1339
1340 struct SOPC_instruction : public Instruction {};
1341 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1342
1343 struct SOP1_instruction : public Instruction {};
1344 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1345
1346 struct SOP2_instruction : public Instruction {};
1347 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1348
1349 /**
1350 * Scalar Memory Format:
1351 * For s_(buffer_)load_dword*:
1352 * Operand(0): SBASE - SGPR-pair which provides base address
1353 * Operand(1): Offset - immediate (un)signed offset or SGPR
1354 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1355 * Operand(n-1): SOffset - SGPR offset (Vega only)
1356 *
1357 * Having no operands is also valid for instructions such as s_dcache_inv.
1358 *
1359 */
1360 struct SMEM_instruction : public Instruction {
1361 memory_sync_info sync;
1362 bool glc : 1; /* VI+: globally coherent */
1363 bool dlc : 1; /* NAVI: device level coherent */
1364 bool nv : 1; /* VEGA only: Non-volatile */
1365 bool disable_wqm : 1;
1366 bool prevent_overflow : 1; /* avoid overflow when combining additions */
1367 uint8_t padding : 3;
1368 };
1369 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1370
1371 struct VOP1_instruction : public Instruction {};
1372 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1373
1374 struct VOP2_instruction : public Instruction {};
1375 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1376
1377 struct VOPC_instruction : public Instruction {};
1378 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1379
1380 struct VOP3_instruction : public Instruction {
1381 bool abs[3];
1382 bool neg[3];
1383 uint8_t opsel : 4;
1384 uint8_t omod : 2;
1385 bool clamp : 1;
1386 uint8_t padding0 : 1;
1387 uint8_t padding1;
1388 };
1389 static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1390
1391 struct VOP3P_instruction : public Instruction {
1392 bool neg_lo[3];
1393 bool neg_hi[3];
1394 uint8_t opsel_lo : 3;
1395 uint8_t opsel_hi : 3;
1396 bool clamp : 1;
1397 uint8_t padding0 : 1;
1398 uint8_t padding1;
1399 };
1400 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1401
1402 /**
1403 * Data Parallel Primitives Format:
1404 * This format can be used for VOP1, VOP2 or VOPC instructions.
1405 * The swizzle applies to the src0 operand.
1406 *
1407 */
1408 struct DPP_instruction : public Instruction {
1409 bool abs[2];
1410 bool neg[2];
1411 uint16_t dpp_ctrl;
1412 uint8_t row_mask : 4;
1413 uint8_t bank_mask : 4;
1414 bool bound_ctrl : 1;
1415 uint8_t padding : 7;
1416 };
1417 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1418
1419 struct SubdwordSel {
1420 enum sdwa_sel : uint8_t {
1421 ubyte = 0x4,
1422 uword = 0x8,
1423 dword = 0x10,
1424 sext = 0x20,
1425 sbyte = ubyte | sext,
1426 sword = uword | sext,
1427
1428 ubyte0 = ubyte,
1429 ubyte1 = ubyte | 1,
1430 ubyte2 = ubyte | 2,
1431 ubyte3 = ubyte | 3,
1432 sbyte0 = sbyte,
1433 sbyte1 = sbyte | 1,
1434 sbyte2 = sbyte | 2,
1435 sbyte3 = sbyte | 3,
1436 uword0 = uword,
1437 uword1 = uword | 2,
1438 sword0 = sword,
1439 sword1 = sword | 2,
1440 };
1441
SubdwordSelSubdwordSel1442 SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1443 constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1444 constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1445 : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1446 {}
sdwa_selSubdwordSel1447 constexpr operator sdwa_sel() const { return sel; }
1448 explicit operator bool() const { return sel != 0; }
1449
sizeSubdwordSel1450 constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1451 constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1452 constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1453 constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1454 {
1455 reg_byte_offset += offset();
1456 if (size() == 1)
1457 return reg_byte_offset;
1458 else if (size() == 2)
1459 return 4 + (reg_byte_offset >> 1);
1460 else
1461 return 6;
1462 }
1463
1464 private:
1465 sdwa_sel sel;
1466 };
1467
1468 /**
1469 * Sub-Dword Addressing Format:
1470 * This format can be used for VOP1, VOP2 or VOPC instructions.
1471 *
1472 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1473 * the definition doesn't have to be VCC on GFX9+.
1474 *
1475 */
1476 struct SDWA_instruction : public Instruction {
1477 /* these destination modifiers aren't available with VOPC except for
1478 * clamp on GFX8 */
1479 SubdwordSel sel[2];
1480 SubdwordSel dst_sel;
1481 bool neg[2];
1482 bool abs[2];
1483 bool clamp : 1;
1484 uint8_t omod : 2; /* GFX9+ */
1485 uint8_t padding : 5;
1486 };
1487 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1488
1489 struct Interp_instruction : public Instruction {
1490 uint8_t attribute;
1491 uint8_t component;
1492 uint16_t padding;
1493 };
1494 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1495
1496 /**
1497 * Local and Global Data Sharing instructions
1498 * Operand(0): ADDR - VGPR which supplies the address.
1499 * Operand(1): DATA0 - First data VGPR.
1500 * Operand(2): DATA1 - Second data VGPR.
1501 * Operand(n-1): M0 - LDS size.
1502 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1503 *
1504 */
1505 struct DS_instruction : public Instruction {
1506 memory_sync_info sync;
1507 bool gds;
1508 int16_t offset0;
1509 int8_t offset1;
1510 uint8_t padding;
1511 };
1512 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1513
1514 /**
1515 * Vector Memory Untyped-buffer Instructions
1516 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1517 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1518 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1519 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1520 *
1521 */
1522 struct MUBUF_instruction : public Instruction {
1523 memory_sync_info sync;
1524 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1525 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1526 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1527 bool glc : 1; /* globally coherent */
1528 bool dlc : 1; /* NAVI: device level coherent */
1529 bool slc : 1; /* system level coherent */
1530 bool tfe : 1; /* texture fail enable */
1531 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1532 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1533 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1534 uint16_t swizzled : 1;
1535 uint16_t padding0 : 2;
1536 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1537 uint16_t padding1 : 10;
1538 };
1539 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1540
1541 /**
1542 * Vector Memory Typed-buffer Instructions
1543 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1544 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1545 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1546 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1547 *
1548 */
1549 struct MTBUF_instruction : public Instruction {
1550 memory_sync_info sync;
1551 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1552 uint8_t nfmt : 3; /* Numeric format of data in memory */
1553 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1554 uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */
1555 uint16_t glc : 1; /* globally coherent */
1556 uint16_t dlc : 1; /* NAVI: device level coherent */
1557 uint16_t slc : 1; /* system level coherent */
1558 uint16_t tfe : 1; /* texture fail enable */
1559 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1560 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1561 uint16_t padding : 4;
1562 uint16_t offset; /* Unsigned byte offset - 12 bit */
1563 };
1564 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1565
1566 /**
1567 * Vector Memory Image Instructions
1568 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1569 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1570 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1571 * Operand(3): VADDR - Address source. Can carry an offset or an index.
1572 * Definition(0): VDATA - Vector GPR for read result.
1573 *
1574 */
1575 struct MIMG_instruction : public Instruction {
1576 memory_sync_info sync;
1577 uint8_t dmask; /* Data VGPR enable mask */
1578 uint8_t dim : 3; /* NAVI: dimensionality */
1579 bool unrm : 1; /* Force address to be un-normalized */
1580 bool dlc : 1; /* NAVI: device level coherent */
1581 bool glc : 1; /* globally coherent */
1582 bool slc : 1; /* system level coherent */
1583 bool tfe : 1; /* texture fail enable */
1584 bool da : 1; /* declare an array */
1585 bool lwe : 1; /* LOD warning enable */
1586 bool r128 : 1; /* NAVI: Texture resource size */
1587 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1588 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1589 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1590 uint8_t padding0 : 2;
1591 uint8_t padding1;
1592 uint8_t padding2;
1593 };
1594 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1595
1596 /**
1597 * Flat/Scratch/Global Instructions
1598 * Operand(0): ADDR
1599 * Operand(1): SADDR
1600 * Operand(2) / Definition(0): DATA/VDST
1601 *
1602 */
1603 struct FLAT_instruction : public Instruction {
1604 memory_sync_info sync;
1605 bool slc : 1; /* system level coherent */
1606 bool glc : 1; /* globally coherent */
1607 bool dlc : 1; /* NAVI: device level coherent */
1608 bool lds : 1;
1609 bool nv : 1;
1610 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1611 uint8_t padding0 : 2;
1612 uint16_t offset; /* Vega/Navi only */
1613 uint16_t padding1;
1614 };
1615 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1616
1617 struct Export_instruction : public Instruction {
1618 uint8_t enabled_mask;
1619 uint8_t dest;
1620 bool compressed : 1;
1621 bool done : 1;
1622 bool valid_mask : 1;
1623 uint8_t padding0 : 5;
1624 uint8_t padding1;
1625 };
1626 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1627
1628 struct Pseudo_instruction : public Instruction {
1629 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1630 bool tmp_in_scc;
1631 uint8_t padding;
1632 };
1633 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1634
1635 struct Pseudo_branch_instruction : public Instruction {
1636 /* target[0] is the block index of the branch target.
1637 * For conditional branches, target[1] contains the fall-through alternative.
1638 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1639 */
1640 uint32_t target[2];
1641 };
1642 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1643
1644 struct Pseudo_barrier_instruction : public Instruction {
1645 memory_sync_info sync;
1646 sync_scope exec_scope;
1647 };
1648 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1649
1650 enum ReduceOp : uint16_t {
1651 // clang-format off
1652 iadd8, iadd16, iadd32, iadd64,
1653 imul8, imul16, imul32, imul64,
1654 fadd16, fadd32, fadd64,
1655 fmul16, fmul32, fmul64,
1656 imin8, imin16, imin32, imin64,
1657 imax8, imax16, imax32, imax64,
1658 umin8, umin16, umin32, umin64,
1659 umax8, umax16, umax32, umax64,
1660 fmin16, fmin32, fmin64,
1661 fmax16, fmax32, fmax64,
1662 iand8, iand16, iand32, iand64,
1663 ior8, ior16, ior32, ior64,
1664 ixor8, ixor16, ixor32, ixor64,
1665 num_reduce_ops,
1666 // clang-format on
1667 };
1668
1669 /**
1670 * Subgroup Reduction Instructions, everything except for the data to be
1671 * reduced and the result as inserted by setup_reduce_temp().
1672 * Operand(0): data to be reduced
1673 * Operand(1): reduce temporary
1674 * Operand(2): vector temporary
1675 * Definition(0): result
1676 * Definition(1): scalar temporary
1677 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1678 * Definition(3): scc clobber
1679 * Definition(4): vcc clobber
1680 *
1681 */
1682 struct Pseudo_reduction_instruction : public Instruction {
1683 ReduceOp reduce_op;
1684 uint16_t cluster_size; // must be 0 for scans
1685 };
1686 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1687 "Unexpected padding");
1688
1689 struct instr_deleter_functor {
operatorinstr_deleter_functor1690 void operator()(void* p) { free(p); }
1691 };
1692
1693 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1694
1695 template <typename T>
1696 T*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1697 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1698 uint32_t num_definitions)
1699 {
1700 std::size_t size =
1701 sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1702 char* data = (char*)calloc(1, size);
1703 T* inst = (T*)data;
1704
1705 inst->opcode = opcode;
1706 inst->format = format;
1707
1708 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1709 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1710 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1711 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1712
1713 return inst;
1714 }
1715
1716 constexpr bool
usesModifiers()1717 Instruction::usesModifiers() const noexcept
1718 {
1719 if (isDPP() || isSDWA())
1720 return true;
1721
1722 if (isVOP3P()) {
1723 const VOP3P_instruction& vop3p = this->vop3p();
1724 for (unsigned i = 0; i < operands.size(); i++) {
1725 if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1726 return true;
1727
1728 /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1729 if (!(vop3p.opsel_hi & (1 << i)))
1730 return true;
1731 }
1732 return vop3p.opsel_lo || vop3p.clamp;
1733 } else if (isVOP3()) {
1734 const VOP3_instruction& vop3 = this->vop3();
1735 for (unsigned i = 0; i < operands.size(); i++) {
1736 if (vop3.abs[i] || vop3.neg[i])
1737 return true;
1738 }
1739 return vop3.opsel || vop3.clamp || vop3.omod;
1740 }
1741 return false;
1742 }
1743
1744 constexpr bool
is_phi(Instruction * instr)1745 is_phi(Instruction* instr)
1746 {
1747 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1748 }
1749
1750 static inline bool
is_phi(aco_ptr<Instruction> & instr)1751 is_phi(aco_ptr<Instruction>& instr)
1752 {
1753 return is_phi(instr.get());
1754 }
1755
1756 memory_sync_info get_sync_info(const Instruction* instr);
1757
1758 bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1759
1760 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1761 bool instr_is_16bit(chip_class chip, aco_opcode op);
1762 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
1763 bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra);
1764 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1765 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1766 aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr);
1767 bool needs_exec_mask(const Instruction* instr);
1768
1769 aco_opcode get_ordered(aco_opcode op);
1770 aco_opcode get_unordered(aco_opcode op);
1771 aco_opcode get_inverse(aco_opcode op);
1772 aco_opcode get_f32_cmp(aco_opcode op);
1773 unsigned get_cmp_bitsize(aco_opcode op);
1774 bool is_cmp(aco_opcode op);
1775
1776 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
1777
1778 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1779
1780 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1781
1782 bool should_form_clause(const Instruction* a, const Instruction* b);
1783
1784 enum block_kind {
1785 /* uniform indicates that leaving this block,
1786 * all actives lanes stay active */
1787 block_kind_uniform = 1 << 0,
1788 block_kind_top_level = 1 << 1,
1789 block_kind_loop_preheader = 1 << 2,
1790 block_kind_loop_header = 1 << 3,
1791 block_kind_loop_exit = 1 << 4,
1792 block_kind_continue = 1 << 5,
1793 block_kind_break = 1 << 6,
1794 block_kind_continue_or_break = 1 << 7,
1795 block_kind_discard = 1 << 8,
1796 block_kind_branch = 1 << 9,
1797 block_kind_merge = 1 << 10,
1798 block_kind_invert = 1 << 11,
1799 block_kind_uses_discard_if = 1 << 12,
1800 block_kind_needs_lowering = 1 << 13,
1801 block_kind_uses_demote = 1 << 14,
1802 block_kind_export_end = 1 << 15,
1803 };
1804
1805 struct RegisterDemand {
1806 constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1807 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1808 int16_t vgpr = 0;
1809 int16_t sgpr = 0;
1810
1811 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1812 {
1813 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1814 }
1815
exceedsRegisterDemand1816 constexpr bool exceeds(const RegisterDemand other) const noexcept
1817 {
1818 return vgpr > other.vgpr || sgpr > other.sgpr;
1819 }
1820
1821 constexpr RegisterDemand operator+(const Temp t) const noexcept
1822 {
1823 if (t.type() == RegType::sgpr)
1824 return RegisterDemand(vgpr, sgpr + t.size());
1825 else
1826 return RegisterDemand(vgpr + t.size(), sgpr);
1827 }
1828
1829 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1830 {
1831 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1832 }
1833
1834 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1835 {
1836 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1837 }
1838
1839 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1840 {
1841 vgpr += other.vgpr;
1842 sgpr += other.sgpr;
1843 return *this;
1844 }
1845
1846 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1847 {
1848 vgpr -= other.vgpr;
1849 sgpr -= other.sgpr;
1850 return *this;
1851 }
1852
1853 constexpr RegisterDemand& operator+=(const Temp t) noexcept
1854 {
1855 if (t.type() == RegType::sgpr)
1856 sgpr += t.size();
1857 else
1858 vgpr += t.size();
1859 return *this;
1860 }
1861
1862 constexpr RegisterDemand& operator-=(const Temp t) noexcept
1863 {
1864 if (t.type() == RegType::sgpr)
1865 sgpr -= t.size();
1866 else
1867 vgpr -= t.size();
1868 return *this;
1869 }
1870
updateRegisterDemand1871 constexpr void update(const RegisterDemand other) noexcept
1872 {
1873 vgpr = std::max(vgpr, other.vgpr);
1874 sgpr = std::max(sgpr, other.sgpr);
1875 }
1876 };
1877
1878 /* CFG */
1879 struct Block {
1880 float_mode fp_mode;
1881 unsigned index;
1882 unsigned offset = 0;
1883 std::vector<aco_ptr<Instruction>> instructions;
1884 std::vector<unsigned> logical_preds;
1885 std::vector<unsigned> linear_preds;
1886 std::vector<unsigned> logical_succs;
1887 std::vector<unsigned> linear_succs;
1888 RegisterDemand register_demand = RegisterDemand();
1889 uint16_t loop_nest_depth = 0;
1890 uint16_t divergent_if_logical_depth = 0;
1891 uint16_t uniform_if_depth = 0;
1892 uint16_t kind = 0;
1893 int logical_idom = -1;
1894 int linear_idom = -1;
1895
1896 /* this information is needed for predecessors to blocks with phis when
1897 * moving out of ssa */
1898 bool scc_live_out = false;
1899 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1900
BlockBlock1901 Block() : index(0) {}
1902 };
1903
1904 /*
1905 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1906 */
1907 enum class SWStage : uint8_t {
1908 None = 0,
1909 VS = 1 << 0, /* Vertex Shader */
1910 GS = 1 << 1, /* Geometry Shader */
1911 TCS = 1 << 2, /* Tessellation Control aka Hull Shader */
1912 TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */
1913 FS = 1 << 4, /* Fragment aka Pixel Shader */
1914 CS = 1 << 5, /* Compute Shader */
1915 GSCopy = 1 << 6, /* GS Copy Shader (internal) */
1916
1917 /* Stage combinations merged to run on a single HWStage */
1918 VS_GS = VS | GS,
1919 VS_TCS = VS | TCS,
1920 TES_GS = TES | GS,
1921 };
1922
1923 constexpr SWStage
1924 operator|(SWStage a, SWStage b)
1925 {
1926 return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
1927 }
1928
1929 /*
1930 * Shader stages as running on the AMD GPU.
1931 *
1932 * The relation between HWStages and SWStages is not a one-to-one mapping:
1933 * Some SWStages are merged by ACO to run on a single HWStage.
1934 * See README.md for details.
1935 */
1936 enum class HWStage : uint8_t {
1937 VS,
1938 ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1939 GS, /* Geometry shader on GFX10/legacy and GFX6-9. */
1940 NGG, /* Primitive shader, used to implement VS, TES, GS. */
1941 LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1942 HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1943 FS,
1944 CS,
1945 };
1946
1947 /*
1948 * Set of SWStages to be merged into a single shader paired with the
1949 * HWStage it will run on.
1950 */
1951 struct Stage {
1952 constexpr Stage() = default;
1953
StageStage1954 explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1955
1956 /* Check if the given SWStage is included */
hasStage1957 constexpr bool has(SWStage stage) const
1958 {
1959 return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
1960 }
1961
num_sw_stagesStage1962 unsigned num_sw_stages() const { return util_bitcount(static_cast<uint8_t>(sw)); }
1963
1964 constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
1965
1966 constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
1967
1968 /* Mask of merged software stages */
1969 SWStage sw = SWStage::None;
1970
1971 /* Active hardware stage */
1972 HWStage hw{};
1973 };
1974
1975 /* possible settings of Program::stage */
1976 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
1977 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
1978 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
1979 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
1980 static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
1981 /* GFX10/NGG */
1982 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
1983 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
1984 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
1985 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
1986 /* GFX9 (and GFX10 if NGG isn't used) */
1987 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
1988 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
1989 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
1990 /* pre-GFX9 */
1991 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
1992 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
1993 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
1994 static constexpr Stage tess_eval_es(HWStage::ES,
1995 SWStage::TES); /* tesselation evaluation before geometry */
1996 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
1997
1998 enum statistic {
1999 statistic_hash,
2000 statistic_instructions,
2001 statistic_copies,
2002 statistic_branches,
2003 statistic_latency,
2004 statistic_inv_throughput,
2005 statistic_vmem_clauses,
2006 statistic_smem_clauses,
2007 statistic_sgpr_presched,
2008 statistic_vgpr_presched,
2009 num_statistics
2010 };
2011
2012 struct DeviceInfo {
2013 uint16_t lds_encoding_granule;
2014 uint16_t lds_alloc_granule;
2015 uint32_t lds_limit; /* in bytes */
2016 bool has_16bank_lds;
2017 uint16_t physical_sgprs;
2018 uint16_t physical_vgprs;
2019 uint16_t vgpr_limit;
2020 uint16_t sgpr_limit;
2021 uint16_t sgpr_alloc_granule;
2022 uint16_t vgpr_alloc_granule; /* must be power of two */
2023 unsigned max_wave64_per_simd;
2024 unsigned simd_per_cu;
2025 bool has_fast_fma32 = false;
2026 bool xnack_enabled = false;
2027 bool sram_ecc_enabled = false;
2028 };
2029
2030 enum class CompilationProgress {
2031 after_isel,
2032 after_spilling,
2033 after_ra,
2034 };
2035
2036 class Program final {
2037 public:
2038 std::vector<Block> blocks;
2039 std::vector<RegClass> temp_rc = {s1};
2040 RegisterDemand max_reg_demand = RegisterDemand();
2041 uint16_t num_waves = 0;
2042 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
2043 ac_shader_config* config;
2044 const struct radv_shader_info* info;
2045 enum chip_class chip_class;
2046 enum radeon_family family;
2047 DeviceInfo dev;
2048 unsigned wave_size;
2049 RegClass lane_mask;
2050 Stage stage;
2051 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2052 bool needs_wqm = false; /* there exists a p_wqm instruction */
2053
2054 std::vector<uint8_t> constant_data;
2055 Temp private_segment_buffer;
2056 Temp scratch_offset;
2057
2058 uint16_t min_waves = 0;
2059 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2060 bool wgp_mode;
2061 bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2062
2063 bool needs_vcc = false;
2064 bool needs_flat_scr = false;
2065
2066 CompilationProgress progress;
2067
2068 bool collect_statistics = false;
2069 uint32_t statistics[num_statistics];
2070
2071 float_mode next_fp_mode;
2072 unsigned next_loop_depth = 0;
2073 unsigned next_divergent_if_logical_depth = 0;
2074 unsigned next_uniform_if_depth = 0;
2075
2076 std::vector<Definition> vs_inputs;
2077
2078 struct {
2079 FILE* output = stderr;
2080 bool shorten_messages = false;
2081 void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message);
2082 void* private_data;
2083 } debug;
2084
allocateId(RegClass rc)2085 uint32_t allocateId(RegClass rc)
2086 {
2087 assert(allocationID <= 16777215);
2088 temp_rc.push_back(rc);
2089 return allocationID++;
2090 }
2091
allocateRange(unsigned amount)2092 void allocateRange(unsigned amount)
2093 {
2094 assert(allocationID + amount <= 16777216);
2095 temp_rc.resize(temp_rc.size() + amount);
2096 allocationID += amount;
2097 }
2098
allocateTmp(RegClass rc)2099 Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2100
peekAllocationId()2101 uint32_t peekAllocationId() { return allocationID; }
2102
2103 friend void reindex_ssa(Program* program);
2104 friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2105
create_and_insert_block()2106 Block* create_and_insert_block()
2107 {
2108 Block block;
2109 return insert_block(std::move(block));
2110 }
2111
insert_block(Block && block)2112 Block* insert_block(Block&& block)
2113 {
2114 block.index = blocks.size();
2115 block.fp_mode = next_fp_mode;
2116 block.loop_nest_depth = next_loop_depth;
2117 block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2118 block.uniform_if_depth = next_uniform_if_depth;
2119 blocks.emplace_back(std::move(block));
2120 return &blocks.back();
2121 }
2122
2123 private:
2124 uint32_t allocationID = 1;
2125 };
2126
2127 struct live {
2128 /* live temps out per block */
2129 std::vector<IDSet> live_out;
2130 /* register demand (sgpr/vgpr) per instruction per block */
2131 std::vector<std::vector<RegisterDemand>> register_demand;
2132 };
2133
2134 struct ra_test_policy {
2135 /* Force RA to always use its pessimistic fallback algorithm */
2136 bool skip_optimistic_path = false;
2137 };
2138
2139 void init();
2140
2141 void init_program(Program* program, Stage stage, const struct radv_shader_info* info,
2142 enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
2143 ac_shader_config* config);
2144
2145 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2146 ac_shader_config* config, const struct radv_shader_args* args);
2147 void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2148 const struct radv_shader_args* args);
2149 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2150 ac_shader_config* config, const struct radv_shader_args* args);
2151 void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key,
2152 ac_shader_config* config, const struct radv_shader_args* args,
2153 unsigned* num_preserved_sgprs);
2154
2155 void lower_phis(Program* program);
2156 void calc_min_waves(Program* program);
2157 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2158 live live_var_analysis(Program* program);
2159 std::vector<uint16_t> dead_code_analysis(Program* program);
2160 void dominator_tree(Program* program);
2161 void insert_exec_mask(Program* program);
2162 void value_numbering(Program* program);
2163 void optimize(Program* program);
2164 void optimize_postRA(Program* program);
2165 void setup_reduce_temp(Program* program);
2166 void lower_to_cssa(Program* program, live& live_vars);
2167 void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2168 ra_test_policy = {});
2169 void ssa_elimination(Program* program);
2170 void lower_to_hw_instr(Program* program);
2171 void schedule_program(Program* program, live& live_vars);
2172 void spill(Program* program, live& live_vars);
2173 void insert_wait_states(Program* program);
2174 void insert_NOPs(Program* program);
2175 void form_hard_clauses(Program* program);
2176 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2177 /**
2178 * Returns true if print_asm can disassemble the given program for the current build/runtime
2179 * configuration
2180 */
2181 bool check_print_asm_support(Program* program);
2182 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2183 bool validate_ir(Program* program);
2184 bool validate_ra(Program* program);
2185 #ifndef NDEBUG
2186 void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2187 #else
2188 #define perfwarn(program, cond, msg, ...) \
2189 do { \
2190 } while (0)
2191 #endif
2192
2193 void collect_presched_stats(Program* program);
2194 void collect_preasm_stats(Program* program);
2195 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2196
2197 enum print_flags {
2198 print_no_ssa = 0x1,
2199 print_perf_info = 0x2,
2200 print_kill = 0x4,
2201 print_live_vars = 0x8,
2202 };
2203
2204 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2205 void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2206 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2207 void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2208 unsigned flags = 0);
2209
2210 void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2211 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2212
2213 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2214 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2215
2216 /* utilities for dealing with register demand */
2217 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2218 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2219 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2220 aco_ptr<Instruction>& instr_before);
2221
2222 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2223 uint16_t get_extra_sgprs(Program* program);
2224
2225 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2226 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2227 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2228
2229 /* return number of addressable sgprs/vgprs for max_waves */
2230 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2231 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2232
2233 typedef struct {
2234 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2235 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2236 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2237 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2238 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2239 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2240 const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2241 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2242 /* sizes used for input/output modifiers and constants */
2243 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2244 const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2245 } Info;
2246
2247 extern const Info instr_info;
2248
2249 } // namespace aco
2250
2251 #endif /* ACO_IR_H */
2252