1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 #include "vulkan/radv_shader.h"
41
42 struct radv_shader_args;
43 struct radv_shader_info;
44
45 namespace aco {
46
47 extern uint64_t debug_flags;
48
49 enum {
50 DEBUG_VALIDATE_IR = 0x1,
51 DEBUG_VALIDATE_RA = 0x2,
52 DEBUG_PERFWARN = 0x4,
53 DEBUG_FORCE_WAITCNT = 0x8,
54 DEBUG_NO_VN = 0x10,
55 DEBUG_NO_OPT = 0x20,
56 DEBUG_NO_SCHED = 0x40,
57 };
58
59 /**
60 * Representation of the instruction's microcode encoding format
61 * Note: Some Vector ALU Formats can be combined, such that:
62 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
63 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
64 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
65 *
66 * (*) The same is applicable for VOP1 and VOPC instructions.
67 */
68 enum class Format : std::uint16_t {
69 /* Pseudo Instruction Format */
70 PSEUDO = 0,
71 /* Scalar ALU & Control Formats */
72 SOP1 = 1,
73 SOP2 = 2,
74 SOPK = 3,
75 SOPP = 4,
76 SOPC = 5,
77 /* Scalar Memory Format */
78 SMEM = 6,
79 /* LDS/GDS Format */
80 DS = 8,
81 /* Vector Memory Buffer Formats */
82 MTBUF = 9,
83 MUBUF = 10,
84 /* Vector Memory Image Format */
85 MIMG = 11,
86 /* Export Format */
87 EXP = 12,
88 /* Flat Formats */
89 FLAT = 13,
90 GLOBAL = 14,
91 SCRATCH = 15,
92
93 PSEUDO_BRANCH = 16,
94 PSEUDO_BARRIER = 17,
95 PSEUDO_REDUCTION = 18,
96
97 /* Vector ALU Formats */
98 VOP3P = 19,
99 VOP1 = 1 << 8,
100 VOP2 = 1 << 9,
101 VOPC = 1 << 10,
102 VOP3 = 1 << 11,
103 VOP3A = 1 << 11,
104 VOP3B = 1 << 11,
105 /* Vector Parameter Interpolation Format */
106 VINTRP = 1 << 12,
107 DPP = 1 << 13,
108 SDWA = 1 << 14,
109 };
110
111 enum storage_class : uint8_t {
112 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
113 storage_buffer = 0x1, /* SSBOs and global memory */
114 storage_atomic_counter = 0x2, /* not used for Vulkan */
115 storage_image = 0x4,
116 storage_shared = 0x8, /* or TCS output */
117 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
118 storage_scratch = 0x20,
119 storage_vgpr_spill = 0x40,
120 storage_count = 8,
121 };
122
123 enum memory_semantics : uint8_t {
124 semantic_none = 0x0,
125 /* for loads: don't move any access after this load to before this load (even other loads)
126 * for barriers: don't move any access after the barrier to before any
127 * atomics/control_barriers/sendmsg_gs_done before the barrier */
128 semantic_acquire = 0x1,
129 /* for stores: don't move any access before this store to after this store
130 * for barriers: don't move any access before the barrier to after any
131 * atomics/control_barriers/sendmsg_gs_done after the barrier */
132 semantic_release = 0x2,
133
134 /* the rest are for load/stores/atomics only */
135 /* cannot be DCE'd or CSE'd */
136 semantic_volatile = 0x4,
137 /* does not interact with barriers and assumes this lane is the only lane
138 * accessing this memory */
139 semantic_private = 0x8,
140 /* this operation can be reordered around operations of the same storage. says nothing about barriers */
141 semantic_can_reorder = 0x10,
142 /* this is a atomic instruction (may only read or write memory) */
143 semantic_atomic = 0x20,
144 /* this is instruction both reads and writes memory */
145 semantic_rmw = 0x40,
146
147 semantic_acqrel = semantic_acquire | semantic_release,
148 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
149 };
150
151 enum sync_scope : uint8_t {
152 scope_invocation = 0,
153 scope_subgroup = 1,
154 scope_workgroup = 2,
155 scope_queuefamily = 3,
156 scope_device = 4,
157 };
158
159 struct memory_sync_info {
memory_sync_infomemory_sync_info160 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
161 memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
162 : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
163
164 storage_class storage:8;
165 memory_semantics semantics:8;
166 sync_scope scope:8;
167
168 bool operator == (const memory_sync_info& rhs) const {
169 return storage == rhs.storage &&
170 semantics == rhs.semantics &&
171 scope == rhs.scope;
172 }
173
can_reordermemory_sync_info174 bool can_reorder() const {
175 if (semantics & semantic_acqrel)
176 return false;
177 /* Also check storage so that zero-initialized memory_sync_info can be
178 * reordered. */
179 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
180 }
181 };
182 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
183
184 enum fp_round {
185 fp_round_ne = 0,
186 fp_round_pi = 1,
187 fp_round_ni = 2,
188 fp_round_tz = 3,
189 };
190
191 enum fp_denorm {
192 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
193 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
194 fp_denorm_flush = 0x0,
195 fp_denorm_keep_in = 0x1,
196 fp_denorm_keep_out = 0x2,
197 fp_denorm_keep = 0x3,
198 };
199
200 struct float_mode {
201 /* matches encoding of the MODE register */
202 union {
203 struct {
204 fp_round round32:2;
205 fp_round round16_64:2;
206 unsigned denorm32:2;
207 unsigned denorm16_64:2;
208 };
209 struct {
210 uint8_t round:4;
211 uint8_t denorm:4;
212 };
213 uint8_t val = 0;
214 };
215 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
216 bool preserve_signed_zero_inf_nan32:1;
217 bool preserve_signed_zero_inf_nan16_64:1;
218 /* if false, optimizations which may remove denormal flushing can be done */
219 bool must_flush_denorms32:1;
220 bool must_flush_denorms16_64:1;
221 bool care_about_round32:1;
222 bool care_about_round16_64:1;
223
224 /* Returns true if instructions using the mode "other" can safely use the
225 * current one instead. */
canReplacefloat_mode226 bool canReplace(float_mode other) const noexcept {
227 return val == other.val &&
228 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
229 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
230 (must_flush_denorms32 || !other.must_flush_denorms32) &&
231 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
232 (care_about_round32 || !other.care_about_round32) &&
233 (care_about_round16_64 || !other.care_about_round16_64);
234 }
235 };
236
asVOP3(Format format)237 constexpr Format asVOP3(Format format) {
238 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
239 };
240
asSDWA(Format format)241 constexpr Format asSDWA(Format format) {
242 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
243 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
244 }
245
246 enum class RegType {
247 none = 0,
248 sgpr,
249 vgpr,
250 linear_vgpr,
251 };
252
253 struct RegClass {
254
255 enum RC : uint8_t {
256 s1 = 1,
257 s2 = 2,
258 s3 = 3,
259 s4 = 4,
260 s6 = 6,
261 s8 = 8,
262 s16 = 16,
263 v1 = s1 | (1 << 5),
264 v2 = s2 | (1 << 5),
265 v3 = s3 | (1 << 5),
266 v4 = s4 | (1 << 5),
267 v5 = 5 | (1 << 5),
268 v6 = 6 | (1 << 5),
269 v7 = 7 | (1 << 5),
270 v8 = 8 | (1 << 5),
271 /* byte-sized register class */
272 v1b = v1 | (1 << 7),
273 v2b = v2 | (1 << 7),
274 v3b = v3 | (1 << 7),
275 v4b = v4 | (1 << 7),
276 v6b = v6 | (1 << 7),
277 v8b = v8 | (1 << 7),
278 /* these are used for WWM and spills to vgpr */
279 v1_linear = v1 | (1 << 6),
280 v2_linear = v2 | (1 << 6),
281 };
282
283 RegClass() = default;
RegClassRegClass284 constexpr RegClass(RC rc)
285 : rc(rc) {}
RegClassRegClass286 constexpr RegClass(RegType type, unsigned size)
287 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
288
RCRegClass289 constexpr operator RC() const { return rc; }
290 explicit operator bool() = delete;
291
typeRegClass292 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_subdwordRegClass293 constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass294 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
295 //TODO: use size() less in favor of bytes()
sizeRegClass296 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass297 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
as_linearRegClass298 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
as_subdwordRegClass299 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
300
getRegClass301 static constexpr RegClass get(RegType type, unsigned bytes) {
302 if (type == RegType::sgpr) {
303 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
304 } else {
305 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
306 RegClass(type, bytes / 4u);
307 }
308 }
309
310 private:
311 RC rc;
312 };
313
314 /* transitional helper expressions */
315 static constexpr RegClass s1{RegClass::s1};
316 static constexpr RegClass s2{RegClass::s2};
317 static constexpr RegClass s3{RegClass::s3};
318 static constexpr RegClass s4{RegClass::s4};
319 static constexpr RegClass s8{RegClass::s8};
320 static constexpr RegClass s16{RegClass::s16};
321 static constexpr RegClass v1{RegClass::v1};
322 static constexpr RegClass v2{RegClass::v2};
323 static constexpr RegClass v3{RegClass::v3};
324 static constexpr RegClass v4{RegClass::v4};
325 static constexpr RegClass v5{RegClass::v5};
326 static constexpr RegClass v6{RegClass::v6};
327 static constexpr RegClass v7{RegClass::v7};
328 static constexpr RegClass v8{RegClass::v8};
329 static constexpr RegClass v1b{RegClass::v1b};
330 static constexpr RegClass v2b{RegClass::v2b};
331 static constexpr RegClass v3b{RegClass::v3b};
332 static constexpr RegClass v4b{RegClass::v4b};
333 static constexpr RegClass v6b{RegClass::v6b};
334 static constexpr RegClass v8b{RegClass::v8b};
335
336 /**
337 * Temp Class
338 * Each temporary virtual register has a
339 * register class (i.e. size and type)
340 * and SSA id.
341 */
342 struct Temp {
TempTemp343 Temp() noexcept : id_(0), reg_class(0) {}
TempTemp344 constexpr Temp(uint32_t id, RegClass cls) noexcept
345 : id_(id), reg_class(uint8_t(cls)) {}
346
idTemp347 constexpr uint32_t id() const noexcept { return id_; }
regClassTemp348 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
349
bytesTemp350 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp351 constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp352 constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp353 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
354
355 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
356 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
357 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
358
359 private:
360 uint32_t id_: 24;
361 uint32_t reg_class : 8;
362 };
363
364 /**
365 * PhysReg
366 * Represents the physical register for each
367 * Operand and Definition.
368 */
369 struct PhysReg {
370 constexpr PhysReg() = default;
PhysRegPhysReg371 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg372 constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg373 constexpr unsigned byte() const { return reg_b & 0x3; }
374 constexpr operator unsigned() const { return reg(); }
375 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
376 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
377 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg378 constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
379
380 uint16_t reg_b = 0;
381 };
382
383 /* helper expressions for special registers */
384 static constexpr PhysReg m0{124};
385 static constexpr PhysReg vcc{106};
386 static constexpr PhysReg vcc_hi{107};
387 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
388 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
389 static constexpr PhysReg ttmp0{112};
390 static constexpr PhysReg ttmp1{113};
391 static constexpr PhysReg ttmp2{114};
392 static constexpr PhysReg ttmp3{115};
393 static constexpr PhysReg ttmp4{116};
394 static constexpr PhysReg ttmp5{117};
395 static constexpr PhysReg ttmp6{118};
396 static constexpr PhysReg ttmp7{119};
397 static constexpr PhysReg ttmp8{120};
398 static constexpr PhysReg ttmp9{121};
399 static constexpr PhysReg ttmp10{122};
400 static constexpr PhysReg ttmp11{123};
401 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
402 static constexpr PhysReg exec{126};
403 static constexpr PhysReg exec_lo{126};
404 static constexpr PhysReg exec_hi{127};
405 static constexpr PhysReg vccz{251};
406 static constexpr PhysReg execz{252};
407 static constexpr PhysReg scc{253};
408
409 /**
410 * Operand Class
411 * Initially, each Operand refers to either
412 * a temporary virtual register
413 * or to a constant value
414 * Temporary registers get mapped to physical register during RA
415 * Constant values are inlined into the instruction sequence.
416 */
417 class Operand final
418 {
419 public:
Operand()420 constexpr Operand()
421 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
422 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
423 isLateKill_(false) {}
424
Operand(Temp r)425 explicit Operand(Temp r) noexcept
426 {
427 data_.temp = r;
428 if (r.id()) {
429 isTemp_ = true;
430 } else {
431 isUndef_ = true;
432 setFixed(PhysReg{128});
433 }
434 };
Operand(uint8_t v)435 explicit Operand(uint8_t v) noexcept
436 {
437 /* 8-bit constants are only used for copies and copies from any 8-bit
438 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
439 * to be inline constants. */
440 data_.i = v;
441 isConstant_ = true;
442 constSize = 0;
443 setFixed(PhysReg{0u});
444 };
Operand(uint16_t v)445 explicit Operand(uint16_t v) noexcept
446 {
447 data_.i = v;
448 isConstant_ = true;
449 constSize = 1;
450 if (v <= 64)
451 setFixed(PhysReg{128u + v});
452 else if (v >= 0xFFF0) /* [-16 .. -1] */
453 setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
454 else if (v == 0x3800) /* 0.5 */
455 setFixed(PhysReg{240});
456 else if (v == 0xB800) /* -0.5 */
457 setFixed(PhysReg{241});
458 else if (v == 0x3C00) /* 1.0 */
459 setFixed(PhysReg{242});
460 else if (v == 0xBC00) /* -1.0 */
461 setFixed(PhysReg{243});
462 else if (v == 0x4000) /* 2.0 */
463 setFixed(PhysReg{244});
464 else if (v == 0xC000) /* -2.0 */
465 setFixed(PhysReg{245});
466 else if (v == 0x4400) /* 4.0 */
467 setFixed(PhysReg{246});
468 else if (v == 0xC400) /* -4.0 */
469 setFixed(PhysReg{247});
470 else if (v == 0x3118) /* 1/2 PI */
471 setFixed(PhysReg{248});
472 else /* Literal Constant */
473 setFixed(PhysReg{255});
474 };
475 explicit Operand(uint32_t v, bool is64bit = false) noexcept
476 {
477 data_.i = v;
478 isConstant_ = true;
479 constSize = is64bit ? 3 : 2;
480 if (v <= 64)
481 setFixed(PhysReg{128 + v});
482 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
483 setFixed(PhysReg{192 - v});
484 else if (v == 0x3f000000) /* 0.5 */
485 setFixed(PhysReg{240});
486 else if (v == 0xbf000000) /* -0.5 */
487 setFixed(PhysReg{241});
488 else if (v == 0x3f800000) /* 1.0 */
489 setFixed(PhysReg{242});
490 else if (v == 0xbf800000) /* -1.0 */
491 setFixed(PhysReg{243});
492 else if (v == 0x40000000) /* 2.0 */
493 setFixed(PhysReg{244});
494 else if (v == 0xc0000000) /* -2.0 */
495 setFixed(PhysReg{245});
496 else if (v == 0x40800000) /* 4.0 */
497 setFixed(PhysReg{246});
498 else if (v == 0xc0800000) /* -4.0 */
499 setFixed(PhysReg{247});
500 else { /* Literal Constant */
501 assert(!is64bit && "attempt to create a 64-bit literal constant");
502 setFixed(PhysReg{255});
503 }
504 };
Operand(uint64_t v)505 explicit Operand(uint64_t v) noexcept
506 {
507 isConstant_ = true;
508 constSize = 3;
509 if (v <= 64) {
510 data_.i = (uint32_t) v;
511 setFixed(PhysReg{128 + (uint32_t) v});
512 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
513 data_.i = (uint32_t) v;
514 setFixed(PhysReg{192 - (uint32_t) v});
515 } else if (v == 0x3FE0000000000000) { /* 0.5 */
516 data_.i = 0x3f000000;
517 setFixed(PhysReg{240});
518 } else if (v == 0xBFE0000000000000) { /* -0.5 */
519 data_.i = 0xbf000000;
520 setFixed(PhysReg{241});
521 } else if (v == 0x3FF0000000000000) { /* 1.0 */
522 data_.i = 0x3f800000;
523 setFixed(PhysReg{242});
524 } else if (v == 0xBFF0000000000000) { /* -1.0 */
525 data_.i = 0xbf800000;
526 setFixed(PhysReg{243});
527 } else if (v == 0x4000000000000000) { /* 2.0 */
528 data_.i = 0x40000000;
529 setFixed(PhysReg{244});
530 } else if (v == 0xC000000000000000) { /* -2.0 */
531 data_.i = 0xc0000000;
532 setFixed(PhysReg{245});
533 } else if (v == 0x4010000000000000) { /* 4.0 */
534 data_.i = 0x40800000;
535 setFixed(PhysReg{246});
536 } else if (v == 0xC010000000000000) { /* -4.0 */
537 data_.i = 0xc0800000;
538 setFixed(PhysReg{247});
539 } else { /* Literal Constant: we don't know if it is a long or double.*/
540 isConstant_ = 0;
541 assert(false && "attempt to create a 64-bit literal constant");
542 }
543 };
Operand(RegClass type)544 explicit Operand(RegClass type) noexcept
545 {
546 isUndef_ = true;
547 data_.temp = Temp(0, type);
548 setFixed(PhysReg{128});
549 };
Operand(PhysReg reg,RegClass type)550 explicit Operand(PhysReg reg, RegClass type) noexcept
551 {
552 data_.temp = Temp(0, type);
553 setFixed(reg);
554 }
555
isTemp()556 constexpr bool isTemp() const noexcept
557 {
558 return isTemp_;
559 }
560
setTemp(Temp t)561 constexpr void setTemp(Temp t) noexcept {
562 assert(!isConstant_);
563 isTemp_ = true;
564 data_.temp = t;
565 }
566
getTemp()567 constexpr Temp getTemp() const noexcept
568 {
569 return data_.temp;
570 }
571
tempId()572 constexpr uint32_t tempId() const noexcept
573 {
574 return data_.temp.id();
575 }
576
hasRegClass()577 constexpr bool hasRegClass() const noexcept
578 {
579 return isTemp() || isUndefined();
580 }
581
regClass()582 constexpr RegClass regClass() const noexcept
583 {
584 return data_.temp.regClass();
585 }
586
bytes()587 constexpr unsigned bytes() const noexcept
588 {
589 if (isConstant())
590 return 1 << constSize;
591 else
592 return data_.temp.bytes();
593 }
594
size()595 constexpr unsigned size() const noexcept
596 {
597 if (isConstant())
598 return constSize > 2 ? 2 : 1;
599 else
600 return data_.temp.size();
601 }
602
isFixed()603 constexpr bool isFixed() const noexcept
604 {
605 return isFixed_;
606 }
607
physReg()608 constexpr PhysReg physReg() const noexcept
609 {
610 return reg_;
611 }
612
setFixed(PhysReg reg)613 constexpr void setFixed(PhysReg reg) noexcept
614 {
615 isFixed_ = reg != unsigned(-1);
616 reg_ = reg;
617 }
618
isConstant()619 constexpr bool isConstant() const noexcept
620 {
621 return isConstant_;
622 }
623
isLiteral()624 constexpr bool isLiteral() const noexcept
625 {
626 return isConstant() && reg_ == 255;
627 }
628
isUndefined()629 constexpr bool isUndefined() const noexcept
630 {
631 return isUndef_;
632 }
633
constantValue()634 constexpr uint32_t constantValue() const noexcept
635 {
636 return data_.i;
637 }
638
constantEquals(uint32_t cmp)639 constexpr bool constantEquals(uint32_t cmp) const noexcept
640 {
641 return isConstant() && constantValue() == cmp;
642 }
643
644 constexpr uint64_t constantValue64(bool signext=false) const noexcept
645 {
646 if (constSize == 3) {
647 if (reg_ <= 192)
648 return reg_ - 128;
649 else if (reg_ <= 208)
650 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
651
652 switch (reg_) {
653 case 240:
654 return 0x3FE0000000000000;
655 case 241:
656 return 0xBFE0000000000000;
657 case 242:
658 return 0x3FF0000000000000;
659 case 243:
660 return 0xBFF0000000000000;
661 case 244:
662 return 0x4000000000000000;
663 case 245:
664 return 0xC000000000000000;
665 case 246:
666 return 0x4010000000000000;
667 case 247:
668 return 0xC010000000000000;
669 }
670 } else if (constSize == 1) {
671 return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
672 } else if (constSize == 0) {
673 return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
674 }
675 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
676 }
677
isOfType(RegType type)678 constexpr bool isOfType(RegType type) const noexcept
679 {
680 return hasRegClass() && regClass().type() == type;
681 }
682
683 /* Indicates that the killed operand's live range intersects with the
684 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
685 * not set by liveness analysis. */
setLateKill(bool flag)686 constexpr void setLateKill(bool flag) noexcept
687 {
688 isLateKill_ = flag;
689 }
690
isLateKill()691 constexpr bool isLateKill() const noexcept
692 {
693 return isLateKill_;
694 }
695
setKill(bool flag)696 constexpr void setKill(bool flag) noexcept
697 {
698 isKill_ = flag;
699 if (!flag)
700 setFirstKill(false);
701 }
702
isKill()703 constexpr bool isKill() const noexcept
704 {
705 return isKill_ || isFirstKill();
706 }
707
setFirstKill(bool flag)708 constexpr void setFirstKill(bool flag) noexcept
709 {
710 isFirstKill_ = flag;
711 if (flag)
712 setKill(flag);
713 }
714
715 /* When there are multiple operands killing the same temporary,
716 * isFirstKill() is only returns true for the first one. */
isFirstKill()717 constexpr bool isFirstKill() const noexcept
718 {
719 return isFirstKill_;
720 }
721
isKillBeforeDef()722 constexpr bool isKillBeforeDef() const noexcept
723 {
724 return isKill() && !isLateKill();
725 }
726
isFirstKillBeforeDef()727 constexpr bool isFirstKillBeforeDef() const noexcept
728 {
729 return isFirstKill() && !isLateKill();
730 }
731
732 constexpr bool operator == (Operand other) const noexcept
733 {
734 if (other.size() != size())
735 return false;
736 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
737 return false;
738 if (isFixed() && other.isFixed() && physReg() != other.physReg())
739 return false;
740 if (isLiteral())
741 return other.isLiteral() && other.constantValue() == constantValue();
742 else if (isConstant())
743 return other.isConstant() && other.physReg() == physReg();
744 else if (isUndefined())
745 return other.isUndefined() && other.regClass() == regClass();
746 else
747 return other.isTemp() && other.getTemp() == getTemp();
748 }
749 private:
750 union {
751 uint32_t i;
752 float f;
753 Temp temp = Temp(0, s1);
754 } data_;
755 PhysReg reg_;
756 union {
757 struct {
758 uint8_t isTemp_:1;
759 uint8_t isFixed_:1;
760 uint8_t isConstant_:1;
761 uint8_t isKill_:1;
762 uint8_t isUndef_:1;
763 uint8_t isFirstKill_:1;
764 uint8_t constSize:2;
765 uint8_t isLateKill_:1;
766 };
767 /* can't initialize bit-fields in c++11, so work around using a union */
768 uint16_t control_ = 0;
769 };
770 };
771
772 /**
773 * Definition Class
774 * Definitions are the results of Instructions
775 * and refer to temporary virtual registers
776 * which are later mapped to physical registers
777 */
778 class Definition final
779 {
780 public:
Definition()781 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
782 isKill_(0), isPrecise_(0), isNUW_(0) {}
Definition(uint32_t index,RegClass type)783 Definition(uint32_t index, RegClass type) noexcept
784 : temp(index, type) {}
Definition(Temp tmp)785 explicit Definition(Temp tmp) noexcept
786 : temp(tmp) {}
Definition(PhysReg reg,RegClass type)787 Definition(PhysReg reg, RegClass type) noexcept
788 : temp(Temp(0, type))
789 {
790 setFixed(reg);
791 }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)792 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
793 : temp(Temp(tmpId, type))
794 {
795 setFixed(reg);
796 }
797
isTemp()798 constexpr bool isTemp() const noexcept
799 {
800 return tempId() > 0;
801 }
802
getTemp()803 constexpr Temp getTemp() const noexcept
804 {
805 return temp;
806 }
807
tempId()808 constexpr uint32_t tempId() const noexcept
809 {
810 return temp.id();
811 }
812
setTemp(Temp t)813 constexpr void setTemp(Temp t) noexcept {
814 temp = t;
815 }
816
regClass()817 constexpr RegClass regClass() const noexcept
818 {
819 return temp.regClass();
820 }
821
bytes()822 constexpr unsigned bytes() const noexcept
823 {
824 return temp.bytes();
825 }
826
size()827 constexpr unsigned size() const noexcept
828 {
829 return temp.size();
830 }
831
isFixed()832 constexpr bool isFixed() const noexcept
833 {
834 return isFixed_;
835 }
836
physReg()837 constexpr PhysReg physReg() const noexcept
838 {
839 return reg_;
840 }
841
setFixed(PhysReg reg)842 constexpr void setFixed(PhysReg reg) noexcept
843 {
844 isFixed_ = 1;
845 reg_ = reg;
846 }
847
setHint(PhysReg reg)848 constexpr void setHint(PhysReg reg) noexcept
849 {
850 hasHint_ = 1;
851 reg_ = reg;
852 }
853
hasHint()854 constexpr bool hasHint() const noexcept
855 {
856 return hasHint_;
857 }
858
setKill(bool flag)859 constexpr void setKill(bool flag) noexcept
860 {
861 isKill_ = flag;
862 }
863
isKill()864 constexpr bool isKill() const noexcept
865 {
866 return isKill_;
867 }
868
setPrecise(bool precise)869 constexpr void setPrecise(bool precise) noexcept
870 {
871 isPrecise_ = precise;
872 }
873
isPrecise()874 constexpr bool isPrecise() const noexcept
875 {
876 return isPrecise_;
877 }
878
879 /* No Unsigned Wrap */
setNUW(bool nuw)880 constexpr void setNUW(bool nuw) noexcept
881 {
882 isNUW_ = nuw;
883 }
884
isNUW()885 constexpr bool isNUW() const noexcept
886 {
887 return isNUW_;
888 }
889
890 private:
891 Temp temp = Temp(0, s1);
892 PhysReg reg_;
893 union {
894 struct {
895 uint8_t isFixed_:1;
896 uint8_t hasHint_:1;
897 uint8_t isKill_:1;
898 uint8_t isPrecise_:1;
899 uint8_t isNUW_:1;
900 };
901 /* can't initialize bit-fields in c++11, so work around using a union */
902 uint8_t control_ = 0;
903 };
904 };
905
906 struct Block;
907
908 struct Instruction {
909 aco_opcode opcode;
910 Format format;
911 uint32_t pass_flags;
912
913 aco::span<Operand> operands;
914 aco::span<Definition> definitions;
915
isVALUInstruction916 constexpr bool isVALU() const noexcept
917 {
918 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
919 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
920 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
921 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
922 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
923 || format == Format::VOP3P;
924 }
925
isSALUInstruction926 constexpr bool isSALU() const noexcept
927 {
928 return format == Format::SOP1 ||
929 format == Format::SOP2 ||
930 format == Format::SOPC ||
931 format == Format::SOPK ||
932 format == Format::SOPP;
933 }
934
isVMEMInstruction935 constexpr bool isVMEM() const noexcept
936 {
937 return format == Format::MTBUF ||
938 format == Format::MUBUF ||
939 format == Format::MIMG;
940 }
941
isDPPInstruction942 constexpr bool isDPP() const noexcept
943 {
944 return (uint16_t) format & (uint16_t) Format::DPP;
945 }
946
isVOP3Instruction947 constexpr bool isVOP3() const noexcept
948 {
949 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
950 ((uint16_t) format & (uint16_t) Format::VOP3B);
951 }
952
isSDWAInstruction953 constexpr bool isSDWA() const noexcept
954 {
955 return (uint16_t) format & (uint16_t) Format::SDWA;
956 }
957
isFlatOrGlobalInstruction958 constexpr bool isFlatOrGlobal() const noexcept
959 {
960 return format == Format::FLAT || format == Format::GLOBAL;
961 }
962
963 constexpr bool usesModifiers() const noexcept;
964
reads_execInstruction965 constexpr bool reads_exec() const noexcept
966 {
967 for (const Operand& op : operands) {
968 if (op.isFixed() && op.physReg() == exec)
969 return true;
970 }
971 return false;
972 }
973 };
974 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
975
976 struct SOPK_instruction : public Instruction {
977 uint16_t imm;
978 uint16_t padding;
979 };
980 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
981
982 struct SOPP_instruction : public Instruction {
983 uint32_t imm;
984 int block;
985 };
986 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
987
988 struct SOPC_instruction : public Instruction {
989 };
990 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
991
992 struct SOP1_instruction : public Instruction {
993 };
994 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
995
996 struct SOP2_instruction : public Instruction {
997 };
998 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
999
1000 /**
1001 * Scalar Memory Format:
1002 * For s_(buffer_)load_dword*:
1003 * Operand(0): SBASE - SGPR-pair which provides base address
1004 * Operand(1): Offset - immediate (un)signed offset or SGPR
1005 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1006 * Operand(n-1): SOffset - SGPR offset (Vega only)
1007 *
1008 * Having no operands is also valid for instructions such as s_dcache_inv.
1009 *
1010 */
1011 struct SMEM_instruction : public Instruction {
1012 memory_sync_info sync;
1013 bool glc : 1; /* VI+: globally coherent */
1014 bool dlc : 1; /* NAVI: device level coherent */
1015 bool nv : 1; /* VEGA only: Non-volatile */
1016 bool disable_wqm : 1;
1017 bool prevent_overflow : 1; /* avoid overflow when combining additions */
1018 uint32_t padding: 3;
1019 };
1020 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1021
1022 struct VOP1_instruction : public Instruction {
1023 };
1024 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1025
1026 struct VOP2_instruction : public Instruction {
1027 };
1028 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1029
1030 struct VOPC_instruction : public Instruction {
1031 };
1032 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1033
1034 struct VOP3A_instruction : public Instruction {
1035 bool abs[3];
1036 bool neg[3];
1037 uint8_t opsel : 4;
1038 uint8_t omod : 2;
1039 bool clamp : 1;
1040 uint32_t padding : 9;
1041 };
1042 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1043
1044 struct VOP3P_instruction : public Instruction {
1045 bool neg_lo[3];
1046 bool neg_hi[3];
1047 uint8_t opsel_lo : 3;
1048 uint8_t opsel_hi : 3;
1049 bool clamp : 1;
1050 uint32_t padding : 9;
1051 };
1052 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1053
1054 /**
1055 * Data Parallel Primitives Format:
1056 * This format can be used for VOP1, VOP2 or VOPC instructions.
1057 * The swizzle applies to the src0 operand.
1058 *
1059 */
1060 struct DPP_instruction : public Instruction {
1061 bool abs[2];
1062 bool neg[2];
1063 uint16_t dpp_ctrl;
1064 uint8_t row_mask : 4;
1065 uint8_t bank_mask : 4;
1066 bool bound_ctrl : 1;
1067 uint32_t padding : 7;
1068 };
1069 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1070
1071 enum sdwa_sel : uint8_t {
1072 /* masks */
1073 sdwa_wordnum = 0x1,
1074 sdwa_bytenum = 0x3,
1075 sdwa_asuint = 0x7 | 0x10,
1076 sdwa_rasize = 0x3,
1077
1078 /* flags */
1079 sdwa_isword = 0x4,
1080 sdwa_sext = 0x8,
1081 sdwa_isra = 0x10,
1082
1083 /* specific values */
1084 sdwa_ubyte0 = 0,
1085 sdwa_ubyte1 = 1,
1086 sdwa_ubyte2 = 2,
1087 sdwa_ubyte3 = 3,
1088 sdwa_uword0 = sdwa_isword | 0,
1089 sdwa_uword1 = sdwa_isword | 1,
1090 sdwa_udword = 6,
1091
1092 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1093 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1094 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1095 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1096 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1097 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1098 sdwa_sdword = sdwa_udword | sdwa_sext,
1099
1100 /* register-allocated */
1101 sdwa_ubyte = 1 | sdwa_isra,
1102 sdwa_uword = 2 | sdwa_isra,
1103 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1104 sdwa_sword = sdwa_uword | sdwa_sext,
1105 };
1106
1107 /**
1108 * Sub-Dword Addressing Format:
1109 * This format can be used for VOP1, VOP2 or VOPC instructions.
1110 *
1111 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1112 * the definition doesn't have to be VCC on GFX9+.
1113 *
1114 */
1115 struct SDWA_instruction : public Instruction {
1116 /* these destination modifiers aren't available with VOPC except for
1117 * clamp on GFX8 */
1118 uint8_t sel[2];
1119 uint8_t dst_sel;
1120 bool neg[2];
1121 bool abs[2];
1122 bool dst_preserve : 1;
1123 bool clamp : 1;
1124 uint8_t omod : 2; /* GFX9+ */
1125 uint32_t padding : 4;
1126 };
1127 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1128
1129 struct Interp_instruction : public Instruction {
1130 uint8_t attribute;
1131 uint8_t component;
1132 uint16_t padding;
1133 };
1134 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1135
1136 /**
1137 * Local and Global Data Sharing instructions
1138 * Operand(0): ADDR - VGPR which supplies the address.
1139 * Operand(1): DATA0 - First data VGPR.
1140 * Operand(2): DATA1 - Second data VGPR.
1141 * Operand(n-1): M0 - LDS size.
1142 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1143 *
1144 */
1145 struct DS_instruction : public Instruction {
1146 memory_sync_info sync;
1147 bool gds;
1148 int16_t offset0;
1149 int8_t offset1;
1150 uint8_t padding;
1151 };
1152 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1153
1154 /**
1155 * Vector Memory Untyped-buffer Instructions
1156 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1157 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1158 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1159 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1160 *
1161 */
1162 struct MUBUF_instruction : public Instruction {
1163 memory_sync_info sync;
1164 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1165 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1166 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1167 bool glc : 1; /* globally coherent */
1168 bool dlc : 1; /* NAVI: device level coherent */
1169 bool slc : 1; /* system level coherent */
1170 bool tfe : 1; /* texture fail enable */
1171 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1172 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1173 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1174 bool swizzled : 1;
1175 uint32_t padding1 : 18;
1176 };
1177 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1178
1179 /**
1180 * Vector Memory Typed-buffer Instructions
1181 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1182 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1183 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1184 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1185 *
1186 */
1187 struct MTBUF_instruction : public Instruction {
1188 memory_sync_info sync;
1189 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1190 uint8_t nfmt : 3; /* Numeric format of data in memory */
1191 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1192 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1193 bool glc : 1; /* globally coherent */
1194 bool dlc : 1; /* NAVI: device level coherent */
1195 bool slc : 1; /* system level coherent */
1196 bool tfe : 1; /* texture fail enable */
1197 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1198 uint32_t padding : 10;
1199 uint16_t offset; /* Unsigned byte offset - 12 bit */
1200 };
1201 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1202
1203 /**
1204 * Vector Memory Image Instructions
1205 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1206 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1207 * or VDATA - Vector GPR for write data.
1208 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1209 * Definition(0): VDATA - Vector GPR for read result.
1210 *
1211 */
1212 struct MIMG_instruction : public Instruction {
1213 memory_sync_info sync;
1214 uint8_t dmask; /* Data VGPR enable mask */
1215 uint8_t dim : 3; /* NAVI: dimensionality */
1216 bool unrm : 1; /* Force address to be un-normalized */
1217 bool dlc : 1; /* NAVI: device level coherent */
1218 bool glc : 1; /* globally coherent */
1219 bool slc : 1; /* system level coherent */
1220 bool tfe : 1; /* texture fail enable */
1221 bool da : 1; /* declare an array */
1222 bool lwe : 1; /* Force data to be un-normalized */
1223 bool r128 : 1; /* NAVI: Texture resource size */
1224 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1225 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1226 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1227 uint32_t padding : 18;
1228 };
1229 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1230
1231 /**
1232 * Flat/Scratch/Global Instructions
1233 * Operand(0): ADDR
1234 * Operand(1): SADDR
1235 * Operand(2) / Definition(0): DATA/VDST
1236 *
1237 */
1238 struct FLAT_instruction : public Instruction {
1239 memory_sync_info sync;
1240 bool slc : 1; /* system level coherent */
1241 bool glc : 1; /* globally coherent */
1242 bool dlc : 1; /* NAVI: device level coherent */
1243 bool lds : 1;
1244 bool nv : 1;
1245 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1246 uint32_t padding0 : 2;
1247 uint16_t offset; /* Vega/Navi only */
1248 uint16_t padding1;
1249 };
1250 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1251
1252 struct Export_instruction : public Instruction {
1253 uint8_t enabled_mask;
1254 uint8_t dest;
1255 bool compressed : 1;
1256 bool done : 1;
1257 bool valid_mask : 1;
1258 uint32_t padding : 13;
1259 };
1260 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1261
1262 struct Pseudo_instruction : public Instruction {
1263 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1264 bool tmp_in_scc;
1265 uint8_t padding;
1266 };
1267 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1268
1269 struct Pseudo_branch_instruction : public Instruction {
1270 /* target[0] is the block index of the branch target.
1271 * For conditional branches, target[1] contains the fall-through alternative.
1272 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1273 */
1274 uint32_t target[2];
1275 };
1276 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1277
1278 struct Pseudo_barrier_instruction : public Instruction {
1279 memory_sync_info sync;
1280 sync_scope exec_scope;
1281 };
1282 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1283
1284 enum ReduceOp : uint16_t {
1285 iadd8, iadd16, iadd32, iadd64,
1286 imul8, imul16, imul32, imul64,
1287 fadd16, fadd32, fadd64,
1288 fmul16, fmul32, fmul64,
1289 imin8, imin16, imin32, imin64,
1290 imax8, imax16, imax32, imax64,
1291 umin8, umin16, umin32, umin64,
1292 umax8, umax16, umax32, umax64,
1293 fmin16, fmin32, fmin64,
1294 fmax16, fmax32, fmax64,
1295 iand8, iand16, iand32, iand64,
1296 ior8, ior16, ior32, ior64,
1297 ixor8, ixor16, ixor32, ixor64,
1298 };
1299
1300 /**
1301 * Subgroup Reduction Instructions, everything except for the data to be
1302 * reduced and the result as inserted by setup_reduce_temp().
1303 * Operand(0): data to be reduced
1304 * Operand(1): reduce temporary
1305 * Operand(2): vector temporary
1306 * Definition(0): result
1307 * Definition(1): scalar temporary
1308 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1309 * Definition(3): scc clobber
1310 * Definition(4): vcc clobber
1311 *
1312 */
1313 struct Pseudo_reduction_instruction : public Instruction {
1314 ReduceOp reduce_op;
1315 uint16_t cluster_size; // must be 0 for scans
1316 };
1317 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1318
1319 struct instr_deleter_functor {
operatorinstr_deleter_functor1320 void operator()(void* p) {
1321 free(p);
1322 }
1323 };
1324
1325 template<typename T>
1326 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1327
1328 template<typename T>
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1329 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1330 {
1331 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1332 char *data = (char*) calloc(1, size);
1333 T* inst = (T*) data;
1334
1335 inst->opcode = opcode;
1336 inst->format = format;
1337
1338 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1339 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1340 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1341 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1342
1343 return inst;
1344 }
1345
usesModifiers()1346 constexpr bool Instruction::usesModifiers() const noexcept
1347 {
1348 if (isDPP() || isSDWA())
1349 return true;
1350
1351 if (format == Format::VOP3P) {
1352 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1353 for (unsigned i = 0; i < operands.size(); i++) {
1354 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1355 return true;
1356 }
1357 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1358 } else if (isVOP3()) {
1359 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1360 for (unsigned i = 0; i < operands.size(); i++) {
1361 if (vop3->abs[i] || vop3->neg[i])
1362 return true;
1363 }
1364 return vop3->opsel || vop3->clamp || vop3->omod;
1365 }
1366 return false;
1367 }
1368
is_phi(Instruction * instr)1369 constexpr bool is_phi(Instruction* instr)
1370 {
1371 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1372 }
1373
is_phi(aco_ptr<Instruction> & instr)1374 static inline bool is_phi(aco_ptr<Instruction>& instr)
1375 {
1376 return is_phi(instr.get());
1377 }
1378
1379 memory_sync_info get_sync_info(const Instruction* instr);
1380
1381 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1382
1383 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1384 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1385 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1386 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1387
1388 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1389
1390 enum block_kind {
1391 /* uniform indicates that leaving this block,
1392 * all actives lanes stay active */
1393 block_kind_uniform = 1 << 0,
1394 block_kind_top_level = 1 << 1,
1395 block_kind_loop_preheader = 1 << 2,
1396 block_kind_loop_header = 1 << 3,
1397 block_kind_loop_exit = 1 << 4,
1398 block_kind_continue = 1 << 5,
1399 block_kind_break = 1 << 6,
1400 block_kind_continue_or_break = 1 << 7,
1401 block_kind_discard = 1 << 8,
1402 block_kind_branch = 1 << 9,
1403 block_kind_merge = 1 << 10,
1404 block_kind_invert = 1 << 11,
1405 block_kind_uses_discard_if = 1 << 12,
1406 block_kind_needs_lowering = 1 << 13,
1407 block_kind_uses_demote = 1 << 14,
1408 block_kind_export_end = 1 << 15,
1409 };
1410
1411
1412 struct RegisterDemand {
1413 constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1414 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1415 : vgpr{v}, sgpr{s} {}
1416 int16_t vgpr = 0;
1417 int16_t sgpr = 0;
1418
1419 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1420 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1421 }
1422
exceedsRegisterDemand1423 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1424 return vgpr > other.vgpr || sgpr > other.sgpr;
1425 }
1426
1427 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1428 if (t.type() == RegType::sgpr)
1429 return RegisterDemand( vgpr, sgpr + t.size() );
1430 else
1431 return RegisterDemand( vgpr + t.size(), sgpr );
1432 }
1433
1434 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1435 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1436 }
1437
1438 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1439 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1440 }
1441
1442 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1443 vgpr += other.vgpr;
1444 sgpr += other.sgpr;
1445 return *this;
1446 }
1447
1448 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1449 vgpr -= other.vgpr;
1450 sgpr -= other.sgpr;
1451 return *this;
1452 }
1453
1454 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1455 if (t.type() == RegType::sgpr)
1456 sgpr += t.size();
1457 else
1458 vgpr += t.size();
1459 return *this;
1460 }
1461
1462 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1463 if (t.type() == RegType::sgpr)
1464 sgpr -= t.size();
1465 else
1466 vgpr -= t.size();
1467 return *this;
1468 }
1469
updateRegisterDemand1470 constexpr void update(const RegisterDemand other) noexcept {
1471 vgpr = std::max(vgpr, other.vgpr);
1472 sgpr = std::max(sgpr, other.sgpr);
1473 }
1474
1475 };
1476
1477 /* CFG */
1478 struct Block {
1479 float_mode fp_mode;
1480 unsigned index;
1481 unsigned offset = 0;
1482 std::vector<aco_ptr<Instruction>> instructions;
1483 std::vector<unsigned> logical_preds;
1484 std::vector<unsigned> linear_preds;
1485 std::vector<unsigned> logical_succs;
1486 std::vector<unsigned> linear_succs;
1487 RegisterDemand register_demand = RegisterDemand();
1488 uint16_t loop_nest_depth = 0;
1489 uint16_t kind = 0;
1490 int logical_idom = -1;
1491 int linear_idom = -1;
1492 Temp live_out_exec = Temp();
1493
1494 /* this information is needed for predecessors to blocks with phis when
1495 * moving out of ssa */
1496 bool scc_live_out = false;
1497 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1498
BlockBlock1499 Block(unsigned idx) : index(idx) {}
BlockBlock1500 Block() : index(0) {}
1501 };
1502
1503 /*
1504 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1505 */
1506 enum class SWStage : uint8_t {
1507 None = 0,
1508 VS = 1 << 0, /* Vertex Shader */
1509 GS = 1 << 1, /* Geometry Shader */
1510 TCS = 1 << 2, /* Tessellation Control aka Hull Shader */
1511 TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */
1512 FS = 1 << 4, /* Fragment aka Pixel Shader */
1513 CS = 1 << 5, /* Compute Shader */
1514 GSCopy = 1 << 6, /* GS Copy Shader (internal) */
1515
1516 /* Stage combinations merged to run on a single HWStage */
1517 VS_GS = VS | GS,
1518 VS_TCS = VS | TCS,
1519 TES_GS = TES | GS,
1520 };
1521
1522 constexpr SWStage operator|(SWStage a, SWStage b) {
1523 return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
1524 }
1525
1526 /*
1527 * Shader stages as running on the AMD GPU.
1528 *
1529 * The relation between HWStages and SWStages is not a one-to-one mapping:
1530 * Some SWStages are merged by ACO to run on a single HWStage.
1531 * See README.md for details.
1532 */
1533 enum class HWStage : uint8_t {
1534 VS,
1535 ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1536 GS, /* Geometry shader on GFX10/legacy and GFX6-9. */
1537 NGG, /* Primitive shader, used to implement VS, TES, GS. */
1538 LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1539 HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1540 FS,
1541 CS,
1542 };
1543
1544 /*
1545 * Set of SWStages to be merged into a single shader paired with the
1546 * HWStage it will run on.
1547 */
1548 struct Stage {
1549 constexpr Stage() = default;
1550
StageStage1551 explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) { }
1552
1553 /* Check if the given SWStage is included */
hasStage1554 constexpr bool has(SWStage stage) const {
1555 return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
1556 }
1557
num_sw_stagesStage1558 unsigned num_sw_stages() const {
1559 return util_bitcount(static_cast<uint8_t>(sw));
1560 }
1561
1562 constexpr bool operator==(const Stage& other) const {
1563 return sw == other.sw && hw == other.hw;
1564 }
1565
1566 constexpr bool operator!=(const Stage& other) const {
1567 return sw != other.sw || hw != other.hw;
1568 }
1569
1570 /* Mask of merged software stages */
1571 SWStage sw = SWStage::None;
1572
1573 /* Active hardware stage */
1574 HWStage hw {};
1575 };
1576
1577 /* possible settings of Program::stage */
1578 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
1579 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
1580 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
1581 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
1582 static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
1583 /* GFX10/NGG */
1584 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
1585 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
1586 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
1587 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
1588 /* GFX9 (and GFX10 if NGG isn't used) */
1589 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
1590 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
1591 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
1592 /* pre-GFX9 */
1593 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
1594 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
1595 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
1596 static constexpr Stage tess_eval_es(HWStage::ES, SWStage::TES); /* tesselation evaluation before geometry */
1597 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
1598
1599 enum statistic {
1600 statistic_hash,
1601 statistic_instructions,
1602 statistic_copies,
1603 statistic_branches,
1604 statistic_cycles,
1605 statistic_vmem_clauses,
1606 statistic_smem_clauses,
1607 statistic_vmem_score,
1608 statistic_smem_score,
1609 statistic_sgpr_presched,
1610 statistic_vgpr_presched,
1611 num_statistics
1612 };
1613
1614 class Program final {
1615 public:
1616 float_mode next_fp_mode;
1617 std::vector<Block> blocks;
1618 std::vector<RegClass> temp_rc = {s1};
1619 RegisterDemand max_reg_demand = RegisterDemand();
1620 uint16_t num_waves = 0;
1621 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1622 ac_shader_config* config;
1623 struct radv_shader_info *info;
1624 enum chip_class chip_class;
1625 enum radeon_family family;
1626 unsigned wave_size;
1627 RegClass lane_mask;
1628 Stage stage;
1629 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1630 bool needs_wqm = false; /* there exists a p_wqm instruction */
1631 bool wb_smem_l1_on_end = false;
1632
1633 std::vector<uint8_t> constant_data;
1634 Temp private_segment_buffer;
1635 Temp scratch_offset;
1636
1637 uint16_t min_waves = 0;
1638 uint16_t lds_alloc_granule;
1639 uint32_t lds_limit; /* in bytes */
1640 bool has_16bank_lds;
1641 uint16_t vgpr_limit;
1642 uint16_t sgpr_limit;
1643 uint16_t physical_sgprs;
1644 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1645 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1646 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1647
1648 bool xnack_enabled = false;
1649 bool sram_ecc_enabled = false;
1650 bool has_fast_fma32 = false;
1651
1652 bool needs_vcc = false;
1653 bool needs_flat_scr = false;
1654
1655 bool collect_statistics = false;
1656 uint32_t statistics[num_statistics];
1657
1658 struct {
1659 void (*func)(void *private_data,
1660 enum radv_compiler_debug_level level,
1661 const char *message);
1662 void *private_data;
1663 } debug;
1664
allocateId(RegClass rc)1665 uint32_t allocateId(RegClass rc)
1666 {
1667 assert(allocationID <= 16777215);
1668 temp_rc.push_back(rc);
1669 return allocationID++;
1670 }
1671
allocateRange(unsigned amount)1672 void allocateRange(unsigned amount)
1673 {
1674 assert(allocationID + amount <= 16777216);
1675 temp_rc.resize(temp_rc.size() + amount);
1676 allocationID += amount;
1677 }
1678
allocateTmp(RegClass rc)1679 Temp allocateTmp(RegClass rc)
1680 {
1681 return Temp(allocateId(rc), rc);
1682 }
1683
peekAllocationId()1684 uint32_t peekAllocationId()
1685 {
1686 return allocationID;
1687 }
1688
create_and_insert_block()1689 Block* create_and_insert_block() {
1690 blocks.emplace_back(blocks.size());
1691 blocks.back().fp_mode = next_fp_mode;
1692 return &blocks.back();
1693 }
1694
insert_block(Block && block)1695 Block* insert_block(Block&& block) {
1696 block.index = blocks.size();
1697 block.fp_mode = next_fp_mode;
1698 blocks.emplace_back(std::move(block));
1699 return &blocks.back();
1700 }
1701
1702 private:
1703 uint32_t allocationID = 1;
1704 };
1705
1706 struct live {
1707 /* live temps out per block */
1708 std::vector<IDSet> live_out;
1709 /* register demand (sgpr/vgpr) per instruction per block */
1710 std::vector<std::vector<RegisterDemand>> register_demand;
1711 };
1712
1713 void init();
1714
1715 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1716 enum chip_class chip_class, enum radeon_family family,
1717 ac_shader_config *config);
1718
1719 void select_program(Program *program,
1720 unsigned shader_count,
1721 struct nir_shader *const *shaders,
1722 ac_shader_config* config,
1723 struct radv_shader_args *args);
1724 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1725 ac_shader_config* config,
1726 struct radv_shader_args *args);
1727 void select_trap_handler_shader(Program *program, struct nir_shader *shader,
1728 ac_shader_config* config,
1729 struct radv_shader_args *args);
1730
1731 void lower_phis(Program* program);
1732 void calc_min_waves(Program* program);
1733 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1734 live live_var_analysis(Program* program);
1735 std::vector<uint16_t> dead_code_analysis(Program *program);
1736 void dominator_tree(Program* program);
1737 void insert_exec_mask(Program *program);
1738 void value_numbering(Program* program);
1739 void optimize(Program* program);
1740 void setup_reduce_temp(Program* program);
1741 void lower_to_cssa(Program* program, live& live_vars);
1742 void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block);
1743 void ssa_elimination(Program* program);
1744 void lower_to_hw_instr(Program* program);
1745 void schedule_program(Program* program, live& live_vars);
1746 void spill(Program* program, live& live_vars);
1747 void insert_wait_states(Program* program);
1748 void insert_NOPs(Program* program);
1749 void form_hard_clauses(Program *program);
1750 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1751 bool print_asm(Program *program, std::vector<uint32_t>& binary,
1752 unsigned exec_size, FILE *output);
1753 bool validate_ir(Program* program);
1754 bool validate_ra(Program* program);
1755 #ifndef NDEBUG
1756 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1757 #else
1758 #define perfwarn(program, cond, msg, ...) do {} while(0)
1759 #endif
1760
1761 void collect_presched_stats(Program *program);
1762 void collect_preasm_stats(Program *program);
1763 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1764
1765 void aco_print_instr(const Instruction *instr, FILE *output);
1766 void aco_print_program(const Program *program, FILE *output);
1767
1768 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1769 const char *fmt, ...);
1770 void _aco_err(Program *program, const char *file, unsigned line,
1771 const char *fmt, ...);
1772
1773 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1774 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1775
1776 /* utilities for dealing with register demand */
1777 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1778 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1779 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1780
1781 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1782 uint16_t get_extra_sgprs(Program *program);
1783
1784 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1785 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1786 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1787
1788 /* return number of addressable sgprs/vgprs for max_waves */
1789 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1790 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1791
1792 typedef struct {
1793 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1794 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1795 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1796 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1797 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1798 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1799 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1800 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1801 /* sizes used for input/output modifiers and constants */
1802 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1803 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1804 } Info;
1805
1806 extern const Info instr_info;
1807
1808 }
1809
1810 #endif /* ACO_IR_H */
1811
1812