• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_ir.h"
8 
9 #include "aco_builder.h"
10 
11 #include "util/u_debug.h"
12 
13 #include "c11/threads.h"
14 
15 namespace aco {
16 
17 thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
18 
19 uint64_t debug_flags = 0;
20 
21 static const struct debug_control aco_debug_options[] = {
22    {"validateir", DEBUG_VALIDATE_IR},
23    {"validatera", DEBUG_VALIDATE_RA},
24    {"validate-livevars", DEBUG_VALIDATE_LIVE_VARS},
25    {"novalidateir", DEBUG_NO_VALIDATE_IR},
26    {"force-waitcnt", DEBUG_FORCE_WAITCNT},
27    {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
28    {"novn", DEBUG_NO_VN},
29    {"noopt", DEBUG_NO_OPT},
30    {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
31    {"nosched-ilp", DEBUG_NO_SCHED_ILP},
32    {"nosched-vopd", DEBUG_NO_SCHED_VOPD},
33    {"perfinfo", DEBUG_PERF_INFO},
34    {"liveinfo", DEBUG_LIVE_INFO},
35    {NULL, 0}};
36 
37 static once_flag init_once_flag = ONCE_FLAG_INIT;
38 
39 static void
init_once()40 init_once()
41 {
42    debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
43 
44 #ifndef NDEBUG
45    /* enable some flags by default on debug builds */
46    debug_flags |= aco::DEBUG_VALIDATE_IR;
47 #endif
48 
49    if (debug_flags & aco::DEBUG_NO_VALIDATE_IR)
50       debug_flags &= ~aco::DEBUG_VALIDATE_IR;
51 }
52 
53 void
init()54 init()
55 {
56    call_once(&init_once_flag, init_once);
57 }
58 
59 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)60 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
61              enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
62              ac_shader_config* config)
63 {
64    instruction_buffer = &program->m;
65    program->stage = stage;
66    program->config = config;
67    program->info = *info;
68    program->gfx_level = gfx_level;
69    if (family == CHIP_UNKNOWN) {
70       switch (gfx_level) {
71       case GFX6: program->family = CHIP_TAHITI; break;
72       case GFX7: program->family = CHIP_BONAIRE; break;
73       case GFX8: program->family = CHIP_POLARIS10; break;
74       case GFX9: program->family = CHIP_VEGA10; break;
75       case GFX10: program->family = CHIP_NAVI10; break;
76       case GFX10_3: program->family = CHIP_NAVI21; break;
77       case GFX11: program->family = CHIP_NAVI31; break;
78       case GFX12: program->family = CHIP_GFX1200; break;
79       default: program->family = CHIP_UNKNOWN; break;
80       }
81    } else {
82       program->family = family;
83    }
84    program->wave_size = info->wave_size;
85    program->lane_mask = program->wave_size == 32 ? s1 : s2;
86 
87    program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024
88                                        : gfx_level >= GFX7                        ? 512
89                                                                                   : 256;
90    program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
91 
92    /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
93    program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
94 
95    /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96    program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97 
98    program->dev.vgpr_limit = stage == raytracing_cs ? 128 : 256;
99    program->dev.physical_vgprs = 256;
100    program->dev.vgpr_alloc_granule = 4;
101 
102    if (gfx_level >= GFX10) {
103       program->dev.physical_sgprs = 128 * 20; /* enough for max waves */
104       program->dev.sgpr_alloc_granule = 128;
105       program->dev.sgpr_limit =
106          108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
107 
108       if (family == CHIP_NAVI31 || family == CHIP_NAVI32 || family == CHIP_GFX1151 ||
109           gfx_level >= GFX12) {
110          program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
111          program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
112       } else {
113          program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
114          if (gfx_level >= GFX10_3)
115             program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
116          else
117             program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
118       }
119    } else if (program->gfx_level >= GFX8) {
120       program->dev.physical_sgprs = 800;
121       program->dev.sgpr_alloc_granule = 16;
122       program->dev.sgpr_limit = 102;
123       if (family == CHIP_TONGA || family == CHIP_ICELAND)
124          program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
125    } else {
126       program->dev.physical_sgprs = 512;
127       program->dev.sgpr_alloc_granule = 8;
128       program->dev.sgpr_limit = 104;
129    }
130 
131    program->dev.scratch_alloc_granule = gfx_level >= GFX11 ? 256 : 1024;
132 
133    program->dev.max_waves_per_simd = 10;
134    if (program->gfx_level >= GFX10_3)
135       program->dev.max_waves_per_simd = 16;
136    else if (program->gfx_level == GFX10)
137       program->dev.max_waves_per_simd = 20;
138    else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
139       program->dev.max_waves_per_simd = 8;
140 
141    program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
142 
143    switch (program->family) {
144    /* GFX8 APUs */
145    case CHIP_CARRIZO:
146    case CHIP_STONEY:
147    /* GFX9 APUS */
148    case CHIP_RAVEN:
149    case CHIP_RAVEN2:
150    case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
151    default: break;
152    }
153 
154    program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
155    /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
156    program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
157    if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
158        program->family == CHIP_HAWAII)
159       program->dev.has_fast_fma32 = true;
160    program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level == GFX10;
161    program->dev.has_fmac_legacy32 = program->gfx_level >= GFX10_3 && program->gfx_level < GFX12;
162 
163    program->dev.fused_mad_mix = program->gfx_level >= GFX10;
164    if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
165        program->family == CHIP_MI100 || program->family == CHIP_MI200)
166       program->dev.fused_mad_mix = true;
167 
168    if (program->gfx_level >= GFX11) {
169       program->dev.scratch_global_offset_min = -4096;
170       program->dev.scratch_global_offset_max = 4095;
171    } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
172       program->dev.scratch_global_offset_min = -2048;
173       program->dev.scratch_global_offset_max = 2047;
174    } else if (program->gfx_level == GFX9) {
175       /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
176       program->dev.scratch_global_offset_min = 0;
177       program->dev.scratch_global_offset_max = 4095;
178    }
179 
180    if (program->gfx_level >= GFX12) {
181       /* Same as GFX11, except one less for VSAMPLE. */
182       program->dev.max_nsa_vgprs = 3;
183    } else if (program->gfx_level >= GFX11) {
184       /* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
185        * rest of the address.
186        */
187       program->dev.max_nsa_vgprs = 4;
188    } else if (program->gfx_level >= GFX10_3) {
189       /* GFX10.3 can have up to 3 NSA dwords. */
190       program->dev.max_nsa_vgprs = 13;
191    } else if (program->gfx_level >= GFX10) {
192       /* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
193       program->dev.max_nsa_vgprs = 5;
194    } else {
195       program->dev.max_nsa_vgprs = 0;
196    }
197 
198    program->wgp_mode = wgp_mode;
199 
200    program->progress = CompilationProgress::after_isel;
201 
202    program->next_fp_mode.must_flush_denorms32 = false;
203    program->next_fp_mode.must_flush_denorms16_64 = false;
204    program->next_fp_mode.care_about_round32 = false;
205    program->next_fp_mode.care_about_round16_64 = false;
206    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
207    program->next_fp_mode.denorm32 = 0;
208    program->next_fp_mode.round16_64 = fp_round_ne;
209    program->next_fp_mode.round32 = fp_round_ne;
210 }
211 
212 bool
is_wait_export_ready(amd_gfx_level gfx_level,const Instruction * instr)213 is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr)
214 {
215    return instr->opcode == aco_opcode::s_wait_event &&
216           (gfx_level >= GFX12 ? (instr->salu().imm & wait_event_imm_wait_export_ready_gfx12)
217                               : !(instr->salu().imm & wait_event_imm_dont_wait_export_ready_gfx11));
218 }
219 
220 memory_sync_info
get_sync_info(const Instruction * instr)221 get_sync_info(const Instruction* instr)
222 {
223    /* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
224     * overlapping waves in the queue family.
225     */
226    if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
227        instr->opcode == aco_opcode::s_wait_event) {
228       return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
229    } else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
230       return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
231    }
232 
233    switch (instr->format) {
234    case Format::SMEM: return instr->smem().sync;
235    case Format::MUBUF: return instr->mubuf().sync;
236    case Format::MIMG: return instr->mimg().sync;
237    case Format::MTBUF: return instr->mtbuf().sync;
238    case Format::FLAT:
239    case Format::GLOBAL:
240    case Format::SCRATCH: return instr->flatlike().sync;
241    case Format::DS: return instr->ds().sync;
242    case Format::LDSDIR: return instr->ldsdir().sync;
243    default: return memory_sync_info();
244    }
245 }
246 
247 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)248 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
249 {
250    if (!instr->isVALU())
251       return false;
252 
253    if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
254       return false;
255 
256    if (instr->isSDWA())
257       return true;
258 
259    if (instr->isVOP3()) {
260       VALU_instruction& vop3 = instr->valu();
261       if (instr->format == Format::VOP3)
262          return false;
263       if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
264          return false;
265       if (vop3.omod && gfx_level < GFX9)
266          return false;
267 
268       // TODO: return true if we know we will use vcc
269       if (!pre_ra && instr->definitions.size() >= 2)
270          return false;
271 
272       for (unsigned i = 1; i < instr->operands.size(); i++) {
273          if (instr->operands[i].isLiteral())
274             return false;
275          if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
276             return false;
277       }
278    }
279 
280    if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
281       return false;
282 
283    if (!instr->operands.empty()) {
284       if (instr->operands[0].isLiteral())
285          return false;
286       if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
287          return false;
288       if (instr->operands[0].bytes() > 4)
289          return false;
290       if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
291          return false;
292    }
293 
294    bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
295                  instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
296 
297    if (gfx_level != GFX8 && is_mac)
298       return false;
299 
300    // TODO: return true if we know we will use vcc
301    if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
302       return false;
303    if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
304       return false;
305 
306    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
307           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
308           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
309           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
310           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
311           instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
312 }
313 
314 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
315 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)316 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
317 {
318    if (instr->isSDWA())
319       return NULL;
320 
321    aco_ptr<Instruction> tmp = std::move(instr);
322    Format format = asSDWA(withoutVOP3(tmp->format));
323    instr.reset(
324       create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
325    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
326    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
327 
328    SDWA_instruction& sdwa = instr->sdwa();
329 
330    if (tmp->isVOP3()) {
331       VALU_instruction& vop3 = tmp->valu();
332       sdwa.neg = vop3.neg;
333       sdwa.abs = vop3.abs;
334       sdwa.omod = vop3.omod;
335       sdwa.clamp = vop3.clamp;
336    }
337 
338    for (unsigned i = 0; i < instr->operands.size(); i++) {
339       /* SDWA only uses operands 0 and 1. */
340       if (i >= 2)
341          break;
342 
343       sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
344    }
345 
346    sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
347 
348    if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
349       instr->definitions[0].setPrecolored(vcc);
350    if (instr->definitions.size() >= 2)
351       instr->definitions[1].setPrecolored(vcc);
352    if (instr->operands.size() >= 3)
353       instr->operands[2].setPrecolored(vcc);
354 
355    instr->pass_flags = tmp->pass_flags;
356 
357    return tmp;
358 }
359 
360 bool
can_use_DPP(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool dpp8)361 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
362 {
363    assert(instr->isVALU() && !instr->operands.empty());
364 
365    if (instr->isDPP())
366       return instr->isDPP8() == dpp8;
367 
368    if (instr->isSDWA() || instr->isVINTERP_INREG())
369       return false;
370 
371    if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
372       return false;
373 
374    if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
375        instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
376       return false;
377 
378    if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
379        instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
380        gfx_level < GFX11)
381       return false;
382 
383    if (instr->isVOP3() && gfx_level < GFX11) {
384       const VALU_instruction* vop3 = &instr->valu();
385       if (vop3->clamp || vop3->omod)
386          return false;
387       if (dpp8)
388          return false;
389    }
390 
391    for (unsigned i = 0; i < instr->operands.size(); i++) {
392       if (instr->operands[i].isLiteral())
393          return false;
394       if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
395          return false;
396    }
397 
398    /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */
399    if (instr->writes_exec())
400       return false;
401 
402    /* simpler than listing all VOP3P opcodes which do not support DPP */
403    if (instr->isVOP3P()) {
404       return instr->opcode == aco_opcode::v_fma_mix_f32 ||
405              instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
406              instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
407              instr->opcode == aco_opcode::v_dot2_f32_f16 ||
408              instr->opcode == aco_opcode::v_dot2_f32_bf16;
409    }
410 
411    if (instr->opcode == aco_opcode::v_pk_fmac_f16)
412       return gfx_level < GFX11;
413 
414    /* there are more cases but those all take 64-bit inputs */
415    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
416           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
417           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
418           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
419           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
420           instr->opcode != aco_opcode::v_cvt_f64_i32 &&
421           instr->opcode != aco_opcode::v_cvt_f64_f32 &&
422           instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
423           instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
424           instr->opcode != aco_opcode::v_mul_hi_i32 &&
425           instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
426           instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
427           instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
428           instr->opcode != aco_opcode::v_mad_u64_u32 &&
429           instr->opcode != aco_opcode::v_mad_i64_i32 &&
430           instr->opcode != aco_opcode::v_permlane16_b32 &&
431           instr->opcode != aco_opcode::v_permlanex16_b32 &&
432           instr->opcode != aco_opcode::v_permlane64_b32 &&
433           instr->opcode != aco_opcode::v_readlane_b32_e64 &&
434           instr->opcode != aco_opcode::v_writelane_b32_e64 &&
435           instr->opcode != aco_opcode::p_v_cvt_pk_u8_f32;
436 }
437 
438 aco_ptr<Instruction>
convert_to_DPP(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,bool dpp8)439 convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
440 {
441    if (instr->isDPP())
442       return NULL;
443 
444    aco_ptr<Instruction> tmp = std::move(instr);
445    Format format =
446       (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
447    if (dpp8)
448       instr.reset(
449          create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
450    else
451       instr.reset(
452          create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
453    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
454    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
455 
456    if (dpp8) {
457       DPP8_instruction* dpp = &instr->dpp8();
458       dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
459       dpp->fetch_inactive = gfx_level >= GFX10;
460    } else {
461       DPP16_instruction* dpp = &instr->dpp16();
462       dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
463       dpp->row_mask = 0xf;
464       dpp->bank_mask = 0xf;
465       dpp->fetch_inactive = gfx_level >= GFX10;
466    }
467 
468    instr->valu().neg = tmp->valu().neg;
469    instr->valu().abs = tmp->valu().abs;
470    instr->valu().omod = tmp->valu().omod;
471    instr->valu().clamp = tmp->valu().clamp;
472    instr->valu().opsel = tmp->valu().opsel;
473    instr->valu().opsel_lo = tmp->valu().opsel_lo;
474    instr->valu().opsel_hi = tmp->valu().opsel_hi;
475 
476    if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
477       instr->definitions.back().setPrecolored(vcc);
478 
479    if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
480        gfx_level < GFX11)
481       instr->operands[2].setPrecolored(vcc);
482 
483    instr->pass_flags = tmp->pass_flags;
484 
485    /* DPP16 supports input modifiers, so we might no longer need VOP3. */
486    bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
487                       (instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
488 
489    /* VOPC/add_co/sub_co definition needs VCC without VOP3. */
490    remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
491                   !instr->definitions.back().isFixed() ||
492                   instr->definitions.back().physReg() == vcc;
493 
494    /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
495    remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
496                   instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
497 
498    if (remove_vop3)
499       instr->format = withoutVOP3(instr->format);
500 
501    return tmp;
502 }
503 
504 bool
can_use_input_modifiers(amd_gfx_level gfx_level,aco_opcode op,int idx)505 can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
506 {
507    if (op == aco_opcode::v_mov_b32)
508       return gfx_level >= GFX10;
509 
510    if (op == aco_opcode::v_ldexp_f16 || op == aco_opcode::v_ldexp_f32 ||
511        op == aco_opcode::v_ldexp_f64)
512       return idx == 0;
513 
514    return instr_info.can_use_input_modifiers[(int)op];
515 }
516 
517 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)518 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
519 {
520    /* opsel is only GFX9+ */
521    if (gfx_level < GFX9)
522       return false;
523 
524    switch (op) {
525    case aco_opcode::v_div_fixup_f16:
526    case aco_opcode::v_fma_f16:
527    case aco_opcode::v_mad_f16:
528    case aco_opcode::v_mad_u16:
529    case aco_opcode::v_mad_i16:
530    case aco_opcode::v_med3_f16:
531    case aco_opcode::v_med3_i16:
532    case aco_opcode::v_med3_u16:
533    case aco_opcode::v_min3_f16:
534    case aco_opcode::v_min3_i16:
535    case aco_opcode::v_min3_u16:
536    case aco_opcode::v_max3_f16:
537    case aco_opcode::v_max3_i16:
538    case aco_opcode::v_max3_u16:
539    case aco_opcode::v_minmax_f16:
540    case aco_opcode::v_maxmin_f16:
541    case aco_opcode::v_max_u16_e64:
542    case aco_opcode::v_max_i16_e64:
543    case aco_opcode::v_min_u16_e64:
544    case aco_opcode::v_min_i16_e64:
545    case aco_opcode::v_add_i16:
546    case aco_opcode::v_sub_i16:
547    case aco_opcode::v_add_u16_e64:
548    case aco_opcode::v_sub_u16_e64:
549    case aco_opcode::v_lshlrev_b16_e64:
550    case aco_opcode::v_lshrrev_b16_e64:
551    case aco_opcode::v_ashrrev_i16_e64:
552    case aco_opcode::v_and_b16:
553    case aco_opcode::v_or_b16:
554    case aco_opcode::v_xor_b16:
555    case aco_opcode::v_mul_lo_u16_e64: return true;
556    case aco_opcode::v_pack_b32_f16:
557    case aco_opcode::v_cvt_pknorm_i16_f16:
558    case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
559    case aco_opcode::v_mad_u32_u16:
560    case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
561    case aco_opcode::v_dot2_f16_f16:
562    case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
563    case aco_opcode::v_cndmask_b16: return idx != 2;
564    case aco_opcode::v_interp_p10_f16_f32_inreg:
565    case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
566    case aco_opcode::v_interp_p2_f16_f32_inreg:
567    case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
568    default:
569       return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
570    }
571 }
572 
573 bool
can_write_m0(const aco_ptr<Instruction> & instr)574 can_write_m0(const aco_ptr<Instruction>& instr)
575 {
576    if (instr->isSALU())
577       return true;
578 
579    /* VALU can't write m0 on any GPU generations. */
580    if (instr->isVALU())
581       return false;
582 
583    switch (instr->opcode) {
584    case aco_opcode::p_parallelcopy:
585    case aco_opcode::p_extract:
586    case aco_opcode::p_insert:
587       /* These pseudo instructions are implemented with SALU when writing m0. */
588       return true;
589    default:
590       /* Assume that no other instructions can write m0. */
591       return false;
592    }
593 }
594 
595 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)596 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
597 {
598    /* partial register writes are GFX9+, only */
599    if (gfx_level < GFX9)
600       return false;
601 
602    switch (op) {
603    /* VOP3 */
604    case aco_opcode::v_mad_legacy_f16:
605    case aco_opcode::v_mad_legacy_u16:
606    case aco_opcode::v_mad_legacy_i16:
607    case aco_opcode::v_fma_legacy_f16:
608    case aco_opcode::v_div_fixup_legacy_f16: return false;
609    case aco_opcode::v_interp_p2_f16:
610    case aco_opcode::v_interp_p2_hi_f16:
611    case aco_opcode::v_fma_mixlo_f16:
612    case aco_opcode::v_fma_mixhi_f16:
613    /* VOP2 */
614    case aco_opcode::v_mac_f16:
615    case aco_opcode::v_madak_f16:
616    case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
617    case aco_opcode::v_add_f16:
618    case aco_opcode::v_sub_f16:
619    case aco_opcode::v_subrev_f16:
620    case aco_opcode::v_mul_f16:
621    case aco_opcode::v_max_f16:
622    case aco_opcode::v_min_f16:
623    case aco_opcode::v_ldexp_f16:
624    case aco_opcode::v_fmac_f16:
625    case aco_opcode::v_fmamk_f16:
626    case aco_opcode::v_fmaak_f16:
627    /* VOP1 */
628    case aco_opcode::v_cvt_f16_f32:
629    case aco_opcode::p_v_cvt_f16_f32_rtne:
630    case aco_opcode::v_cvt_f16_u16:
631    case aco_opcode::v_cvt_f16_i16:
632    case aco_opcode::v_rcp_f16:
633    case aco_opcode::v_sqrt_f16:
634    case aco_opcode::v_rsq_f16:
635    case aco_opcode::v_log_f16:
636    case aco_opcode::v_exp_f16:
637    case aco_opcode::v_frexp_mant_f16:
638    case aco_opcode::v_frexp_exp_i16_f16:
639    case aco_opcode::v_floor_f16:
640    case aco_opcode::v_ceil_f16:
641    case aco_opcode::v_trunc_f16:
642    case aco_opcode::v_rndne_f16:
643    case aco_opcode::v_fract_f16:
644    case aco_opcode::v_sin_f16:
645    case aco_opcode::v_cos_f16:
646    case aco_opcode::v_cvt_u16_f16:
647    case aco_opcode::v_cvt_i16_f16:
648    case aco_opcode::v_cvt_norm_i16_f16:
649    case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
650    /* all non legacy opsel instructions preserve the high bits */
651    default: return can_use_opsel(gfx_level, op, -1);
652    }
653 }
654 
655 /* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
656  * only supports v0-v127.
657  * The first three bits are used for operands 0-2, and the 4th bit is used for the destination.
658  */
659 uint8_t
get_gfx11_true16_mask(aco_opcode op)660 get_gfx11_true16_mask(aco_opcode op)
661 {
662    switch (op) {
663    case aco_opcode::v_ceil_f16:
664    case aco_opcode::v_cos_f16:
665    case aco_opcode::v_cvt_f16_i16:
666    case aco_opcode::v_cvt_f16_u16:
667    case aco_opcode::v_cvt_i16_f16:
668    case aco_opcode::v_cvt_u16_f16:
669    case aco_opcode::v_cvt_norm_i16_f16:
670    case aco_opcode::v_cvt_norm_u16_f16:
671    case aco_opcode::v_exp_f16:
672    case aco_opcode::v_floor_f16:
673    case aco_opcode::v_fract_f16:
674    case aco_opcode::v_frexp_exp_i16_f16:
675    case aco_opcode::v_frexp_mant_f16:
676    case aco_opcode::v_log_f16:
677    case aco_opcode::v_not_b16:
678    case aco_opcode::v_rcp_f16:
679    case aco_opcode::v_rndne_f16:
680    case aco_opcode::v_rsq_f16:
681    case aco_opcode::v_sin_f16:
682    case aco_opcode::v_sqrt_f16:
683    case aco_opcode::v_trunc_f16:
684    case aco_opcode::v_swap_b16:
685    case aco_opcode::v_mov_b16: return 0x1 | 0x8;
686    case aco_opcode::v_add_f16:
687    case aco_opcode::v_fmaak_f16:
688    case aco_opcode::v_fmac_f16:
689    case aco_opcode::v_fmamk_f16:
690    case aco_opcode::v_ldexp_f16:
691    case aco_opcode::v_max_f16:
692    case aco_opcode::v_min_f16:
693    case aco_opcode::v_mul_f16:
694    case aco_opcode::v_sub_f16:
695    case aco_opcode::v_subrev_f16:
696    case aco_opcode::v_and_b16:
697    case aco_opcode::v_or_b16:
698    case aco_opcode::v_xor_b16: return 0x3 | 0x8;
699    case aco_opcode::v_cvt_f32_f16:
700    case aco_opcode::v_cvt_i32_i16:
701    case aco_opcode::v_cvt_u32_u16: return 0x1;
702    case aco_opcode::v_cmp_class_f16:
703    case aco_opcode::v_cmp_eq_f16:
704    case aco_opcode::v_cmp_eq_i16:
705    case aco_opcode::v_cmp_eq_u16:
706    case aco_opcode::v_cmp_ge_f16:
707    case aco_opcode::v_cmp_ge_i16:
708    case aco_opcode::v_cmp_ge_u16:
709    case aco_opcode::v_cmp_gt_f16:
710    case aco_opcode::v_cmp_gt_i16:
711    case aco_opcode::v_cmp_gt_u16:
712    case aco_opcode::v_cmp_le_f16:
713    case aco_opcode::v_cmp_le_i16:
714    case aco_opcode::v_cmp_le_u16:
715    case aco_opcode::v_cmp_lg_f16:
716    case aco_opcode::v_cmp_lg_i16:
717    case aco_opcode::v_cmp_lg_u16:
718    case aco_opcode::v_cmp_lt_f16:
719    case aco_opcode::v_cmp_lt_i16:
720    case aco_opcode::v_cmp_lt_u16:
721    case aco_opcode::v_cmp_neq_f16:
722    case aco_opcode::v_cmp_nge_f16:
723    case aco_opcode::v_cmp_ngt_f16:
724    case aco_opcode::v_cmp_nle_f16:
725    case aco_opcode::v_cmp_nlg_f16:
726    case aco_opcode::v_cmp_nlt_f16:
727    case aco_opcode::v_cmp_o_f16:
728    case aco_opcode::v_cmp_u_f16:
729    case aco_opcode::v_cmpx_class_f16:
730    case aco_opcode::v_cmpx_eq_f16:
731    case aco_opcode::v_cmpx_eq_i16:
732    case aco_opcode::v_cmpx_eq_u16:
733    case aco_opcode::v_cmpx_ge_f16:
734    case aco_opcode::v_cmpx_ge_i16:
735    case aco_opcode::v_cmpx_ge_u16:
736    case aco_opcode::v_cmpx_gt_f16:
737    case aco_opcode::v_cmpx_gt_i16:
738    case aco_opcode::v_cmpx_gt_u16:
739    case aco_opcode::v_cmpx_le_f16:
740    case aco_opcode::v_cmpx_le_i16:
741    case aco_opcode::v_cmpx_le_u16:
742    case aco_opcode::v_cmpx_lg_f16:
743    case aco_opcode::v_cmpx_lg_i16:
744    case aco_opcode::v_cmpx_lg_u16:
745    case aco_opcode::v_cmpx_lt_f16:
746    case aco_opcode::v_cmpx_lt_i16:
747    case aco_opcode::v_cmpx_lt_u16:
748    case aco_opcode::v_cmpx_neq_f16:
749    case aco_opcode::v_cmpx_nge_f16:
750    case aco_opcode::v_cmpx_ngt_f16:
751    case aco_opcode::v_cmpx_nle_f16:
752    case aco_opcode::v_cmpx_nlg_f16:
753    case aco_opcode::v_cmpx_nlt_f16:
754    case aco_opcode::v_cmpx_o_f16:
755    case aco_opcode::v_cmpx_u_f16: return 0x3;
756    case aco_opcode::v_cvt_f16_f32:
757    case aco_opcode::v_sat_pk_u8_i16: return 0x8;
758    default: return 0x0;
759    }
760 }
761 
762 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)763 get_reduction_identity(ReduceOp op, unsigned idx)
764 {
765    switch (op) {
766    case iadd8:
767    case iadd16:
768    case iadd32:
769    case iadd64:
770    case fadd16:
771    case fadd32:
772    case fadd64:
773    case ior8:
774    case ior16:
775    case ior32:
776    case ior64:
777    case ixor8:
778    case ixor16:
779    case ixor32:
780    case ixor64:
781    case umax8:
782    case umax16:
783    case umax32:
784    case umax64: return 0;
785    case imul8:
786    case imul16:
787    case imul32:
788    case imul64: return idx ? 0 : 1;
789    case fmul16: return 0x3c00u;                /* 1.0 */
790    case fmul32: return 0x3f800000u;            /* 1.0 */
791    case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
792    case imin8: return INT8_MAX;
793    case imin16: return INT16_MAX;
794    case imin32: return INT32_MAX;
795    case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
796    case imax8: return INT8_MIN;
797    case imax16: return INT16_MIN;
798    case imax32: return INT32_MIN;
799    case imax64: return idx ? 0x80000000u : 0;
800    case umin8:
801    case umin16:
802    case iand8:
803    case iand16: return 0xffffffffu;
804    case umin32:
805    case umin64:
806    case iand32:
807    case iand64: return 0xffffffffu;
808    case fmin16: return 0x7c00u;                /* infinity */
809    case fmin32: return 0x7f800000u;            /* infinity */
810    case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
811    case fmax16: return 0xfc00u;                /* negative infinity */
812    case fmax32: return 0xff800000u;            /* negative infinity */
813    case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
814    default: unreachable("Invalid reduction operation"); break;
815    }
816    return 0;
817 }
818 
819 unsigned
get_operand_size(aco_ptr<Instruction> & instr,unsigned index)820 get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
821 {
822    if (instr->isPseudo())
823       return instr->operands[index].bytes() * 8u;
824    else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
825             instr->opcode == aco_opcode::v_mad_i64_i32)
826       return index == 2 ? 64 : 32;
827    else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
828             instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
829             instr->opcode == aco_opcode::v_fma_mixhi_f16)
830       return instr->valu().opsel_hi[index] ? 16 : 32;
831    else if (instr->opcode == aco_opcode::v_interp_p10_f16_f32_inreg ||
832             instr->opcode == aco_opcode::v_interp_p10_rtz_f16_f32_inreg)
833       return index == 1 ? 32 : 16;
834    else if (instr->opcode == aco_opcode::v_interp_p2_f16_f32_inreg ||
835             instr->opcode == aco_opcode::v_interp_p2_rtz_f16_f32_inreg)
836       return index == 0 ? 16 : 32;
837    else if (instr->isVALU() || instr->isSALU())
838       return instr_info.operand_size[(int)instr->opcode];
839    else
840       return 0;
841 }
842 
843 bool
needs_exec_mask(const Instruction * instr)844 needs_exec_mask(const Instruction* instr)
845 {
846    if (instr->isVALU()) {
847       return instr->opcode != aco_opcode::v_readlane_b32 &&
848              instr->opcode != aco_opcode::v_readlane_b32_e64 &&
849              instr->opcode != aco_opcode::v_writelane_b32 &&
850              instr->opcode != aco_opcode::v_writelane_b32_e64;
851    }
852 
853    if (instr->isVMEM() || instr->isFlatLike())
854       return true;
855 
856    if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
857       return instr->reads_exec();
858 
859    if (instr->isPseudo()) {
860       switch (instr->opcode) {
861       case aco_opcode::p_create_vector:
862       case aco_opcode::p_extract_vector:
863       case aco_opcode::p_split_vector:
864       case aco_opcode::p_phi:
865       case aco_opcode::p_parallelcopy:
866          for (Definition def : instr->definitions) {
867             if (def.getTemp().type() == RegType::vgpr)
868                return true;
869          }
870          return instr->reads_exec();
871       case aco_opcode::p_spill:
872       case aco_opcode::p_reload:
873       case aco_opcode::p_end_linear_vgpr:
874       case aco_opcode::p_logical_start:
875       case aco_opcode::p_logical_end:
876       case aco_opcode::p_startpgm:
877       case aco_opcode::p_end_wqm:
878       case aco_opcode::p_init_scratch: return instr->reads_exec();
879       case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
880       default: break;
881       }
882    }
883 
884    return true;
885 }
886 
887 struct CmpInfo {
888    aco_opcode swapped;
889    aco_opcode inverse;
890    aco_opcode vcmpx;
891 };
892 
893 static ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)894 get_cmp_info(aco_opcode op, CmpInfo* info)
895 {
896    info->swapped = aco_opcode::num_opcodes;
897    info->inverse = aco_opcode::num_opcodes;
898    info->vcmpx = aco_opcode::num_opcodes;
899    switch (op) {
900       // clang-format off
901 #define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
902    case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
903    case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
904       info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
905                                                       : aco_opcode::v_cmp_n##unord_swap##_f##sz;   \
906       info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
907                                                                : aco_opcode::v_cmp_n##ord##_f##sz; \
908       info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz       \
909                                                           : aco_opcode::v_cmpx_n##unord##_f##sz;   \
910       return true;
911 #define CMP(ord, unord, ord_swap, unord_swap)                                                      \
912    CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
913    CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
914    CMP2(ord, unord, ord_swap, unord_swap, 64)
915       CMP(lt, /*n*/ge, gt, /*n*/le)
916       CMP(eq, /*n*/lg, eq, /*n*/lg)
917       CMP(le, /*n*/gt, ge, /*n*/lt)
918       CMP(gt, /*n*/le, lt, /*n*/ge)
919       CMP(lg, /*n*/eq, lg, /*n*/eq)
920       CMP(ge, /*n*/lt, le, /*n*/gt)
921 #undef CMP
922 #undef CMP2
923 #define ORD_TEST(sz)                                                                               \
924    case aco_opcode::v_cmp_u_f##sz:                                                                 \
925       info->swapped = aco_opcode::v_cmp_u_f##sz;                                                   \
926       info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
927       info->vcmpx = aco_opcode::v_cmpx_u_f##sz;                                                    \
928       return true;                                                                                 \
929    case aco_opcode::v_cmp_o_f##sz:                                                                 \
930       info->swapped = aco_opcode::v_cmp_o_f##sz;                                                   \
931       info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
932       info->vcmpx = aco_opcode::v_cmpx_o_f##sz;                                                    \
933       return true;
934       ORD_TEST(16)
935       ORD_TEST(32)
936       ORD_TEST(64)
937 #undef ORD_TEST
938 #define CMPI2(op, swap, inv, type, sz)                                                             \
939    case aco_opcode::v_cmp_##op##_##type##sz:                                                       \
940       info->swapped = aco_opcode::v_cmp_##swap##_##type##sz;                                       \
941       info->inverse = aco_opcode::v_cmp_##inv##_##type##sz;                                        \
942       info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz;                                          \
943       return true;
944 #define CMPI(op, swap, inv)                                                                        \
945    CMPI2(op, swap, inv, i, 16)                                                                     \
946    CMPI2(op, swap, inv, u, 16)                                                                     \
947    CMPI2(op, swap, inv, i, 32)                                                                     \
948    CMPI2(op, swap, inv, u, 32)                                                                     \
949    CMPI2(op, swap, inv, i, 64)                                                                     \
950    CMPI2(op, swap, inv, u, 64)
951       CMPI(lt, gt, ge)
952       CMPI(eq, eq, lg)
953       CMPI(le, ge, gt)
954       CMPI(gt, lt, le)
955       CMPI(lg, lg, eq)
956       CMPI(ge, le, lt)
957 #undef CMPI
958 #undef CMPI2
959 #define CMPCLASS(sz)                                                                               \
960    case aco_opcode::v_cmp_class_f##sz:                                                             \
961       info->vcmpx = aco_opcode::v_cmpx_class_f##sz;                                                \
962       return true;
963       CMPCLASS(16)
964       CMPCLASS(32)
965       CMPCLASS(64)
966 #undef CMPCLASS
967       // clang-format on
968    default: return false;
969    }
970 }
971 
972 aco_opcode
get_vcmp_inverse(aco_opcode op)973 get_vcmp_inverse(aco_opcode op)
974 {
975    CmpInfo info;
976    return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
977 }
978 
979 aco_opcode
get_vcmp_swapped(aco_opcode op)980 get_vcmp_swapped(aco_opcode op)
981 {
982    CmpInfo info;
983    return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
984 }
985 
986 aco_opcode
get_vcmpx(aco_opcode op)987 get_vcmpx(aco_opcode op)
988 {
989    CmpInfo info;
990    return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
991 }
992 
993 bool
is_cmpx(aco_opcode op)994 is_cmpx(aco_opcode op)
995 {
996    CmpInfo info;
997    return !get_cmp_info(op, &info);
998 }
999 
1000 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op,unsigned idx0,unsigned idx1)1001 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
1002 {
1003    if (idx0 == idx1) {
1004       *new_op = instr->opcode;
1005       return true;
1006    }
1007 
1008    if (idx0 > idx1)
1009       std::swap(idx0, idx1);
1010 
1011    if (instr->isDPP())
1012       return false;
1013 
1014    if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
1015       return false;
1016 
1017    if (instr->isVOPC()) {
1018       CmpInfo info;
1019       if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
1020          *new_op = info.swapped;
1021          return true;
1022       }
1023    }
1024 
1025    /* opcodes not relevant for DPP or SGPRs optimizations are not included. */
1026    switch (instr->opcode) {
1027    case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
1028    case aco_opcode::v_add_u32:
1029    case aco_opcode::v_add_co_u32:
1030    case aco_opcode::v_add_co_u32_e64:
1031    case aco_opcode::v_add_i32:
1032    case aco_opcode::v_add_i16:
1033    case aco_opcode::v_add_u16_e64:
1034    case aco_opcode::v_add3_u32:
1035    case aco_opcode::v_add_f16:
1036    case aco_opcode::v_add_f32:
1037    case aco_opcode::v_mul_i32_i24:
1038    case aco_opcode::v_mul_hi_i32_i24:
1039    case aco_opcode::v_mul_u32_u24:
1040    case aco_opcode::v_mul_hi_u32_u24:
1041    case aco_opcode::v_mul_lo_u16:
1042    case aco_opcode::v_mul_lo_u16_e64:
1043    case aco_opcode::v_mul_f16:
1044    case aco_opcode::v_mul_f32:
1045    case aco_opcode::v_mul_legacy_f32:
1046    case aco_opcode::v_or_b32:
1047    case aco_opcode::v_and_b32:
1048    case aco_opcode::v_xor_b32:
1049    case aco_opcode::v_xnor_b32:
1050    case aco_opcode::v_xor3_b32:
1051    case aco_opcode::v_or3_b32:
1052    case aco_opcode::v_and_b16:
1053    case aco_opcode::v_or_b16:
1054    case aco_opcode::v_xor_b16:
1055    case aco_opcode::v_max3_f32:
1056    case aco_opcode::v_min3_f32:
1057    case aco_opcode::v_max3_f16:
1058    case aco_opcode::v_min3_f16:
1059    case aco_opcode::v_med3_f16:
1060    case aco_opcode::v_max3_u32:
1061    case aco_opcode::v_min3_u32:
1062    case aco_opcode::v_med3_u32:
1063    case aco_opcode::v_max3_i32:
1064    case aco_opcode::v_min3_i32:
1065    case aco_opcode::v_med3_i32:
1066    case aco_opcode::v_max3_u16:
1067    case aco_opcode::v_min3_u16:
1068    case aco_opcode::v_med3_u16:
1069    case aco_opcode::v_max3_i16:
1070    case aco_opcode::v_min3_i16:
1071    case aco_opcode::v_med3_i16:
1072    case aco_opcode::v_max_f16:
1073    case aco_opcode::v_max_f32:
1074    case aco_opcode::v_min_f16:
1075    case aco_opcode::v_min_f32:
1076    case aco_opcode::v_max_i32:
1077    case aco_opcode::v_min_i32:
1078    case aco_opcode::v_max_u32:
1079    case aco_opcode::v_min_u32:
1080    case aco_opcode::v_max_i16:
1081    case aco_opcode::v_min_i16:
1082    case aco_opcode::v_max_u16:
1083    case aco_opcode::v_min_u16:
1084    case aco_opcode::v_max_i16_e64:
1085    case aco_opcode::v_min_i16_e64:
1086    case aco_opcode::v_max_u16_e64:
1087    case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
1088    case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
1089    case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
1090    case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
1091    case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
1092    case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
1093    case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
1094    case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
1095    case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
1096    case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
1097    case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
1098    case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
1099    case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
1100    case aco_opcode::v_addc_co_u32:
1101    case aco_opcode::v_mad_i32_i24:
1102    case aco_opcode::v_mad_u32_u24:
1103    case aco_opcode::v_lerp_u8:
1104    case aco_opcode::v_sad_u8:
1105    case aco_opcode::v_sad_hi_u8:
1106    case aco_opcode::v_sad_u16:
1107    case aco_opcode::v_sad_u32:
1108    case aco_opcode::v_xad_u32:
1109    case aco_opcode::v_add_lshl_u32:
1110    case aco_opcode::v_and_or_b32:
1111    case aco_opcode::v_mad_u16:
1112    case aco_opcode::v_mad_i16:
1113    case aco_opcode::v_mad_u32_u16:
1114    case aco_opcode::v_mad_i32_i16:
1115    case aco_opcode::v_maxmin_f32:
1116    case aco_opcode::v_minmax_f32:
1117    case aco_opcode::v_maxmin_f16:
1118    case aco_opcode::v_minmax_f16:
1119    case aco_opcode::v_maxmin_u32:
1120    case aco_opcode::v_minmax_u32:
1121    case aco_opcode::v_maxmin_i32:
1122    case aco_opcode::v_minmax_i32:
1123    case aco_opcode::v_fma_f32:
1124    case aco_opcode::v_fma_legacy_f32:
1125    case aco_opcode::v_fmac_f32:
1126    case aco_opcode::v_fmac_legacy_f32:
1127    case aco_opcode::v_mac_f32:
1128    case aco_opcode::v_mac_legacy_f32:
1129    case aco_opcode::v_fma_f16:
1130    case aco_opcode::v_fmac_f16:
1131    case aco_opcode::v_mac_f16:
1132    case aco_opcode::v_dot4c_i32_i8:
1133    case aco_opcode::v_dot2c_f32_f16:
1134    case aco_opcode::v_dot2_f32_f16:
1135    case aco_opcode::v_dot2_f32_bf16:
1136    case aco_opcode::v_dot2_f16_f16:
1137    case aco_opcode::v_dot2_bf16_bf16:
1138    case aco_opcode::v_fma_mix_f32:
1139    case aco_opcode::v_fma_mixlo_f16:
1140    case aco_opcode::v_fma_mixhi_f16:
1141    case aco_opcode::v_pk_fmac_f16: {
1142       if (idx1 == 2)
1143          return false;
1144       *new_op = instr->opcode;
1145       return true;
1146    }
1147    case aco_opcode::v_subb_co_u32: {
1148       if (idx1 == 2)
1149          return false;
1150       *new_op = aco_opcode::v_subbrev_co_u32;
1151       return true;
1152    }
1153    case aco_opcode::v_subbrev_co_u32: {
1154       if (idx1 == 2)
1155          return false;
1156       *new_op = aco_opcode::v_subb_co_u32;
1157       return true;
1158    }
1159    default: return false;
1160    }
1161 }
1162 
wait_imm()1163 wait_imm::wait_imm()
1164     : exp(unset_counter), lgkm(unset_counter), vm(unset_counter), vs(unset_counter),
1165       sample(unset_counter), bvh(unset_counter), km(unset_counter)
1166 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)1167 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
1168     : exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_), sample(unset_counter), bvh(unset_counter),
1169       km(unset_counter)
1170 {}
1171 
1172 uint16_t
pack(enum amd_gfx_level gfx_level) const1173 wait_imm::pack(enum amd_gfx_level gfx_level) const
1174 {
1175    uint16_t imm = 0;
1176    assert(exp == unset_counter || exp <= 0x7);
1177    if (gfx_level >= GFX11) {
1178       assert(lgkm == unset_counter || lgkm <= 0x3f);
1179       assert(vm == unset_counter || vm <= 0x3f);
1180       imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
1181    } else if (gfx_level >= GFX10) {
1182       assert(lgkm == unset_counter || lgkm <= 0x3f);
1183       assert(vm == unset_counter || vm <= 0x3f);
1184       imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1185    } else if (gfx_level >= GFX9) {
1186       assert(lgkm == unset_counter || lgkm <= 0xf);
1187       assert(vm == unset_counter || vm <= 0x3f);
1188       imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1189    } else {
1190       assert(lgkm == unset_counter || lgkm <= 0xf);
1191       assert(vm == unset_counter || vm <= 0xf);
1192       imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1193    }
1194    if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
1195       imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
1196                         architecture when interpreting the immediate */
1197    if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
1198       imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
1199                         architecture when interpreting the immediate */
1200    return imm;
1201 }
1202 
1203 wait_imm
max(enum amd_gfx_level gfx_level)1204 wait_imm::max(enum amd_gfx_level gfx_level)
1205 {
1206    wait_imm imm;
1207    imm.vm = gfx_level >= GFX9 ? 63 : 15;
1208    imm.exp = 7;
1209    imm.lgkm = gfx_level >= GFX10 ? 63 : 15;
1210    imm.vs = gfx_level >= GFX10 ? 63 : 0;
1211    imm.sample = gfx_level >= GFX12 ? 63 : 0;
1212    imm.bvh = gfx_level >= GFX12 ? 7 : 0;
1213    imm.km = gfx_level >= GFX12 ? 31 : 0;
1214    return imm;
1215 }
1216 
1217 bool
unpack(enum amd_gfx_level gfx_level,const Instruction * instr)1218 wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr)
1219 {
1220    if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null))
1221       return false;
1222 
1223    aco_opcode op = instr->opcode;
1224    uint16_t packed = instr->salu().imm;
1225 
1226    if (op == aco_opcode::s_wait_loadcnt) {
1227       vm = std::min<uint8_t>(vm, packed);
1228    } else if (op == aco_opcode::s_wait_storecnt) {
1229       vs = std::min<uint8_t>(vs, packed);
1230    } else if (op == aco_opcode::s_wait_samplecnt) {
1231       sample = std::min<uint8_t>(sample, packed);
1232    } else if (op == aco_opcode::s_wait_bvhcnt) {
1233       bvh = std::min<uint8_t>(bvh, packed);
1234    } else if (op == aco_opcode::s_wait_expcnt) {
1235       exp = std::min<uint8_t>(exp, packed);
1236    } else if (op == aco_opcode::s_wait_dscnt) {
1237       lgkm = std::min<uint8_t>(lgkm, packed);
1238    } else if (op == aco_opcode::s_wait_kmcnt) {
1239       km = std::min<uint8_t>(km, packed);
1240    } else if (op == aco_opcode::s_wait_loadcnt_dscnt) {
1241       uint32_t vm2 = (packed >> 8) & 0x3f;
1242       uint32_t ds = packed & 0x3f;
1243       vm = std::min<uint8_t>(vm, vm2 == 0x3f ? wait_imm::unset_counter : vm2);
1244       lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1245    } else if (op == aco_opcode::s_wait_storecnt_dscnt) {
1246       uint32_t vs2 = (packed >> 8) & 0x3f;
1247       uint32_t ds = packed & 0x3f;
1248       vs = std::min<uint8_t>(vs, vs2 == 0x3f ? wait_imm::unset_counter : vs2);
1249       lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1250    } else if (op == aco_opcode::s_waitcnt_expcnt) {
1251       exp = std::min<uint8_t>(exp, packed);
1252    } else if (op == aco_opcode::s_waitcnt_lgkmcnt) {
1253       lgkm = std::min<uint8_t>(lgkm, packed);
1254    } else if (op == aco_opcode::s_waitcnt_vmcnt) {
1255       vm = std::min<uint8_t>(vm, packed);
1256    } else if (op == aco_opcode::s_waitcnt_vscnt) {
1257       vs = std::min<uint8_t>(vs, packed);
1258    } else if (op == aco_opcode::s_waitcnt) {
1259       uint8_t vm2, lgkm2, exp2;
1260       if (gfx_level >= GFX11) {
1261          vm2 = (packed >> 10) & 0x3f;
1262          lgkm2 = (packed >> 4) & 0x3f;
1263          exp2 = packed & 0x7;
1264       } else {
1265          vm2 = packed & 0xf;
1266          if (gfx_level >= GFX9)
1267             vm2 |= (packed >> 10) & 0x30;
1268 
1269          exp2 = (packed >> 4) & 0x7;
1270 
1271          lgkm2 = (packed >> 8) & 0xf;
1272          if (gfx_level >= GFX10)
1273             lgkm2 |= (packed >> 8) & 0x30;
1274       }
1275 
1276       if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf))
1277          vm2 = wait_imm::unset_counter;
1278       if (exp2 == 0x7)
1279          exp2 = wait_imm::unset_counter;
1280       if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf))
1281          lgkm2 = wait_imm::unset_counter;
1282 
1283       vm = std::min(vm, vm2);
1284       exp = std::min(exp, exp2);
1285       lgkm = std::min(lgkm, lgkm2);
1286    } else {
1287       return false;
1288    }
1289    return true;
1290 }
1291 
1292 bool
combine(const wait_imm & other)1293 wait_imm::combine(const wait_imm& other)
1294 {
1295    bool changed = false;
1296    for (unsigned i = 0; i < wait_type_num; i++) {
1297       if (other[i] < (*this)[i])
1298          changed = true;
1299       (*this)[i] = std::min((*this)[i], other[i]);
1300    }
1301    return changed;
1302 }
1303 
1304 bool
empty() const1305 wait_imm::empty() const
1306 {
1307    for (unsigned i = 0; i < wait_type_num; i++) {
1308       if ((*this)[i] != unset_counter)
1309          return false;
1310    }
1311    return true;
1312 }
1313 
1314 void
print(FILE * output) const1315 wait_imm::print(FILE* output) const
1316 {
1317    const char* names[wait_type_num];
1318    names[wait_type_exp] = "exp";
1319    names[wait_type_vm] = "vm";
1320    names[wait_type_lgkm] = "lgkm";
1321    names[wait_type_vs] = "vs";
1322    names[wait_type_sample] = "sample";
1323    names[wait_type_bvh] = "bvh";
1324    names[wait_type_km] = "km";
1325    for (unsigned i = 0; i < wait_type_num; i++) {
1326       if ((*this)[i] != unset_counter)
1327          fprintf(output, "%s: %u\n", names[i], (*this)[i]);
1328    }
1329 }
1330 
1331 void
build_waitcnt(Builder & bld)1332 wait_imm::build_waitcnt(Builder& bld)
1333 {
1334    enum amd_gfx_level gfx_level = bld.program->gfx_level;
1335 
1336    if (gfx_level >= GFX12) {
1337       if (vm != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) {
1338          bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (vm << 8) | lgkm);
1339          vm = wait_imm::unset_counter;
1340          lgkm = wait_imm::unset_counter;
1341       }
1342 
1343       if (vs != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) {
1344          bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (vs << 8) | lgkm);
1345          vs = wait_imm::unset_counter;
1346          lgkm = wait_imm::unset_counter;
1347       }
1348 
1349       aco_opcode op[wait_type_num];
1350       op[wait_type_exp] = aco_opcode::s_wait_expcnt;
1351       op[wait_type_lgkm] = aco_opcode::s_wait_dscnt;
1352       op[wait_type_vm] = aco_opcode::s_wait_loadcnt;
1353       op[wait_type_vs] = aco_opcode::s_wait_storecnt;
1354       op[wait_type_sample] = aco_opcode::s_wait_samplecnt;
1355       op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt;
1356       op[wait_type_km] = aco_opcode::s_wait_kmcnt;
1357 
1358       for (unsigned i = 0; i < wait_type_num; i++) {
1359          if ((*this)[i] != wait_imm::unset_counter)
1360             bld.sopp(op[i], (*this)[i]);
1361       }
1362    } else {
1363       if (vs != wait_imm::unset_counter) {
1364          assert(gfx_level >= GFX10);
1365          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), vs);
1366          vs = wait_imm::unset_counter;
1367       }
1368       if (!empty())
1369          bld.sopp(aco_opcode::s_waitcnt, pack(gfx_level));
1370    }
1371 
1372    *this = wait_imm();
1373 }
1374 
1375 bool
should_form_clause(const Instruction * a,const Instruction * b)1376 should_form_clause(const Instruction* a, const Instruction* b)
1377 {
1378    if (a->definitions.empty() != b->definitions.empty())
1379       return false;
1380 
1381    if (a->format != b->format)
1382       return false;
1383 
1384    if (a->operands.empty() || b->operands.empty())
1385       return false;
1386 
1387    /* Assume loads which don't use descriptors might load from similar addresses. */
1388    if (a->isFlatLike() || a->accessesLDS())
1389       return true;
1390    if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
1391       return true;
1392 
1393    /* If they load from the same descriptor, assume they might load from similar
1394     * addresses.
1395     */
1396    if (a->isVMEM() || a->isSMEM())
1397       return a->operands[0].tempId() == b->operands[0].tempId();
1398 
1399    if (a->isEXP() && b->isEXP())
1400       return true;
1401 
1402    return false;
1403 }
1404 
1405 int
get_op_fixed_to_def(Instruction * instr)1406 get_op_fixed_to_def(Instruction* instr)
1407 {
1408    if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
1409        instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
1410        instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
1411        instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
1412        instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
1413        instr->opcode == aco_opcode::v_writelane_b32_e64 ||
1414        instr->opcode == aco_opcode::v_dot4c_i32_i8 || instr->opcode == aco_opcode::s_fmac_f32 ||
1415        instr->opcode == aco_opcode::s_fmac_f16) {
1416       return 2;
1417    } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
1418               instr->opcode == aco_opcode::s_cmovk_i32) {
1419       return 0;
1420    } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) {
1421       return 3;
1422    } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
1423               !instr->operands[2].isUndefined()) {
1424       return 2;
1425    }
1426    return -1;
1427 }
1428 
1429 uint8_t
get_vmem_type(enum amd_gfx_level gfx_level,Instruction * instr)1430 get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
1431 {
1432    if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
1433       return vmem_bvh;
1434    else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
1435       return vmem_sampler;
1436    else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
1437             instr->operands[1].regClass() == s4)
1438       return vmem_sampler;
1439    else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
1440       return vmem_nosampler;
1441    return 0;
1442 }
1443 
1444 /* Parse implicit data dependency resolution:
1445  * Returns the value of each counter that must be reached
1446  * before an instruction is issued.
1447  *
1448  * (Probably incomplete.)
1449  */
1450 depctr_wait
parse_depctr_wait(const Instruction * instr)1451 parse_depctr_wait(const Instruction* instr)
1452 {
1453    depctr_wait res;
1454    if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP()) {
1455       res.va_vdst = 0;
1456       res.va_exec = 0;
1457       res.sa_exec = 0;
1458       if (instr->isVMEM() || instr->isFlatLike()) {
1459          res.sa_sdst = 0;
1460          res.va_sdst = 0;
1461          res.va_vcc = 0;
1462       }
1463    } else if (instr->isSMEM()) {
1464       res.sa_sdst = 0;
1465       res.va_sdst = 0;
1466       res.va_vcc = 0;
1467    } else if (instr->isLDSDIR()) {
1468       res.va_vdst = instr->ldsdir().wait_vdst;
1469       res.va_exec = 0;
1470       res.sa_exec = 0;
1471    } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1472       unsigned imm = instr->salu().imm;
1473       res.va_vdst = (imm >> 12) & 0xf;
1474       res.va_sdst = (imm >> 9) & 0x7;
1475       res.va_ssrc = (imm >> 8) & 0x1;
1476       res.hold_cnt = (imm >> 7) & 0x1;
1477       res.vm_vsrc = (imm >> 2) & 0x7;
1478       res.va_vcc = (imm >> 1) & 0x1;
1479       res.sa_sdst = imm & 0x1;
1480    } else if (instr->isVALU()) {
1481       res.sa_exec = 0;
1482       for (const Definition& def : instr->definitions) {
1483          if (def.regClass().type() == RegType::sgpr) {
1484             res.sa_sdst = 0;
1485             /* Notably, this is the only exception, even VALU that
1486              * reads exec doesn't implicitly wait for va_exec.
1487              */
1488             if (instr->opcode == aco_opcode::v_readfirstlane_b32)
1489                res.va_exec = 0;
1490             break;
1491          }
1492       }
1493    } else if (instr_info.classes[(int)instr->opcode] == instr_class::branch ||
1494               instr_info.classes[(int)instr->opcode] == instr_class::sendmsg) {
1495       res.sa_exec = 0;
1496       res.va_exec = 0;
1497       switch (instr->opcode) {
1498       case aco_opcode::s_cbranch_vccz:
1499       case aco_opcode::s_cbranch_vccnz:
1500          res.va_vcc = 0;
1501          res.sa_sdst = 0;
1502          break;
1503       case aco_opcode::s_cbranch_scc0:
1504       case aco_opcode::s_cbranch_scc1:
1505          res.sa_sdst = 0;
1506          break;
1507       default: break;
1508       }
1509    } else if (instr->isSALU()) {
1510       for (const Definition& def : instr->definitions) {
1511          if (def.physReg() < vcc) {
1512             res.va_sdst = 0;
1513          } else if (def.physReg() <= vcc_hi) {
1514             res.va_vcc = 0;
1515          } else if (def.physReg() == exec || def.physReg() == exec_hi) {
1516             res.va_exec = 0;
1517          }
1518       }
1519       for (const Operand& op : instr->operands) {
1520          if (op.physReg() < vcc) {
1521             res.va_sdst = 0;
1522          } else if (op.physReg() <= vcc_hi) {
1523             res.va_vcc = 0;
1524          } else if (op.physReg() == exec || op.physReg() == exec_hi) {
1525             res.va_exec = 0;
1526          }
1527       }
1528    }
1529 
1530    return res;
1531 }
1532 
1533 bool
dealloc_vgprs(Program * program)1534 dealloc_vgprs(Program* program)
1535 {
1536    if (program->gfx_level < GFX11)
1537       return false;
1538 
1539    /* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
1540     * a wait after exports. There might still be pending VMEM stores for PS parameter exports,
1541     * except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any
1542     * pending VMEM stores or exports if we insert the sendmsg for these stages. */
1543    if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1544                                          program->stage.hw == AC_HW_PIXEL_SHADER))
1545       return false;
1546 
1547    Block& block = program->blocks.back();
1548 
1549    /* don't bother checking if there is a pending VMEM store or export: there almost always is */
1550    Builder bld(program);
1551    if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
1552       bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
1553       bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
1554    }
1555 
1556    return true;
1557 }
1558 
1559 bool
isTrans() const1560 Instruction::isTrans() const noexcept
1561 {
1562    return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
1563           instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental ||
1564           instr_info.classes[(int)opcode] == instr_class::valu_pseudo_scalar_trans;
1565 }
1566 
1567 size_t
get_instr_data_size(Format format)1568 get_instr_data_size(Format format)
1569 {
1570    switch (format) {
1571    case Format::SOP1:
1572    case Format::SOP2:
1573    case Format::SOPC:
1574    case Format::SOPK:
1575    case Format::SOPP: return sizeof(SALU_instruction);
1576    case Format::SMEM: return sizeof(SMEM_instruction);
1577    case Format::PSEUDO: return sizeof(Pseudo_instruction);
1578    case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction);
1579    case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction);
1580    case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction);
1581    case Format::DS: return sizeof(DS_instruction);
1582    case Format::FLAT:
1583    case Format::GLOBAL:
1584    case Format::SCRATCH: return sizeof(FLAT_instruction);
1585    case Format::LDSDIR: return sizeof(LDSDIR_instruction);
1586    case Format::MTBUF: return sizeof(MTBUF_instruction);
1587    case Format::MUBUF: return sizeof(MUBUF_instruction);
1588    case Format::MIMG: return sizeof(MIMG_instruction);
1589    case Format::VOPD: return sizeof(VOPD_instruction);
1590    case Format::VINTERP_INREG: return sizeof(VINTERP_inreg_instruction);
1591    case Format::VINTRP: return sizeof(VINTRP_instruction);
1592    case Format::EXP: return sizeof(Export_instruction);
1593    default:
1594       if ((uint16_t)format & (uint16_t)Format::DPP16)
1595          return sizeof(DPP16_instruction);
1596       else if ((uint16_t)format & (uint16_t)Format::DPP8)
1597          return sizeof(DPP8_instruction);
1598       else if ((uint16_t)format & (uint16_t)Format::SDWA)
1599          return sizeof(SDWA_instruction);
1600       else
1601          return sizeof(VALU_instruction);
1602    }
1603 }
1604 
1605 Instruction*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1606 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1607                    uint32_t num_definitions)
1608 {
1609    size_t size = get_instr_data_size(format);
1610    size_t total_size = size + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1611 
1612    void* data = instruction_buffer->allocate(total_size, alignof(uint32_t));
1613    memset(data, 0, total_size);
1614    Instruction* inst = (Instruction*)data;
1615 
1616    inst->opcode = opcode;
1617    inst->format = format;
1618 
1619    uint16_t operands_offset = size - offsetof(Instruction, operands);
1620    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1621    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1622    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1623 
1624    return inst;
1625 }
1626 
1627 } // namespace aco
1628