• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_ir.h"
8 
9 #include "aco_builder.h"
10 
11 #include "util/u_debug.h"
12 
13 #include "c11/threads.h"
14 
15 namespace aco {
16 
17 thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
18 
19 uint64_t debug_flags = 0;
20 
21 static const struct debug_control aco_debug_options[] = {
22    {"validateir", DEBUG_VALIDATE_IR},
23    {"validatera", DEBUG_VALIDATE_RA},
24    {"validate-livevars", DEBUG_VALIDATE_LIVE_VARS},
25    {"novalidateir", DEBUG_NO_VALIDATE_IR},
26    {"force-waitcnt", DEBUG_FORCE_WAITCNT},
27    {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
28    {"novn", DEBUG_NO_VN},
29    {"noopt", DEBUG_NO_OPT},
30    {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
31    {"nosched-ilp", DEBUG_NO_SCHED_ILP},
32    {"nosched-vopd", DEBUG_NO_SCHED_VOPD},
33    {"perfinfo", DEBUG_PERF_INFO},
34    {"liveinfo", DEBUG_LIVE_INFO},
35    {NULL, 0}};
36 
37 static once_flag init_once_flag = ONCE_FLAG_INIT;
38 
39 static void
init_once()40 init_once()
41 {
42    debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
43 
44 #ifndef NDEBUG
45    /* enable some flags by default on debug builds */
46    debug_flags |= aco::DEBUG_VALIDATE_IR;
47 #endif
48 
49    if (debug_flags & aco::DEBUG_NO_VALIDATE_IR)
50       debug_flags &= ~aco::DEBUG_VALIDATE_IR;
51 }
52 
53 void
init()54 init()
55 {
56    call_once(&init_once_flag, init_once);
57 }
58 
59 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)60 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
61              enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
62              ac_shader_config* config)
63 {
64    instruction_buffer = &program->m;
65    program->stage = stage;
66    program->config = config;
67    program->info = *info;
68    program->gfx_level = gfx_level;
69    if (family == CHIP_UNKNOWN) {
70       switch (gfx_level) {
71       case GFX6: program->family = CHIP_TAHITI; break;
72       case GFX7: program->family = CHIP_BONAIRE; break;
73       case GFX8: program->family = CHIP_POLARIS10; break;
74       case GFX9: program->family = CHIP_VEGA10; break;
75       case GFX10: program->family = CHIP_NAVI10; break;
76       case GFX10_3: program->family = CHIP_NAVI21; break;
77       case GFX11: program->family = CHIP_NAVI31; break;
78       case GFX12: program->family = CHIP_GFX1200; break;
79       default: program->family = CHIP_UNKNOWN; break;
80       }
81    } else {
82       program->family = family;
83    }
84    program->wave_size = info->wave_size;
85    program->lane_mask = program->wave_size == 32 ? s1 : s2;
86 
87    program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024
88                                        : gfx_level >= GFX7                        ? 512
89                                                                                   : 256;
90    program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
91 
92    /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
93    program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
94 
95    /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96    program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97 
98    program->dev.vgpr_limit = stage == raytracing_cs ? 128 : 256;
99    program->dev.physical_vgprs = 256;
100    program->dev.vgpr_alloc_granule = 4;
101 
102    if (gfx_level >= GFX10) {
103       program->dev.physical_sgprs = 128 * 20; /* enough for max waves */
104       program->dev.sgpr_alloc_granule = 128;
105       program->dev.sgpr_limit =
106          108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
107 
108       if (family == CHIP_NAVI31 || family == CHIP_NAVI32 || family == CHIP_GFX1151 ||
109           gfx_level >= GFX12) {
110          program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
111          program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
112       } else {
113          program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
114          if (gfx_level >= GFX10_3)
115             program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
116          else
117             program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
118       }
119    } else if (program->gfx_level >= GFX8) {
120       program->dev.physical_sgprs = 800;
121       program->dev.sgpr_alloc_granule = 16;
122       program->dev.sgpr_limit = 102;
123       if (family == CHIP_TONGA || family == CHIP_ICELAND)
124          program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
125    } else {
126       program->dev.physical_sgprs = 512;
127       program->dev.sgpr_alloc_granule = 8;
128       program->dev.sgpr_limit = 104;
129    }
130 
131    program->dev.scratch_alloc_granule = gfx_level >= GFX11 ? 256 : 1024;
132 
133    program->dev.max_waves_per_simd = 10;
134    if (program->gfx_level >= GFX10_3)
135       program->dev.max_waves_per_simd = 16;
136    else if (program->gfx_level == GFX10)
137       program->dev.max_waves_per_simd = 20;
138    else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
139       program->dev.max_waves_per_simd = 8;
140 
141    program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
142 
143    switch (program->family) {
144    /* GFX8 APUs */
145    case CHIP_CARRIZO:
146    case CHIP_STONEY:
147    /* GFX9 APUS */
148    case CHIP_RAVEN:
149    case CHIP_RAVEN2:
150    case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
151    default: break;
152    }
153 
154    program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
155    /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
156    program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
157    if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
158        program->family == CHIP_HAWAII)
159       program->dev.has_fast_fma32 = true;
160    program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level == GFX10;
161    program->dev.has_fmac_legacy32 = program->gfx_level >= GFX10_3 && program->gfx_level < GFX12;
162 
163    program->dev.fused_mad_mix = program->gfx_level >= GFX10;
164    if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
165        program->family == CHIP_MI100 || program->family == CHIP_MI200)
166       program->dev.fused_mad_mix = true;
167 
168    if (program->gfx_level >= GFX11) {
169       program->dev.scratch_global_offset_min = -4096;
170       program->dev.scratch_global_offset_max = 4095;
171    } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
172       program->dev.scratch_global_offset_min = -2048;
173       program->dev.scratch_global_offset_max = 2047;
174    } else if (program->gfx_level == GFX9) {
175       /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
176       program->dev.scratch_global_offset_min = 0;
177       program->dev.scratch_global_offset_max = 4095;
178    }
179 
180    if (program->gfx_level >= GFX12) {
181       /* Same as GFX11, except one less for VSAMPLE. */
182       program->dev.max_nsa_vgprs = 3;
183    } else if (program->gfx_level >= GFX11) {
184       /* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
185        * rest of the address.
186        */
187       program->dev.max_nsa_vgprs = 4;
188    } else if (program->gfx_level >= GFX10_3) {
189       /* GFX10.3 can have up to 3 NSA dwords. */
190       program->dev.max_nsa_vgprs = 13;
191    } else if (program->gfx_level >= GFX10) {
192       /* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
193       program->dev.max_nsa_vgprs = 5;
194    } else {
195       program->dev.max_nsa_vgprs = 0;
196    }
197 
198    program->wgp_mode = wgp_mode;
199 
200    program->progress = CompilationProgress::after_isel;
201 
202    program->next_fp_mode.must_flush_denorms32 = false;
203    program->next_fp_mode.must_flush_denorms16_64 = false;
204    program->next_fp_mode.care_about_round32 = false;
205    program->next_fp_mode.care_about_round16_64 = false;
206    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
207    program->next_fp_mode.denorm32 = 0;
208    program->next_fp_mode.round16_64 = fp_round_ne;
209    program->next_fp_mode.round32 = fp_round_ne;
210 }
211 
212 bool
is_wait_export_ready(amd_gfx_level gfx_level,const Instruction * instr)213 is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr)
214 {
215    return instr->opcode == aco_opcode::s_wait_event &&
216           (gfx_level >= GFX12 ? (instr->salu().imm & wait_event_imm_wait_export_ready_gfx12)
217                               : !(instr->salu().imm & wait_event_imm_dont_wait_export_ready_gfx11));
218 }
219 
220 memory_sync_info
get_sync_info(const Instruction * instr)221 get_sync_info(const Instruction* instr)
222 {
223    /* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
224     * overlapping waves in the queue family.
225     */
226    if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
227        instr->opcode == aco_opcode::s_wait_event) {
228       return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
229    } else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
230       return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
231    }
232 
233    switch (instr->format) {
234    case Format::SMEM: return instr->smem().sync;
235    case Format::MUBUF: return instr->mubuf().sync;
236    case Format::MIMG: return instr->mimg().sync;
237    case Format::MTBUF: return instr->mtbuf().sync;
238    case Format::FLAT:
239    case Format::GLOBAL:
240    case Format::SCRATCH: return instr->flatlike().sync;
241    case Format::DS: return instr->ds().sync;
242    case Format::LDSDIR: return instr->ldsdir().sync;
243    default: return memory_sync_info();
244    }
245 }
246 
247 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)248 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
249 {
250    if (!instr->isVALU())
251       return false;
252 
253    if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
254       return false;
255 
256    if (instr->isSDWA())
257       return true;
258 
259    if (instr->isVOP3()) {
260       VALU_instruction& vop3 = instr->valu();
261       if (instr->format == Format::VOP3)
262          return false;
263       if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
264          return false;
265       if (vop3.omod && gfx_level < GFX9)
266          return false;
267 
268       // TODO: return true if we know we will use vcc
269       if (!pre_ra && instr->definitions.size() >= 2)
270          return false;
271 
272       for (unsigned i = 1; i < instr->operands.size(); i++) {
273          if (instr->operands[i].isLiteral())
274             return false;
275          if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
276             return false;
277       }
278    }
279 
280    if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
281       return false;
282 
283    if (!instr->operands.empty()) {
284       if (instr->operands[0].isLiteral())
285          return false;
286       if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
287          return false;
288       if (instr->operands[0].bytes() > 4)
289          return false;
290       if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
291          return false;
292    }
293 
294    bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
295                  instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
296 
297    if (gfx_level != GFX8 && is_mac)
298       return false;
299 
300    // TODO: return true if we know we will use vcc
301    if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
302       return false;
303    if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
304       return false;
305 
306    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
307           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
308           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
309           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
310           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
311           instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
312 }
313 
314 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
315 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)316 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
317 {
318    if (instr->isSDWA())
319       return NULL;
320 
321    aco_ptr<Instruction> tmp = std::move(instr);
322    Format format = asSDWA(withoutVOP3(tmp->format));
323    instr.reset(
324       create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
325    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
326    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
327 
328    SDWA_instruction& sdwa = instr->sdwa();
329 
330    if (tmp->isVOP3()) {
331       VALU_instruction& vop3 = tmp->valu();
332       sdwa.neg = vop3.neg;
333       sdwa.abs = vop3.abs;
334       sdwa.omod = vop3.omod;
335       sdwa.clamp = vop3.clamp;
336    }
337 
338    for (unsigned i = 0; i < instr->operands.size(); i++) {
339       /* SDWA only uses operands 0 and 1. */
340       if (i >= 2)
341          break;
342 
343       sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
344    }
345 
346    sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
347 
348    if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
349       instr->definitions[0].setPrecolored(vcc);
350    if (instr->definitions.size() >= 2)
351       instr->definitions[1].setPrecolored(vcc);
352    if (instr->operands.size() >= 3)
353       instr->operands[2].setPrecolored(vcc);
354 
355    instr->pass_flags = tmp->pass_flags;
356 
357    return tmp;
358 }
359 
360 bool
can_use_DPP(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool dpp8)361 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
362 {
363    assert(instr->isVALU() && !instr->operands.empty());
364 
365    if (instr->isDPP())
366       return instr->isDPP8() == dpp8;
367 
368    if (instr->isSDWA() || instr->isVINTERP_INREG())
369       return false;
370 
371    if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
372       return false;
373 
374    if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
375        instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
376       return false;
377 
378    if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
379        instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
380        gfx_level < GFX11)
381       return false;
382 
383    if (instr->isVOP3() && gfx_level < GFX11) {
384       const VALU_instruction* vop3 = &instr->valu();
385       if (vop3->clamp || vop3->omod)
386          return false;
387       if (dpp8)
388          return false;
389    }
390 
391    for (unsigned i = 0; i < instr->operands.size(); i++) {
392       if (instr->operands[i].isLiteral())
393          return false;
394       if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
395          return false;
396    }
397 
398    /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */
399    if (instr->writes_exec())
400       return false;
401 
402    /* simpler than listing all VOP3P opcodes which do not support DPP */
403    if (instr->isVOP3P()) {
404       return instr->opcode == aco_opcode::v_fma_mix_f32 ||
405              instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
406              instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
407              instr->opcode == aco_opcode::v_dot2_f32_f16 ||
408              instr->opcode == aco_opcode::v_dot2_f32_bf16;
409    }
410 
411    if (instr->opcode == aco_opcode::v_pk_fmac_f16)
412       return gfx_level < GFX11;
413 
414    /* there are more cases but those all take 64-bit inputs */
415    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
416           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
417           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
418           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
419           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
420           instr->opcode != aco_opcode::v_cvt_f64_i32 &&
421           instr->opcode != aco_opcode::v_cvt_f64_f32 &&
422           instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
423           instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
424           instr->opcode != aco_opcode::v_mul_hi_i32 &&
425           instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
426           instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
427           instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
428           instr->opcode != aco_opcode::v_mad_u64_u32 &&
429           instr->opcode != aco_opcode::v_mad_i64_i32 &&
430           instr->opcode != aco_opcode::v_permlane16_b32 &&
431           instr->opcode != aco_opcode::v_permlanex16_b32 &&
432           instr->opcode != aco_opcode::v_permlane64_b32 &&
433           instr->opcode != aco_opcode::v_readlane_b32_e64 &&
434           instr->opcode != aco_opcode::v_writelane_b32_e64 &&
435           instr->opcode != aco_opcode::p_v_cvt_pk_u8_f32;
436 }
437 
438 aco_ptr<Instruction>
convert_to_DPP(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,bool dpp8)439 convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
440 {
441    if (instr->isDPP())
442       return NULL;
443 
444    aco_ptr<Instruction> tmp = std::move(instr);
445    Format format =
446       (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
447    if (dpp8)
448       instr.reset(
449          create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
450    else
451       instr.reset(
452          create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
453    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
454    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
455 
456    if (dpp8) {
457       DPP8_instruction* dpp = &instr->dpp8();
458       dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
459       dpp->fetch_inactive = gfx_level >= GFX10;
460    } else {
461       DPP16_instruction* dpp = &instr->dpp16();
462       dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
463       dpp->row_mask = 0xf;
464       dpp->bank_mask = 0xf;
465       dpp->fetch_inactive = gfx_level >= GFX10;
466    }
467 
468    instr->valu().neg = tmp->valu().neg;
469    instr->valu().abs = tmp->valu().abs;
470    instr->valu().omod = tmp->valu().omod;
471    instr->valu().clamp = tmp->valu().clamp;
472    instr->valu().opsel = tmp->valu().opsel;
473    instr->valu().opsel_lo = tmp->valu().opsel_lo;
474    instr->valu().opsel_hi = tmp->valu().opsel_hi;
475 
476    if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
477       instr->definitions.back().setPrecolored(vcc);
478 
479    if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
480        gfx_level < GFX11)
481       instr->operands[2].setPrecolored(vcc);
482 
483    instr->pass_flags = tmp->pass_flags;
484 
485    /* DPP16 supports input modifiers, so we might no longer need VOP3. */
486    bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
487                       (instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
488 
489    /* VOPC/add_co/sub_co definition needs VCC without VOP3. */
490    remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
491                   !instr->definitions.back().isFixed() ||
492                   instr->definitions.back().physReg() == vcc;
493 
494    /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
495    remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
496                   instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
497 
498    if (remove_vop3)
499       instr->format = withoutVOP3(instr->format);
500 
501    return tmp;
502 }
503 
504 bool
can_use_input_modifiers(amd_gfx_level gfx_level,aco_opcode op,int idx)505 can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
506 {
507    if (op == aco_opcode::v_mov_b32)
508       return gfx_level >= GFX10;
509 
510    if (op == aco_opcode::v_ldexp_f16 || op == aco_opcode::v_ldexp_f32 ||
511        op == aco_opcode::v_ldexp_f64)
512       return idx == 0;
513 
514    return instr_info.can_use_input_modifiers[(int)op];
515 }
516 
517 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)518 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
519 {
520    /* opsel is only GFX9+ */
521    if (gfx_level < GFX9)
522       return false;
523 
524    switch (op) {
525    case aco_opcode::v_div_fixup_f16:
526    case aco_opcode::v_fma_f16:
527    case aco_opcode::v_mad_f16:
528    case aco_opcode::v_mad_u16:
529    case aco_opcode::v_mad_i16:
530    case aco_opcode::v_med3_f16:
531    case aco_opcode::v_med3_i16:
532    case aco_opcode::v_med3_u16:
533    case aco_opcode::v_min3_f16:
534    case aco_opcode::v_min3_i16:
535    case aco_opcode::v_min3_u16:
536    case aco_opcode::v_max3_f16:
537    case aco_opcode::v_max3_i16:
538    case aco_opcode::v_max3_u16:
539    case aco_opcode::v_minmax_f16:
540    case aco_opcode::v_maxmin_f16:
541    case aco_opcode::v_max_u16_e64:
542    case aco_opcode::v_max_i16_e64:
543    case aco_opcode::v_min_u16_e64:
544    case aco_opcode::v_min_i16_e64:
545    case aco_opcode::v_add_i16:
546    case aco_opcode::v_sub_i16:
547    case aco_opcode::v_add_u16_e64:
548    case aco_opcode::v_sub_u16_e64:
549    case aco_opcode::v_lshlrev_b16_e64:
550    case aco_opcode::v_lshrrev_b16_e64:
551    case aco_opcode::v_ashrrev_i16_e64:
552    case aco_opcode::v_and_b16:
553    case aco_opcode::v_or_b16:
554    case aco_opcode::v_xor_b16:
555    case aco_opcode::v_mul_lo_u16_e64: return true;
556    case aco_opcode::v_pack_b32_f16:
557    case aco_opcode::v_cvt_pknorm_i16_f16:
558    case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
559    case aco_opcode::v_mad_u32_u16:
560    case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
561    case aco_opcode::v_dot2_f16_f16:
562    case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
563    case aco_opcode::v_cndmask_b16: return idx != 2;
564    case aco_opcode::v_interp_p10_f16_f32_inreg:
565    case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
566    case aco_opcode::v_interp_p2_f16_f32_inreg:
567    case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
568    default:
569       return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
570    }
571 }
572 
573 bool
can_write_m0(const aco_ptr<Instruction> & instr)574 can_write_m0(const aco_ptr<Instruction>& instr)
575 {
576    if (instr->isSALU())
577       return true;
578 
579    /* VALU can't write m0 on any GPU generations. */
580    if (instr->isVALU())
581       return false;
582 
583    switch (instr->opcode) {
584    case aco_opcode::p_parallelcopy:
585    case aco_opcode::p_extract:
586    case aco_opcode::p_insert:
587       /* These pseudo instructions are implemented with SALU when writing m0. */
588       return true;
589    default:
590       /* Assume that no other instructions can write m0. */
591       return false;
592    }
593 }
594 
595 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)596 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
597 {
598    /* partial register writes are GFX9+, only */
599    if (gfx_level < GFX9)
600       return false;
601 
602    switch (op) {
603    /* VOP3 */
604    case aco_opcode::v_mad_legacy_f16:
605    case aco_opcode::v_mad_legacy_u16:
606    case aco_opcode::v_mad_legacy_i16:
607    case aco_opcode::v_fma_legacy_f16:
608    case aco_opcode::v_div_fixup_legacy_f16: return false;
609    case aco_opcode::v_interp_p2_f16:
610    case aco_opcode::v_interp_p2_hi_f16:
611    case aco_opcode::v_fma_mixlo_f16:
612    case aco_opcode::v_fma_mixhi_f16:
613    /* VOP2 */
614    case aco_opcode::v_mac_f16:
615    case aco_opcode::v_madak_f16:
616    case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
617    case aco_opcode::v_add_f16:
618    case aco_opcode::v_sub_f16:
619    case aco_opcode::v_subrev_f16:
620    case aco_opcode::v_mul_f16:
621    case aco_opcode::v_max_f16:
622    case aco_opcode::v_min_f16:
623    case aco_opcode::v_ldexp_f16:
624    case aco_opcode::v_fmac_f16:
625    case aco_opcode::v_fmamk_f16:
626    case aco_opcode::v_fmaak_f16:
627    /* VOP1 */
628    case aco_opcode::v_cvt_f16_f32:
629    case aco_opcode::p_v_cvt_f16_f32_rtne:
630    case aco_opcode::v_cvt_f16_u16:
631    case aco_opcode::v_cvt_f16_i16:
632    case aco_opcode::v_rcp_f16:
633    case aco_opcode::v_sqrt_f16:
634    case aco_opcode::v_rsq_f16:
635    case aco_opcode::v_log_f16:
636    case aco_opcode::v_exp_f16:
637    case aco_opcode::v_frexp_mant_f16:
638    case aco_opcode::v_frexp_exp_i16_f16:
639    case aco_opcode::v_floor_f16:
640    case aco_opcode::v_ceil_f16:
641    case aco_opcode::v_trunc_f16:
642    case aco_opcode::v_rndne_f16:
643    case aco_opcode::v_fract_f16:
644    case aco_opcode::v_sin_f16:
645    case aco_opcode::v_cos_f16:
646    case aco_opcode::v_cvt_u16_f16:
647    case aco_opcode::v_cvt_i16_f16:
648    case aco_opcode::v_cvt_norm_i16_f16:
649    case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
650    /* all non legacy opsel instructions preserve the high bits */
651    default: return can_use_opsel(gfx_level, op, -1);
652    }
653 }
654 
655 /* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
656  * only supports v0-v127.
657  * The first three bits are used for operands 0-2, and the 4th bit is used for the destination.
658  */
659 uint8_t
get_gfx11_true16_mask(aco_opcode op)660 get_gfx11_true16_mask(aco_opcode op)
661 {
662    switch (op) {
663    case aco_opcode::v_ceil_f16:
664    case aco_opcode::v_cos_f16:
665    case aco_opcode::v_cvt_f16_i16:
666    case aco_opcode::v_cvt_f16_u16:
667    case aco_opcode::v_cvt_i16_f16:
668    case aco_opcode::v_cvt_u16_f16:
669    case aco_opcode::v_cvt_norm_i16_f16:
670    case aco_opcode::v_cvt_norm_u16_f16:
671    case aco_opcode::v_exp_f16:
672    case aco_opcode::v_floor_f16:
673    case aco_opcode::v_fract_f16:
674    case aco_opcode::v_frexp_exp_i16_f16:
675    case aco_opcode::v_frexp_mant_f16:
676    case aco_opcode::v_log_f16:
677    case aco_opcode::v_not_b16:
678    case aco_opcode::v_rcp_f16:
679    case aco_opcode::v_rndne_f16:
680    case aco_opcode::v_rsq_f16:
681    case aco_opcode::v_sin_f16:
682    case aco_opcode::v_sqrt_f16:
683    case aco_opcode::v_trunc_f16:
684    case aco_opcode::v_swap_b16:
685    case aco_opcode::v_mov_b16: return 0x1 | 0x8;
686    case aco_opcode::v_add_f16:
687    case aco_opcode::v_fmaak_f16:
688    case aco_opcode::v_fmac_f16:
689    case aco_opcode::v_fmamk_f16:
690    case aco_opcode::v_ldexp_f16:
691    case aco_opcode::v_max_f16:
692    case aco_opcode::v_min_f16:
693    case aco_opcode::v_mul_f16:
694    case aco_opcode::v_sub_f16:
695    case aco_opcode::v_subrev_f16:
696    case aco_opcode::v_and_b16:
697    case aco_opcode::v_or_b16:
698    case aco_opcode::v_xor_b16: return 0x3 | 0x8;
699    case aco_opcode::v_cvt_f32_f16:
700    case aco_opcode::v_cvt_i32_i16:
701    case aco_opcode::v_cvt_u32_u16: return 0x1;
702    case aco_opcode::v_cmp_class_f16:
703    case aco_opcode::v_cmp_eq_f16:
704    case aco_opcode::v_cmp_eq_i16:
705    case aco_opcode::v_cmp_eq_u16:
706    case aco_opcode::v_cmp_ge_f16:
707    case aco_opcode::v_cmp_ge_i16:
708    case aco_opcode::v_cmp_ge_u16:
709    case aco_opcode::v_cmp_gt_f16:
710    case aco_opcode::v_cmp_gt_i16:
711    case aco_opcode::v_cmp_gt_u16:
712    case aco_opcode::v_cmp_le_f16:
713    case aco_opcode::v_cmp_le_i16:
714    case aco_opcode::v_cmp_le_u16:
715    case aco_opcode::v_cmp_lg_f16:
716    case aco_opcode::v_cmp_lg_i16:
717    case aco_opcode::v_cmp_lg_u16:
718    case aco_opcode::v_cmp_lt_f16:
719    case aco_opcode::v_cmp_lt_i16:
720    case aco_opcode::v_cmp_lt_u16:
721    case aco_opcode::v_cmp_neq_f16:
722    case aco_opcode::v_cmp_nge_f16:
723    case aco_opcode::v_cmp_ngt_f16:
724    case aco_opcode::v_cmp_nle_f16:
725    case aco_opcode::v_cmp_nlg_f16:
726    case aco_opcode::v_cmp_nlt_f16:
727    case aco_opcode::v_cmp_o_f16:
728    case aco_opcode::v_cmp_u_f16:
729    case aco_opcode::v_cmpx_class_f16:
730    case aco_opcode::v_cmpx_eq_f16:
731    case aco_opcode::v_cmpx_eq_i16:
732    case aco_opcode::v_cmpx_eq_u16:
733    case aco_opcode::v_cmpx_ge_f16:
734    case aco_opcode::v_cmpx_ge_i16:
735    case aco_opcode::v_cmpx_ge_u16:
736    case aco_opcode::v_cmpx_gt_f16:
737    case aco_opcode::v_cmpx_gt_i16:
738    case aco_opcode::v_cmpx_gt_u16:
739    case aco_opcode::v_cmpx_le_f16:
740    case aco_opcode::v_cmpx_le_i16:
741    case aco_opcode::v_cmpx_le_u16:
742    case aco_opcode::v_cmpx_lg_f16:
743    case aco_opcode::v_cmpx_lg_i16:
744    case aco_opcode::v_cmpx_lg_u16:
745    case aco_opcode::v_cmpx_lt_f16:
746    case aco_opcode::v_cmpx_lt_i16:
747    case aco_opcode::v_cmpx_lt_u16:
748    case aco_opcode::v_cmpx_neq_f16:
749    case aco_opcode::v_cmpx_nge_f16:
750    case aco_opcode::v_cmpx_ngt_f16:
751    case aco_opcode::v_cmpx_nle_f16:
752    case aco_opcode::v_cmpx_nlg_f16:
753    case aco_opcode::v_cmpx_nlt_f16:
754    case aco_opcode::v_cmpx_o_f16:
755    case aco_opcode::v_cmpx_u_f16: return 0x3;
756    case aco_opcode::v_cvt_f16_f32:
757    case aco_opcode::v_sat_pk_u8_i16: return 0x8;
758    default: return 0x0;
759    }
760 }
761 
762 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)763 get_reduction_identity(ReduceOp op, unsigned idx)
764 {
765    switch (op) {
766    case iadd8:
767    case iadd16:
768    case iadd32:
769    case iadd64:
770    case fadd16:
771    case fadd32:
772    case fadd64:
773    case ior8:
774    case ior16:
775    case ior32:
776    case ior64:
777    case ixor8:
778    case ixor16:
779    case ixor32:
780    case ixor64:
781    case umax8:
782    case umax16:
783    case umax32:
784    case umax64: return 0;
785    case imul8:
786    case imul16:
787    case imul32:
788    case imul64: return idx ? 0 : 1;
789    case fmul16: return 0x3c00u;                /* 1.0 */
790    case fmul32: return 0x3f800000u;            /* 1.0 */
791    case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
792    case imin8: return INT8_MAX;
793    case imin16: return INT16_MAX;
794    case imin32: return INT32_MAX;
795    case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
796    case imax8: return INT8_MIN;
797    case imax16: return INT16_MIN;
798    case imax32: return INT32_MIN;
799    case imax64: return idx ? 0x80000000u : 0;
800    case umin8:
801    case umin16:
802    case iand8:
803    case iand16: return 0xffffffffu;
804    case umin32:
805    case umin64:
806    case iand32:
807    case iand64: return 0xffffffffu;
808    case fmin16: return 0x7c00u;                /* infinity */
809    case fmin32: return 0x7f800000u;            /* infinity */
810    case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
811    case fmax16: return 0xfc00u;                /* negative infinity */
812    case fmax32: return 0xff800000u;            /* negative infinity */
813    case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
814    default: unreachable("Invalid reduction operation"); break;
815    }
816    return 0;
817 }
818 
819 unsigned
get_operand_size(aco_ptr<Instruction> & instr,unsigned index)820 get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
821 {
822    if (instr->isPseudo())
823       return instr->operands[index].bytes() * 8u;
824    else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
825             instr->opcode == aco_opcode::v_mad_i64_i32)
826       return index == 2 ? 64 : 32;
827    else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
828             instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
829             instr->opcode == aco_opcode::v_fma_mixhi_f16)
830       return instr->valu().opsel_hi[index] ? 16 : 32;
831    else if (instr->opcode == aco_opcode::v_interp_p10_f16_f32_inreg ||
832             instr->opcode == aco_opcode::v_interp_p10_rtz_f16_f32_inreg)
833       return index == 1 ? 32 : 16;
834    else if (instr->opcode == aco_opcode::v_interp_p2_f16_f32_inreg ||
835             instr->opcode == aco_opcode::v_interp_p2_rtz_f16_f32_inreg)
836       return index == 0 ? 16 : 32;
837    else if (instr->isVALU() || instr->isSALU())
838       return instr_info.operand_size[(int)instr->opcode];
839    else
840       return 0;
841 }
842 
843 bool
needs_exec_mask(const Instruction * instr)844 needs_exec_mask(const Instruction* instr)
845 {
846    if (instr->isVALU()) {
847       return instr->opcode != aco_opcode::v_readlane_b32 &&
848              instr->opcode != aco_opcode::v_readlane_b32_e64 &&
849              instr->opcode != aco_opcode::v_writelane_b32 &&
850              instr->opcode != aco_opcode::v_writelane_b32_e64;
851    }
852 
853    if (instr->isVMEM() || instr->isFlatLike())
854       return true;
855 
856    if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
857       return instr->opcode == aco_opcode::s_cbranch_execz ||
858              instr->opcode == aco_opcode::s_cbranch_execnz || instr->reads_exec();
859 
860    if (instr->isPseudo()) {
861       switch (instr->opcode) {
862       case aco_opcode::p_create_vector:
863       case aco_opcode::p_extract_vector:
864       case aco_opcode::p_split_vector:
865       case aco_opcode::p_phi:
866       case aco_opcode::p_parallelcopy:
867          for (Definition def : instr->definitions) {
868             if (def.getTemp().type() == RegType::vgpr)
869                return true;
870          }
871          return instr->reads_exec();
872       case aco_opcode::p_spill:
873       case aco_opcode::p_reload:
874       case aco_opcode::p_end_linear_vgpr:
875       case aco_opcode::p_logical_start:
876       case aco_opcode::p_logical_end:
877       case aco_opcode::p_startpgm:
878       case aco_opcode::p_end_wqm:
879       case aco_opcode::p_init_scratch: return instr->reads_exec();
880       case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
881       default: break;
882       }
883    }
884 
885    return true;
886 }
887 
888 struct CmpInfo {
889    aco_opcode swapped;
890    aco_opcode inverse;
891    aco_opcode vcmpx;
892 };
893 
894 static ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)895 get_cmp_info(aco_opcode op, CmpInfo* info)
896 {
897    info->swapped = aco_opcode::num_opcodes;
898    info->inverse = aco_opcode::num_opcodes;
899    info->vcmpx = aco_opcode::num_opcodes;
900    switch (op) {
901       // clang-format off
902 #define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
903    case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
904    case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
905       info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
906                                                       : aco_opcode::v_cmp_n##unord_swap##_f##sz;   \
907       info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
908                                                                : aco_opcode::v_cmp_n##ord##_f##sz; \
909       info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz       \
910                                                           : aco_opcode::v_cmpx_n##unord##_f##sz;   \
911       return true;
912 #define CMP(ord, unord, ord_swap, unord_swap)                                                      \
913    CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
914    CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
915    CMP2(ord, unord, ord_swap, unord_swap, 64)
916       CMP(lt, /*n*/ge, gt, /*n*/le)
917       CMP(eq, /*n*/lg, eq, /*n*/lg)
918       CMP(le, /*n*/gt, ge, /*n*/lt)
919       CMP(gt, /*n*/le, lt, /*n*/ge)
920       CMP(lg, /*n*/eq, lg, /*n*/eq)
921       CMP(ge, /*n*/lt, le, /*n*/gt)
922 #undef CMP
923 #undef CMP2
924 #define ORD_TEST(sz)                                                                               \
925    case aco_opcode::v_cmp_u_f##sz:                                                                 \
926       info->swapped = aco_opcode::v_cmp_u_f##sz;                                                   \
927       info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
928       info->vcmpx = aco_opcode::v_cmpx_u_f##sz;                                                    \
929       return true;                                                                                 \
930    case aco_opcode::v_cmp_o_f##sz:                                                                 \
931       info->swapped = aco_opcode::v_cmp_o_f##sz;                                                   \
932       info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
933       info->vcmpx = aco_opcode::v_cmpx_o_f##sz;                                                    \
934       return true;
935       ORD_TEST(16)
936       ORD_TEST(32)
937       ORD_TEST(64)
938 #undef ORD_TEST
939 #define CMPI2(op, swap, inv, type, sz)                                                             \
940    case aco_opcode::v_cmp_##op##_##type##sz:                                                       \
941       info->swapped = aco_opcode::v_cmp_##swap##_##type##sz;                                       \
942       info->inverse = aco_opcode::v_cmp_##inv##_##type##sz;                                        \
943       info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz;                                          \
944       return true;
945 #define CMPI(op, swap, inv)                                                                        \
946    CMPI2(op, swap, inv, i, 16)                                                                     \
947    CMPI2(op, swap, inv, u, 16)                                                                     \
948    CMPI2(op, swap, inv, i, 32)                                                                     \
949    CMPI2(op, swap, inv, u, 32)                                                                     \
950    CMPI2(op, swap, inv, i, 64)                                                                     \
951    CMPI2(op, swap, inv, u, 64)
952       CMPI(lt, gt, ge)
953       CMPI(eq, eq, lg)
954       CMPI(le, ge, gt)
955       CMPI(gt, lt, le)
956       CMPI(lg, lg, eq)
957       CMPI(ge, le, lt)
958 #undef CMPI
959 #undef CMPI2
960 #define CMPCLASS(sz)                                                                               \
961    case aco_opcode::v_cmp_class_f##sz:                                                             \
962       info->vcmpx = aco_opcode::v_cmpx_class_f##sz;                                                \
963       return true;
964       CMPCLASS(16)
965       CMPCLASS(32)
966       CMPCLASS(64)
967 #undef CMPCLASS
968       // clang-format on
969    default: return false;
970    }
971 }
972 
973 aco_opcode
get_vcmp_inverse(aco_opcode op)974 get_vcmp_inverse(aco_opcode op)
975 {
976    CmpInfo info;
977    return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
978 }
979 
980 aco_opcode
get_vcmp_swapped(aco_opcode op)981 get_vcmp_swapped(aco_opcode op)
982 {
983    CmpInfo info;
984    return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
985 }
986 
987 aco_opcode
get_vcmpx(aco_opcode op)988 get_vcmpx(aco_opcode op)
989 {
990    CmpInfo info;
991    return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
992 }
993 
994 bool
is_cmpx(aco_opcode op)995 is_cmpx(aco_opcode op)
996 {
997    CmpInfo info;
998    return !get_cmp_info(op, &info);
999 }
1000 
1001 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op,unsigned idx0,unsigned idx1)1002 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
1003 {
1004    if (idx0 == idx1) {
1005       *new_op = instr->opcode;
1006       return true;
1007    }
1008 
1009    if (idx0 > idx1)
1010       std::swap(idx0, idx1);
1011 
1012    if (instr->isDPP())
1013       return false;
1014 
1015    if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
1016       return false;
1017 
1018    if (instr->isVOPC()) {
1019       CmpInfo info;
1020       if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
1021          *new_op = info.swapped;
1022          return true;
1023       }
1024    }
1025 
1026    /* opcodes not relevant for DPP or SGPRs optimizations are not included. */
1027    switch (instr->opcode) {
1028    case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
1029    case aco_opcode::v_add_u32:
1030    case aco_opcode::v_add_co_u32:
1031    case aco_opcode::v_add_co_u32_e64:
1032    case aco_opcode::v_add_i32:
1033    case aco_opcode::v_add_i16:
1034    case aco_opcode::v_add_u16_e64:
1035    case aco_opcode::v_add3_u32:
1036    case aco_opcode::v_add_f16:
1037    case aco_opcode::v_add_f32:
1038    case aco_opcode::v_mul_i32_i24:
1039    case aco_opcode::v_mul_hi_i32_i24:
1040    case aco_opcode::v_mul_u32_u24:
1041    case aco_opcode::v_mul_hi_u32_u24:
1042    case aco_opcode::v_mul_lo_u16:
1043    case aco_opcode::v_mul_lo_u16_e64:
1044    case aco_opcode::v_mul_f16:
1045    case aco_opcode::v_mul_f32:
1046    case aco_opcode::v_mul_legacy_f32:
1047    case aco_opcode::v_or_b32:
1048    case aco_opcode::v_and_b32:
1049    case aco_opcode::v_xor_b32:
1050    case aco_opcode::v_xnor_b32:
1051    case aco_opcode::v_xor3_b32:
1052    case aco_opcode::v_or3_b32:
1053    case aco_opcode::v_and_b16:
1054    case aco_opcode::v_or_b16:
1055    case aco_opcode::v_xor_b16:
1056    case aco_opcode::v_max3_f32:
1057    case aco_opcode::v_min3_f32:
1058    case aco_opcode::v_max3_f16:
1059    case aco_opcode::v_min3_f16:
1060    case aco_opcode::v_med3_f16:
1061    case aco_opcode::v_max3_u32:
1062    case aco_opcode::v_min3_u32:
1063    case aco_opcode::v_med3_u32:
1064    case aco_opcode::v_max3_i32:
1065    case aco_opcode::v_min3_i32:
1066    case aco_opcode::v_med3_i32:
1067    case aco_opcode::v_max3_u16:
1068    case aco_opcode::v_min3_u16:
1069    case aco_opcode::v_med3_u16:
1070    case aco_opcode::v_max3_i16:
1071    case aco_opcode::v_min3_i16:
1072    case aco_opcode::v_med3_i16:
1073    case aco_opcode::v_max_f16:
1074    case aco_opcode::v_max_f32:
1075    case aco_opcode::v_min_f16:
1076    case aco_opcode::v_min_f32:
1077    case aco_opcode::v_max_i32:
1078    case aco_opcode::v_min_i32:
1079    case aco_opcode::v_max_u32:
1080    case aco_opcode::v_min_u32:
1081    case aco_opcode::v_max_i16:
1082    case aco_opcode::v_min_i16:
1083    case aco_opcode::v_max_u16:
1084    case aco_opcode::v_min_u16:
1085    case aco_opcode::v_max_i16_e64:
1086    case aco_opcode::v_min_i16_e64:
1087    case aco_opcode::v_max_u16_e64:
1088    case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
1089    case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
1090    case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
1091    case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
1092    case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
1093    case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
1094    case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
1095    case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
1096    case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
1097    case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
1098    case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
1099    case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
1100    case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
1101    case aco_opcode::v_addc_co_u32:
1102    case aco_opcode::v_mad_i32_i24:
1103    case aco_opcode::v_mad_u32_u24:
1104    case aco_opcode::v_lerp_u8:
1105    case aco_opcode::v_sad_u8:
1106    case aco_opcode::v_sad_hi_u8:
1107    case aco_opcode::v_sad_u16:
1108    case aco_opcode::v_sad_u32:
1109    case aco_opcode::v_xad_u32:
1110    case aco_opcode::v_add_lshl_u32:
1111    case aco_opcode::v_and_or_b32:
1112    case aco_opcode::v_mad_u16:
1113    case aco_opcode::v_mad_i16:
1114    case aco_opcode::v_mad_u32_u16:
1115    case aco_opcode::v_mad_i32_i16:
1116    case aco_opcode::v_maxmin_f32:
1117    case aco_opcode::v_minmax_f32:
1118    case aco_opcode::v_maxmin_f16:
1119    case aco_opcode::v_minmax_f16:
1120    case aco_opcode::v_maxmin_u32:
1121    case aco_opcode::v_minmax_u32:
1122    case aco_opcode::v_maxmin_i32:
1123    case aco_opcode::v_minmax_i32:
1124    case aco_opcode::v_fma_f32:
1125    case aco_opcode::v_fma_legacy_f32:
1126    case aco_opcode::v_fmac_f32:
1127    case aco_opcode::v_fmac_legacy_f32:
1128    case aco_opcode::v_mac_f32:
1129    case aco_opcode::v_mac_legacy_f32:
1130    case aco_opcode::v_fma_f16:
1131    case aco_opcode::v_fmac_f16:
1132    case aco_opcode::v_mac_f16:
1133    case aco_opcode::v_dot4c_i32_i8:
1134    case aco_opcode::v_dot2c_f32_f16:
1135    case aco_opcode::v_dot2_f32_f16:
1136    case aco_opcode::v_dot2_f32_bf16:
1137    case aco_opcode::v_dot2_f16_f16:
1138    case aco_opcode::v_dot2_bf16_bf16:
1139    case aco_opcode::v_fma_mix_f32:
1140    case aco_opcode::v_fma_mixlo_f16:
1141    case aco_opcode::v_fma_mixhi_f16:
1142    case aco_opcode::v_pk_fmac_f16: {
1143       if (idx1 == 2)
1144          return false;
1145       *new_op = instr->opcode;
1146       return true;
1147    }
1148    case aco_opcode::v_subb_co_u32: {
1149       if (idx1 == 2)
1150          return false;
1151       *new_op = aco_opcode::v_subbrev_co_u32;
1152       return true;
1153    }
1154    case aco_opcode::v_subbrev_co_u32: {
1155       if (idx1 == 2)
1156          return false;
1157       *new_op = aco_opcode::v_subb_co_u32;
1158       return true;
1159    }
1160    default: return false;
1161    }
1162 }
1163 
wait_imm()1164 wait_imm::wait_imm()
1165     : exp(unset_counter), lgkm(unset_counter), vm(unset_counter), vs(unset_counter),
1166       sample(unset_counter), bvh(unset_counter), km(unset_counter)
1167 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)1168 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
1169     : exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_), sample(unset_counter), bvh(unset_counter),
1170       km(unset_counter)
1171 {}
1172 
1173 uint16_t
pack(enum amd_gfx_level gfx_level) const1174 wait_imm::pack(enum amd_gfx_level gfx_level) const
1175 {
1176    uint16_t imm = 0;
1177    assert(exp == unset_counter || exp <= 0x7);
1178    if (gfx_level >= GFX11) {
1179       assert(lgkm == unset_counter || lgkm <= 0x3f);
1180       assert(vm == unset_counter || vm <= 0x3f);
1181       imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
1182    } else if (gfx_level >= GFX10) {
1183       assert(lgkm == unset_counter || lgkm <= 0x3f);
1184       assert(vm == unset_counter || vm <= 0x3f);
1185       imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1186    } else if (gfx_level >= GFX9) {
1187       assert(lgkm == unset_counter || lgkm <= 0xf);
1188       assert(vm == unset_counter || vm <= 0x3f);
1189       imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1190    } else {
1191       assert(lgkm == unset_counter || lgkm <= 0xf);
1192       assert(vm == unset_counter || vm <= 0xf);
1193       imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1194    }
1195    if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
1196       imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
1197                         architecture when interpreting the immediate */
1198    if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
1199       imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
1200                         architecture when interpreting the immediate */
1201    return imm;
1202 }
1203 
1204 wait_imm
max(enum amd_gfx_level gfx_level)1205 wait_imm::max(enum amd_gfx_level gfx_level)
1206 {
1207    wait_imm imm;
1208    imm.vm = gfx_level >= GFX9 ? 63 : 15;
1209    imm.exp = 7;
1210    imm.lgkm = gfx_level >= GFX10 ? 63 : 15;
1211    imm.vs = gfx_level >= GFX10 ? 63 : 0;
1212    imm.sample = gfx_level >= GFX12 ? 63 : 0;
1213    imm.bvh = gfx_level >= GFX12 ? 7 : 0;
1214    imm.km = gfx_level >= GFX12 ? 31 : 0;
1215    return imm;
1216 }
1217 
1218 bool
unpack(enum amd_gfx_level gfx_level,const Instruction * instr)1219 wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr)
1220 {
1221    if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null))
1222       return false;
1223 
1224    aco_opcode op = instr->opcode;
1225    uint16_t packed = instr->salu().imm;
1226 
1227    if (op == aco_opcode::s_wait_loadcnt) {
1228       vm = std::min<uint8_t>(vm, packed);
1229    } else if (op == aco_opcode::s_wait_storecnt) {
1230       vs = std::min<uint8_t>(vs, packed);
1231    } else if (op == aco_opcode::s_wait_samplecnt) {
1232       sample = std::min<uint8_t>(sample, packed);
1233    } else if (op == aco_opcode::s_wait_bvhcnt) {
1234       bvh = std::min<uint8_t>(bvh, packed);
1235    } else if (op == aco_opcode::s_wait_expcnt) {
1236       exp = std::min<uint8_t>(exp, packed);
1237    } else if (op == aco_opcode::s_wait_dscnt) {
1238       lgkm = std::min<uint8_t>(lgkm, packed);
1239    } else if (op == aco_opcode::s_wait_kmcnt) {
1240       km = std::min<uint8_t>(km, packed);
1241    } else if (op == aco_opcode::s_wait_loadcnt_dscnt) {
1242       uint32_t vm2 = (packed >> 8) & 0x3f;
1243       uint32_t ds = packed & 0x3f;
1244       vm = std::min<uint8_t>(vm, vm2 == 0x3f ? wait_imm::unset_counter : vm2);
1245       lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1246    } else if (op == aco_opcode::s_wait_storecnt_dscnt) {
1247       uint32_t vs2 = (packed >> 8) & 0x3f;
1248       uint32_t ds = packed & 0x3f;
1249       vs = std::min<uint8_t>(vs, vs2 == 0x3f ? wait_imm::unset_counter : vs2);
1250       lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1251    } else if (op == aco_opcode::s_waitcnt_expcnt) {
1252       exp = std::min<uint8_t>(exp, packed);
1253    } else if (op == aco_opcode::s_waitcnt_lgkmcnt) {
1254       lgkm = std::min<uint8_t>(lgkm, packed);
1255    } else if (op == aco_opcode::s_waitcnt_vmcnt) {
1256       vm = std::min<uint8_t>(vm, packed);
1257    } else if (op == aco_opcode::s_waitcnt_vscnt) {
1258       vs = std::min<uint8_t>(vs, packed);
1259    } else if (op == aco_opcode::s_waitcnt) {
1260       uint8_t vm2, lgkm2, exp2;
1261       if (gfx_level >= GFX11) {
1262          vm2 = (packed >> 10) & 0x3f;
1263          lgkm2 = (packed >> 4) & 0x3f;
1264          exp2 = packed & 0x7;
1265       } else {
1266          vm2 = packed & 0xf;
1267          if (gfx_level >= GFX9)
1268             vm2 |= (packed >> 10) & 0x30;
1269 
1270          exp2 = (packed >> 4) & 0x7;
1271 
1272          lgkm2 = (packed >> 8) & 0xf;
1273          if (gfx_level >= GFX10)
1274             lgkm2 |= (packed >> 8) & 0x30;
1275       }
1276 
1277       if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf))
1278          vm2 = wait_imm::unset_counter;
1279       if (exp2 == 0x7)
1280          exp2 = wait_imm::unset_counter;
1281       if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf))
1282          lgkm2 = wait_imm::unset_counter;
1283 
1284       vm = std::min(vm, vm2);
1285       exp = std::min(exp, exp2);
1286       lgkm = std::min(lgkm, lgkm2);
1287    } else {
1288       return false;
1289    }
1290    return true;
1291 }
1292 
1293 bool
combine(const wait_imm & other)1294 wait_imm::combine(const wait_imm& other)
1295 {
1296    bool changed = false;
1297    for (unsigned i = 0; i < wait_type_num; i++) {
1298       if (other[i] < (*this)[i])
1299          changed = true;
1300       (*this)[i] = std::min((*this)[i], other[i]);
1301    }
1302    return changed;
1303 }
1304 
1305 bool
empty() const1306 wait_imm::empty() const
1307 {
1308    for (unsigned i = 0; i < wait_type_num; i++) {
1309       if ((*this)[i] != unset_counter)
1310          return false;
1311    }
1312    return true;
1313 }
1314 
1315 void
print(FILE * output) const1316 wait_imm::print(FILE* output) const
1317 {
1318    const char* names[wait_type_num];
1319    names[wait_type_exp] = "exp";
1320    names[wait_type_vm] = "vm";
1321    names[wait_type_lgkm] = "lgkm";
1322    names[wait_type_vs] = "vs";
1323    names[wait_type_sample] = "sample";
1324    names[wait_type_bvh] = "bvh";
1325    names[wait_type_km] = "km";
1326    for (unsigned i = 0; i < wait_type_num; i++) {
1327       if ((*this)[i] != unset_counter)
1328          fprintf(output, "%s: %u\n", names[i], (*this)[i]);
1329    }
1330 }
1331 
1332 void
build_waitcnt(Builder & bld)1333 wait_imm::build_waitcnt(Builder& bld)
1334 {
1335    enum amd_gfx_level gfx_level = bld.program->gfx_level;
1336 
1337    if (gfx_level >= GFX12) {
1338       if (vm != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) {
1339          bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (vm << 8) | lgkm);
1340          vm = wait_imm::unset_counter;
1341          lgkm = wait_imm::unset_counter;
1342       }
1343 
1344       if (vs != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) {
1345          bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (vs << 8) | lgkm);
1346          vs = wait_imm::unset_counter;
1347          lgkm = wait_imm::unset_counter;
1348       }
1349 
1350       aco_opcode op[wait_type_num];
1351       op[wait_type_exp] = aco_opcode::s_wait_expcnt;
1352       op[wait_type_lgkm] = aco_opcode::s_wait_dscnt;
1353       op[wait_type_vm] = aco_opcode::s_wait_loadcnt;
1354       op[wait_type_vs] = aco_opcode::s_wait_storecnt;
1355       op[wait_type_sample] = aco_opcode::s_wait_samplecnt;
1356       op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt;
1357       op[wait_type_km] = aco_opcode::s_wait_kmcnt;
1358 
1359       for (unsigned i = 0; i < wait_type_num; i++) {
1360          if ((*this)[i] != wait_imm::unset_counter)
1361             bld.sopp(op[i], (*this)[i]);
1362       }
1363    } else {
1364       if (vs != wait_imm::unset_counter) {
1365          assert(gfx_level >= GFX10);
1366          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), vs);
1367          vs = wait_imm::unset_counter;
1368       }
1369       if (!empty())
1370          bld.sopp(aco_opcode::s_waitcnt, pack(gfx_level));
1371    }
1372 
1373    *this = wait_imm();
1374 }
1375 
1376 bool
should_form_clause(const Instruction * a,const Instruction * b)1377 should_form_clause(const Instruction* a, const Instruction* b)
1378 {
1379    if (a->definitions.empty() != b->definitions.empty())
1380       return false;
1381 
1382    if (a->format != b->format)
1383       return false;
1384 
1385    if (a->operands.empty() || b->operands.empty())
1386       return false;
1387 
1388    /* Assume loads which don't use descriptors might load from similar addresses. */
1389    if (a->isFlatLike() || a->accessesLDS())
1390       return true;
1391    if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
1392       return true;
1393 
1394    /* If they load from the same descriptor, assume they might load from similar
1395     * addresses.
1396     */
1397    if (a->isVMEM() || a->isSMEM())
1398       return a->operands[0].tempId() == b->operands[0].tempId();
1399 
1400    if (a->isEXP() && b->isEXP())
1401       return true;
1402 
1403    return false;
1404 }
1405 
1406 int
get_op_fixed_to_def(Instruction * instr)1407 get_op_fixed_to_def(Instruction* instr)
1408 {
1409    if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
1410        instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
1411        instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
1412        instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
1413        instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
1414        instr->opcode == aco_opcode::v_writelane_b32_e64 ||
1415        instr->opcode == aco_opcode::v_dot4c_i32_i8 || instr->opcode == aco_opcode::s_fmac_f32 ||
1416        instr->opcode == aco_opcode::s_fmac_f16) {
1417       return 2;
1418    } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
1419               instr->opcode == aco_opcode::s_cmovk_i32) {
1420       return 0;
1421    } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) {
1422       return 3;
1423    } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
1424               !instr->operands[2].isUndefined()) {
1425       return 2;
1426    }
1427    return -1;
1428 }
1429 
1430 uint8_t
get_vmem_type(enum amd_gfx_level gfx_level,Instruction * instr)1431 get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
1432 {
1433    if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
1434       return vmem_bvh;
1435    else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
1436       return vmem_sampler;
1437    else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
1438             instr->operands[1].regClass() == s4)
1439       return vmem_sampler;
1440    else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
1441       return vmem_nosampler;
1442    return 0;
1443 }
1444 
1445 /* Parse implicit data dependency resolution:
1446  * Returns the value of each counter that must be reached
1447  * before an instruction is issued.
1448  *
1449  * (Probably incomplete.)
1450  */
1451 depctr_wait
parse_depctr_wait(const Instruction * instr)1452 parse_depctr_wait(const Instruction* instr)
1453 {
1454    depctr_wait res;
1455    if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP()) {
1456       res.va_vdst = 0;
1457       res.va_exec = 0;
1458       res.sa_exec = 0;
1459       if (instr->isVMEM() || instr->isFlatLike()) {
1460          res.sa_sdst = 0;
1461          res.va_sdst = 0;
1462          res.va_vcc = 0;
1463       }
1464    } else if (instr->isSMEM()) {
1465       res.sa_sdst = 0;
1466       res.va_sdst = 0;
1467       res.va_vcc = 0;
1468    } else if (instr->isLDSDIR()) {
1469       res.va_vdst = instr->ldsdir().wait_vdst;
1470       res.va_exec = 0;
1471       res.sa_exec = 0;
1472    } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1473       unsigned imm = instr->salu().imm;
1474       res.va_vdst = (imm >> 12) & 0xf;
1475       res.va_sdst = (imm >> 9) & 0x7;
1476       res.va_ssrc = (imm >> 8) & 0x1;
1477       res.hold_cnt = (imm >> 7) & 0x1;
1478       res.vm_vsrc = (imm >> 2) & 0x7;
1479       res.va_vcc = (imm >> 1) & 0x1;
1480       res.sa_sdst = imm & 0x1;
1481    } else if (instr->isVALU()) {
1482       res.sa_exec = 0;
1483       for (const Definition& def : instr->definitions) {
1484          if (def.regClass().type() == RegType::sgpr) {
1485             res.sa_sdst = 0;
1486             /* Notably, this is the only exception, even VALU that
1487              * reads exec doesn't implicitly wait for va_exec.
1488              */
1489             if (instr->opcode == aco_opcode::v_readfirstlane_b32)
1490                res.va_exec = 0;
1491             break;
1492          }
1493       }
1494    } else if (instr_info.classes[(int)instr->opcode] == instr_class::branch ||
1495               instr_info.classes[(int)instr->opcode] == instr_class::sendmsg) {
1496       res.sa_exec = 0;
1497       res.va_exec = 0;
1498       switch (instr->opcode) {
1499       case aco_opcode::s_cbranch_vccz:
1500       case aco_opcode::s_cbranch_vccnz:
1501          res.va_vcc = 0;
1502          res.sa_sdst = 0;
1503          break;
1504       case aco_opcode::s_cbranch_scc0:
1505       case aco_opcode::s_cbranch_scc1:
1506          res.sa_sdst = 0;
1507          break;
1508       default: break;
1509       }
1510    } else if (instr->isSALU()) {
1511       for (const Definition& def : instr->definitions) {
1512          if (def.physReg() < vcc) {
1513             res.va_sdst = 0;
1514          } else if (def.physReg() <= vcc_hi) {
1515             res.va_vcc = 0;
1516          } else if (def.physReg() == exec || def.physReg() == exec_hi) {
1517             res.va_exec = 0;
1518          }
1519       }
1520       for (const Operand& op : instr->operands) {
1521          if (op.physReg() < vcc) {
1522             res.va_sdst = 0;
1523          } else if (op.physReg() <= vcc_hi) {
1524             res.va_vcc = 0;
1525          } else if (op.physReg() == exec || op.physReg() == exec_hi) {
1526             res.va_exec = 0;
1527          }
1528       }
1529    }
1530 
1531    return res;
1532 }
1533 
1534 bool
dealloc_vgprs(Program * program)1535 dealloc_vgprs(Program* program)
1536 {
1537    if (program->gfx_level < GFX11)
1538       return false;
1539 
1540    /* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
1541     * a wait after exports. There might still be pending VMEM stores for PS parameter exports,
1542     * except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any
1543     * pending VMEM stores or exports if we insert the sendmsg for these stages. */
1544    if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1545                                          program->stage.hw == AC_HW_PIXEL_SHADER))
1546       return false;
1547 
1548    Block& block = program->blocks.back();
1549 
1550    /* don't bother checking if there is a pending VMEM store or export: there almost always is */
1551    Builder bld(program);
1552    if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
1553       bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
1554       bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
1555    }
1556 
1557    return true;
1558 }
1559 
1560 bool
isTrans() const1561 Instruction::isTrans() const noexcept
1562 {
1563    return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
1564           instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental ||
1565           instr_info.classes[(int)opcode] == instr_class::valu_pseudo_scalar_trans;
1566 }
1567 
1568 size_t
get_instr_data_size(Format format)1569 get_instr_data_size(Format format)
1570 {
1571    switch (format) {
1572    case Format::SOP1:
1573    case Format::SOP2:
1574    case Format::SOPC:
1575    case Format::SOPK:
1576    case Format::SOPP: return sizeof(SALU_instruction);
1577    case Format::SMEM: return sizeof(SMEM_instruction);
1578    case Format::PSEUDO: return sizeof(Pseudo_instruction);
1579    case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction);
1580    case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction);
1581    case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction);
1582    case Format::DS: return sizeof(DS_instruction);
1583    case Format::FLAT:
1584    case Format::GLOBAL:
1585    case Format::SCRATCH: return sizeof(FLAT_instruction);
1586    case Format::LDSDIR: return sizeof(LDSDIR_instruction);
1587    case Format::MTBUF: return sizeof(MTBUF_instruction);
1588    case Format::MUBUF: return sizeof(MUBUF_instruction);
1589    case Format::MIMG: return sizeof(MIMG_instruction);
1590    case Format::VOPD: return sizeof(VOPD_instruction);
1591    case Format::VINTERP_INREG: return sizeof(VINTERP_inreg_instruction);
1592    case Format::VINTRP: return sizeof(VINTRP_instruction);
1593    case Format::EXP: return sizeof(Export_instruction);
1594    default:
1595       if ((uint16_t)format & (uint16_t)Format::DPP16)
1596          return sizeof(DPP16_instruction);
1597       else if ((uint16_t)format & (uint16_t)Format::DPP8)
1598          return sizeof(DPP8_instruction);
1599       else if ((uint16_t)format & (uint16_t)Format::SDWA)
1600          return sizeof(SDWA_instruction);
1601       else
1602          return sizeof(VALU_instruction);
1603    }
1604 }
1605 
1606 Instruction*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1607 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1608                    uint32_t num_definitions)
1609 {
1610    size_t size = get_instr_data_size(format);
1611    size_t total_size = size + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1612 
1613    void* data = instruction_buffer->allocate(total_size, alignof(uint32_t));
1614    memset(data, 0, total_size);
1615    Instruction* inst = (Instruction*)data;
1616 
1617    inst->opcode = opcode;
1618    inst->format = format;
1619 
1620    uint16_t operands_offset = size - offsetof(Instruction, operands);
1621    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1622    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1623    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1624 
1625    return inst;
1626 }
1627 
1628 } // namespace aco
1629