1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_ir.h"
8
9 #include "aco_builder.h"
10
11 #include "util/u_debug.h"
12
13 #include "c11/threads.h"
14
15 namespace aco {
16
17 thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
18
19 uint64_t debug_flags = 0;
20
21 static const struct debug_control aco_debug_options[] = {
22 {"validateir", DEBUG_VALIDATE_IR},
23 {"validatera", DEBUG_VALIDATE_RA},
24 {"validate-livevars", DEBUG_VALIDATE_LIVE_VARS},
25 {"novalidateir", DEBUG_NO_VALIDATE_IR},
26 {"force-waitcnt", DEBUG_FORCE_WAITCNT},
27 {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
28 {"novn", DEBUG_NO_VN},
29 {"noopt", DEBUG_NO_OPT},
30 {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
31 {"nosched-ilp", DEBUG_NO_SCHED_ILP},
32 {"nosched-vopd", DEBUG_NO_SCHED_VOPD},
33 {"perfinfo", DEBUG_PERF_INFO},
34 {"liveinfo", DEBUG_LIVE_INFO},
35 {NULL, 0}};
36
37 static once_flag init_once_flag = ONCE_FLAG_INIT;
38
39 static void
init_once()40 init_once()
41 {
42 debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
43
44 #ifndef NDEBUG
45 /* enable some flags by default on debug builds */
46 debug_flags |= aco::DEBUG_VALIDATE_IR;
47 #endif
48
49 if (debug_flags & aco::DEBUG_NO_VALIDATE_IR)
50 debug_flags &= ~aco::DEBUG_VALIDATE_IR;
51 }
52
53 void
init()54 init()
55 {
56 call_once(&init_once_flag, init_once);
57 }
58
59 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)60 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
61 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
62 ac_shader_config* config)
63 {
64 instruction_buffer = &program->m;
65 program->stage = stage;
66 program->config = config;
67 program->info = *info;
68 program->gfx_level = gfx_level;
69 if (family == CHIP_UNKNOWN) {
70 switch (gfx_level) {
71 case GFX6: program->family = CHIP_TAHITI; break;
72 case GFX7: program->family = CHIP_BONAIRE; break;
73 case GFX8: program->family = CHIP_POLARIS10; break;
74 case GFX9: program->family = CHIP_VEGA10; break;
75 case GFX10: program->family = CHIP_NAVI10; break;
76 case GFX10_3: program->family = CHIP_NAVI21; break;
77 case GFX11: program->family = CHIP_NAVI31; break;
78 case GFX12: program->family = CHIP_GFX1200; break;
79 default: program->family = CHIP_UNKNOWN; break;
80 }
81 } else {
82 program->family = family;
83 }
84 program->wave_size = info->wave_size;
85 program->lane_mask = program->wave_size == 32 ? s1 : s2;
86
87 program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024
88 : gfx_level >= GFX7 ? 512
89 : 256;
90 program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
91
92 /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
93 program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
94
95 /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96 program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97
98 program->dev.vgpr_limit = stage == raytracing_cs ? 128 : 256;
99 program->dev.physical_vgprs = 256;
100 program->dev.vgpr_alloc_granule = 4;
101
102 if (gfx_level >= GFX10) {
103 program->dev.physical_sgprs = 128 * 20; /* enough for max waves */
104 program->dev.sgpr_alloc_granule = 128;
105 program->dev.sgpr_limit =
106 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
107
108 if (family == CHIP_NAVI31 || family == CHIP_NAVI32 || family == CHIP_GFX1151 ||
109 gfx_level >= GFX12) {
110 program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
111 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
112 } else {
113 program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
114 if (gfx_level >= GFX10_3)
115 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
116 else
117 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
118 }
119 } else if (program->gfx_level >= GFX8) {
120 program->dev.physical_sgprs = 800;
121 program->dev.sgpr_alloc_granule = 16;
122 program->dev.sgpr_limit = 102;
123 if (family == CHIP_TONGA || family == CHIP_ICELAND)
124 program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
125 } else {
126 program->dev.physical_sgprs = 512;
127 program->dev.sgpr_alloc_granule = 8;
128 program->dev.sgpr_limit = 104;
129 }
130
131 program->dev.scratch_alloc_granule = gfx_level >= GFX11 ? 256 : 1024;
132
133 program->dev.max_waves_per_simd = 10;
134 if (program->gfx_level >= GFX10_3)
135 program->dev.max_waves_per_simd = 16;
136 else if (program->gfx_level == GFX10)
137 program->dev.max_waves_per_simd = 20;
138 else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
139 program->dev.max_waves_per_simd = 8;
140
141 program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
142
143 switch (program->family) {
144 /* GFX8 APUs */
145 case CHIP_CARRIZO:
146 case CHIP_STONEY:
147 /* GFX9 APUS */
148 case CHIP_RAVEN:
149 case CHIP_RAVEN2:
150 case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
151 default: break;
152 }
153
154 program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
155 /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
156 program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
157 if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
158 program->family == CHIP_HAWAII)
159 program->dev.has_fast_fma32 = true;
160 program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level == GFX10;
161 program->dev.has_fmac_legacy32 = program->gfx_level >= GFX10_3 && program->gfx_level < GFX12;
162
163 program->dev.fused_mad_mix = program->gfx_level >= GFX10;
164 if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
165 program->family == CHIP_MI100 || program->family == CHIP_MI200)
166 program->dev.fused_mad_mix = true;
167
168 if (program->gfx_level >= GFX11) {
169 program->dev.scratch_global_offset_min = -4096;
170 program->dev.scratch_global_offset_max = 4095;
171 } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
172 program->dev.scratch_global_offset_min = -2048;
173 program->dev.scratch_global_offset_max = 2047;
174 } else if (program->gfx_level == GFX9) {
175 /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
176 program->dev.scratch_global_offset_min = 0;
177 program->dev.scratch_global_offset_max = 4095;
178 }
179
180 if (program->gfx_level >= GFX12) {
181 /* Same as GFX11, except one less for VSAMPLE. */
182 program->dev.max_nsa_vgprs = 3;
183 } else if (program->gfx_level >= GFX11) {
184 /* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
185 * rest of the address.
186 */
187 program->dev.max_nsa_vgprs = 4;
188 } else if (program->gfx_level >= GFX10_3) {
189 /* GFX10.3 can have up to 3 NSA dwords. */
190 program->dev.max_nsa_vgprs = 13;
191 } else if (program->gfx_level >= GFX10) {
192 /* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
193 program->dev.max_nsa_vgprs = 5;
194 } else {
195 program->dev.max_nsa_vgprs = 0;
196 }
197
198 program->wgp_mode = wgp_mode;
199
200 program->progress = CompilationProgress::after_isel;
201
202 program->next_fp_mode.must_flush_denorms32 = false;
203 program->next_fp_mode.must_flush_denorms16_64 = false;
204 program->next_fp_mode.care_about_round32 = false;
205 program->next_fp_mode.care_about_round16_64 = false;
206 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
207 program->next_fp_mode.denorm32 = 0;
208 program->next_fp_mode.round16_64 = fp_round_ne;
209 program->next_fp_mode.round32 = fp_round_ne;
210 }
211
212 bool
is_wait_export_ready(amd_gfx_level gfx_level,const Instruction * instr)213 is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr)
214 {
215 return instr->opcode == aco_opcode::s_wait_event &&
216 (gfx_level >= GFX12 ? (instr->salu().imm & wait_event_imm_wait_export_ready_gfx12)
217 : !(instr->salu().imm & wait_event_imm_dont_wait_export_ready_gfx11));
218 }
219
220 memory_sync_info
get_sync_info(const Instruction * instr)221 get_sync_info(const Instruction* instr)
222 {
223 /* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
224 * overlapping waves in the queue family.
225 */
226 if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
227 instr->opcode == aco_opcode::s_wait_event) {
228 return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
229 } else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
230 return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
231 }
232
233 switch (instr->format) {
234 case Format::SMEM: return instr->smem().sync;
235 case Format::MUBUF: return instr->mubuf().sync;
236 case Format::MIMG: return instr->mimg().sync;
237 case Format::MTBUF: return instr->mtbuf().sync;
238 case Format::FLAT:
239 case Format::GLOBAL:
240 case Format::SCRATCH: return instr->flatlike().sync;
241 case Format::DS: return instr->ds().sync;
242 case Format::LDSDIR: return instr->ldsdir().sync;
243 default: return memory_sync_info();
244 }
245 }
246
247 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)248 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
249 {
250 if (!instr->isVALU())
251 return false;
252
253 if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
254 return false;
255
256 if (instr->isSDWA())
257 return true;
258
259 if (instr->isVOP3()) {
260 VALU_instruction& vop3 = instr->valu();
261 if (instr->format == Format::VOP3)
262 return false;
263 if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
264 return false;
265 if (vop3.omod && gfx_level < GFX9)
266 return false;
267
268 // TODO: return true if we know we will use vcc
269 if (!pre_ra && instr->definitions.size() >= 2)
270 return false;
271
272 for (unsigned i = 1; i < instr->operands.size(); i++) {
273 if (instr->operands[i].isLiteral())
274 return false;
275 if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
276 return false;
277 }
278 }
279
280 if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
281 return false;
282
283 if (!instr->operands.empty()) {
284 if (instr->operands[0].isLiteral())
285 return false;
286 if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
287 return false;
288 if (instr->operands[0].bytes() > 4)
289 return false;
290 if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
291 return false;
292 }
293
294 bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
295 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
296
297 if (gfx_level != GFX8 && is_mac)
298 return false;
299
300 // TODO: return true if we know we will use vcc
301 if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
302 return false;
303 if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
304 return false;
305
306 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
307 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
308 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
309 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
310 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
311 instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
312 }
313
314 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
315 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)316 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
317 {
318 if (instr->isSDWA())
319 return NULL;
320
321 aco_ptr<Instruction> tmp = std::move(instr);
322 Format format = asSDWA(withoutVOP3(tmp->format));
323 instr.reset(
324 create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
325 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
326 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
327
328 SDWA_instruction& sdwa = instr->sdwa();
329
330 if (tmp->isVOP3()) {
331 VALU_instruction& vop3 = tmp->valu();
332 sdwa.neg = vop3.neg;
333 sdwa.abs = vop3.abs;
334 sdwa.omod = vop3.omod;
335 sdwa.clamp = vop3.clamp;
336 }
337
338 for (unsigned i = 0; i < instr->operands.size(); i++) {
339 /* SDWA only uses operands 0 and 1. */
340 if (i >= 2)
341 break;
342
343 sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
344 }
345
346 sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
347
348 if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
349 instr->definitions[0].setPrecolored(vcc);
350 if (instr->definitions.size() >= 2)
351 instr->definitions[1].setPrecolored(vcc);
352 if (instr->operands.size() >= 3)
353 instr->operands[2].setPrecolored(vcc);
354
355 instr->pass_flags = tmp->pass_flags;
356
357 return tmp;
358 }
359
360 bool
can_use_DPP(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool dpp8)361 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
362 {
363 assert(instr->isVALU() && !instr->operands.empty());
364
365 if (instr->isDPP())
366 return instr->isDPP8() == dpp8;
367
368 if (instr->isSDWA() || instr->isVINTERP_INREG())
369 return false;
370
371 if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
372 return false;
373
374 if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
375 instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
376 return false;
377
378 if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
379 instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
380 gfx_level < GFX11)
381 return false;
382
383 if (instr->isVOP3() && gfx_level < GFX11) {
384 const VALU_instruction* vop3 = &instr->valu();
385 if (vop3->clamp || vop3->omod)
386 return false;
387 if (dpp8)
388 return false;
389 }
390
391 for (unsigned i = 0; i < instr->operands.size(); i++) {
392 if (instr->operands[i].isLiteral())
393 return false;
394 if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
395 return false;
396 }
397
398 /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */
399 if (instr->writes_exec())
400 return false;
401
402 /* simpler than listing all VOP3P opcodes which do not support DPP */
403 if (instr->isVOP3P()) {
404 return instr->opcode == aco_opcode::v_fma_mix_f32 ||
405 instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
406 instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
407 instr->opcode == aco_opcode::v_dot2_f32_f16 ||
408 instr->opcode == aco_opcode::v_dot2_f32_bf16;
409 }
410
411 if (instr->opcode == aco_opcode::v_pk_fmac_f16)
412 return gfx_level < GFX11;
413
414 /* there are more cases but those all take 64-bit inputs */
415 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
416 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
417 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
418 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
419 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
420 instr->opcode != aco_opcode::v_cvt_f64_i32 &&
421 instr->opcode != aco_opcode::v_cvt_f64_f32 &&
422 instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
423 instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
424 instr->opcode != aco_opcode::v_mul_hi_i32 &&
425 instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
426 instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
427 instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
428 instr->opcode != aco_opcode::v_mad_u64_u32 &&
429 instr->opcode != aco_opcode::v_mad_i64_i32 &&
430 instr->opcode != aco_opcode::v_permlane16_b32 &&
431 instr->opcode != aco_opcode::v_permlanex16_b32 &&
432 instr->opcode != aco_opcode::v_permlane64_b32 &&
433 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
434 instr->opcode != aco_opcode::v_writelane_b32_e64 &&
435 instr->opcode != aco_opcode::p_v_cvt_pk_u8_f32;
436 }
437
438 aco_ptr<Instruction>
convert_to_DPP(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,bool dpp8)439 convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
440 {
441 if (instr->isDPP())
442 return NULL;
443
444 aco_ptr<Instruction> tmp = std::move(instr);
445 Format format =
446 (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
447 if (dpp8)
448 instr.reset(
449 create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
450 else
451 instr.reset(
452 create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
453 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
454 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
455
456 if (dpp8) {
457 DPP8_instruction* dpp = &instr->dpp8();
458 dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
459 dpp->fetch_inactive = gfx_level >= GFX10;
460 } else {
461 DPP16_instruction* dpp = &instr->dpp16();
462 dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
463 dpp->row_mask = 0xf;
464 dpp->bank_mask = 0xf;
465 dpp->fetch_inactive = gfx_level >= GFX10;
466 }
467
468 instr->valu().neg = tmp->valu().neg;
469 instr->valu().abs = tmp->valu().abs;
470 instr->valu().omod = tmp->valu().omod;
471 instr->valu().clamp = tmp->valu().clamp;
472 instr->valu().opsel = tmp->valu().opsel;
473 instr->valu().opsel_lo = tmp->valu().opsel_lo;
474 instr->valu().opsel_hi = tmp->valu().opsel_hi;
475
476 if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
477 instr->definitions.back().setPrecolored(vcc);
478
479 if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
480 gfx_level < GFX11)
481 instr->operands[2].setPrecolored(vcc);
482
483 instr->pass_flags = tmp->pass_flags;
484
485 /* DPP16 supports input modifiers, so we might no longer need VOP3. */
486 bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
487 (instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
488
489 /* VOPC/add_co/sub_co definition needs VCC without VOP3. */
490 remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
491 !instr->definitions.back().isFixed() ||
492 instr->definitions.back().physReg() == vcc;
493
494 /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
495 remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
496 instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
497
498 if (remove_vop3)
499 instr->format = withoutVOP3(instr->format);
500
501 return tmp;
502 }
503
504 bool
can_use_input_modifiers(amd_gfx_level gfx_level,aco_opcode op,int idx)505 can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
506 {
507 if (op == aco_opcode::v_mov_b32)
508 return gfx_level >= GFX10;
509
510 if (op == aco_opcode::v_ldexp_f16 || op == aco_opcode::v_ldexp_f32 ||
511 op == aco_opcode::v_ldexp_f64)
512 return idx == 0;
513
514 return instr_info.can_use_input_modifiers[(int)op];
515 }
516
517 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)518 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
519 {
520 /* opsel is only GFX9+ */
521 if (gfx_level < GFX9)
522 return false;
523
524 switch (op) {
525 case aco_opcode::v_div_fixup_f16:
526 case aco_opcode::v_fma_f16:
527 case aco_opcode::v_mad_f16:
528 case aco_opcode::v_mad_u16:
529 case aco_opcode::v_mad_i16:
530 case aco_opcode::v_med3_f16:
531 case aco_opcode::v_med3_i16:
532 case aco_opcode::v_med3_u16:
533 case aco_opcode::v_min3_f16:
534 case aco_opcode::v_min3_i16:
535 case aco_opcode::v_min3_u16:
536 case aco_opcode::v_max3_f16:
537 case aco_opcode::v_max3_i16:
538 case aco_opcode::v_max3_u16:
539 case aco_opcode::v_minmax_f16:
540 case aco_opcode::v_maxmin_f16:
541 case aco_opcode::v_max_u16_e64:
542 case aco_opcode::v_max_i16_e64:
543 case aco_opcode::v_min_u16_e64:
544 case aco_opcode::v_min_i16_e64:
545 case aco_opcode::v_add_i16:
546 case aco_opcode::v_sub_i16:
547 case aco_opcode::v_add_u16_e64:
548 case aco_opcode::v_sub_u16_e64:
549 case aco_opcode::v_lshlrev_b16_e64:
550 case aco_opcode::v_lshrrev_b16_e64:
551 case aco_opcode::v_ashrrev_i16_e64:
552 case aco_opcode::v_and_b16:
553 case aco_opcode::v_or_b16:
554 case aco_opcode::v_xor_b16:
555 case aco_opcode::v_mul_lo_u16_e64: return true;
556 case aco_opcode::v_pack_b32_f16:
557 case aco_opcode::v_cvt_pknorm_i16_f16:
558 case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
559 case aco_opcode::v_mad_u32_u16:
560 case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
561 case aco_opcode::v_dot2_f16_f16:
562 case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
563 case aco_opcode::v_cndmask_b16: return idx != 2;
564 case aco_opcode::v_interp_p10_f16_f32_inreg:
565 case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
566 case aco_opcode::v_interp_p2_f16_f32_inreg:
567 case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
568 default:
569 return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
570 }
571 }
572
573 bool
can_write_m0(const aco_ptr<Instruction> & instr)574 can_write_m0(const aco_ptr<Instruction>& instr)
575 {
576 if (instr->isSALU())
577 return true;
578
579 /* VALU can't write m0 on any GPU generations. */
580 if (instr->isVALU())
581 return false;
582
583 switch (instr->opcode) {
584 case aco_opcode::p_parallelcopy:
585 case aco_opcode::p_extract:
586 case aco_opcode::p_insert:
587 /* These pseudo instructions are implemented with SALU when writing m0. */
588 return true;
589 default:
590 /* Assume that no other instructions can write m0. */
591 return false;
592 }
593 }
594
595 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)596 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
597 {
598 /* partial register writes are GFX9+, only */
599 if (gfx_level < GFX9)
600 return false;
601
602 switch (op) {
603 /* VOP3 */
604 case aco_opcode::v_mad_legacy_f16:
605 case aco_opcode::v_mad_legacy_u16:
606 case aco_opcode::v_mad_legacy_i16:
607 case aco_opcode::v_fma_legacy_f16:
608 case aco_opcode::v_div_fixup_legacy_f16: return false;
609 case aco_opcode::v_interp_p2_f16:
610 case aco_opcode::v_interp_p2_hi_f16:
611 case aco_opcode::v_fma_mixlo_f16:
612 case aco_opcode::v_fma_mixhi_f16:
613 /* VOP2 */
614 case aco_opcode::v_mac_f16:
615 case aco_opcode::v_madak_f16:
616 case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
617 case aco_opcode::v_add_f16:
618 case aco_opcode::v_sub_f16:
619 case aco_opcode::v_subrev_f16:
620 case aco_opcode::v_mul_f16:
621 case aco_opcode::v_max_f16:
622 case aco_opcode::v_min_f16:
623 case aco_opcode::v_ldexp_f16:
624 case aco_opcode::v_fmac_f16:
625 case aco_opcode::v_fmamk_f16:
626 case aco_opcode::v_fmaak_f16:
627 /* VOP1 */
628 case aco_opcode::v_cvt_f16_f32:
629 case aco_opcode::p_v_cvt_f16_f32_rtne:
630 case aco_opcode::v_cvt_f16_u16:
631 case aco_opcode::v_cvt_f16_i16:
632 case aco_opcode::v_rcp_f16:
633 case aco_opcode::v_sqrt_f16:
634 case aco_opcode::v_rsq_f16:
635 case aco_opcode::v_log_f16:
636 case aco_opcode::v_exp_f16:
637 case aco_opcode::v_frexp_mant_f16:
638 case aco_opcode::v_frexp_exp_i16_f16:
639 case aco_opcode::v_floor_f16:
640 case aco_opcode::v_ceil_f16:
641 case aco_opcode::v_trunc_f16:
642 case aco_opcode::v_rndne_f16:
643 case aco_opcode::v_fract_f16:
644 case aco_opcode::v_sin_f16:
645 case aco_opcode::v_cos_f16:
646 case aco_opcode::v_cvt_u16_f16:
647 case aco_opcode::v_cvt_i16_f16:
648 case aco_opcode::v_cvt_norm_i16_f16:
649 case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
650 /* all non legacy opsel instructions preserve the high bits */
651 default: return can_use_opsel(gfx_level, op, -1);
652 }
653 }
654
655 /* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
656 * only supports v0-v127.
657 * The first three bits are used for operands 0-2, and the 4th bit is used for the destination.
658 */
659 uint8_t
get_gfx11_true16_mask(aco_opcode op)660 get_gfx11_true16_mask(aco_opcode op)
661 {
662 switch (op) {
663 case aco_opcode::v_ceil_f16:
664 case aco_opcode::v_cos_f16:
665 case aco_opcode::v_cvt_f16_i16:
666 case aco_opcode::v_cvt_f16_u16:
667 case aco_opcode::v_cvt_i16_f16:
668 case aco_opcode::v_cvt_u16_f16:
669 case aco_opcode::v_cvt_norm_i16_f16:
670 case aco_opcode::v_cvt_norm_u16_f16:
671 case aco_opcode::v_exp_f16:
672 case aco_opcode::v_floor_f16:
673 case aco_opcode::v_fract_f16:
674 case aco_opcode::v_frexp_exp_i16_f16:
675 case aco_opcode::v_frexp_mant_f16:
676 case aco_opcode::v_log_f16:
677 case aco_opcode::v_not_b16:
678 case aco_opcode::v_rcp_f16:
679 case aco_opcode::v_rndne_f16:
680 case aco_opcode::v_rsq_f16:
681 case aco_opcode::v_sin_f16:
682 case aco_opcode::v_sqrt_f16:
683 case aco_opcode::v_trunc_f16:
684 case aco_opcode::v_swap_b16:
685 case aco_opcode::v_mov_b16: return 0x1 | 0x8;
686 case aco_opcode::v_add_f16:
687 case aco_opcode::v_fmaak_f16:
688 case aco_opcode::v_fmac_f16:
689 case aco_opcode::v_fmamk_f16:
690 case aco_opcode::v_ldexp_f16:
691 case aco_opcode::v_max_f16:
692 case aco_opcode::v_min_f16:
693 case aco_opcode::v_mul_f16:
694 case aco_opcode::v_sub_f16:
695 case aco_opcode::v_subrev_f16:
696 case aco_opcode::v_and_b16:
697 case aco_opcode::v_or_b16:
698 case aco_opcode::v_xor_b16: return 0x3 | 0x8;
699 case aco_opcode::v_cvt_f32_f16:
700 case aco_opcode::v_cvt_i32_i16:
701 case aco_opcode::v_cvt_u32_u16: return 0x1;
702 case aco_opcode::v_cmp_class_f16:
703 case aco_opcode::v_cmp_eq_f16:
704 case aco_opcode::v_cmp_eq_i16:
705 case aco_opcode::v_cmp_eq_u16:
706 case aco_opcode::v_cmp_ge_f16:
707 case aco_opcode::v_cmp_ge_i16:
708 case aco_opcode::v_cmp_ge_u16:
709 case aco_opcode::v_cmp_gt_f16:
710 case aco_opcode::v_cmp_gt_i16:
711 case aco_opcode::v_cmp_gt_u16:
712 case aco_opcode::v_cmp_le_f16:
713 case aco_opcode::v_cmp_le_i16:
714 case aco_opcode::v_cmp_le_u16:
715 case aco_opcode::v_cmp_lg_f16:
716 case aco_opcode::v_cmp_lg_i16:
717 case aco_opcode::v_cmp_lg_u16:
718 case aco_opcode::v_cmp_lt_f16:
719 case aco_opcode::v_cmp_lt_i16:
720 case aco_opcode::v_cmp_lt_u16:
721 case aco_opcode::v_cmp_neq_f16:
722 case aco_opcode::v_cmp_nge_f16:
723 case aco_opcode::v_cmp_ngt_f16:
724 case aco_opcode::v_cmp_nle_f16:
725 case aco_opcode::v_cmp_nlg_f16:
726 case aco_opcode::v_cmp_nlt_f16:
727 case aco_opcode::v_cmp_o_f16:
728 case aco_opcode::v_cmp_u_f16:
729 case aco_opcode::v_cmpx_class_f16:
730 case aco_opcode::v_cmpx_eq_f16:
731 case aco_opcode::v_cmpx_eq_i16:
732 case aco_opcode::v_cmpx_eq_u16:
733 case aco_opcode::v_cmpx_ge_f16:
734 case aco_opcode::v_cmpx_ge_i16:
735 case aco_opcode::v_cmpx_ge_u16:
736 case aco_opcode::v_cmpx_gt_f16:
737 case aco_opcode::v_cmpx_gt_i16:
738 case aco_opcode::v_cmpx_gt_u16:
739 case aco_opcode::v_cmpx_le_f16:
740 case aco_opcode::v_cmpx_le_i16:
741 case aco_opcode::v_cmpx_le_u16:
742 case aco_opcode::v_cmpx_lg_f16:
743 case aco_opcode::v_cmpx_lg_i16:
744 case aco_opcode::v_cmpx_lg_u16:
745 case aco_opcode::v_cmpx_lt_f16:
746 case aco_opcode::v_cmpx_lt_i16:
747 case aco_opcode::v_cmpx_lt_u16:
748 case aco_opcode::v_cmpx_neq_f16:
749 case aco_opcode::v_cmpx_nge_f16:
750 case aco_opcode::v_cmpx_ngt_f16:
751 case aco_opcode::v_cmpx_nle_f16:
752 case aco_opcode::v_cmpx_nlg_f16:
753 case aco_opcode::v_cmpx_nlt_f16:
754 case aco_opcode::v_cmpx_o_f16:
755 case aco_opcode::v_cmpx_u_f16: return 0x3;
756 case aco_opcode::v_cvt_f16_f32:
757 case aco_opcode::v_sat_pk_u8_i16: return 0x8;
758 default: return 0x0;
759 }
760 }
761
762 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)763 get_reduction_identity(ReduceOp op, unsigned idx)
764 {
765 switch (op) {
766 case iadd8:
767 case iadd16:
768 case iadd32:
769 case iadd64:
770 case fadd16:
771 case fadd32:
772 case fadd64:
773 case ior8:
774 case ior16:
775 case ior32:
776 case ior64:
777 case ixor8:
778 case ixor16:
779 case ixor32:
780 case ixor64:
781 case umax8:
782 case umax16:
783 case umax32:
784 case umax64: return 0;
785 case imul8:
786 case imul16:
787 case imul32:
788 case imul64: return idx ? 0 : 1;
789 case fmul16: return 0x3c00u; /* 1.0 */
790 case fmul32: return 0x3f800000u; /* 1.0 */
791 case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
792 case imin8: return INT8_MAX;
793 case imin16: return INT16_MAX;
794 case imin32: return INT32_MAX;
795 case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
796 case imax8: return INT8_MIN;
797 case imax16: return INT16_MIN;
798 case imax32: return INT32_MIN;
799 case imax64: return idx ? 0x80000000u : 0;
800 case umin8:
801 case umin16:
802 case iand8:
803 case iand16: return 0xffffffffu;
804 case umin32:
805 case umin64:
806 case iand32:
807 case iand64: return 0xffffffffu;
808 case fmin16: return 0x7c00u; /* infinity */
809 case fmin32: return 0x7f800000u; /* infinity */
810 case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
811 case fmax16: return 0xfc00u; /* negative infinity */
812 case fmax32: return 0xff800000u; /* negative infinity */
813 case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
814 default: unreachable("Invalid reduction operation"); break;
815 }
816 return 0;
817 }
818
819 unsigned
get_operand_size(aco_ptr<Instruction> & instr,unsigned index)820 get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
821 {
822 if (instr->isPseudo())
823 return instr->operands[index].bytes() * 8u;
824 else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
825 instr->opcode == aco_opcode::v_mad_i64_i32)
826 return index == 2 ? 64 : 32;
827 else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
828 instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
829 instr->opcode == aco_opcode::v_fma_mixhi_f16)
830 return instr->valu().opsel_hi[index] ? 16 : 32;
831 else if (instr->opcode == aco_opcode::v_interp_p10_f16_f32_inreg ||
832 instr->opcode == aco_opcode::v_interp_p10_rtz_f16_f32_inreg)
833 return index == 1 ? 32 : 16;
834 else if (instr->opcode == aco_opcode::v_interp_p2_f16_f32_inreg ||
835 instr->opcode == aco_opcode::v_interp_p2_rtz_f16_f32_inreg)
836 return index == 0 ? 16 : 32;
837 else if (instr->isVALU() || instr->isSALU())
838 return instr_info.operand_size[(int)instr->opcode];
839 else
840 return 0;
841 }
842
843 bool
needs_exec_mask(const Instruction * instr)844 needs_exec_mask(const Instruction* instr)
845 {
846 if (instr->isVALU()) {
847 return instr->opcode != aco_opcode::v_readlane_b32 &&
848 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
849 instr->opcode != aco_opcode::v_writelane_b32 &&
850 instr->opcode != aco_opcode::v_writelane_b32_e64;
851 }
852
853 if (instr->isVMEM() || instr->isFlatLike())
854 return true;
855
856 if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
857 return instr->opcode == aco_opcode::s_cbranch_execz ||
858 instr->opcode == aco_opcode::s_cbranch_execnz || instr->reads_exec();
859
860 if (instr->isPseudo()) {
861 switch (instr->opcode) {
862 case aco_opcode::p_create_vector:
863 case aco_opcode::p_extract_vector:
864 case aco_opcode::p_split_vector:
865 case aco_opcode::p_phi:
866 case aco_opcode::p_parallelcopy:
867 for (Definition def : instr->definitions) {
868 if (def.getTemp().type() == RegType::vgpr)
869 return true;
870 }
871 return instr->reads_exec();
872 case aco_opcode::p_spill:
873 case aco_opcode::p_reload:
874 case aco_opcode::p_end_linear_vgpr:
875 case aco_opcode::p_logical_start:
876 case aco_opcode::p_logical_end:
877 case aco_opcode::p_startpgm:
878 case aco_opcode::p_end_wqm:
879 case aco_opcode::p_init_scratch: return instr->reads_exec();
880 case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
881 default: break;
882 }
883 }
884
885 return true;
886 }
887
888 struct CmpInfo {
889 aco_opcode swapped;
890 aco_opcode inverse;
891 aco_opcode vcmpx;
892 };
893
894 static ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)895 get_cmp_info(aco_opcode op, CmpInfo* info)
896 {
897 info->swapped = aco_opcode::num_opcodes;
898 info->inverse = aco_opcode::num_opcodes;
899 info->vcmpx = aco_opcode::num_opcodes;
900 switch (op) {
901 // clang-format off
902 #define CMP2(ord, unord, ord_swap, unord_swap, sz) \
903 case aco_opcode::v_cmp_##ord##_f##sz: \
904 case aco_opcode::v_cmp_n##unord##_f##sz: \
905 info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
906 : aco_opcode::v_cmp_n##unord_swap##_f##sz; \
907 info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
908 : aco_opcode::v_cmp_n##ord##_f##sz; \
909 info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz \
910 : aco_opcode::v_cmpx_n##unord##_f##sz; \
911 return true;
912 #define CMP(ord, unord, ord_swap, unord_swap) \
913 CMP2(ord, unord, ord_swap, unord_swap, 16) \
914 CMP2(ord, unord, ord_swap, unord_swap, 32) \
915 CMP2(ord, unord, ord_swap, unord_swap, 64)
916 CMP(lt, /*n*/ge, gt, /*n*/le)
917 CMP(eq, /*n*/lg, eq, /*n*/lg)
918 CMP(le, /*n*/gt, ge, /*n*/lt)
919 CMP(gt, /*n*/le, lt, /*n*/ge)
920 CMP(lg, /*n*/eq, lg, /*n*/eq)
921 CMP(ge, /*n*/lt, le, /*n*/gt)
922 #undef CMP
923 #undef CMP2
924 #define ORD_TEST(sz) \
925 case aco_opcode::v_cmp_u_f##sz: \
926 info->swapped = aco_opcode::v_cmp_u_f##sz; \
927 info->inverse = aco_opcode::v_cmp_o_f##sz; \
928 info->vcmpx = aco_opcode::v_cmpx_u_f##sz; \
929 return true; \
930 case aco_opcode::v_cmp_o_f##sz: \
931 info->swapped = aco_opcode::v_cmp_o_f##sz; \
932 info->inverse = aco_opcode::v_cmp_u_f##sz; \
933 info->vcmpx = aco_opcode::v_cmpx_o_f##sz; \
934 return true;
935 ORD_TEST(16)
936 ORD_TEST(32)
937 ORD_TEST(64)
938 #undef ORD_TEST
939 #define CMPI2(op, swap, inv, type, sz) \
940 case aco_opcode::v_cmp_##op##_##type##sz: \
941 info->swapped = aco_opcode::v_cmp_##swap##_##type##sz; \
942 info->inverse = aco_opcode::v_cmp_##inv##_##type##sz; \
943 info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz; \
944 return true;
945 #define CMPI(op, swap, inv) \
946 CMPI2(op, swap, inv, i, 16) \
947 CMPI2(op, swap, inv, u, 16) \
948 CMPI2(op, swap, inv, i, 32) \
949 CMPI2(op, swap, inv, u, 32) \
950 CMPI2(op, swap, inv, i, 64) \
951 CMPI2(op, swap, inv, u, 64)
952 CMPI(lt, gt, ge)
953 CMPI(eq, eq, lg)
954 CMPI(le, ge, gt)
955 CMPI(gt, lt, le)
956 CMPI(lg, lg, eq)
957 CMPI(ge, le, lt)
958 #undef CMPI
959 #undef CMPI2
960 #define CMPCLASS(sz) \
961 case aco_opcode::v_cmp_class_f##sz: \
962 info->vcmpx = aco_opcode::v_cmpx_class_f##sz; \
963 return true;
964 CMPCLASS(16)
965 CMPCLASS(32)
966 CMPCLASS(64)
967 #undef CMPCLASS
968 // clang-format on
969 default: return false;
970 }
971 }
972
973 aco_opcode
get_vcmp_inverse(aco_opcode op)974 get_vcmp_inverse(aco_opcode op)
975 {
976 CmpInfo info;
977 return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
978 }
979
980 aco_opcode
get_vcmp_swapped(aco_opcode op)981 get_vcmp_swapped(aco_opcode op)
982 {
983 CmpInfo info;
984 return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
985 }
986
987 aco_opcode
get_vcmpx(aco_opcode op)988 get_vcmpx(aco_opcode op)
989 {
990 CmpInfo info;
991 return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
992 }
993
994 bool
is_cmpx(aco_opcode op)995 is_cmpx(aco_opcode op)
996 {
997 CmpInfo info;
998 return !get_cmp_info(op, &info);
999 }
1000
1001 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op,unsigned idx0,unsigned idx1)1002 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
1003 {
1004 if (idx0 == idx1) {
1005 *new_op = instr->opcode;
1006 return true;
1007 }
1008
1009 if (idx0 > idx1)
1010 std::swap(idx0, idx1);
1011
1012 if (instr->isDPP())
1013 return false;
1014
1015 if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
1016 return false;
1017
1018 if (instr->isVOPC()) {
1019 CmpInfo info;
1020 if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
1021 *new_op = info.swapped;
1022 return true;
1023 }
1024 }
1025
1026 /* opcodes not relevant for DPP or SGPRs optimizations are not included. */
1027 switch (instr->opcode) {
1028 case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
1029 case aco_opcode::v_add_u32:
1030 case aco_opcode::v_add_co_u32:
1031 case aco_opcode::v_add_co_u32_e64:
1032 case aco_opcode::v_add_i32:
1033 case aco_opcode::v_add_i16:
1034 case aco_opcode::v_add_u16_e64:
1035 case aco_opcode::v_add3_u32:
1036 case aco_opcode::v_add_f16:
1037 case aco_opcode::v_add_f32:
1038 case aco_opcode::v_mul_i32_i24:
1039 case aco_opcode::v_mul_hi_i32_i24:
1040 case aco_opcode::v_mul_u32_u24:
1041 case aco_opcode::v_mul_hi_u32_u24:
1042 case aco_opcode::v_mul_lo_u16:
1043 case aco_opcode::v_mul_lo_u16_e64:
1044 case aco_opcode::v_mul_f16:
1045 case aco_opcode::v_mul_f32:
1046 case aco_opcode::v_mul_legacy_f32:
1047 case aco_opcode::v_or_b32:
1048 case aco_opcode::v_and_b32:
1049 case aco_opcode::v_xor_b32:
1050 case aco_opcode::v_xnor_b32:
1051 case aco_opcode::v_xor3_b32:
1052 case aco_opcode::v_or3_b32:
1053 case aco_opcode::v_and_b16:
1054 case aco_opcode::v_or_b16:
1055 case aco_opcode::v_xor_b16:
1056 case aco_opcode::v_max3_f32:
1057 case aco_opcode::v_min3_f32:
1058 case aco_opcode::v_max3_f16:
1059 case aco_opcode::v_min3_f16:
1060 case aco_opcode::v_med3_f16:
1061 case aco_opcode::v_max3_u32:
1062 case aco_opcode::v_min3_u32:
1063 case aco_opcode::v_med3_u32:
1064 case aco_opcode::v_max3_i32:
1065 case aco_opcode::v_min3_i32:
1066 case aco_opcode::v_med3_i32:
1067 case aco_opcode::v_max3_u16:
1068 case aco_opcode::v_min3_u16:
1069 case aco_opcode::v_med3_u16:
1070 case aco_opcode::v_max3_i16:
1071 case aco_opcode::v_min3_i16:
1072 case aco_opcode::v_med3_i16:
1073 case aco_opcode::v_max_f16:
1074 case aco_opcode::v_max_f32:
1075 case aco_opcode::v_min_f16:
1076 case aco_opcode::v_min_f32:
1077 case aco_opcode::v_max_i32:
1078 case aco_opcode::v_min_i32:
1079 case aco_opcode::v_max_u32:
1080 case aco_opcode::v_min_u32:
1081 case aco_opcode::v_max_i16:
1082 case aco_opcode::v_min_i16:
1083 case aco_opcode::v_max_u16:
1084 case aco_opcode::v_min_u16:
1085 case aco_opcode::v_max_i16_e64:
1086 case aco_opcode::v_min_i16_e64:
1087 case aco_opcode::v_max_u16_e64:
1088 case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
1089 case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
1090 case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
1091 case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
1092 case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
1093 case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
1094 case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
1095 case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
1096 case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
1097 case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
1098 case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
1099 case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
1100 case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
1101 case aco_opcode::v_addc_co_u32:
1102 case aco_opcode::v_mad_i32_i24:
1103 case aco_opcode::v_mad_u32_u24:
1104 case aco_opcode::v_lerp_u8:
1105 case aco_opcode::v_sad_u8:
1106 case aco_opcode::v_sad_hi_u8:
1107 case aco_opcode::v_sad_u16:
1108 case aco_opcode::v_sad_u32:
1109 case aco_opcode::v_xad_u32:
1110 case aco_opcode::v_add_lshl_u32:
1111 case aco_opcode::v_and_or_b32:
1112 case aco_opcode::v_mad_u16:
1113 case aco_opcode::v_mad_i16:
1114 case aco_opcode::v_mad_u32_u16:
1115 case aco_opcode::v_mad_i32_i16:
1116 case aco_opcode::v_maxmin_f32:
1117 case aco_opcode::v_minmax_f32:
1118 case aco_opcode::v_maxmin_f16:
1119 case aco_opcode::v_minmax_f16:
1120 case aco_opcode::v_maxmin_u32:
1121 case aco_opcode::v_minmax_u32:
1122 case aco_opcode::v_maxmin_i32:
1123 case aco_opcode::v_minmax_i32:
1124 case aco_opcode::v_fma_f32:
1125 case aco_opcode::v_fma_legacy_f32:
1126 case aco_opcode::v_fmac_f32:
1127 case aco_opcode::v_fmac_legacy_f32:
1128 case aco_opcode::v_mac_f32:
1129 case aco_opcode::v_mac_legacy_f32:
1130 case aco_opcode::v_fma_f16:
1131 case aco_opcode::v_fmac_f16:
1132 case aco_opcode::v_mac_f16:
1133 case aco_opcode::v_dot4c_i32_i8:
1134 case aco_opcode::v_dot2c_f32_f16:
1135 case aco_opcode::v_dot2_f32_f16:
1136 case aco_opcode::v_dot2_f32_bf16:
1137 case aco_opcode::v_dot2_f16_f16:
1138 case aco_opcode::v_dot2_bf16_bf16:
1139 case aco_opcode::v_fma_mix_f32:
1140 case aco_opcode::v_fma_mixlo_f16:
1141 case aco_opcode::v_fma_mixhi_f16:
1142 case aco_opcode::v_pk_fmac_f16: {
1143 if (idx1 == 2)
1144 return false;
1145 *new_op = instr->opcode;
1146 return true;
1147 }
1148 case aco_opcode::v_subb_co_u32: {
1149 if (idx1 == 2)
1150 return false;
1151 *new_op = aco_opcode::v_subbrev_co_u32;
1152 return true;
1153 }
1154 case aco_opcode::v_subbrev_co_u32: {
1155 if (idx1 == 2)
1156 return false;
1157 *new_op = aco_opcode::v_subb_co_u32;
1158 return true;
1159 }
1160 default: return false;
1161 }
1162 }
1163
wait_imm()1164 wait_imm::wait_imm()
1165 : exp(unset_counter), lgkm(unset_counter), vm(unset_counter), vs(unset_counter),
1166 sample(unset_counter), bvh(unset_counter), km(unset_counter)
1167 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)1168 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
1169 : exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_), sample(unset_counter), bvh(unset_counter),
1170 km(unset_counter)
1171 {}
1172
1173 uint16_t
pack(enum amd_gfx_level gfx_level) const1174 wait_imm::pack(enum amd_gfx_level gfx_level) const
1175 {
1176 uint16_t imm = 0;
1177 assert(exp == unset_counter || exp <= 0x7);
1178 if (gfx_level >= GFX11) {
1179 assert(lgkm == unset_counter || lgkm <= 0x3f);
1180 assert(vm == unset_counter || vm <= 0x3f);
1181 imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
1182 } else if (gfx_level >= GFX10) {
1183 assert(lgkm == unset_counter || lgkm <= 0x3f);
1184 assert(vm == unset_counter || vm <= 0x3f);
1185 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1186 } else if (gfx_level >= GFX9) {
1187 assert(lgkm == unset_counter || lgkm <= 0xf);
1188 assert(vm == unset_counter || vm <= 0x3f);
1189 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1190 } else {
1191 assert(lgkm == unset_counter || lgkm <= 0xf);
1192 assert(vm == unset_counter || vm <= 0xf);
1193 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1194 }
1195 if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
1196 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
1197 architecture when interpreting the immediate */
1198 if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
1199 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
1200 architecture when interpreting the immediate */
1201 return imm;
1202 }
1203
1204 wait_imm
max(enum amd_gfx_level gfx_level)1205 wait_imm::max(enum amd_gfx_level gfx_level)
1206 {
1207 wait_imm imm;
1208 imm.vm = gfx_level >= GFX9 ? 63 : 15;
1209 imm.exp = 7;
1210 imm.lgkm = gfx_level >= GFX10 ? 63 : 15;
1211 imm.vs = gfx_level >= GFX10 ? 63 : 0;
1212 imm.sample = gfx_level >= GFX12 ? 63 : 0;
1213 imm.bvh = gfx_level >= GFX12 ? 7 : 0;
1214 imm.km = gfx_level >= GFX12 ? 31 : 0;
1215 return imm;
1216 }
1217
1218 bool
unpack(enum amd_gfx_level gfx_level,const Instruction * instr)1219 wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr)
1220 {
1221 if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null))
1222 return false;
1223
1224 aco_opcode op = instr->opcode;
1225 uint16_t packed = instr->salu().imm;
1226
1227 if (op == aco_opcode::s_wait_loadcnt) {
1228 vm = std::min<uint8_t>(vm, packed);
1229 } else if (op == aco_opcode::s_wait_storecnt) {
1230 vs = std::min<uint8_t>(vs, packed);
1231 } else if (op == aco_opcode::s_wait_samplecnt) {
1232 sample = std::min<uint8_t>(sample, packed);
1233 } else if (op == aco_opcode::s_wait_bvhcnt) {
1234 bvh = std::min<uint8_t>(bvh, packed);
1235 } else if (op == aco_opcode::s_wait_expcnt) {
1236 exp = std::min<uint8_t>(exp, packed);
1237 } else if (op == aco_opcode::s_wait_dscnt) {
1238 lgkm = std::min<uint8_t>(lgkm, packed);
1239 } else if (op == aco_opcode::s_wait_kmcnt) {
1240 km = std::min<uint8_t>(km, packed);
1241 } else if (op == aco_opcode::s_wait_loadcnt_dscnt) {
1242 uint32_t vm2 = (packed >> 8) & 0x3f;
1243 uint32_t ds = packed & 0x3f;
1244 vm = std::min<uint8_t>(vm, vm2 == 0x3f ? wait_imm::unset_counter : vm2);
1245 lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1246 } else if (op == aco_opcode::s_wait_storecnt_dscnt) {
1247 uint32_t vs2 = (packed >> 8) & 0x3f;
1248 uint32_t ds = packed & 0x3f;
1249 vs = std::min<uint8_t>(vs, vs2 == 0x3f ? wait_imm::unset_counter : vs2);
1250 lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1251 } else if (op == aco_opcode::s_waitcnt_expcnt) {
1252 exp = std::min<uint8_t>(exp, packed);
1253 } else if (op == aco_opcode::s_waitcnt_lgkmcnt) {
1254 lgkm = std::min<uint8_t>(lgkm, packed);
1255 } else if (op == aco_opcode::s_waitcnt_vmcnt) {
1256 vm = std::min<uint8_t>(vm, packed);
1257 } else if (op == aco_opcode::s_waitcnt_vscnt) {
1258 vs = std::min<uint8_t>(vs, packed);
1259 } else if (op == aco_opcode::s_waitcnt) {
1260 uint8_t vm2, lgkm2, exp2;
1261 if (gfx_level >= GFX11) {
1262 vm2 = (packed >> 10) & 0x3f;
1263 lgkm2 = (packed >> 4) & 0x3f;
1264 exp2 = packed & 0x7;
1265 } else {
1266 vm2 = packed & 0xf;
1267 if (gfx_level >= GFX9)
1268 vm2 |= (packed >> 10) & 0x30;
1269
1270 exp2 = (packed >> 4) & 0x7;
1271
1272 lgkm2 = (packed >> 8) & 0xf;
1273 if (gfx_level >= GFX10)
1274 lgkm2 |= (packed >> 8) & 0x30;
1275 }
1276
1277 if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf))
1278 vm2 = wait_imm::unset_counter;
1279 if (exp2 == 0x7)
1280 exp2 = wait_imm::unset_counter;
1281 if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf))
1282 lgkm2 = wait_imm::unset_counter;
1283
1284 vm = std::min(vm, vm2);
1285 exp = std::min(exp, exp2);
1286 lgkm = std::min(lgkm, lgkm2);
1287 } else {
1288 return false;
1289 }
1290 return true;
1291 }
1292
1293 bool
combine(const wait_imm & other)1294 wait_imm::combine(const wait_imm& other)
1295 {
1296 bool changed = false;
1297 for (unsigned i = 0; i < wait_type_num; i++) {
1298 if (other[i] < (*this)[i])
1299 changed = true;
1300 (*this)[i] = std::min((*this)[i], other[i]);
1301 }
1302 return changed;
1303 }
1304
1305 bool
empty() const1306 wait_imm::empty() const
1307 {
1308 for (unsigned i = 0; i < wait_type_num; i++) {
1309 if ((*this)[i] != unset_counter)
1310 return false;
1311 }
1312 return true;
1313 }
1314
1315 void
print(FILE * output) const1316 wait_imm::print(FILE* output) const
1317 {
1318 const char* names[wait_type_num];
1319 names[wait_type_exp] = "exp";
1320 names[wait_type_vm] = "vm";
1321 names[wait_type_lgkm] = "lgkm";
1322 names[wait_type_vs] = "vs";
1323 names[wait_type_sample] = "sample";
1324 names[wait_type_bvh] = "bvh";
1325 names[wait_type_km] = "km";
1326 for (unsigned i = 0; i < wait_type_num; i++) {
1327 if ((*this)[i] != unset_counter)
1328 fprintf(output, "%s: %u\n", names[i], (*this)[i]);
1329 }
1330 }
1331
1332 void
build_waitcnt(Builder & bld)1333 wait_imm::build_waitcnt(Builder& bld)
1334 {
1335 enum amd_gfx_level gfx_level = bld.program->gfx_level;
1336
1337 if (gfx_level >= GFX12) {
1338 if (vm != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) {
1339 bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (vm << 8) | lgkm);
1340 vm = wait_imm::unset_counter;
1341 lgkm = wait_imm::unset_counter;
1342 }
1343
1344 if (vs != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) {
1345 bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (vs << 8) | lgkm);
1346 vs = wait_imm::unset_counter;
1347 lgkm = wait_imm::unset_counter;
1348 }
1349
1350 aco_opcode op[wait_type_num];
1351 op[wait_type_exp] = aco_opcode::s_wait_expcnt;
1352 op[wait_type_lgkm] = aco_opcode::s_wait_dscnt;
1353 op[wait_type_vm] = aco_opcode::s_wait_loadcnt;
1354 op[wait_type_vs] = aco_opcode::s_wait_storecnt;
1355 op[wait_type_sample] = aco_opcode::s_wait_samplecnt;
1356 op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt;
1357 op[wait_type_km] = aco_opcode::s_wait_kmcnt;
1358
1359 for (unsigned i = 0; i < wait_type_num; i++) {
1360 if ((*this)[i] != wait_imm::unset_counter)
1361 bld.sopp(op[i], (*this)[i]);
1362 }
1363 } else {
1364 if (vs != wait_imm::unset_counter) {
1365 assert(gfx_level >= GFX10);
1366 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), vs);
1367 vs = wait_imm::unset_counter;
1368 }
1369 if (!empty())
1370 bld.sopp(aco_opcode::s_waitcnt, pack(gfx_level));
1371 }
1372
1373 *this = wait_imm();
1374 }
1375
1376 bool
should_form_clause(const Instruction * a,const Instruction * b)1377 should_form_clause(const Instruction* a, const Instruction* b)
1378 {
1379 if (a->definitions.empty() != b->definitions.empty())
1380 return false;
1381
1382 if (a->format != b->format)
1383 return false;
1384
1385 if (a->operands.empty() || b->operands.empty())
1386 return false;
1387
1388 /* Assume loads which don't use descriptors might load from similar addresses. */
1389 if (a->isFlatLike() || a->accessesLDS())
1390 return true;
1391 if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
1392 return true;
1393
1394 /* If they load from the same descriptor, assume they might load from similar
1395 * addresses.
1396 */
1397 if (a->isVMEM() || a->isSMEM())
1398 return a->operands[0].tempId() == b->operands[0].tempId();
1399
1400 if (a->isEXP() && b->isEXP())
1401 return true;
1402
1403 return false;
1404 }
1405
1406 int
get_op_fixed_to_def(Instruction * instr)1407 get_op_fixed_to_def(Instruction* instr)
1408 {
1409 if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
1410 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
1411 instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
1412 instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
1413 instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
1414 instr->opcode == aco_opcode::v_writelane_b32_e64 ||
1415 instr->opcode == aco_opcode::v_dot4c_i32_i8 || instr->opcode == aco_opcode::s_fmac_f32 ||
1416 instr->opcode == aco_opcode::s_fmac_f16) {
1417 return 2;
1418 } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
1419 instr->opcode == aco_opcode::s_cmovk_i32) {
1420 return 0;
1421 } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) {
1422 return 3;
1423 } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
1424 !instr->operands[2].isUndefined()) {
1425 return 2;
1426 }
1427 return -1;
1428 }
1429
1430 uint8_t
get_vmem_type(enum amd_gfx_level gfx_level,Instruction * instr)1431 get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
1432 {
1433 if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
1434 return vmem_bvh;
1435 else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
1436 return vmem_sampler;
1437 else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
1438 instr->operands[1].regClass() == s4)
1439 return vmem_sampler;
1440 else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
1441 return vmem_nosampler;
1442 return 0;
1443 }
1444
1445 /* Parse implicit data dependency resolution:
1446 * Returns the value of each counter that must be reached
1447 * before an instruction is issued.
1448 *
1449 * (Probably incomplete.)
1450 */
1451 depctr_wait
parse_depctr_wait(const Instruction * instr)1452 parse_depctr_wait(const Instruction* instr)
1453 {
1454 depctr_wait res;
1455 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP()) {
1456 res.va_vdst = 0;
1457 res.va_exec = 0;
1458 res.sa_exec = 0;
1459 if (instr->isVMEM() || instr->isFlatLike()) {
1460 res.sa_sdst = 0;
1461 res.va_sdst = 0;
1462 res.va_vcc = 0;
1463 }
1464 } else if (instr->isSMEM()) {
1465 res.sa_sdst = 0;
1466 res.va_sdst = 0;
1467 res.va_vcc = 0;
1468 } else if (instr->isLDSDIR()) {
1469 res.va_vdst = instr->ldsdir().wait_vdst;
1470 res.va_exec = 0;
1471 res.sa_exec = 0;
1472 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1473 unsigned imm = instr->salu().imm;
1474 res.va_vdst = (imm >> 12) & 0xf;
1475 res.va_sdst = (imm >> 9) & 0x7;
1476 res.va_ssrc = (imm >> 8) & 0x1;
1477 res.hold_cnt = (imm >> 7) & 0x1;
1478 res.vm_vsrc = (imm >> 2) & 0x7;
1479 res.va_vcc = (imm >> 1) & 0x1;
1480 res.sa_sdst = imm & 0x1;
1481 } else if (instr->isVALU()) {
1482 res.sa_exec = 0;
1483 for (const Definition& def : instr->definitions) {
1484 if (def.regClass().type() == RegType::sgpr) {
1485 res.sa_sdst = 0;
1486 /* Notably, this is the only exception, even VALU that
1487 * reads exec doesn't implicitly wait for va_exec.
1488 */
1489 if (instr->opcode == aco_opcode::v_readfirstlane_b32)
1490 res.va_exec = 0;
1491 break;
1492 }
1493 }
1494 } else if (instr_info.classes[(int)instr->opcode] == instr_class::branch ||
1495 instr_info.classes[(int)instr->opcode] == instr_class::sendmsg) {
1496 res.sa_exec = 0;
1497 res.va_exec = 0;
1498 switch (instr->opcode) {
1499 case aco_opcode::s_cbranch_vccz:
1500 case aco_opcode::s_cbranch_vccnz:
1501 res.va_vcc = 0;
1502 res.sa_sdst = 0;
1503 break;
1504 case aco_opcode::s_cbranch_scc0:
1505 case aco_opcode::s_cbranch_scc1:
1506 res.sa_sdst = 0;
1507 break;
1508 default: break;
1509 }
1510 } else if (instr->isSALU()) {
1511 for (const Definition& def : instr->definitions) {
1512 if (def.physReg() < vcc) {
1513 res.va_sdst = 0;
1514 } else if (def.physReg() <= vcc_hi) {
1515 res.va_vcc = 0;
1516 } else if (def.physReg() == exec || def.physReg() == exec_hi) {
1517 res.va_exec = 0;
1518 }
1519 }
1520 for (const Operand& op : instr->operands) {
1521 if (op.physReg() < vcc) {
1522 res.va_sdst = 0;
1523 } else if (op.physReg() <= vcc_hi) {
1524 res.va_vcc = 0;
1525 } else if (op.physReg() == exec || op.physReg() == exec_hi) {
1526 res.va_exec = 0;
1527 }
1528 }
1529 }
1530
1531 return res;
1532 }
1533
1534 bool
dealloc_vgprs(Program * program)1535 dealloc_vgprs(Program* program)
1536 {
1537 if (program->gfx_level < GFX11)
1538 return false;
1539
1540 /* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
1541 * a wait after exports. There might still be pending VMEM stores for PS parameter exports,
1542 * except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any
1543 * pending VMEM stores or exports if we insert the sendmsg for these stages. */
1544 if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1545 program->stage.hw == AC_HW_PIXEL_SHADER))
1546 return false;
1547
1548 Block& block = program->blocks.back();
1549
1550 /* don't bother checking if there is a pending VMEM store or export: there almost always is */
1551 Builder bld(program);
1552 if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
1553 bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
1554 bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
1555 }
1556
1557 return true;
1558 }
1559
1560 bool
isTrans() const1561 Instruction::isTrans() const noexcept
1562 {
1563 return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
1564 instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental ||
1565 instr_info.classes[(int)opcode] == instr_class::valu_pseudo_scalar_trans;
1566 }
1567
1568 size_t
get_instr_data_size(Format format)1569 get_instr_data_size(Format format)
1570 {
1571 switch (format) {
1572 case Format::SOP1:
1573 case Format::SOP2:
1574 case Format::SOPC:
1575 case Format::SOPK:
1576 case Format::SOPP: return sizeof(SALU_instruction);
1577 case Format::SMEM: return sizeof(SMEM_instruction);
1578 case Format::PSEUDO: return sizeof(Pseudo_instruction);
1579 case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction);
1580 case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction);
1581 case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction);
1582 case Format::DS: return sizeof(DS_instruction);
1583 case Format::FLAT:
1584 case Format::GLOBAL:
1585 case Format::SCRATCH: return sizeof(FLAT_instruction);
1586 case Format::LDSDIR: return sizeof(LDSDIR_instruction);
1587 case Format::MTBUF: return sizeof(MTBUF_instruction);
1588 case Format::MUBUF: return sizeof(MUBUF_instruction);
1589 case Format::MIMG: return sizeof(MIMG_instruction);
1590 case Format::VOPD: return sizeof(VOPD_instruction);
1591 case Format::VINTERP_INREG: return sizeof(VINTERP_inreg_instruction);
1592 case Format::VINTRP: return sizeof(VINTRP_instruction);
1593 case Format::EXP: return sizeof(Export_instruction);
1594 default:
1595 if ((uint16_t)format & (uint16_t)Format::DPP16)
1596 return sizeof(DPP16_instruction);
1597 else if ((uint16_t)format & (uint16_t)Format::DPP8)
1598 return sizeof(DPP8_instruction);
1599 else if ((uint16_t)format & (uint16_t)Format::SDWA)
1600 return sizeof(SDWA_instruction);
1601 else
1602 return sizeof(VALU_instruction);
1603 }
1604 }
1605
1606 Instruction*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1607 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1608 uint32_t num_definitions)
1609 {
1610 size_t size = get_instr_data_size(format);
1611 size_t total_size = size + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1612
1613 void* data = instruction_buffer->allocate(total_size, alignof(uint32_t));
1614 memset(data, 0, total_size);
1615 Instruction* inst = (Instruction*)data;
1616
1617 inst->opcode = opcode;
1618 inst->format = format;
1619
1620 uint16_t operands_offset = size - offsetof(Instruction, operands);
1621 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1622 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1623 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1624
1625 return inst;
1626 }
1627
1628 } // namespace aco
1629