1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26
27 #include "aco_builder.h"
28
29 #include "util/u_debug.h"
30
31 #include "c11/threads.h"
32
33 namespace aco {
34
35 thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
36
37 uint64_t debug_flags = 0;
38
39 static const struct debug_control aco_debug_options[] = {
40 {"validateir", DEBUG_VALIDATE_IR},
41 {"validatera", DEBUG_VALIDATE_RA},
42 {"novalidateir", DEBUG_NO_VALIDATE_IR},
43 {"perfwarn", DEBUG_PERFWARN},
44 {"force-waitcnt", DEBUG_FORCE_WAITCNT},
45 {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
46 {"novn", DEBUG_NO_VN},
47 {"noopt", DEBUG_NO_OPT},
48 {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
49 {"nosched-ilp", DEBUG_NO_SCHED_ILP},
50 {"nosched-vopd", DEBUG_NO_SCHED_VOPD},
51 {"perfinfo", DEBUG_PERF_INFO},
52 {"liveinfo", DEBUG_LIVE_INFO},
53 {NULL, 0}};
54
55 static once_flag init_once_flag = ONCE_FLAG_INIT;
56
57 static void
init_once()58 init_once()
59 {
60 debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
61
62 #ifndef NDEBUG
63 /* enable some flags by default on debug builds */
64 debug_flags |= aco::DEBUG_VALIDATE_IR;
65 #endif
66
67 if (debug_flags & aco::DEBUG_NO_VALIDATE_IR)
68 debug_flags &= ~aco::DEBUG_VALIDATE_IR;
69 }
70
71 void
init()72 init()
73 {
74 call_once(&init_once_flag, init_once);
75 }
76
77 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)78 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
79 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
80 ac_shader_config* config)
81 {
82 instruction_buffer = &program->m;
83 program->stage = stage;
84 program->config = config;
85 program->info = *info;
86 program->gfx_level = gfx_level;
87 if (family == CHIP_UNKNOWN) {
88 switch (gfx_level) {
89 case GFX6: program->family = CHIP_TAHITI; break;
90 case GFX7: program->family = CHIP_BONAIRE; break;
91 case GFX8: program->family = CHIP_POLARIS10; break;
92 case GFX9: program->family = CHIP_VEGA10; break;
93 case GFX10: program->family = CHIP_NAVI10; break;
94 case GFX10_3: program->family = CHIP_NAVI21; break;
95 case GFX11: program->family = CHIP_NAVI31; break;
96 default: program->family = CHIP_UNKNOWN; break;
97 }
98 } else {
99 program->family = family;
100 }
101 program->wave_size = info->wave_size;
102 program->lane_mask = program->wave_size == 32 ? s1 : s2;
103
104 program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024
105 : gfx_level >= GFX7 ? 512
106 : 256;
107 program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
108
109 /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
110 program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
111
112 /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
113 program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
114
115 program->dev.vgpr_limit = 256;
116 program->dev.physical_vgprs = 256;
117 program->dev.vgpr_alloc_granule = 4;
118
119 if (gfx_level >= GFX10) {
120 program->dev.physical_sgprs = 128 * 20; /* enough for max waves */
121 program->dev.sgpr_alloc_granule = 128;
122 program->dev.sgpr_limit =
123 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
124
125 if (family == CHIP_NAVI31 || family == CHIP_NAVI32) {
126 program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
127 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
128 } else {
129 program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
130 if (gfx_level >= GFX10_3)
131 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
132 else
133 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
134 }
135 } else if (program->gfx_level >= GFX8) {
136 program->dev.physical_sgprs = 800;
137 program->dev.sgpr_alloc_granule = 16;
138 program->dev.sgpr_limit = 102;
139 if (family == CHIP_TONGA || family == CHIP_ICELAND)
140 program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
141 } else {
142 program->dev.physical_sgprs = 512;
143 program->dev.sgpr_alloc_granule = 8;
144 program->dev.sgpr_limit = 104;
145 }
146
147 program->dev.scratch_alloc_granule = gfx_level >= GFX11 ? 256 : 1024;
148
149 program->dev.max_waves_per_simd = 10;
150 if (program->gfx_level >= GFX10_3)
151 program->dev.max_waves_per_simd = 16;
152 else if (program->gfx_level == GFX10)
153 program->dev.max_waves_per_simd = 20;
154 else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
155 program->dev.max_waves_per_simd = 8;
156
157 program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
158
159 switch (program->family) {
160 /* GFX8 APUs */
161 case CHIP_CARRIZO:
162 case CHIP_STONEY:
163 /* GFX9 APUS */
164 case CHIP_RAVEN:
165 case CHIP_RAVEN2:
166 case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
167 default: break;
168 }
169
170 program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
171 /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
172 program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
173 if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
174 program->family == CHIP_HAWAII)
175 program->dev.has_fast_fma32 = true;
176 program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level >= GFX10;
177
178 program->dev.fused_mad_mix = program->gfx_level >= GFX10;
179 if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
180 program->family == CHIP_MI100 || program->family == CHIP_MI200)
181 program->dev.fused_mad_mix = true;
182
183 if (program->gfx_level >= GFX11) {
184 program->dev.scratch_global_offset_min = -4096;
185 program->dev.scratch_global_offset_max = 4095;
186 } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
187 program->dev.scratch_global_offset_min = -2048;
188 program->dev.scratch_global_offset_max = 2047;
189 } else if (program->gfx_level == GFX9) {
190 /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
191 program->dev.scratch_global_offset_min = 0;
192 program->dev.scratch_global_offset_max = 4095;
193 }
194
195 if (program->gfx_level >= GFX11) {
196 /* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
197 * rest of the address.
198 */
199 program->dev.max_nsa_vgprs = 4;
200 } else if (program->gfx_level >= GFX10_3) {
201 /* GFX10.3 can have up to 3 NSA dwords. */
202 program->dev.max_nsa_vgprs = 13;
203 } else if (program->gfx_level >= GFX10) {
204 /* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
205 program->dev.max_nsa_vgprs = 5;
206 } else {
207 program->dev.max_nsa_vgprs = 0;
208 }
209
210 program->wgp_mode = wgp_mode;
211
212 program->progress = CompilationProgress::after_isel;
213
214 program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
215 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
216 program->next_fp_mode.must_flush_denorms32 = false;
217 program->next_fp_mode.must_flush_denorms16_64 = false;
218 program->next_fp_mode.care_about_round32 = false;
219 program->next_fp_mode.care_about_round16_64 = false;
220 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
221 program->next_fp_mode.denorm32 = 0;
222 program->next_fp_mode.round16_64 = fp_round_ne;
223 program->next_fp_mode.round32 = fp_round_ne;
224 }
225
226 memory_sync_info
get_sync_info(const Instruction * instr)227 get_sync_info(const Instruction* instr)
228 {
229 /* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
230 * overlapping waves in the queue family.
231 */
232 if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
233 (instr->opcode == aco_opcode::s_wait_event &&
234 !(instr->sopp().imm & wait_event_imm_dont_wait_export_ready))) {
235 return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
236 } else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
237 return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
238 }
239
240 switch (instr->format) {
241 case Format::SMEM: return instr->smem().sync;
242 case Format::MUBUF: return instr->mubuf().sync;
243 case Format::MIMG: return instr->mimg().sync;
244 case Format::MTBUF: return instr->mtbuf().sync;
245 case Format::FLAT:
246 case Format::GLOBAL:
247 case Format::SCRATCH: return instr->flatlike().sync;
248 case Format::DS: return instr->ds().sync;
249 case Format::LDSDIR: return instr->ldsdir().sync;
250 default: return memory_sync_info();
251 }
252 }
253
254 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)255 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
256 {
257 if (!instr->isVALU())
258 return false;
259
260 if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
261 return false;
262
263 if (instr->isSDWA())
264 return true;
265
266 if (instr->isVOP3()) {
267 VALU_instruction& vop3 = instr->valu();
268 if (instr->format == Format::VOP3)
269 return false;
270 if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
271 return false;
272 if (vop3.omod && gfx_level < GFX9)
273 return false;
274
275 // TODO: return true if we know we will use vcc
276 if (!pre_ra && instr->definitions.size() >= 2)
277 return false;
278
279 for (unsigned i = 1; i < instr->operands.size(); i++) {
280 if (instr->operands[i].isLiteral())
281 return false;
282 if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
283 return false;
284 }
285 }
286
287 if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
288 return false;
289
290 if (!instr->operands.empty()) {
291 if (instr->operands[0].isLiteral())
292 return false;
293 if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
294 return false;
295 if (instr->operands[0].bytes() > 4)
296 return false;
297 if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
298 return false;
299 }
300
301 bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
302 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
303
304 if (gfx_level != GFX8 && is_mac)
305 return false;
306
307 // TODO: return true if we know we will use vcc
308 if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
309 return false;
310 if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
311 return false;
312
313 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
314 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
315 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
316 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
317 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
318 instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
319 }
320
321 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
322 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)323 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
324 {
325 if (instr->isSDWA())
326 return NULL;
327
328 aco_ptr<Instruction> tmp = std::move(instr);
329 Format format = asSDWA(withoutVOP3(tmp->format));
330 instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
331 tmp->definitions.size()));
332 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
333 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
334
335 SDWA_instruction& sdwa = instr->sdwa();
336
337 if (tmp->isVOP3()) {
338 VALU_instruction& vop3 = tmp->valu();
339 sdwa.neg = vop3.neg;
340 sdwa.abs = vop3.abs;
341 sdwa.omod = vop3.omod;
342 sdwa.clamp = vop3.clamp;
343 }
344
345 for (unsigned i = 0; i < instr->operands.size(); i++) {
346 /* SDWA only uses operands 0 and 1. */
347 if (i >= 2)
348 break;
349
350 sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
351 }
352
353 sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
354
355 if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
356 instr->definitions[0].setFixed(vcc);
357 if (instr->definitions.size() >= 2)
358 instr->definitions[1].setFixed(vcc);
359 if (instr->operands.size() >= 3)
360 instr->operands[2].setFixed(vcc);
361
362 instr->pass_flags = tmp->pass_flags;
363
364 return tmp;
365 }
366
367 bool
can_use_DPP(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool dpp8)368 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
369 {
370 assert(instr->isVALU() && !instr->operands.empty());
371
372 if (instr->isDPP())
373 return instr->isDPP8() == dpp8;
374
375 if (instr->isSDWA() || instr->isVINTERP_INREG())
376 return false;
377
378 if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
379 return false;
380
381 if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
382 instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
383 return false;
384
385 if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
386 instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
387 gfx_level < GFX11)
388 return false;
389
390 if (instr->isVOP3() && gfx_level < GFX11) {
391 const VALU_instruction* vop3 = &instr->valu();
392 if (vop3->clamp || vop3->omod)
393 return false;
394 if (dpp8)
395 return false;
396 }
397
398 for (unsigned i = 0; i < instr->operands.size(); i++) {
399 if (instr->operands[i].isLiteral())
400 return false;
401 if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
402 return false;
403 }
404
405 /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */
406 if (instr->writes_exec())
407 return false;
408
409 /* simpler than listing all VOP3P opcodes which do not support DPP */
410 if (instr->isVOP3P()) {
411 return instr->opcode == aco_opcode::v_fma_mix_f32 ||
412 instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
413 instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
414 instr->opcode == aco_opcode::v_dot2_f32_f16 ||
415 instr->opcode == aco_opcode::v_dot2_f32_bf16;
416 }
417
418 if (instr->opcode == aco_opcode::v_pk_fmac_f16)
419 return gfx_level < GFX11;
420
421 /* there are more cases but those all take 64-bit inputs */
422 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
423 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
424 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
425 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
426 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
427 instr->opcode != aco_opcode::v_cvt_f64_i32 &&
428 instr->opcode != aco_opcode::v_cvt_f64_f32 &&
429 instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
430 instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
431 instr->opcode != aco_opcode::v_mul_hi_i32 &&
432 instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
433 instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
434 instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
435 instr->opcode != aco_opcode::v_mad_u64_u32 &&
436 instr->opcode != aco_opcode::v_mad_i64_i32 &&
437 instr->opcode != aco_opcode::v_permlane16_b32 &&
438 instr->opcode != aco_opcode::v_permlanex16_b32 &&
439 instr->opcode != aco_opcode::v_permlane64_b32 &&
440 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
441 instr->opcode != aco_opcode::v_writelane_b32_e64;
442 }
443
444 aco_ptr<Instruction>
convert_to_DPP(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,bool dpp8)445 convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
446 {
447 if (instr->isDPP())
448 return NULL;
449
450 aco_ptr<Instruction> tmp = std::move(instr);
451 Format format =
452 (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
453 if (dpp8)
454 instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(),
455 tmp->definitions.size()));
456 else
457 instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(),
458 tmp->definitions.size()));
459 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
460 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
461
462 if (dpp8) {
463 DPP8_instruction* dpp = &instr->dpp8();
464 dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
465 dpp->fetch_inactive = gfx_level >= GFX10;
466 } else {
467 DPP16_instruction* dpp = &instr->dpp16();
468 dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
469 dpp->row_mask = 0xf;
470 dpp->bank_mask = 0xf;
471 dpp->fetch_inactive = gfx_level >= GFX10;
472 }
473
474 instr->valu().neg = tmp->valu().neg;
475 instr->valu().abs = tmp->valu().abs;
476 instr->valu().omod = tmp->valu().omod;
477 instr->valu().clamp = tmp->valu().clamp;
478 instr->valu().opsel = tmp->valu().opsel;
479 instr->valu().opsel_lo = tmp->valu().opsel_lo;
480 instr->valu().opsel_hi = tmp->valu().opsel_hi;
481
482 if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
483 instr->definitions.back().setFixed(vcc);
484
485 if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
486 gfx_level < GFX11)
487 instr->operands[2].setFixed(vcc);
488
489 instr->pass_flags = tmp->pass_flags;
490
491 /* DPP16 supports input modifiers, so we might no longer need VOP3. */
492 bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
493 (instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
494
495 /* VOPC/add_co/sub_co definition needs VCC without VOP3. */
496 remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
497 !instr->definitions.back().isFixed() ||
498 instr->definitions.back().physReg() == vcc;
499
500 /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
501 remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
502 instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
503
504 if (remove_vop3)
505 instr->format = withoutVOP3(instr->format);
506
507 return tmp;
508 }
509
510 bool
can_use_input_modifiers(amd_gfx_level gfx_level,aco_opcode op,int idx)511 can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
512 {
513 if (op == aco_opcode::v_mov_b32)
514 return gfx_level >= GFX10;
515
516 if (op == aco_opcode::v_ldexp_f16 || op == aco_opcode::v_ldexp_f32 ||
517 op == aco_opcode::v_ldexp_f64)
518 return idx == 0;
519
520 return instr_info.can_use_input_modifiers[(int)op];
521 }
522
523 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)524 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
525 {
526 /* opsel is only GFX9+ */
527 if (gfx_level < GFX9)
528 return false;
529
530 switch (op) {
531 case aco_opcode::v_div_fixup_f16:
532 case aco_opcode::v_fma_f16:
533 case aco_opcode::v_mad_f16:
534 case aco_opcode::v_mad_u16:
535 case aco_opcode::v_mad_i16:
536 case aco_opcode::v_med3_f16:
537 case aco_opcode::v_med3_i16:
538 case aco_opcode::v_med3_u16:
539 case aco_opcode::v_min3_f16:
540 case aco_opcode::v_min3_i16:
541 case aco_opcode::v_min3_u16:
542 case aco_opcode::v_max3_f16:
543 case aco_opcode::v_max3_i16:
544 case aco_opcode::v_max3_u16:
545 case aco_opcode::v_minmax_f16:
546 case aco_opcode::v_maxmin_f16:
547 case aco_opcode::v_max_u16_e64:
548 case aco_opcode::v_max_i16_e64:
549 case aco_opcode::v_min_u16_e64:
550 case aco_opcode::v_min_i16_e64:
551 case aco_opcode::v_add_i16:
552 case aco_opcode::v_sub_i16:
553 case aco_opcode::v_add_u16_e64:
554 case aco_opcode::v_sub_u16_e64:
555 case aco_opcode::v_lshlrev_b16_e64:
556 case aco_opcode::v_lshrrev_b16_e64:
557 case aco_opcode::v_ashrrev_i16_e64:
558 case aco_opcode::v_and_b16:
559 case aco_opcode::v_or_b16:
560 case aco_opcode::v_xor_b16:
561 case aco_opcode::v_mul_lo_u16_e64: return true;
562 case aco_opcode::v_pack_b32_f16:
563 case aco_opcode::v_cvt_pknorm_i16_f16:
564 case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
565 case aco_opcode::v_mad_u32_u16:
566 case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
567 case aco_opcode::v_dot2_f16_f16:
568 case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
569 case aco_opcode::v_cndmask_b16: return idx != 2;
570 case aco_opcode::v_interp_p10_f16_f32_inreg:
571 case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
572 case aco_opcode::v_interp_p2_f16_f32_inreg:
573 case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
574 default:
575 return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
576 }
577 }
578
579 bool
can_write_m0(const aco_ptr<Instruction> & instr)580 can_write_m0(const aco_ptr<Instruction>& instr)
581 {
582 if (instr->isSALU())
583 return true;
584
585 /* VALU can't write m0 on any GPU generations. */
586 if (instr->isVALU())
587 return false;
588
589 switch (instr->opcode) {
590 case aco_opcode::p_parallelcopy:
591 case aco_opcode::p_extract:
592 case aco_opcode::p_insert:
593 /* These pseudo instructions are implemented with SALU when writing m0. */
594 return true;
595 default:
596 /* Assume that no other instructions can write m0. */
597 return false;
598 }
599 }
600
601 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)602 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
603 {
604 /* partial register writes are GFX9+, only */
605 if (gfx_level < GFX9)
606 return false;
607
608 switch (op) {
609 /* VOP3 */
610 case aco_opcode::v_mad_f16:
611 case aco_opcode::v_mad_u16:
612 case aco_opcode::v_mad_i16:
613 case aco_opcode::v_fma_f16:
614 case aco_opcode::v_div_fixup_f16:
615 case aco_opcode::v_interp_p2_f16:
616 case aco_opcode::v_fma_mixlo_f16:
617 case aco_opcode::v_fma_mixhi_f16:
618 /* VOP2 */
619 case aco_opcode::v_mac_f16:
620 case aco_opcode::v_madak_f16:
621 case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
622 case aco_opcode::v_add_f16:
623 case aco_opcode::v_sub_f16:
624 case aco_opcode::v_subrev_f16:
625 case aco_opcode::v_mul_f16:
626 case aco_opcode::v_max_f16:
627 case aco_opcode::v_min_f16:
628 case aco_opcode::v_ldexp_f16:
629 case aco_opcode::v_fmac_f16:
630 case aco_opcode::v_fmamk_f16:
631 case aco_opcode::v_fmaak_f16:
632 /* VOP1 */
633 case aco_opcode::v_cvt_f16_f32:
634 case aco_opcode::p_cvt_f16_f32_rtne:
635 case aco_opcode::v_cvt_f16_u16:
636 case aco_opcode::v_cvt_f16_i16:
637 case aco_opcode::v_rcp_f16:
638 case aco_opcode::v_sqrt_f16:
639 case aco_opcode::v_rsq_f16:
640 case aco_opcode::v_log_f16:
641 case aco_opcode::v_exp_f16:
642 case aco_opcode::v_frexp_mant_f16:
643 case aco_opcode::v_frexp_exp_i16_f16:
644 case aco_opcode::v_floor_f16:
645 case aco_opcode::v_ceil_f16:
646 case aco_opcode::v_trunc_f16:
647 case aco_opcode::v_rndne_f16:
648 case aco_opcode::v_fract_f16:
649 case aco_opcode::v_sin_f16:
650 case aco_opcode::v_cos_f16:
651 case aco_opcode::v_cvt_u16_f16:
652 case aco_opcode::v_cvt_i16_f16:
653 case aco_opcode::v_cvt_norm_i16_f16:
654 case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
655 /* on GFX10, all opsel instructions preserve the high bits */
656 default: return gfx_level >= GFX10 && can_use_opsel(gfx_level, op, -1);
657 }
658 }
659
660 /* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
661 * only supports v0-v127.
662 * The first three bits are used for operands 0-2, and the 4th bit is used for the destination.
663 */
664 uint8_t
get_gfx11_true16_mask(aco_opcode op)665 get_gfx11_true16_mask(aco_opcode op)
666 {
667 switch (op) {
668 case aco_opcode::v_ceil_f16:
669 case aco_opcode::v_cos_f16:
670 case aco_opcode::v_cvt_f16_i16:
671 case aco_opcode::v_cvt_f16_u16:
672 case aco_opcode::v_cvt_i16_f16:
673 case aco_opcode::v_cvt_u16_f16:
674 case aco_opcode::v_cvt_norm_i16_f16:
675 case aco_opcode::v_cvt_norm_u16_f16:
676 case aco_opcode::v_exp_f16:
677 case aco_opcode::v_floor_f16:
678 case aco_opcode::v_fract_f16:
679 case aco_opcode::v_frexp_exp_i16_f16:
680 case aco_opcode::v_frexp_mant_f16:
681 case aco_opcode::v_log_f16:
682 case aco_opcode::v_not_b16:
683 case aco_opcode::v_rcp_f16:
684 case aco_opcode::v_rndne_f16:
685 case aco_opcode::v_rsq_f16:
686 case aco_opcode::v_sin_f16:
687 case aco_opcode::v_sqrt_f16:
688 case aco_opcode::v_trunc_f16:
689 case aco_opcode::v_mov_b16: return 0x1 | 0x8;
690 case aco_opcode::v_add_f16:
691 case aco_opcode::v_fmaak_f16:
692 case aco_opcode::v_fmac_f16:
693 case aco_opcode::v_fmamk_f16:
694 case aco_opcode::v_ldexp_f16:
695 case aco_opcode::v_max_f16:
696 case aco_opcode::v_min_f16:
697 case aco_opcode::v_mul_f16:
698 case aco_opcode::v_sub_f16:
699 case aco_opcode::v_subrev_f16:
700 case aco_opcode::v_and_b16:
701 case aco_opcode::v_or_b16:
702 case aco_opcode::v_xor_b16: return 0x3 | 0x8;
703 case aco_opcode::v_cvt_f32_f16:
704 case aco_opcode::v_cvt_i32_i16:
705 case aco_opcode::v_cvt_u32_u16: return 0x1;
706 case aco_opcode::v_cmp_class_f16:
707 case aco_opcode::v_cmp_eq_f16:
708 case aco_opcode::v_cmp_eq_i16:
709 case aco_opcode::v_cmp_eq_u16:
710 case aco_opcode::v_cmp_ge_f16:
711 case aco_opcode::v_cmp_ge_i16:
712 case aco_opcode::v_cmp_ge_u16:
713 case aco_opcode::v_cmp_gt_f16:
714 case aco_opcode::v_cmp_gt_i16:
715 case aco_opcode::v_cmp_gt_u16:
716 case aco_opcode::v_cmp_le_f16:
717 case aco_opcode::v_cmp_le_i16:
718 case aco_opcode::v_cmp_le_u16:
719 case aco_opcode::v_cmp_lg_f16:
720 case aco_opcode::v_cmp_lg_i16:
721 case aco_opcode::v_cmp_lg_u16:
722 case aco_opcode::v_cmp_lt_f16:
723 case aco_opcode::v_cmp_lt_i16:
724 case aco_opcode::v_cmp_lt_u16:
725 case aco_opcode::v_cmp_neq_f16:
726 case aco_opcode::v_cmp_nge_f16:
727 case aco_opcode::v_cmp_ngt_f16:
728 case aco_opcode::v_cmp_nle_f16:
729 case aco_opcode::v_cmp_nlg_f16:
730 case aco_opcode::v_cmp_nlt_f16:
731 case aco_opcode::v_cmp_o_f16:
732 case aco_opcode::v_cmp_u_f16:
733 case aco_opcode::v_cmpx_class_f16:
734 case aco_opcode::v_cmpx_eq_f16:
735 case aco_opcode::v_cmpx_eq_i16:
736 case aco_opcode::v_cmpx_eq_u16:
737 case aco_opcode::v_cmpx_ge_f16:
738 case aco_opcode::v_cmpx_ge_i16:
739 case aco_opcode::v_cmpx_ge_u16:
740 case aco_opcode::v_cmpx_gt_f16:
741 case aco_opcode::v_cmpx_gt_i16:
742 case aco_opcode::v_cmpx_gt_u16:
743 case aco_opcode::v_cmpx_le_f16:
744 case aco_opcode::v_cmpx_le_i16:
745 case aco_opcode::v_cmpx_le_u16:
746 case aco_opcode::v_cmpx_lg_f16:
747 case aco_opcode::v_cmpx_lg_i16:
748 case aco_opcode::v_cmpx_lg_u16:
749 case aco_opcode::v_cmpx_lt_f16:
750 case aco_opcode::v_cmpx_lt_i16:
751 case aco_opcode::v_cmpx_lt_u16:
752 case aco_opcode::v_cmpx_neq_f16:
753 case aco_opcode::v_cmpx_nge_f16:
754 case aco_opcode::v_cmpx_ngt_f16:
755 case aco_opcode::v_cmpx_nle_f16:
756 case aco_opcode::v_cmpx_nlg_f16:
757 case aco_opcode::v_cmpx_nlt_f16:
758 case aco_opcode::v_cmpx_o_f16:
759 case aco_opcode::v_cmpx_u_f16: return 0x3;
760 case aco_opcode::v_cvt_f16_f32:
761 case aco_opcode::v_sat_pk_u8_i16: return 0x8;
762 default: return 0x0;
763 }
764 }
765
766 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)767 get_reduction_identity(ReduceOp op, unsigned idx)
768 {
769 switch (op) {
770 case iadd8:
771 case iadd16:
772 case iadd32:
773 case iadd64:
774 case fadd16:
775 case fadd32:
776 case fadd64:
777 case ior8:
778 case ior16:
779 case ior32:
780 case ior64:
781 case ixor8:
782 case ixor16:
783 case ixor32:
784 case ixor64:
785 case umax8:
786 case umax16:
787 case umax32:
788 case umax64: return 0;
789 case imul8:
790 case imul16:
791 case imul32:
792 case imul64: return idx ? 0 : 1;
793 case fmul16: return 0x3c00u; /* 1.0 */
794 case fmul32: return 0x3f800000u; /* 1.0 */
795 case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
796 case imin8: return INT8_MAX;
797 case imin16: return INT16_MAX;
798 case imin32: return INT32_MAX;
799 case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
800 case imax8: return INT8_MIN;
801 case imax16: return INT16_MIN;
802 case imax32: return INT32_MIN;
803 case imax64: return idx ? 0x80000000u : 0;
804 case umin8:
805 case umin16:
806 case iand8:
807 case iand16: return 0xffffffffu;
808 case umin32:
809 case umin64:
810 case iand32:
811 case iand64: return 0xffffffffu;
812 case fmin16: return 0x7c00u; /* infinity */
813 case fmin32: return 0x7f800000u; /* infinity */
814 case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
815 case fmax16: return 0xfc00u; /* negative infinity */
816 case fmax32: return 0xff800000u; /* negative infinity */
817 case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
818 default: unreachable("Invalid reduction operation"); break;
819 }
820 return 0;
821 }
822
823 unsigned
get_operand_size(aco_ptr<Instruction> & instr,unsigned index)824 get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
825 {
826 if (instr->isPseudo())
827 return instr->operands[index].bytes() * 8u;
828 else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
829 instr->opcode == aco_opcode::v_mad_i64_i32)
830 return index == 2 ? 64 : 32;
831 else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
832 instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
833 instr->opcode == aco_opcode::v_fma_mixhi_f16)
834 return instr->valu().opsel_hi[index] ? 16 : 32;
835 else if (instr->isVALU() || instr->isSALU())
836 return instr_info.operand_size[(int)instr->opcode];
837 else
838 return 0;
839 }
840
841 bool
needs_exec_mask(const Instruction * instr)842 needs_exec_mask(const Instruction* instr)
843 {
844 if (instr->isVALU()) {
845 return instr->opcode != aco_opcode::v_readlane_b32 &&
846 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
847 instr->opcode != aco_opcode::v_writelane_b32 &&
848 instr->opcode != aco_opcode::v_writelane_b32_e64;
849 }
850
851 if (instr->isVMEM() || instr->isFlatLike())
852 return true;
853
854 if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
855 return instr->reads_exec();
856
857 if (instr->isPseudo()) {
858 switch (instr->opcode) {
859 case aco_opcode::p_create_vector:
860 case aco_opcode::p_extract_vector:
861 case aco_opcode::p_split_vector:
862 case aco_opcode::p_phi:
863 case aco_opcode::p_parallelcopy:
864 for (Definition def : instr->definitions) {
865 if (def.getTemp().type() == RegType::vgpr)
866 return true;
867 }
868 return instr->reads_exec();
869 case aco_opcode::p_spill:
870 case aco_opcode::p_reload:
871 case aco_opcode::p_end_linear_vgpr:
872 case aco_opcode::p_logical_start:
873 case aco_opcode::p_logical_end:
874 case aco_opcode::p_startpgm:
875 case aco_opcode::p_end_wqm:
876 case aco_opcode::p_init_scratch: return instr->reads_exec();
877 case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
878 default: break;
879 }
880 }
881
882 return true;
883 }
884
885 struct CmpInfo {
886 aco_opcode ordered;
887 aco_opcode unordered;
888 aco_opcode swapped;
889 aco_opcode inverse;
890 aco_opcode vcmpx;
891 aco_opcode f32;
892 unsigned size;
893 };
894
895 ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)896 get_cmp_info(aco_opcode op, CmpInfo* info)
897 {
898 info->ordered = aco_opcode::num_opcodes;
899 info->unordered = aco_opcode::num_opcodes;
900 info->swapped = aco_opcode::num_opcodes;
901 info->inverse = aco_opcode::num_opcodes;
902 info->f32 = aco_opcode::num_opcodes;
903 info->vcmpx = aco_opcode::num_opcodes;
904 switch (op) {
905 // clang-format off
906 #define CMP2(ord, unord, ord_swap, unord_swap, sz) \
907 case aco_opcode::v_cmp_##ord##_f##sz: \
908 case aco_opcode::v_cmp_n##unord##_f##sz: \
909 info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \
910 info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \
911 info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
912 : aco_opcode::v_cmp_n##unord_swap##_f##sz; \
913 info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
914 : aco_opcode::v_cmp_n##ord##_f##sz; \
915 info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \
916 : aco_opcode::v_cmp_n##unord##_f32; \
917 info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz \
918 : aco_opcode::v_cmpx_n##unord##_f##sz; \
919 info->size = sz; \
920 return true;
921 #define CMP(ord, unord, ord_swap, unord_swap) \
922 CMP2(ord, unord, ord_swap, unord_swap, 16) \
923 CMP2(ord, unord, ord_swap, unord_swap, 32) \
924 CMP2(ord, unord, ord_swap, unord_swap, 64)
925 CMP(lt, /*n*/ge, gt, /*n*/le)
926 CMP(eq, /*n*/lg, eq, /*n*/lg)
927 CMP(le, /*n*/gt, ge, /*n*/lt)
928 CMP(gt, /*n*/le, lt, /*n*/ge)
929 CMP(lg, /*n*/eq, lg, /*n*/eq)
930 CMP(ge, /*n*/lt, le, /*n*/gt)
931 #undef CMP
932 #undef CMP2
933 #define ORD_TEST(sz) \
934 case aco_opcode::v_cmp_u_f##sz: \
935 info->f32 = aco_opcode::v_cmp_u_f32; \
936 info->swapped = aco_opcode::v_cmp_u_f##sz; \
937 info->inverse = aco_opcode::v_cmp_o_f##sz; \
938 info->vcmpx = aco_opcode::v_cmpx_u_f##sz; \
939 info->size = sz; \
940 return true; \
941 case aco_opcode::v_cmp_o_f##sz: \
942 info->f32 = aco_opcode::v_cmp_o_f32; \
943 info->swapped = aco_opcode::v_cmp_o_f##sz; \
944 info->inverse = aco_opcode::v_cmp_u_f##sz; \
945 info->vcmpx = aco_opcode::v_cmpx_o_f##sz; \
946 info->size = sz; \
947 return true;
948 ORD_TEST(16)
949 ORD_TEST(32)
950 ORD_TEST(64)
951 #undef ORD_TEST
952 #define CMPI2(op, swap, inv, type, sz) \
953 case aco_opcode::v_cmp_##op##_##type##sz: \
954 info->swapped = aco_opcode::v_cmp_##swap##_##type##sz; \
955 info->inverse = aco_opcode::v_cmp_##inv##_##type##sz; \
956 info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz; \
957 info->size = sz; \
958 return true;
959 #define CMPI(op, swap, inv) \
960 CMPI2(op, swap, inv, i, 16) \
961 CMPI2(op, swap, inv, u, 16) \
962 CMPI2(op, swap, inv, i, 32) \
963 CMPI2(op, swap, inv, u, 32) \
964 CMPI2(op, swap, inv, i, 64) \
965 CMPI2(op, swap, inv, u, 64)
966 CMPI(lt, gt, ge)
967 CMPI(eq, eq, lg)
968 CMPI(le, ge, gt)
969 CMPI(gt, lt, le)
970 CMPI(lg, lg, eq)
971 CMPI(ge, le, lt)
972 #undef CMPI
973 #undef CMPI2
974 #define CMPCLASS(sz) \
975 case aco_opcode::v_cmp_class_f##sz: \
976 info->vcmpx = aco_opcode::v_cmpx_class_f##sz; \
977 info->size = sz; \
978 return true;
979 CMPCLASS(16)
980 CMPCLASS(32)
981 CMPCLASS(64)
982 #undef CMPCLASS
983 // clang-format on
984 default: return false;
985 }
986 }
987
988 aco_opcode
get_ordered(aco_opcode op)989 get_ordered(aco_opcode op)
990 {
991 CmpInfo info;
992 return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
993 }
994
995 aco_opcode
get_unordered(aco_opcode op)996 get_unordered(aco_opcode op)
997 {
998 CmpInfo info;
999 return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
1000 }
1001
1002 aco_opcode
get_inverse(aco_opcode op)1003 get_inverse(aco_opcode op)
1004 {
1005 CmpInfo info;
1006 return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
1007 }
1008
1009 aco_opcode
get_swapped(aco_opcode op)1010 get_swapped(aco_opcode op)
1011 {
1012 CmpInfo info;
1013 return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
1014 }
1015
1016 aco_opcode
get_f32_cmp(aco_opcode op)1017 get_f32_cmp(aco_opcode op)
1018 {
1019 CmpInfo info;
1020 return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
1021 }
1022
1023 aco_opcode
get_vcmpx(aco_opcode op)1024 get_vcmpx(aco_opcode op)
1025 {
1026 CmpInfo info;
1027 return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
1028 }
1029
1030 unsigned
get_cmp_bitsize(aco_opcode op)1031 get_cmp_bitsize(aco_opcode op)
1032 {
1033 CmpInfo info;
1034 return get_cmp_info(op, &info) ? info.size : 0;
1035 }
1036
1037 bool
is_fp_cmp(aco_opcode op)1038 is_fp_cmp(aco_opcode op)
1039 {
1040 CmpInfo info;
1041 return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
1042 }
1043
1044 bool
is_cmpx(aco_opcode op)1045 is_cmpx(aco_opcode op)
1046 {
1047 CmpInfo info;
1048 return !get_cmp_info(op, &info);
1049 }
1050
1051 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op,unsigned idx0,unsigned idx1)1052 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
1053 {
1054 if (idx0 == idx1) {
1055 *new_op = instr->opcode;
1056 return true;
1057 }
1058
1059 if (idx0 > idx1)
1060 std::swap(idx0, idx1);
1061
1062 if (instr->isDPP())
1063 return false;
1064
1065 if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
1066 return false;
1067
1068 if (instr->isVOPC()) {
1069 CmpInfo info;
1070 if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
1071 *new_op = info.swapped;
1072 return true;
1073 }
1074 }
1075
1076 /* opcodes not relevant for DPP or SGPRs optimizations are not included. */
1077 switch (instr->opcode) {
1078 case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
1079 case aco_opcode::v_add_u32:
1080 case aco_opcode::v_add_co_u32:
1081 case aco_opcode::v_add_co_u32_e64:
1082 case aco_opcode::v_add_i32:
1083 case aco_opcode::v_add_i16:
1084 case aco_opcode::v_add_u16_e64:
1085 case aco_opcode::v_add3_u32:
1086 case aco_opcode::v_add_f16:
1087 case aco_opcode::v_add_f32:
1088 case aco_opcode::v_mul_i32_i24:
1089 case aco_opcode::v_mul_hi_i32_i24:
1090 case aco_opcode::v_mul_u32_u24:
1091 case aco_opcode::v_mul_hi_u32_u24:
1092 case aco_opcode::v_mul_lo_u16:
1093 case aco_opcode::v_mul_lo_u16_e64:
1094 case aco_opcode::v_mul_f16:
1095 case aco_opcode::v_mul_f32:
1096 case aco_opcode::v_mul_legacy_f32:
1097 case aco_opcode::v_or_b32:
1098 case aco_opcode::v_and_b32:
1099 case aco_opcode::v_xor_b32:
1100 case aco_opcode::v_xnor_b32:
1101 case aco_opcode::v_xor3_b32:
1102 case aco_opcode::v_or3_b32:
1103 case aco_opcode::v_and_b16:
1104 case aco_opcode::v_or_b16:
1105 case aco_opcode::v_xor_b16:
1106 case aco_opcode::v_max3_f32:
1107 case aco_opcode::v_min3_f32:
1108 case aco_opcode::v_max3_f16:
1109 case aco_opcode::v_min3_f16:
1110 case aco_opcode::v_med3_f16:
1111 case aco_opcode::v_max3_u32:
1112 case aco_opcode::v_min3_u32:
1113 case aco_opcode::v_med3_u32:
1114 case aco_opcode::v_max3_i32:
1115 case aco_opcode::v_min3_i32:
1116 case aco_opcode::v_med3_i32:
1117 case aco_opcode::v_max3_u16:
1118 case aco_opcode::v_min3_u16:
1119 case aco_opcode::v_med3_u16:
1120 case aco_opcode::v_max3_i16:
1121 case aco_opcode::v_min3_i16:
1122 case aco_opcode::v_med3_i16:
1123 case aco_opcode::v_max_f16:
1124 case aco_opcode::v_max_f32:
1125 case aco_opcode::v_min_f16:
1126 case aco_opcode::v_min_f32:
1127 case aco_opcode::v_max_i32:
1128 case aco_opcode::v_min_i32:
1129 case aco_opcode::v_max_u32:
1130 case aco_opcode::v_min_u32:
1131 case aco_opcode::v_max_i16:
1132 case aco_opcode::v_min_i16:
1133 case aco_opcode::v_max_u16:
1134 case aco_opcode::v_min_u16:
1135 case aco_opcode::v_max_i16_e64:
1136 case aco_opcode::v_min_i16_e64:
1137 case aco_opcode::v_max_u16_e64:
1138 case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
1139 case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
1140 case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
1141 case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
1142 case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
1143 case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
1144 case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
1145 case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
1146 case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
1147 case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
1148 case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
1149 case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
1150 case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
1151 case aco_opcode::v_addc_co_u32:
1152 case aco_opcode::v_mad_i32_i24:
1153 case aco_opcode::v_mad_u32_u24:
1154 case aco_opcode::v_lerp_u8:
1155 case aco_opcode::v_sad_u8:
1156 case aco_opcode::v_sad_hi_u8:
1157 case aco_opcode::v_sad_u16:
1158 case aco_opcode::v_sad_u32:
1159 case aco_opcode::v_xad_u32:
1160 case aco_opcode::v_add_lshl_u32:
1161 case aco_opcode::v_and_or_b32:
1162 case aco_opcode::v_mad_u16:
1163 case aco_opcode::v_mad_i16:
1164 case aco_opcode::v_mad_u32_u16:
1165 case aco_opcode::v_mad_i32_i16:
1166 case aco_opcode::v_maxmin_f32:
1167 case aco_opcode::v_minmax_f32:
1168 case aco_opcode::v_maxmin_f16:
1169 case aco_opcode::v_minmax_f16:
1170 case aco_opcode::v_maxmin_u32:
1171 case aco_opcode::v_minmax_u32:
1172 case aco_opcode::v_maxmin_i32:
1173 case aco_opcode::v_minmax_i32:
1174 case aco_opcode::v_fma_f32:
1175 case aco_opcode::v_fma_legacy_f32:
1176 case aco_opcode::v_fmac_f32:
1177 case aco_opcode::v_fmac_legacy_f32:
1178 case aco_opcode::v_mac_f32:
1179 case aco_opcode::v_mac_legacy_f32:
1180 case aco_opcode::v_fma_f16:
1181 case aco_opcode::v_fmac_f16:
1182 case aco_opcode::v_mac_f16:
1183 case aco_opcode::v_dot4c_i32_i8:
1184 case aco_opcode::v_dot2c_f32_f16:
1185 case aco_opcode::v_dot2_f32_f16:
1186 case aco_opcode::v_dot2_f32_bf16:
1187 case aco_opcode::v_dot2_f16_f16:
1188 case aco_opcode::v_dot2_bf16_bf16:
1189 case aco_opcode::v_fma_mix_f32:
1190 case aco_opcode::v_fma_mixlo_f16:
1191 case aco_opcode::v_fma_mixhi_f16:
1192 case aco_opcode::v_pk_fmac_f16: {
1193 if (idx1 == 2)
1194 return false;
1195 *new_op = instr->opcode;
1196 return true;
1197 }
1198 case aco_opcode::v_subb_co_u32: {
1199 if (idx1 == 2)
1200 return false;
1201 *new_op = aco_opcode::v_subbrev_co_u32;
1202 return true;
1203 }
1204 case aco_opcode::v_subbrev_co_u32: {
1205 if (idx1 == 2)
1206 return false;
1207 *new_op = aco_opcode::v_subb_co_u32;
1208 return true;
1209 }
1210 default: return false;
1211 }
1212 }
1213
wait_imm()1214 wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
1215 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)1216 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
1217 : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
1218 {}
1219
wait_imm(enum amd_gfx_level gfx_level,uint16_t packed)1220 wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter)
1221 {
1222 if (gfx_level >= GFX11) {
1223 vm = (packed >> 10) & 0x3f;
1224 lgkm = (packed >> 4) & 0x3f;
1225 exp = packed & 0x7;
1226 } else {
1227 vm = packed & 0xf;
1228 if (gfx_level >= GFX9)
1229 vm |= (packed >> 10) & 0x30;
1230
1231 exp = (packed >> 4) & 0x7;
1232
1233 lgkm = (packed >> 8) & 0xf;
1234 if (gfx_level >= GFX10)
1235 lgkm |= (packed >> 8) & 0x30;
1236 }
1237
1238 if (vm == (gfx_level >= GFX9 ? 0x3f : 0xf))
1239 vm = wait_imm::unset_counter;
1240 if (exp == 0x7)
1241 exp = wait_imm::unset_counter;
1242 if (lgkm == (gfx_level >= GFX10 ? 0x3f : 0xf))
1243 lgkm = wait_imm::unset_counter;
1244 }
1245
1246 uint16_t
pack(enum amd_gfx_level gfx_level) const1247 wait_imm::pack(enum amd_gfx_level gfx_level) const
1248 {
1249 uint16_t imm = 0;
1250 assert(exp == unset_counter || exp <= 0x7);
1251 if (gfx_level >= GFX11) {
1252 assert(lgkm == unset_counter || lgkm <= 0x3f);
1253 assert(vm == unset_counter || vm <= 0x3f);
1254 imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
1255 } else if (gfx_level >= GFX10) {
1256 assert(lgkm == unset_counter || lgkm <= 0x3f);
1257 assert(vm == unset_counter || vm <= 0x3f);
1258 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1259 } else if (gfx_level >= GFX9) {
1260 assert(lgkm == unset_counter || lgkm <= 0xf);
1261 assert(vm == unset_counter || vm <= 0x3f);
1262 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1263 } else {
1264 assert(lgkm == unset_counter || lgkm <= 0xf);
1265 assert(vm == unset_counter || vm <= 0xf);
1266 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1267 }
1268 if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
1269 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
1270 architecture when interpreting the immediate */
1271 if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
1272 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
1273 architecture when interpreting the immediate */
1274 return imm;
1275 }
1276
1277 bool
combine(const wait_imm & other)1278 wait_imm::combine(const wait_imm& other)
1279 {
1280 bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
1281 vm = std::min(vm, other.vm);
1282 exp = std::min(exp, other.exp);
1283 lgkm = std::min(lgkm, other.lgkm);
1284 vs = std::min(vs, other.vs);
1285 return changed;
1286 }
1287
1288 bool
empty() const1289 wait_imm::empty() const
1290 {
1291 return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
1292 vs == unset_counter;
1293 }
1294
1295 void
print(FILE * output) const1296 wait_imm::print(FILE* output) const
1297 {
1298 if (exp != unset_counter)
1299 fprintf(output, "exp: %u\n", exp);
1300 if (vm != unset_counter)
1301 fprintf(output, "vm: %u\n", vm);
1302 if (lgkm != unset_counter)
1303 fprintf(output, "lgkm: %u\n", lgkm);
1304 if (vs != unset_counter)
1305 fprintf(output, "vs: %u\n", vs);
1306 }
1307
1308 bool
should_form_clause(const Instruction * a,const Instruction * b)1309 should_form_clause(const Instruction* a, const Instruction* b)
1310 {
1311 if (a->definitions.empty() != b->definitions.empty())
1312 return false;
1313
1314 if (a->format != b->format)
1315 return false;
1316
1317 if (a->operands.empty() || b->operands.empty())
1318 return false;
1319
1320 /* Assume loads which don't use descriptors might load from similar addresses. */
1321 if (a->isFlatLike() || a->accessesLDS())
1322 return true;
1323 if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
1324 return true;
1325
1326 /* If they load from the same descriptor, assume they might load from similar
1327 * addresses.
1328 */
1329 if (a->isVMEM() || a->isSMEM())
1330 return a->operands[0].tempId() == b->operands[0].tempId();
1331
1332 return false;
1333 }
1334
1335 int
get_op_fixed_to_def(Instruction * instr)1336 get_op_fixed_to_def(Instruction* instr)
1337 {
1338 if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
1339 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
1340 instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
1341 instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
1342 instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
1343 instr->opcode == aco_opcode::v_writelane_b32_e64 ||
1344 instr->opcode == aco_opcode::v_dot4c_i32_i8) {
1345 return 2;
1346 } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
1347 instr->opcode == aco_opcode::s_cmovk_i32) {
1348 return 0;
1349 } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) {
1350 return 3;
1351 } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
1352 !instr->operands[2].isUndefined()) {
1353 return 2;
1354 }
1355 return -1;
1356 }
1357
1358 bool
dealloc_vgprs(Program * program)1359 dealloc_vgprs(Program* program)
1360 {
1361 if (program->gfx_level < GFX11)
1362 return false;
1363
1364 /* skip if deallocating VGPRs won't increase occupancy */
1365 uint16_t max_waves = max_suitable_waves(program, program->dev.max_waves_per_simd);
1366 if (program->max_reg_demand.vgpr <= get_addr_vgpr_from_waves(program, max_waves))
1367 return false;
1368
1369 /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch
1370 * store. */
1371 if (uses_scratch(program))
1372 return false;
1373
1374 Block& block = program->blocks.back();
1375
1376 /* don't bother checking if there is a pending VMEM store or export: there almost always is */
1377 Builder bld(program);
1378 if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
1379 bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
1380 /* Due to a hazard, an s_nop is needed before "s_sendmsg sendmsg_dealloc_vgprs". */
1381 bld.sopp(aco_opcode::s_nop, -1, 0);
1382 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
1383 }
1384
1385 return true;
1386 }
1387
1388 bool
isTrans() const1389 Instruction::isTrans() const noexcept
1390 {
1391 return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
1392 instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental;
1393 }
1394
1395 } // namespace aco
1396