1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26
27 #include "aco_builder.h"
28
29 #include "util/debug.h"
30
31 #include "c11/threads.h"
32
33 namespace aco {
34
35 uint64_t debug_flags = 0;
36
37 static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
38 {"validatera", DEBUG_VALIDATE_RA},
39 {"perfwarn", DEBUG_PERFWARN},
40 {"force-waitcnt", DEBUG_FORCE_WAITCNT},
41 {"novn", DEBUG_NO_VN},
42 {"noopt", DEBUG_NO_OPT},
43 {"nosched", DEBUG_NO_SCHED},
44 {"perfinfo", DEBUG_PERF_INFO},
45 {"liveinfo", DEBUG_LIVE_INFO},
46 {NULL, 0}};
47
48 static once_flag init_once_flag = ONCE_FLAG_INIT;
49
50 static void
init_once()51 init_once()
52 {
53 debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
54
55 #ifndef NDEBUG
56 /* enable some flags by default on debug builds */
57 debug_flags |= aco::DEBUG_VALIDATE_IR;
58 #endif
59 }
60
61 void
init()62 init()
63 {
64 call_once(&init_once_flag, init_once);
65 }
66
67 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)68 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
69 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
70 ac_shader_config* config)
71 {
72 program->stage = stage;
73 program->config = config;
74 program->info = *info;
75 program->gfx_level = gfx_level;
76 if (family == CHIP_UNKNOWN) {
77 switch (gfx_level) {
78 case GFX6: program->family = CHIP_TAHITI; break;
79 case GFX7: program->family = CHIP_BONAIRE; break;
80 case GFX8: program->family = CHIP_POLARIS10; break;
81 case GFX9: program->family = CHIP_VEGA10; break;
82 case GFX10: program->family = CHIP_NAVI10; break;
83 default: program->family = CHIP_UNKNOWN; break;
84 }
85 } else {
86 program->family = family;
87 }
88 program->wave_size = info->wave_size;
89 program->lane_mask = program->wave_size == 32 ? s1 : s2;
90
91 program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 :
92 gfx_level >= GFX7 ? 512 : 256;
93 program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
94 program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
95 /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96 program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97
98 program->dev.vgpr_limit = 256;
99 program->dev.physical_vgprs = 256;
100 program->dev.vgpr_alloc_granule = 4;
101
102 if (gfx_level >= GFX10) {
103 program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
104 program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
105 program->dev.sgpr_alloc_granule = 128;
106 program->dev.sgpr_limit =
107 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
108 if (gfx_level == GFX10_3)
109 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
110 else
111 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
112 } else if (program->gfx_level >= GFX8) {
113 program->dev.physical_sgprs = 800;
114 program->dev.sgpr_alloc_granule = 16;
115 program->dev.sgpr_limit = 102;
116 if (family == CHIP_TONGA || family == CHIP_ICELAND)
117 program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
118 } else {
119 program->dev.physical_sgprs = 512;
120 program->dev.sgpr_alloc_granule = 8;
121 program->dev.sgpr_limit = 104;
122 }
123
124 program->dev.max_wave64_per_simd = 10;
125 if (program->gfx_level >= GFX10_3)
126 program->dev.max_wave64_per_simd = 16;
127 else if (program->gfx_level == GFX10)
128 program->dev.max_wave64_per_simd = 20;
129 else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
130 program->dev.max_wave64_per_simd = 8;
131
132 program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
133
134 switch (program->family) {
135 /* GFX8 APUs */
136 case CHIP_CARRIZO:
137 case CHIP_STONEY:
138 /* GFX9 APUS */
139 case CHIP_RAVEN:
140 case CHIP_RAVEN2:
141 case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
142 default: break;
143 }
144
145 program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
146 /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
147 program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
148 if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
149 program->family == CHIP_HAWAII)
150 program->dev.has_fast_fma32 = true;
151 program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level >= GFX10;
152
153 program->dev.fused_mad_mix = program->gfx_level >= GFX10;
154 if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
155 program->family == CHIP_ARCTURUS || program->family == CHIP_ALDEBARAN)
156 program->dev.fused_mad_mix = true;
157
158 if (program->gfx_level >= GFX11) {
159 program->dev.scratch_global_offset_min = -4096;
160 program->dev.scratch_global_offset_max = 4095;
161 } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
162 program->dev.scratch_global_offset_min = -2048;
163 program->dev.scratch_global_offset_max = 2047;
164 } else if (program->gfx_level == GFX9) {
165 /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
166 program->dev.scratch_global_offset_min = 0;
167 program->dev.scratch_global_offset_max = 4095;
168 }
169
170 program->wgp_mode = wgp_mode;
171
172 program->progress = CompilationProgress::after_isel;
173
174 program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
175 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
176 program->next_fp_mode.must_flush_denorms32 = false;
177 program->next_fp_mode.must_flush_denorms16_64 = false;
178 program->next_fp_mode.care_about_round32 = false;
179 program->next_fp_mode.care_about_round16_64 = false;
180 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
181 program->next_fp_mode.denorm32 = 0;
182 program->next_fp_mode.round16_64 = fp_round_ne;
183 program->next_fp_mode.round32 = fp_round_ne;
184 }
185
186 memory_sync_info
get_sync_info(const Instruction * instr)187 get_sync_info(const Instruction* instr)
188 {
189 switch (instr->format) {
190 case Format::SMEM: return instr->smem().sync;
191 case Format::MUBUF: return instr->mubuf().sync;
192 case Format::MIMG: return instr->mimg().sync;
193 case Format::MTBUF: return instr->mtbuf().sync;
194 case Format::FLAT:
195 case Format::GLOBAL:
196 case Format::SCRATCH: return instr->flatlike().sync;
197 case Format::DS: return instr->ds().sync;
198 default: return memory_sync_info();
199 }
200 }
201
202 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)203 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
204 {
205 if (!instr->isVALU())
206 return false;
207
208 if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
209 return false;
210
211 if (instr->isSDWA())
212 return true;
213
214 if (instr->isVOP3()) {
215 VOP3_instruction& vop3 = instr->vop3();
216 if (instr->format == Format::VOP3)
217 return false;
218 if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
219 return false;
220 if (vop3.omod && gfx_level < GFX9)
221 return false;
222
223 // TODO: return true if we know we will use vcc
224 if (!pre_ra && instr->definitions.size() >= 2)
225 return false;
226
227 for (unsigned i = 1; i < instr->operands.size(); i++) {
228 if (instr->operands[i].isLiteral())
229 return false;
230 if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
231 return false;
232 }
233 }
234
235 if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
236 return false;
237
238 if (!instr->operands.empty()) {
239 if (instr->operands[0].isLiteral())
240 return false;
241 if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
242 return false;
243 if (instr->operands[0].bytes() > 4)
244 return false;
245 if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
246 return false;
247 }
248
249 bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
250 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
251
252 if (gfx_level != GFX8 && is_mac)
253 return false;
254
255 // TODO: return true if we know we will use vcc
256 if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
257 return false;
258 if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
259 return false;
260
261 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
262 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
263 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
264 instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
265 }
266
267 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
268 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)269 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
270 {
271 if (instr->isSDWA())
272 return NULL;
273
274 aco_ptr<Instruction> tmp = std::move(instr);
275 Format format =
276 (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
277 instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
278 tmp->definitions.size()));
279 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
280 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
281
282 SDWA_instruction& sdwa = instr->sdwa();
283
284 if (tmp->isVOP3()) {
285 VOP3_instruction& vop3 = tmp->vop3();
286 memcpy(sdwa.neg, vop3.neg, sizeof(sdwa.neg));
287 memcpy(sdwa.abs, vop3.abs, sizeof(sdwa.abs));
288 sdwa.omod = vop3.omod;
289 sdwa.clamp = vop3.clamp;
290 }
291
292 for (unsigned i = 0; i < instr->operands.size(); i++) {
293 /* SDWA only uses operands 0 and 1. */
294 if (i >= 2)
295 break;
296
297 sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
298 }
299
300 sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
301
302 if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
303 instr->definitions[0].setFixed(vcc);
304 if (instr->definitions.size() >= 2)
305 instr->definitions[1].setFixed(vcc);
306 if (instr->operands.size() >= 3)
307 instr->operands[2].setFixed(vcc);
308
309 instr->pass_flags = tmp->pass_flags;
310
311 return tmp;
312 }
313
314 bool
can_use_DPP(const aco_ptr<Instruction> & instr,bool pre_ra,bool dpp8)315 can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8)
316 {
317 assert(instr->isVALU() && !instr->operands.empty());
318
319 if (instr->isDPP())
320 return instr->isDPP8() == dpp8;
321
322 if (instr->operands.size() && instr->operands[0].isLiteral())
323 return false;
324
325 if (instr->isSDWA())
326 return false;
327
328 if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) &&
329 instr->definitions.back().physReg() != vcc)
330 return false;
331
332 if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc)
333 return false;
334
335 if (instr->isVOP3()) {
336 const VOP3_instruction* vop3 = &instr->vop3();
337 if (vop3->clamp || vop3->omod || vop3->opsel)
338 return false;
339 if (dpp8)
340 return false;
341 if (instr->format == Format::VOP3)
342 return false;
343 if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr))
344 return false;
345 }
346
347 /* there are more cases but those all take 64-bit inputs */
348 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
349 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
350 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
351 instr->opcode != aco_opcode::v_cvt_f64_i32 &&
352 instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32;
353 }
354
355 aco_ptr<Instruction>
convert_to_DPP(aco_ptr<Instruction> & instr,bool dpp8)356 convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8)
357 {
358 if (instr->isDPP())
359 return NULL;
360
361 aco_ptr<Instruction> tmp = std::move(instr);
362 Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) |
363 (dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16));
364 if (dpp8)
365 instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(),
366 tmp->definitions.size()));
367 else
368 instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(),
369 tmp->definitions.size()));
370 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
371 for (unsigned i = 0; i < instr->definitions.size(); i++)
372 instr->definitions[i] = tmp->definitions[i];
373
374 if (dpp8) {
375 DPP8_instruction* dpp = &instr->dpp8();
376 for (unsigned i = 0; i < 8; i++)
377 dpp->lane_sel[i] = i;
378 } else {
379 DPP16_instruction* dpp = &instr->dpp16();
380 dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
381 dpp->row_mask = 0xf;
382 dpp->bank_mask = 0xf;
383
384 if (tmp->isVOP3()) {
385 const VOP3_instruction* vop3 = &tmp->vop3();
386 memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg));
387 memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs));
388 }
389 }
390
391 if (instr->isVOPC() || instr->definitions.size() > 1)
392 instr->definitions.back().setFixed(vcc);
393
394 if (instr->operands.size() >= 3)
395 instr->operands[2].setFixed(vcc);
396
397 instr->pass_flags = tmp->pass_flags;
398
399 return tmp;
400 }
401
402 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)403 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
404 {
405 /* opsel is only GFX9+ */
406 if (gfx_level < GFX9)
407 return false;
408
409 switch (op) {
410 case aco_opcode::v_div_fixup_f16:
411 case aco_opcode::v_fma_f16:
412 case aco_opcode::v_mad_f16:
413 case aco_opcode::v_mad_u16:
414 case aco_opcode::v_mad_i16:
415 case aco_opcode::v_med3_f16:
416 case aco_opcode::v_med3_i16:
417 case aco_opcode::v_med3_u16:
418 case aco_opcode::v_min3_f16:
419 case aco_opcode::v_min3_i16:
420 case aco_opcode::v_min3_u16:
421 case aco_opcode::v_max3_f16:
422 case aco_opcode::v_max3_i16:
423 case aco_opcode::v_max3_u16:
424 case aco_opcode::v_max_u16_e64:
425 case aco_opcode::v_max_i16_e64:
426 case aco_opcode::v_min_u16_e64:
427 case aco_opcode::v_min_i16_e64:
428 case aco_opcode::v_add_i16:
429 case aco_opcode::v_sub_i16:
430 case aco_opcode::v_add_u16_e64:
431 case aco_opcode::v_sub_u16_e64:
432 case aco_opcode::v_lshlrev_b16_e64:
433 case aco_opcode::v_lshrrev_b16_e64:
434 case aco_opcode::v_ashrrev_i16_e64:
435 case aco_opcode::v_mul_lo_u16_e64: return true;
436 case aco_opcode::v_pack_b32_f16:
437 case aco_opcode::v_cvt_pknorm_i16_f16:
438 case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
439 case aco_opcode::v_mad_u32_u16:
440 case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
441 default: return false;
442 }
443 }
444
445 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)446 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
447 {
448 /* partial register writes are GFX9+, only */
449 if (gfx_level < GFX9)
450 return false;
451
452 switch (op) {
453 /* VOP3 */
454 case aco_opcode::v_mad_f16:
455 case aco_opcode::v_mad_u16:
456 case aco_opcode::v_mad_i16:
457 case aco_opcode::v_fma_f16:
458 case aco_opcode::v_div_fixup_f16:
459 case aco_opcode::v_interp_p2_f16:
460 case aco_opcode::v_fma_mixlo_f16:
461 case aco_opcode::v_fma_mixhi_f16:
462 /* VOP2 */
463 case aco_opcode::v_mac_f16:
464 case aco_opcode::v_madak_f16:
465 case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
466 case aco_opcode::v_add_f16:
467 case aco_opcode::v_sub_f16:
468 case aco_opcode::v_subrev_f16:
469 case aco_opcode::v_mul_f16:
470 case aco_opcode::v_max_f16:
471 case aco_opcode::v_min_f16:
472 case aco_opcode::v_ldexp_f16:
473 case aco_opcode::v_fmac_f16:
474 case aco_opcode::v_fmamk_f16:
475 case aco_opcode::v_fmaak_f16:
476 /* VOP1 */
477 case aco_opcode::v_cvt_f16_f32:
478 case aco_opcode::v_cvt_f16_u16:
479 case aco_opcode::v_cvt_f16_i16:
480 case aco_opcode::v_rcp_f16:
481 case aco_opcode::v_sqrt_f16:
482 case aco_opcode::v_rsq_f16:
483 case aco_opcode::v_log_f16:
484 case aco_opcode::v_exp_f16:
485 case aco_opcode::v_frexp_mant_f16:
486 case aco_opcode::v_frexp_exp_i16_f16:
487 case aco_opcode::v_floor_f16:
488 case aco_opcode::v_ceil_f16:
489 case aco_opcode::v_trunc_f16:
490 case aco_opcode::v_rndne_f16:
491 case aco_opcode::v_fract_f16:
492 case aco_opcode::v_sin_f16:
493 case aco_opcode::v_cos_f16: return gfx_level >= GFX10;
494 // TODO: confirm whether these write 16 or 32 bit on GFX10+
495 // case aco_opcode::v_cvt_u16_f16:
496 // case aco_opcode::v_cvt_i16_f16:
497 // case aco_opcode::p_cvt_f16_f32_rtne:
498 // case aco_opcode::v_cvt_norm_i16_f16:
499 // case aco_opcode::v_cvt_norm_u16_f16:
500 /* on GFX10, all opsel instructions preserve the high bits */
501 default: return gfx_level >= GFX10 && can_use_opsel(gfx_level, op, -1);
502 }
503 }
504
505 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)506 get_reduction_identity(ReduceOp op, unsigned idx)
507 {
508 switch (op) {
509 case iadd8:
510 case iadd16:
511 case iadd32:
512 case iadd64:
513 case fadd16:
514 case fadd32:
515 case fadd64:
516 case ior8:
517 case ior16:
518 case ior32:
519 case ior64:
520 case ixor8:
521 case ixor16:
522 case ixor32:
523 case ixor64:
524 case umax8:
525 case umax16:
526 case umax32:
527 case umax64: return 0;
528 case imul8:
529 case imul16:
530 case imul32:
531 case imul64: return idx ? 0 : 1;
532 case fmul16: return 0x3c00u; /* 1.0 */
533 case fmul32: return 0x3f800000u; /* 1.0 */
534 case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
535 case imin8: return INT8_MAX;
536 case imin16: return INT16_MAX;
537 case imin32: return INT32_MAX;
538 case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
539 case imax8: return INT8_MIN;
540 case imax16: return INT16_MIN;
541 case imax32: return INT32_MIN;
542 case imax64: return idx ? 0x80000000u : 0;
543 case umin8:
544 case umin16:
545 case iand8:
546 case iand16: return 0xffffffffu;
547 case umin32:
548 case umin64:
549 case iand32:
550 case iand64: return 0xffffffffu;
551 case fmin16: return 0x7c00u; /* infinity */
552 case fmin32: return 0x7f800000u; /* infinity */
553 case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
554 case fmax16: return 0xfc00u; /* negative infinity */
555 case fmax32: return 0xff800000u; /* negative infinity */
556 case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
557 default: unreachable("Invalid reduction operation"); break;
558 }
559 return 0;
560 }
561
562 bool
needs_exec_mask(const Instruction * instr)563 needs_exec_mask(const Instruction* instr)
564 {
565 if (instr->isVALU()) {
566 return instr->opcode != aco_opcode::v_readlane_b32 &&
567 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
568 instr->opcode != aco_opcode::v_writelane_b32 &&
569 instr->opcode != aco_opcode::v_writelane_b32_e64;
570 }
571
572 if (instr->isVMEM() || instr->isFlatLike())
573 return true;
574
575 if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
576 return instr->reads_exec();
577
578 if (instr->isPseudo()) {
579 switch (instr->opcode) {
580 case aco_opcode::p_create_vector:
581 case aco_opcode::p_extract_vector:
582 case aco_opcode::p_split_vector:
583 case aco_opcode::p_phi:
584 case aco_opcode::p_parallelcopy:
585 for (Definition def : instr->definitions) {
586 if (def.getTemp().type() == RegType::vgpr)
587 return true;
588 }
589 return instr->reads_exec();
590 case aco_opcode::p_spill:
591 case aco_opcode::p_reload:
592 case aco_opcode::p_end_linear_vgpr:
593 case aco_opcode::p_logical_start:
594 case aco_opcode::p_logical_end:
595 case aco_opcode::p_startpgm:
596 case aco_opcode::p_init_scratch: return instr->reads_exec();
597 default: break;
598 }
599 }
600
601 return true;
602 }
603
604 struct CmpInfo {
605 aco_opcode ordered;
606 aco_opcode unordered;
607 aco_opcode swapped;
608 aco_opcode inverse;
609 aco_opcode vcmpx;
610 aco_opcode f32;
611 unsigned size;
612 };
613
614 ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)615 get_cmp_info(aco_opcode op, CmpInfo* info)
616 {
617 info->ordered = aco_opcode::num_opcodes;
618 info->unordered = aco_opcode::num_opcodes;
619 info->swapped = aco_opcode::num_opcodes;
620 info->inverse = aco_opcode::num_opcodes;
621 info->f32 = aco_opcode::num_opcodes;
622 switch (op) {
623 // clang-format off
624 #define CMP2(ord, unord, ord_swap, unord_swap, sz) \
625 case aco_opcode::v_cmp_##ord##_f##sz: \
626 case aco_opcode::v_cmp_n##unord##_f##sz: \
627 info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \
628 info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \
629 info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
630 : aco_opcode::v_cmp_n##unord_swap##_f##sz; \
631 info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
632 : aco_opcode::v_cmp_n##ord##_f##sz; \
633 info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \
634 : aco_opcode::v_cmp_n##unord##_f32; \
635 info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz \
636 : aco_opcode::v_cmpx_n##unord##_f##sz; \
637 info->size = sz; \
638 return true;
639 #define CMP(ord, unord, ord_swap, unord_swap) \
640 CMP2(ord, unord, ord_swap, unord_swap, 16) \
641 CMP2(ord, unord, ord_swap, unord_swap, 32) \
642 CMP2(ord, unord, ord_swap, unord_swap, 64)
643 CMP(lt, /*n*/ge, gt, /*n*/le)
644 CMP(eq, /*n*/lg, eq, /*n*/lg)
645 CMP(le, /*n*/gt, ge, /*n*/lt)
646 CMP(gt, /*n*/le, lt, /*n*/ge)
647 CMP(lg, /*n*/eq, lg, /*n*/eq)
648 CMP(ge, /*n*/lt, le, /*n*/gt)
649 #undef CMP
650 #undef CMP2
651 #define ORD_TEST(sz) \
652 case aco_opcode::v_cmp_u_f##sz: \
653 info->f32 = aco_opcode::v_cmp_u_f32; \
654 info->swapped = aco_opcode::v_cmp_u_f##sz; \
655 info->inverse = aco_opcode::v_cmp_o_f##sz; \
656 info->vcmpx = aco_opcode::v_cmpx_u_f##sz; \
657 info->size = sz; \
658 return true; \
659 case aco_opcode::v_cmp_o_f##sz: \
660 info->f32 = aco_opcode::v_cmp_o_f32; \
661 info->swapped = aco_opcode::v_cmp_o_f##sz; \
662 info->inverse = aco_opcode::v_cmp_u_f##sz; \
663 info->vcmpx = aco_opcode::v_cmpx_o_f##sz; \
664 info->size = sz; \
665 return true;
666 ORD_TEST(16)
667 ORD_TEST(32)
668 ORD_TEST(64)
669 #undef ORD_TEST
670 #define CMPI2(op, swap, inv, type, sz) \
671 case aco_opcode::v_cmp_##op##_##type##sz: \
672 info->swapped = aco_opcode::v_cmp_##swap##_##type##sz; \
673 info->inverse = aco_opcode::v_cmp_##inv##_##type##sz; \
674 info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz; \
675 info->size = sz; \
676 return true;
677 #define CMPI(op, swap, inv) \
678 CMPI2(op, swap, inv, i, 16) \
679 CMPI2(op, swap, inv, u, 16) \
680 CMPI2(op, swap, inv, i, 32) \
681 CMPI2(op, swap, inv, u, 32) \
682 CMPI2(op, swap, inv, i, 64) \
683 CMPI2(op, swap, inv, u, 64)
684 CMPI(lt, gt, ge)
685 CMPI(eq, eq, lg)
686 CMPI(le, ge, gt)
687 CMPI(gt, lt, le)
688 CMPI(lg, lg, eq)
689 CMPI(ge, le, lt)
690 #undef CMPI
691 #undef CMPI2
692 #define CMPCLASS(sz) \
693 case aco_opcode::v_cmp_class_f##sz: \
694 info->vcmpx = aco_opcode::v_cmpx_class_f##sz; \
695 info->size = sz; \
696 return true;
697 CMPCLASS(16)
698 CMPCLASS(32)
699 CMPCLASS(64)
700 #undef CMPCLASS
701 // clang-format on
702 default: return false;
703 }
704 }
705
706 aco_opcode
get_ordered(aco_opcode op)707 get_ordered(aco_opcode op)
708 {
709 CmpInfo info;
710 return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
711 }
712
713 aco_opcode
get_unordered(aco_opcode op)714 get_unordered(aco_opcode op)
715 {
716 CmpInfo info;
717 return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
718 }
719
720 aco_opcode
get_inverse(aco_opcode op)721 get_inverse(aco_opcode op)
722 {
723 CmpInfo info;
724 return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
725 }
726
727 aco_opcode
get_f32_cmp(aco_opcode op)728 get_f32_cmp(aco_opcode op)
729 {
730 CmpInfo info;
731 return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
732 }
733
734 aco_opcode
get_vcmpx(aco_opcode op)735 get_vcmpx(aco_opcode op)
736 {
737 CmpInfo info;
738 return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
739 }
740
741 unsigned
get_cmp_bitsize(aco_opcode op)742 get_cmp_bitsize(aco_opcode op)
743 {
744 CmpInfo info;
745 return get_cmp_info(op, &info) ? info.size : 0;
746 }
747
748 bool
is_cmp(aco_opcode op)749 is_cmp(aco_opcode op)
750 {
751 CmpInfo info;
752 return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
753 }
754
755 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op)756 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op)
757 {
758 if (instr->isDPP())
759 return false;
760
761 if (instr->operands[0].isConstant() ||
762 (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
763 return false;
764
765 switch (instr->opcode) {
766 case aco_opcode::v_add_u32:
767 case aco_opcode::v_add_co_u32:
768 case aco_opcode::v_add_co_u32_e64:
769 case aco_opcode::v_add_i32:
770 case aco_opcode::v_add_f16:
771 case aco_opcode::v_add_f32:
772 case aco_opcode::v_mul_f16:
773 case aco_opcode::v_mul_f32:
774 case aco_opcode::v_or_b32:
775 case aco_opcode::v_and_b32:
776 case aco_opcode::v_xor_b32:
777 case aco_opcode::v_max_f16:
778 case aco_opcode::v_max_f32:
779 case aco_opcode::v_min_f16:
780 case aco_opcode::v_min_f32:
781 case aco_opcode::v_max_i32:
782 case aco_opcode::v_min_i32:
783 case aco_opcode::v_max_u32:
784 case aco_opcode::v_min_u32:
785 case aco_opcode::v_max_i16:
786 case aco_opcode::v_min_i16:
787 case aco_opcode::v_max_u16:
788 case aco_opcode::v_min_u16:
789 case aco_opcode::v_max_i16_e64:
790 case aco_opcode::v_min_i16_e64:
791 case aco_opcode::v_max_u16_e64:
792 case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
793 case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
794 case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
795 case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
796 case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
797 case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
798 default: {
799 CmpInfo info;
800 if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
801 *new_op = info.swapped;
802 return true;
803 }
804 return false;
805 }
806 }
807 }
808
wait_imm()809 wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
810 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)811 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
812 : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
813 {}
814
wait_imm(enum amd_gfx_level gfx_level,uint16_t packed)815 wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter)
816 {
817 vm = packed & 0xf;
818 if (gfx_level >= GFX9)
819 vm |= (packed >> 10) & 0x30;
820
821 exp = (packed >> 4) & 0x7;
822
823 lgkm = (packed >> 8) & 0xf;
824 if (gfx_level >= GFX10)
825 lgkm |= (packed >> 8) & 0x30;
826 }
827
828 uint16_t
pack(enum amd_gfx_level gfx_level) const829 wait_imm::pack(enum amd_gfx_level gfx_level) const
830 {
831 uint16_t imm = 0;
832 assert(exp == unset_counter || exp <= 0x7);
833 switch (gfx_level) {
834 case GFX11:
835 assert(lgkm == unset_counter || lgkm <= 0x3f);
836 assert(vm == unset_counter || vm <= 0x3f);
837 imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
838 break;
839 case GFX10:
840 case GFX10_3:
841 assert(lgkm == unset_counter || lgkm <= 0x3f);
842 assert(vm == unset_counter || vm <= 0x3f);
843 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
844 break;
845 case GFX9:
846 assert(lgkm == unset_counter || lgkm <= 0xf);
847 assert(vm == unset_counter || vm <= 0x3f);
848 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
849 break;
850 default:
851 assert(lgkm == unset_counter || lgkm <= 0xf);
852 assert(vm == unset_counter || vm <= 0xf);
853 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
854 break;
855 }
856 if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
857 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
858 architecture when interpreting the immediate */
859 if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
860 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
861 architecture when interpreting the immediate */
862 return imm;
863 }
864
865 bool
combine(const wait_imm & other)866 wait_imm::combine(const wait_imm& other)
867 {
868 bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
869 vm = std::min(vm, other.vm);
870 exp = std::min(exp, other.exp);
871 lgkm = std::min(lgkm, other.lgkm);
872 vs = std::min(vs, other.vs);
873 return changed;
874 }
875
876 bool
empty() const877 wait_imm::empty() const
878 {
879 return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
880 vs == unset_counter;
881 }
882
883 bool
should_form_clause(const Instruction * a,const Instruction * b)884 should_form_clause(const Instruction* a, const Instruction* b)
885 {
886 /* Vertex attribute loads from the same binding likely load from similar addresses */
887 unsigned a_vtx_binding =
888 a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
889 unsigned b_vtx_binding =
890 b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
891 if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
892 return true;
893
894 if (a->format != b->format)
895 return false;
896
897 /* Assume loads which don't use descriptors might load from similar addresses. */
898 if (a->isFlatLike())
899 return true;
900 if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
901 return true;
902
903 /* If they load from the same descriptor, assume they might load from similar
904 * addresses.
905 */
906 if (a->isVMEM() || a->isSMEM())
907 return a->operands[0].tempId() == b->operands[0].tempId();
908
909 return false;
910 }
911
912 } // namespace aco
913