1 /* 2 * Copyright © 2020 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include "aco_ir.h" 26 27 #include "aco_builder.h" 28 29 #include "util/debug.h" 30 31 #include "c11/threads.h" 32 33 namespace aco { 34 35 uint64_t debug_flags = 0; 36 37 static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR}, 38 {"validatera", DEBUG_VALIDATE_RA}, 39 {"perfwarn", DEBUG_PERFWARN}, 40 {"force-waitcnt", DEBUG_FORCE_WAITCNT}, 41 {"novn", DEBUG_NO_VN}, 42 {"noopt", DEBUG_NO_OPT}, 43 {"nosched", DEBUG_NO_SCHED}, 44 {"perfinfo", DEBUG_PERF_INFO}, 45 {"liveinfo", DEBUG_LIVE_INFO}, 46 {NULL, 0}}; 47 48 static once_flag init_once_flag = ONCE_FLAG_INIT; 49 50 static void init_once()51 init_once() 52 { 53 debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options); 54 55 #ifndef NDEBUG 56 /* enable some flags by default on debug builds */ 57 debug_flags |= aco::DEBUG_VALIDATE_IR; 58 #endif 59 } 60 61 void init()62 init() 63 { 64 call_once(&init_once_flag, init_once); 65 } 66 67 void init_program(Program * program,Stage stage,const struct radv_shader_info * info,enum chip_class chip_class,enum radeon_family family,bool wgp_mode,ac_shader_config * config)68 init_program(Program* program, Stage stage, const struct radv_shader_info* info, 69 enum chip_class chip_class, enum radeon_family family, bool wgp_mode, 70 ac_shader_config* config) 71 { 72 program->stage = stage; 73 program->config = config; 74 program->info = info; 75 program->chip_class = chip_class; 76 if (family == CHIP_UNKNOWN) { 77 switch (chip_class) { 78 case GFX6: program->family = CHIP_TAHITI; break; 79 case GFX7: program->family = CHIP_BONAIRE; break; 80 case GFX8: program->family = CHIP_POLARIS10; break; 81 case GFX9: program->family = CHIP_VEGA10; break; 82 case GFX10: program->family = CHIP_NAVI10; break; 83 default: program->family = CHIP_UNKNOWN; break; 84 } 85 } else { 86 program->family = family; 87 } 88 program->wave_size = info->wave_size; 89 program->lane_mask = program->wave_size == 32 ? s1 : s2; 90 91 program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256; 92 program->dev.lds_alloc_granule = 93 chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule; 94 program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768; 95 /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */ 96 program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY; 97 98 program->dev.vgpr_limit = 256; 99 program->dev.physical_vgprs = 256; 100 program->dev.vgpr_alloc_granule = 4; 101 102 if (chip_class >= GFX10) { 103 program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */ 104 program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512; 105 program->dev.sgpr_alloc_granule = 128; 106 program->dev.sgpr_limit = 107 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */ 108 if (chip_class >= GFX10_3) 109 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8; 110 else 111 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4; 112 } else if (program->chip_class >= GFX8) { 113 program->dev.physical_sgprs = 800; 114 program->dev.sgpr_alloc_granule = 16; 115 program->dev.sgpr_limit = 102; 116 if (family == CHIP_TONGA || family == CHIP_ICELAND) 117 program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */ 118 } else { 119 program->dev.physical_sgprs = 512; 120 program->dev.sgpr_alloc_granule = 8; 121 program->dev.sgpr_limit = 104; 122 } 123 124 program->dev.max_wave64_per_simd = 10; 125 if (program->chip_class >= GFX10_3) 126 program->dev.max_wave64_per_simd = 16; 127 else if (program->chip_class == GFX10) 128 program->dev.max_wave64_per_simd = 20; 129 else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) 130 program->dev.max_wave64_per_simd = 8; 131 132 program->dev.simd_per_cu = program->chip_class >= GFX10 ? 2 : 4; 133 134 switch (program->family) { 135 /* GFX8 APUs */ 136 case CHIP_CARRIZO: 137 case CHIP_STONEY: 138 /* GFX9 APUS */ 139 case CHIP_RAVEN: 140 case CHIP_RAVEN2: 141 case CHIP_RENOIR: program->dev.xnack_enabled = true; break; 142 default: break; 143 } 144 145 program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS; 146 /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */ 147 program->dev.has_fast_fma32 = program->chip_class >= GFX9; 148 if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO || 149 program->family == CHIP_HAWAII) 150 program->dev.has_fast_fma32 = true; 151 152 program->wgp_mode = wgp_mode; 153 154 program->progress = CompilationProgress::after_isel; 155 156 program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; 157 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; 158 program->next_fp_mode.must_flush_denorms32 = false; 159 program->next_fp_mode.must_flush_denorms16_64 = false; 160 program->next_fp_mode.care_about_round32 = false; 161 program->next_fp_mode.care_about_round16_64 = false; 162 program->next_fp_mode.denorm16_64 = fp_denorm_keep; 163 program->next_fp_mode.denorm32 = 0; 164 program->next_fp_mode.round16_64 = fp_round_ne; 165 program->next_fp_mode.round32 = fp_round_ne; 166 } 167 168 memory_sync_info get_sync_info(const Instruction * instr)169 get_sync_info(const Instruction* instr) 170 { 171 switch (instr->format) { 172 case Format::SMEM: return instr->smem().sync; 173 case Format::MUBUF: return instr->mubuf().sync; 174 case Format::MIMG: return instr->mimg().sync; 175 case Format::MTBUF: return instr->mtbuf().sync; 176 case Format::FLAT: 177 case Format::GLOBAL: 178 case Format::SCRATCH: return instr->flatlike().sync; 179 case Format::DS: return instr->ds().sync; 180 default: return memory_sync_info(); 181 } 182 } 183 184 bool can_use_SDWA(chip_class chip,const aco_ptr<Instruction> & instr,bool pre_ra)185 can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra) 186 { 187 if (!instr->isVALU()) 188 return false; 189 190 if (chip < GFX8 || instr->isDPP()) 191 return false; 192 193 if (instr->isSDWA()) 194 return true; 195 196 if (instr->isVOP3()) { 197 VOP3_instruction& vop3 = instr->vop3(); 198 if (instr->format == Format::VOP3) 199 return false; 200 if (vop3.clamp && instr->isVOPC() && chip != GFX8) 201 return false; 202 if (vop3.omod && chip < GFX9) 203 return false; 204 205 // TODO: return true if we know we will use vcc 206 if (!pre_ra && instr->definitions.size() >= 2) 207 return false; 208 209 for (unsigned i = 1; i < instr->operands.size(); i++) { 210 if (instr->operands[i].isLiteral()) 211 return false; 212 if (chip < GFX9 && !instr->operands[i].isOfType(RegType::vgpr)) 213 return false; 214 } 215 } 216 217 if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC()) 218 return false; 219 220 if (!instr->operands.empty()) { 221 if (instr->operands[0].isLiteral()) 222 return false; 223 if (chip < GFX9 && !instr->operands[0].isOfType(RegType::vgpr)) 224 return false; 225 if (instr->operands[0].bytes() > 4) 226 return false; 227 if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4) 228 return false; 229 } 230 231 bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 || 232 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16; 233 234 if (chip != GFX8 && is_mac) 235 return false; 236 237 // TODO: return true if we know we will use vcc 238 if (!pre_ra && instr->isVOPC() && chip == GFX8) 239 return false; 240 if (!pre_ra && instr->operands.size() >= 3 && !is_mac) 241 return false; 242 243 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && 244 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && 245 instr->opcode != aco_opcode::v_readfirstlane_b32 && 246 instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32; 247 } 248 249 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */ 250 aco_ptr<Instruction> convert_to_SDWA(chip_class chip,aco_ptr<Instruction> & instr)251 convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr) 252 { 253 if (instr->isSDWA()) 254 return NULL; 255 256 aco_ptr<Instruction> tmp = std::move(instr); 257 Format format = 258 (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA); 259 instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(), 260 tmp->definitions.size())); 261 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); 262 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin()); 263 264 SDWA_instruction& sdwa = instr->sdwa(); 265 266 if (tmp->isVOP3()) { 267 VOP3_instruction& vop3 = tmp->vop3(); 268 memcpy(sdwa.neg, vop3.neg, sizeof(sdwa.neg)); 269 memcpy(sdwa.abs, vop3.abs, sizeof(sdwa.abs)); 270 sdwa.omod = vop3.omod; 271 sdwa.clamp = vop3.clamp; 272 } 273 274 for (unsigned i = 0; i < instr->operands.size(); i++) { 275 /* SDWA only uses operands 0 and 1. */ 276 if (i >= 2) 277 break; 278 279 sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false); 280 } 281 282 sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false); 283 284 if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8) 285 instr->definitions[0].setFixed(vcc); 286 if (instr->definitions.size() >= 2) 287 instr->definitions[1].setFixed(vcc); 288 if (instr->operands.size() >= 3) 289 instr->operands[2].setFixed(vcc); 290 291 return tmp; 292 } 293 294 bool can_use_DPP(const aco_ptr<Instruction> & instr,bool pre_ra)295 can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra) 296 { 297 assert(instr->isVALU() && !instr->operands.empty()); 298 299 if (instr->isDPP()) 300 return true; 301 302 if (instr->operands.size() && instr->operands[0].isLiteral()) 303 return false; 304 305 if (instr->isSDWA()) 306 return false; 307 308 if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) && 309 instr->definitions.back().physReg() != vcc) 310 return false; 311 312 if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc) 313 return false; 314 315 if (instr->isVOP3()) { 316 const VOP3_instruction* vop3 = &instr->vop3(); 317 if (vop3->clamp || vop3->omod || vop3->opsel) 318 return false; 319 if (instr->format == Format::VOP3) 320 return false; 321 if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr)) 322 return false; 323 } 324 325 /* there are more cases but those all take 64-bit inputs */ 326 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && 327 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && 328 instr->opcode != aco_opcode::v_readfirstlane_b32 && 329 instr->opcode != aco_opcode::v_cvt_f64_i32 && 330 instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32; 331 } 332 333 aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction> & instr)334 convert_to_DPP(aco_ptr<Instruction>& instr) 335 { 336 if (instr->isDPP()) 337 return NULL; 338 339 aco_ptr<Instruction> tmp = std::move(instr); 340 Format format = 341 (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) | (uint32_t)Format::DPP); 342 instr.reset(create_instruction<DPP_instruction>(tmp->opcode, format, tmp->operands.size(), 343 tmp->definitions.size())); 344 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); 345 for (unsigned i = 0; i < instr->definitions.size(); i++) 346 instr->definitions[i] = tmp->definitions[i]; 347 348 DPP_instruction* dpp = &instr->dpp(); 349 dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3); 350 dpp->row_mask = 0xf; 351 dpp->bank_mask = 0xf; 352 353 if (tmp->isVOP3()) { 354 const VOP3_instruction* vop3 = &tmp->vop3(); 355 memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg)); 356 memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs)); 357 } 358 359 if (instr->isVOPC() || instr->definitions.size() > 1) 360 instr->definitions.back().setFixed(vcc); 361 362 if (instr->operands.size() >= 3) 363 instr->operands[2].setFixed(vcc); 364 365 return tmp; 366 } 367 368 bool can_use_opsel(chip_class chip,aco_opcode op,int idx,bool high)369 can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high) 370 { 371 /* opsel is only GFX9+ */ 372 if ((high || idx == -1) && chip < GFX9) 373 return false; 374 375 switch (op) { 376 case aco_opcode::v_div_fixup_f16: 377 case aco_opcode::v_fma_f16: 378 case aco_opcode::v_mad_f16: 379 case aco_opcode::v_mad_u16: 380 case aco_opcode::v_mad_i16: 381 case aco_opcode::v_med3_f16: 382 case aco_opcode::v_med3_i16: 383 case aco_opcode::v_med3_u16: 384 case aco_opcode::v_min3_f16: 385 case aco_opcode::v_min3_i16: 386 case aco_opcode::v_min3_u16: 387 case aco_opcode::v_max3_f16: 388 case aco_opcode::v_max3_i16: 389 case aco_opcode::v_max3_u16: 390 case aco_opcode::v_max_u16_e64: 391 case aco_opcode::v_max_i16_e64: 392 case aco_opcode::v_min_u16_e64: 393 case aco_opcode::v_min_i16_e64: 394 case aco_opcode::v_add_i16: 395 case aco_opcode::v_sub_i16: 396 case aco_opcode::v_add_u16_e64: 397 case aco_opcode::v_sub_u16_e64: 398 case aco_opcode::v_lshlrev_b16_e64: 399 case aco_opcode::v_lshrrev_b16_e64: 400 case aco_opcode::v_ashrrev_i16_e64: 401 case aco_opcode::v_mul_lo_u16_e64: return true; 402 case aco_opcode::v_pack_b32_f16: 403 case aco_opcode::v_cvt_pknorm_i16_f16: 404 case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1; 405 case aco_opcode::v_mad_u32_u16: 406 case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2; 407 default: return false; 408 } 409 } 410 411 bool instr_is_16bit(chip_class chip,aco_opcode op)412 instr_is_16bit(chip_class chip, aco_opcode op) 413 { 414 /* partial register writes are GFX9+, only */ 415 if (chip < GFX9) 416 return false; 417 418 switch (op) { 419 /* VOP3 */ 420 case aco_opcode::v_mad_f16: 421 case aco_opcode::v_mad_u16: 422 case aco_opcode::v_mad_i16: 423 case aco_opcode::v_fma_f16: 424 case aco_opcode::v_div_fixup_f16: 425 case aco_opcode::v_interp_p2_f16: 426 case aco_opcode::v_fma_mixlo_f16: 427 /* VOP2 */ 428 case aco_opcode::v_mac_f16: 429 case aco_opcode::v_madak_f16: 430 case aco_opcode::v_madmk_f16: return chip >= GFX9; 431 case aco_opcode::v_add_f16: 432 case aco_opcode::v_sub_f16: 433 case aco_opcode::v_subrev_f16: 434 case aco_opcode::v_mul_f16: 435 case aco_opcode::v_max_f16: 436 case aco_opcode::v_min_f16: 437 case aco_opcode::v_ldexp_f16: 438 case aco_opcode::v_fmac_f16: 439 case aco_opcode::v_fmamk_f16: 440 case aco_opcode::v_fmaak_f16: 441 /* VOP1 */ 442 case aco_opcode::v_cvt_f16_f32: 443 case aco_opcode::v_cvt_f16_u16: 444 case aco_opcode::v_cvt_f16_i16: 445 case aco_opcode::v_rcp_f16: 446 case aco_opcode::v_sqrt_f16: 447 case aco_opcode::v_rsq_f16: 448 case aco_opcode::v_log_f16: 449 case aco_opcode::v_exp_f16: 450 case aco_opcode::v_frexp_mant_f16: 451 case aco_opcode::v_frexp_exp_i16_f16: 452 case aco_opcode::v_floor_f16: 453 case aco_opcode::v_ceil_f16: 454 case aco_opcode::v_trunc_f16: 455 case aco_opcode::v_rndne_f16: 456 case aco_opcode::v_fract_f16: 457 case aco_opcode::v_sin_f16: 458 case aco_opcode::v_cos_f16: return chip >= GFX10; 459 // TODO: confirm whether these write 16 or 32 bit on GFX10+ 460 // case aco_opcode::v_cvt_u16_f16: 461 // case aco_opcode::v_cvt_i16_f16: 462 // case aco_opcode::p_cvt_f16_f32_rtne: 463 // case aco_opcode::v_cvt_norm_i16_f16: 464 // case aco_opcode::v_cvt_norm_u16_f16: 465 /* on GFX10, all opsel instructions preserve the high bits */ 466 default: return chip >= GFX10 && can_use_opsel(chip, op, -1, false); 467 } 468 } 469 470 uint32_t get_reduction_identity(ReduceOp op,unsigned idx)471 get_reduction_identity(ReduceOp op, unsigned idx) 472 { 473 switch (op) { 474 case iadd8: 475 case iadd16: 476 case iadd32: 477 case iadd64: 478 case fadd16: 479 case fadd32: 480 case fadd64: 481 case ior8: 482 case ior16: 483 case ior32: 484 case ior64: 485 case ixor8: 486 case ixor16: 487 case ixor32: 488 case ixor64: 489 case umax8: 490 case umax16: 491 case umax32: 492 case umax64: return 0; 493 case imul8: 494 case imul16: 495 case imul32: 496 case imul64: return idx ? 0 : 1; 497 case fmul16: return 0x3c00u; /* 1.0 */ 498 case fmul32: return 0x3f800000u; /* 1.0 */ 499 case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */ 500 case imin8: return INT8_MAX; 501 case imin16: return INT16_MAX; 502 case imin32: return INT32_MAX; 503 case imin64: return idx ? 0x7fffffffu : 0xffffffffu; 504 case imax8: return INT8_MIN; 505 case imax16: return INT16_MIN; 506 case imax32: return INT32_MIN; 507 case imax64: return idx ? 0x80000000u : 0; 508 case umin8: 509 case umin16: 510 case iand8: 511 case iand16: return 0xffffffffu; 512 case umin32: 513 case umin64: 514 case iand32: 515 case iand64: return 0xffffffffu; 516 case fmin16: return 0x7c00u; /* infinity */ 517 case fmin32: return 0x7f800000u; /* infinity */ 518 case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */ 519 case fmax16: return 0xfc00u; /* negative infinity */ 520 case fmax32: return 0xff800000u; /* negative infinity */ 521 case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */ 522 default: unreachable("Invalid reduction operation"); break; 523 } 524 return 0; 525 } 526 527 bool needs_exec_mask(const Instruction * instr)528 needs_exec_mask(const Instruction* instr) 529 { 530 if (instr->isVALU()) { 531 return instr->opcode != aco_opcode::v_readlane_b32 && 532 instr->opcode != aco_opcode::v_readlane_b32_e64 && 533 instr->opcode != aco_opcode::v_writelane_b32 && 534 instr->opcode != aco_opcode::v_writelane_b32_e64; 535 } 536 537 if (instr->isVMEM() || instr->isFlatLike()) 538 return true; 539 540 if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier()) 541 return instr->reads_exec(); 542 543 if (instr->isPseudo()) { 544 switch (instr->opcode) { 545 case aco_opcode::p_create_vector: 546 case aco_opcode::p_extract_vector: 547 case aco_opcode::p_split_vector: 548 case aco_opcode::p_phi: 549 case aco_opcode::p_parallelcopy: 550 for (Definition def : instr->definitions) { 551 if (def.getTemp().type() == RegType::vgpr) 552 return true; 553 } 554 return instr->reads_exec(); 555 case aco_opcode::p_spill: 556 case aco_opcode::p_reload: 557 case aco_opcode::p_logical_start: 558 case aco_opcode::p_logical_end: 559 case aco_opcode::p_startpgm: return instr->reads_exec(); 560 default: break; 561 } 562 } 563 564 return true; 565 } 566 567 struct CmpInfo { 568 aco_opcode ordered; 569 aco_opcode unordered; 570 aco_opcode ordered_swapped; 571 aco_opcode unordered_swapped; 572 aco_opcode inverse; 573 aco_opcode f32; 574 unsigned size; 575 }; 576 577 ALWAYS_INLINE bool get_cmp_info(aco_opcode op,CmpInfo * info)578 get_cmp_info(aco_opcode op, CmpInfo* info) 579 { 580 info->ordered = aco_opcode::num_opcodes; 581 info->unordered = aco_opcode::num_opcodes; 582 info->ordered_swapped = aco_opcode::num_opcodes; 583 info->unordered_swapped = aco_opcode::num_opcodes; 584 switch (op) { 585 // clang-format off 586 #define CMP2(ord, unord, ord_swap, unord_swap, sz) \ 587 case aco_opcode::v_cmp_##ord##_f##sz: \ 588 case aco_opcode::v_cmp_n##unord##_f##sz: \ 589 info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \ 590 info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \ 591 info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz; \ 592 info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz; \ 593 info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \ 594 : aco_opcode::v_cmp_n##ord##_f##sz; \ 595 info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \ 596 : aco_opcode::v_cmp_n##unord##_f32; \ 597 info->size = sz; \ 598 return true; 599 #define CMP(ord, unord, ord_swap, unord_swap) \ 600 CMP2(ord, unord, ord_swap, unord_swap, 16) \ 601 CMP2(ord, unord, ord_swap, unord_swap, 32) \ 602 CMP2(ord, unord, ord_swap, unord_swap, 64) 603 CMP(lt, /*n*/ge, gt, /*n*/le) 604 CMP(eq, /*n*/lg, eq, /*n*/lg) 605 CMP(le, /*n*/gt, ge, /*n*/lt) 606 CMP(gt, /*n*/le, lt, /*n*/le) 607 CMP(lg, /*n*/eq, lg, /*n*/eq) 608 CMP(ge, /*n*/lt, le, /*n*/gt) 609 #undef CMP 610 #undef CMP2 611 #define ORD_TEST(sz) \ 612 case aco_opcode::v_cmp_u_f##sz: \ 613 info->f32 = aco_opcode::v_cmp_u_f32; \ 614 info->inverse = aco_opcode::v_cmp_o_f##sz; \ 615 info->size = sz; \ 616 return true; \ 617 case aco_opcode::v_cmp_o_f##sz: \ 618 info->f32 = aco_opcode::v_cmp_o_f32; \ 619 info->inverse = aco_opcode::v_cmp_u_f##sz; \ 620 info->size = sz; \ 621 return true; 622 ORD_TEST(16) 623 ORD_TEST(32) 624 ORD_TEST(64) 625 #undef ORD_TEST 626 // clang-format on 627 default: return false; 628 } 629 } 630 631 aco_opcode get_ordered(aco_opcode op)632 get_ordered(aco_opcode op) 633 { 634 CmpInfo info; 635 return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes; 636 } 637 638 aco_opcode get_unordered(aco_opcode op)639 get_unordered(aco_opcode op) 640 { 641 CmpInfo info; 642 return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes; 643 } 644 645 aco_opcode get_inverse(aco_opcode op)646 get_inverse(aco_opcode op) 647 { 648 CmpInfo info; 649 return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes; 650 } 651 652 aco_opcode get_f32_cmp(aco_opcode op)653 get_f32_cmp(aco_opcode op) 654 { 655 CmpInfo info; 656 return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes; 657 } 658 659 unsigned get_cmp_bitsize(aco_opcode op)660 get_cmp_bitsize(aco_opcode op) 661 { 662 CmpInfo info; 663 return get_cmp_info(op, &info) ? info.size : 0; 664 } 665 666 bool is_cmp(aco_opcode op)667 is_cmp(aco_opcode op) 668 { 669 CmpInfo info; 670 return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes; 671 } 672 673 bool can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op)674 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op) 675 { 676 if (instr->isDPP()) 677 return false; 678 679 if (instr->operands[0].isConstant() || 680 (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr)) 681 return false; 682 683 switch (instr->opcode) { 684 case aco_opcode::v_add_u32: 685 case aco_opcode::v_add_co_u32: 686 case aco_opcode::v_add_co_u32_e64: 687 case aco_opcode::v_add_i32: 688 case aco_opcode::v_add_f16: 689 case aco_opcode::v_add_f32: 690 case aco_opcode::v_mul_f16: 691 case aco_opcode::v_mul_f32: 692 case aco_opcode::v_or_b32: 693 case aco_opcode::v_and_b32: 694 case aco_opcode::v_xor_b32: 695 case aco_opcode::v_max_f16: 696 case aco_opcode::v_max_f32: 697 case aco_opcode::v_min_f16: 698 case aco_opcode::v_min_f32: 699 case aco_opcode::v_max_i32: 700 case aco_opcode::v_min_i32: 701 case aco_opcode::v_max_u32: 702 case aco_opcode::v_min_u32: 703 case aco_opcode::v_max_i16: 704 case aco_opcode::v_min_i16: 705 case aco_opcode::v_max_u16: 706 case aco_opcode::v_min_u16: 707 case aco_opcode::v_max_i16_e64: 708 case aco_opcode::v_min_i16_e64: 709 case aco_opcode::v_max_u16_e64: 710 case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true; 711 case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true; 712 case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true; 713 case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true; 714 case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true; 715 case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true; 716 default: { 717 CmpInfo info; 718 get_cmp_info(instr->opcode, &info); 719 if (info.ordered == instr->opcode) { 720 *new_op = info.ordered_swapped; 721 return true; 722 } 723 if (info.unordered == instr->opcode) { 724 *new_op = info.unordered_swapped; 725 return true; 726 } 727 return false; 728 } 729 } 730 } 731 wait_imm()732 wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) 733 {} wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)734 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) 735 : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) 736 {} 737 wait_imm(enum chip_class chip,uint16_t packed)738 wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter) 739 { 740 vm = packed & 0xf; 741 if (chip >= GFX9) 742 vm |= (packed >> 10) & 0x30; 743 744 exp = (packed >> 4) & 0x7; 745 746 lgkm = (packed >> 8) & 0xf; 747 if (chip >= GFX10) 748 lgkm |= (packed >> 8) & 0x30; 749 } 750 751 uint16_t pack(enum chip_class chip) const752 wait_imm::pack(enum chip_class chip) const 753 { 754 uint16_t imm = 0; 755 assert(exp == unset_counter || exp <= 0x7); 756 switch (chip) { 757 case GFX10: 758 case GFX10_3: 759 assert(lgkm == unset_counter || lgkm <= 0x3f); 760 assert(vm == unset_counter || vm <= 0x3f); 761 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 762 break; 763 case GFX9: 764 assert(lgkm == unset_counter || lgkm <= 0xf); 765 assert(vm == unset_counter || vm <= 0x3f); 766 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 767 break; 768 default: 769 assert(lgkm == unset_counter || lgkm <= 0xf); 770 assert(vm == unset_counter || vm <= 0xf); 771 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); 772 break; 773 } 774 if (chip < GFX9 && vm == wait_imm::unset_counter) 775 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the 776 architecture when interpreting the immediate */ 777 if (chip < GFX10 && lgkm == wait_imm::unset_counter) 778 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the 779 architecture when interpreting the immediate */ 780 return imm; 781 } 782 783 bool combine(const wait_imm & other)784 wait_imm::combine(const wait_imm& other) 785 { 786 bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs; 787 vm = std::min(vm, other.vm); 788 exp = std::min(exp, other.exp); 789 lgkm = std::min(lgkm, other.lgkm); 790 vs = std::min(vs, other.vs); 791 return changed; 792 } 793 794 bool empty() const795 wait_imm::empty() const 796 { 797 return vm == unset_counter && exp == unset_counter && lgkm == unset_counter && 798 vs == unset_counter; 799 } 800 801 bool should_form_clause(const Instruction * a,const Instruction * b)802 should_form_clause(const Instruction* a, const Instruction* b) 803 { 804 /* Vertex attribute loads from the same binding likely load from similar addresses */ 805 unsigned a_vtx_binding = 806 a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0); 807 unsigned b_vtx_binding = 808 b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0); 809 if (a_vtx_binding && a_vtx_binding == b_vtx_binding) 810 return true; 811 812 if (a->format != b->format) 813 return false; 814 815 /* Assume loads which don't use descriptors might load from similar addresses. */ 816 if (a->isFlatLike()) 817 return true; 818 if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8) 819 return true; 820 821 /* If they load from the same descriptor, assume they might load from similar 822 * addresses. 823 */ 824 if (a->isVMEM() || a->isSMEM()) 825 return a->operands[0].tempId() == b->operands[0].tempId(); 826 827 return false; 828 } 829 830 } // namespace aco 831