• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 
27 #include "aco_builder.h"
28 
29 #include "util/u_debug.h"
30 
31 #include "c11/threads.h"
32 
33 namespace aco {
34 
35 thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
36 
37 uint64_t debug_flags = 0;
38 
39 static const struct debug_control aco_debug_options[] = {
40    {"validateir", DEBUG_VALIDATE_IR},
41    {"validatera", DEBUG_VALIDATE_RA},
42    {"novalidateir", DEBUG_NO_VALIDATE_IR},
43    {"perfwarn", DEBUG_PERFWARN},
44    {"force-waitcnt", DEBUG_FORCE_WAITCNT},
45    {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
46    {"novn", DEBUG_NO_VN},
47    {"noopt", DEBUG_NO_OPT},
48    {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
49    {"nosched-ilp", DEBUG_NO_SCHED_ILP},
50    {"nosched-vopd", DEBUG_NO_SCHED_VOPD},
51    {"perfinfo", DEBUG_PERF_INFO},
52    {"liveinfo", DEBUG_LIVE_INFO},
53    {NULL, 0}};
54 
55 static once_flag init_once_flag = ONCE_FLAG_INIT;
56 
57 static void
init_once()58 init_once()
59 {
60    debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
61 
62 #ifndef NDEBUG
63    /* enable some flags by default on debug builds */
64    debug_flags |= aco::DEBUG_VALIDATE_IR;
65 #endif
66 
67    if (debug_flags & aco::DEBUG_NO_VALIDATE_IR)
68       debug_flags &= ~aco::DEBUG_VALIDATE_IR;
69 }
70 
71 void
init()72 init()
73 {
74    call_once(&init_once_flag, init_once);
75 }
76 
77 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)78 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
79              enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
80              ac_shader_config* config)
81 {
82    instruction_buffer = &program->m;
83    program->stage = stage;
84    program->config = config;
85    program->info = *info;
86    program->gfx_level = gfx_level;
87    if (family == CHIP_UNKNOWN) {
88       switch (gfx_level) {
89       case GFX6: program->family = CHIP_TAHITI; break;
90       case GFX7: program->family = CHIP_BONAIRE; break;
91       case GFX8: program->family = CHIP_POLARIS10; break;
92       case GFX9: program->family = CHIP_VEGA10; break;
93       case GFX10: program->family = CHIP_NAVI10; break;
94       case GFX10_3: program->family = CHIP_NAVI21; break;
95       case GFX11: program->family = CHIP_NAVI31; break;
96       default: program->family = CHIP_UNKNOWN; break;
97       }
98    } else {
99       program->family = family;
100    }
101    program->wave_size = info->wave_size;
102    program->lane_mask = program->wave_size == 32 ? s1 : s2;
103 
104    program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024
105                                        : gfx_level >= GFX7                        ? 512
106                                                                                   : 256;
107    program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
108 
109    /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
110    program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
111 
112    /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
113    program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
114 
115    program->dev.vgpr_limit = 256;
116    program->dev.physical_vgprs = 256;
117    program->dev.vgpr_alloc_granule = 4;
118 
119    if (gfx_level >= GFX10) {
120       program->dev.physical_sgprs = 128 * 20; /* enough for max waves */
121       program->dev.sgpr_alloc_granule = 128;
122       program->dev.sgpr_limit =
123          108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
124 
125       if (family == CHIP_NAVI31 || family == CHIP_NAVI32) {
126          program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
127          program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
128       } else {
129          program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
130          if (gfx_level >= GFX10_3)
131             program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
132          else
133             program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
134       }
135    } else if (program->gfx_level >= GFX8) {
136       program->dev.physical_sgprs = 800;
137       program->dev.sgpr_alloc_granule = 16;
138       program->dev.sgpr_limit = 102;
139       if (family == CHIP_TONGA || family == CHIP_ICELAND)
140          program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
141    } else {
142       program->dev.physical_sgprs = 512;
143       program->dev.sgpr_alloc_granule = 8;
144       program->dev.sgpr_limit = 104;
145    }
146 
147    program->dev.scratch_alloc_granule = gfx_level >= GFX11 ? 256 : 1024;
148 
149    program->dev.max_waves_per_simd = 10;
150    if (program->gfx_level >= GFX10_3)
151       program->dev.max_waves_per_simd = 16;
152    else if (program->gfx_level == GFX10)
153       program->dev.max_waves_per_simd = 20;
154    else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
155       program->dev.max_waves_per_simd = 8;
156 
157    program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
158 
159    switch (program->family) {
160    /* GFX8 APUs */
161    case CHIP_CARRIZO:
162    case CHIP_STONEY:
163    /* GFX9 APUS */
164    case CHIP_RAVEN:
165    case CHIP_RAVEN2:
166    case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
167    default: break;
168    }
169 
170    program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
171    /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
172    program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
173    if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
174        program->family == CHIP_HAWAII)
175       program->dev.has_fast_fma32 = true;
176    program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level >= GFX10;
177 
178    program->dev.fused_mad_mix = program->gfx_level >= GFX10;
179    if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
180        program->family == CHIP_MI100 || program->family == CHIP_MI200)
181       program->dev.fused_mad_mix = true;
182 
183    if (program->gfx_level >= GFX11) {
184       program->dev.scratch_global_offset_min = -4096;
185       program->dev.scratch_global_offset_max = 4095;
186    } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
187       program->dev.scratch_global_offset_min = -2048;
188       program->dev.scratch_global_offset_max = 2047;
189    } else if (program->gfx_level == GFX9) {
190       /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
191       program->dev.scratch_global_offset_min = 0;
192       program->dev.scratch_global_offset_max = 4095;
193    }
194 
195    if (program->gfx_level >= GFX11) {
196       /* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
197        * rest of the address.
198        */
199       program->dev.max_nsa_vgprs = 4;
200    } else if (program->gfx_level >= GFX10_3) {
201       /* GFX10.3 can have up to 3 NSA dwords. */
202       program->dev.max_nsa_vgprs = 13;
203    } else if (program->gfx_level >= GFX10) {
204       /* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
205       program->dev.max_nsa_vgprs = 5;
206    } else {
207       program->dev.max_nsa_vgprs = 0;
208    }
209 
210    program->wgp_mode = wgp_mode;
211 
212    program->progress = CompilationProgress::after_isel;
213 
214    program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
215    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
216    program->next_fp_mode.must_flush_denorms32 = false;
217    program->next_fp_mode.must_flush_denorms16_64 = false;
218    program->next_fp_mode.care_about_round32 = false;
219    program->next_fp_mode.care_about_round16_64 = false;
220    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
221    program->next_fp_mode.denorm32 = 0;
222    program->next_fp_mode.round16_64 = fp_round_ne;
223    program->next_fp_mode.round32 = fp_round_ne;
224 }
225 
226 memory_sync_info
get_sync_info(const Instruction * instr)227 get_sync_info(const Instruction* instr)
228 {
229    /* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
230     * overlapping waves in the queue family.
231     */
232    if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
233        (instr->opcode == aco_opcode::s_wait_event &&
234         !(instr->sopp().imm & wait_event_imm_dont_wait_export_ready))) {
235       return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
236    } else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
237       return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
238    }
239 
240    switch (instr->format) {
241    case Format::SMEM: return instr->smem().sync;
242    case Format::MUBUF: return instr->mubuf().sync;
243    case Format::MIMG: return instr->mimg().sync;
244    case Format::MTBUF: return instr->mtbuf().sync;
245    case Format::FLAT:
246    case Format::GLOBAL:
247    case Format::SCRATCH: return instr->flatlike().sync;
248    case Format::DS: return instr->ds().sync;
249    case Format::LDSDIR: return instr->ldsdir().sync;
250    default: return memory_sync_info();
251    }
252 }
253 
254 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)255 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
256 {
257    if (!instr->isVALU())
258       return false;
259 
260    if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
261       return false;
262 
263    if (instr->isSDWA())
264       return true;
265 
266    if (instr->isVOP3()) {
267       VALU_instruction& vop3 = instr->valu();
268       if (instr->format == Format::VOP3)
269          return false;
270       if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
271          return false;
272       if (vop3.omod && gfx_level < GFX9)
273          return false;
274 
275       // TODO: return true if we know we will use vcc
276       if (!pre_ra && instr->definitions.size() >= 2)
277          return false;
278 
279       for (unsigned i = 1; i < instr->operands.size(); i++) {
280          if (instr->operands[i].isLiteral())
281             return false;
282          if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
283             return false;
284       }
285    }
286 
287    if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
288       return false;
289 
290    if (!instr->operands.empty()) {
291       if (instr->operands[0].isLiteral())
292          return false;
293       if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
294          return false;
295       if (instr->operands[0].bytes() > 4)
296          return false;
297       if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
298          return false;
299    }
300 
301    bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
302                  instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
303 
304    if (gfx_level != GFX8 && is_mac)
305       return false;
306 
307    // TODO: return true if we know we will use vcc
308    if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
309       return false;
310    if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
311       return false;
312 
313    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
314           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
315           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
316           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
317           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
318           instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
319 }
320 
321 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
322 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)323 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
324 {
325    if (instr->isSDWA())
326       return NULL;
327 
328    aco_ptr<Instruction> tmp = std::move(instr);
329    Format format = asSDWA(withoutVOP3(tmp->format));
330    instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
331                                                     tmp->definitions.size()));
332    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
333    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
334 
335    SDWA_instruction& sdwa = instr->sdwa();
336 
337    if (tmp->isVOP3()) {
338       VALU_instruction& vop3 = tmp->valu();
339       sdwa.neg = vop3.neg;
340       sdwa.abs = vop3.abs;
341       sdwa.omod = vop3.omod;
342       sdwa.clamp = vop3.clamp;
343    }
344 
345    for (unsigned i = 0; i < instr->operands.size(); i++) {
346       /* SDWA only uses operands 0 and 1. */
347       if (i >= 2)
348          break;
349 
350       sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
351    }
352 
353    sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
354 
355    if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
356       instr->definitions[0].setFixed(vcc);
357    if (instr->definitions.size() >= 2)
358       instr->definitions[1].setFixed(vcc);
359    if (instr->operands.size() >= 3)
360       instr->operands[2].setFixed(vcc);
361 
362    instr->pass_flags = tmp->pass_flags;
363 
364    return tmp;
365 }
366 
367 bool
can_use_DPP(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool dpp8)368 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
369 {
370    assert(instr->isVALU() && !instr->operands.empty());
371 
372    if (instr->isDPP())
373       return instr->isDPP8() == dpp8;
374 
375    if (instr->isSDWA() || instr->isVINTERP_INREG())
376       return false;
377 
378    if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
379       return false;
380 
381    if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
382        instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
383       return false;
384 
385    if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
386        instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
387        gfx_level < GFX11)
388       return false;
389 
390    if (instr->isVOP3() && gfx_level < GFX11) {
391       const VALU_instruction* vop3 = &instr->valu();
392       if (vop3->clamp || vop3->omod)
393          return false;
394       if (dpp8)
395          return false;
396    }
397 
398    for (unsigned i = 0; i < instr->operands.size(); i++) {
399       if (instr->operands[i].isLiteral())
400          return false;
401       if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
402          return false;
403    }
404 
405    /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */
406    if (instr->writes_exec())
407       return false;
408 
409    /* simpler than listing all VOP3P opcodes which do not support DPP */
410    if (instr->isVOP3P()) {
411       return instr->opcode == aco_opcode::v_fma_mix_f32 ||
412              instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
413              instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
414              instr->opcode == aco_opcode::v_dot2_f32_f16 ||
415              instr->opcode == aco_opcode::v_dot2_f32_bf16;
416    }
417 
418    if (instr->opcode == aco_opcode::v_pk_fmac_f16)
419       return gfx_level < GFX11;
420 
421    /* there are more cases but those all take 64-bit inputs */
422    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
423           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
424           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
425           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
426           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
427           instr->opcode != aco_opcode::v_cvt_f64_i32 &&
428           instr->opcode != aco_opcode::v_cvt_f64_f32 &&
429           instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
430           instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
431           instr->opcode != aco_opcode::v_mul_hi_i32 &&
432           instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
433           instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
434           instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
435           instr->opcode != aco_opcode::v_mad_u64_u32 &&
436           instr->opcode != aco_opcode::v_mad_i64_i32 &&
437           instr->opcode != aco_opcode::v_permlane16_b32 &&
438           instr->opcode != aco_opcode::v_permlanex16_b32 &&
439           instr->opcode != aco_opcode::v_permlane64_b32 &&
440           instr->opcode != aco_opcode::v_readlane_b32_e64 &&
441           instr->opcode != aco_opcode::v_writelane_b32_e64;
442 }
443 
444 aco_ptr<Instruction>
convert_to_DPP(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,bool dpp8)445 convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
446 {
447    if (instr->isDPP())
448       return NULL;
449 
450    aco_ptr<Instruction> tmp = std::move(instr);
451    Format format =
452       (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
453    if (dpp8)
454       instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(),
455                                                        tmp->definitions.size()));
456    else
457       instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(),
458                                                         tmp->definitions.size()));
459    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
460    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
461 
462    if (dpp8) {
463       DPP8_instruction* dpp = &instr->dpp8();
464       dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
465       dpp->fetch_inactive = gfx_level >= GFX10;
466    } else {
467       DPP16_instruction* dpp = &instr->dpp16();
468       dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
469       dpp->row_mask = 0xf;
470       dpp->bank_mask = 0xf;
471       dpp->fetch_inactive = gfx_level >= GFX10;
472    }
473 
474    instr->valu().neg = tmp->valu().neg;
475    instr->valu().abs = tmp->valu().abs;
476    instr->valu().omod = tmp->valu().omod;
477    instr->valu().clamp = tmp->valu().clamp;
478    instr->valu().opsel = tmp->valu().opsel;
479    instr->valu().opsel_lo = tmp->valu().opsel_lo;
480    instr->valu().opsel_hi = tmp->valu().opsel_hi;
481 
482    if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
483       instr->definitions.back().setFixed(vcc);
484 
485    if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
486        gfx_level < GFX11)
487       instr->operands[2].setFixed(vcc);
488 
489    instr->pass_flags = tmp->pass_flags;
490 
491    /* DPP16 supports input modifiers, so we might no longer need VOP3. */
492    bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
493                       (instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
494 
495    /* VOPC/add_co/sub_co definition needs VCC without VOP3. */
496    remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
497                   !instr->definitions.back().isFixed() ||
498                   instr->definitions.back().physReg() == vcc;
499 
500    /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
501    remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
502                   instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
503 
504    if (remove_vop3)
505       instr->format = withoutVOP3(instr->format);
506 
507    return tmp;
508 }
509 
510 bool
can_use_input_modifiers(amd_gfx_level gfx_level,aco_opcode op,int idx)511 can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
512 {
513    if (op == aco_opcode::v_mov_b32)
514       return gfx_level >= GFX10;
515 
516    if (op == aco_opcode::v_ldexp_f16 || op == aco_opcode::v_ldexp_f32 ||
517        op == aco_opcode::v_ldexp_f64)
518       return idx == 0;
519 
520    return instr_info.can_use_input_modifiers[(int)op];
521 }
522 
523 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)524 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
525 {
526    /* opsel is only GFX9+ */
527    if (gfx_level < GFX9)
528       return false;
529 
530    switch (op) {
531    case aco_opcode::v_div_fixup_f16:
532    case aco_opcode::v_fma_f16:
533    case aco_opcode::v_mad_f16:
534    case aco_opcode::v_mad_u16:
535    case aco_opcode::v_mad_i16:
536    case aco_opcode::v_med3_f16:
537    case aco_opcode::v_med3_i16:
538    case aco_opcode::v_med3_u16:
539    case aco_opcode::v_min3_f16:
540    case aco_opcode::v_min3_i16:
541    case aco_opcode::v_min3_u16:
542    case aco_opcode::v_max3_f16:
543    case aco_opcode::v_max3_i16:
544    case aco_opcode::v_max3_u16:
545    case aco_opcode::v_minmax_f16:
546    case aco_opcode::v_maxmin_f16:
547    case aco_opcode::v_max_u16_e64:
548    case aco_opcode::v_max_i16_e64:
549    case aco_opcode::v_min_u16_e64:
550    case aco_opcode::v_min_i16_e64:
551    case aco_opcode::v_add_i16:
552    case aco_opcode::v_sub_i16:
553    case aco_opcode::v_add_u16_e64:
554    case aco_opcode::v_sub_u16_e64:
555    case aco_opcode::v_lshlrev_b16_e64:
556    case aco_opcode::v_lshrrev_b16_e64:
557    case aco_opcode::v_ashrrev_i16_e64:
558    case aco_opcode::v_and_b16:
559    case aco_opcode::v_or_b16:
560    case aco_opcode::v_xor_b16:
561    case aco_opcode::v_mul_lo_u16_e64: return true;
562    case aco_opcode::v_pack_b32_f16:
563    case aco_opcode::v_cvt_pknorm_i16_f16:
564    case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
565    case aco_opcode::v_mad_u32_u16:
566    case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
567    case aco_opcode::v_dot2_f16_f16:
568    case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
569    case aco_opcode::v_cndmask_b16: return idx != 2;
570    case aco_opcode::v_interp_p10_f16_f32_inreg:
571    case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
572    case aco_opcode::v_interp_p2_f16_f32_inreg:
573    case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
574    default:
575       return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
576    }
577 }
578 
579 bool
can_write_m0(const aco_ptr<Instruction> & instr)580 can_write_m0(const aco_ptr<Instruction>& instr)
581 {
582    if (instr->isSALU())
583       return true;
584 
585    /* VALU can't write m0 on any GPU generations. */
586    if (instr->isVALU())
587       return false;
588 
589    switch (instr->opcode) {
590    case aco_opcode::p_parallelcopy:
591    case aco_opcode::p_extract:
592    case aco_opcode::p_insert:
593       /* These pseudo instructions are implemented with SALU when writing m0. */
594       return true;
595    default:
596       /* Assume that no other instructions can write m0. */
597       return false;
598    }
599 }
600 
601 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)602 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
603 {
604    /* partial register writes are GFX9+, only */
605    if (gfx_level < GFX9)
606       return false;
607 
608    switch (op) {
609    /* VOP3 */
610    case aco_opcode::v_mad_f16:
611    case aco_opcode::v_mad_u16:
612    case aco_opcode::v_mad_i16:
613    case aco_opcode::v_fma_f16:
614    case aco_opcode::v_div_fixup_f16:
615    case aco_opcode::v_interp_p2_f16:
616    case aco_opcode::v_fma_mixlo_f16:
617    case aco_opcode::v_fma_mixhi_f16:
618    /* VOP2 */
619    case aco_opcode::v_mac_f16:
620    case aco_opcode::v_madak_f16:
621    case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
622    case aco_opcode::v_add_f16:
623    case aco_opcode::v_sub_f16:
624    case aco_opcode::v_subrev_f16:
625    case aco_opcode::v_mul_f16:
626    case aco_opcode::v_max_f16:
627    case aco_opcode::v_min_f16:
628    case aco_opcode::v_ldexp_f16:
629    case aco_opcode::v_fmac_f16:
630    case aco_opcode::v_fmamk_f16:
631    case aco_opcode::v_fmaak_f16:
632    /* VOP1 */
633    case aco_opcode::v_cvt_f16_f32:
634    case aco_opcode::p_cvt_f16_f32_rtne:
635    case aco_opcode::v_cvt_f16_u16:
636    case aco_opcode::v_cvt_f16_i16:
637    case aco_opcode::v_rcp_f16:
638    case aco_opcode::v_sqrt_f16:
639    case aco_opcode::v_rsq_f16:
640    case aco_opcode::v_log_f16:
641    case aco_opcode::v_exp_f16:
642    case aco_opcode::v_frexp_mant_f16:
643    case aco_opcode::v_frexp_exp_i16_f16:
644    case aco_opcode::v_floor_f16:
645    case aco_opcode::v_ceil_f16:
646    case aco_opcode::v_trunc_f16:
647    case aco_opcode::v_rndne_f16:
648    case aco_opcode::v_fract_f16:
649    case aco_opcode::v_sin_f16:
650    case aco_opcode::v_cos_f16:
651    case aco_opcode::v_cvt_u16_f16:
652    case aco_opcode::v_cvt_i16_f16:
653    case aco_opcode::v_cvt_norm_i16_f16:
654    case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
655    /* on GFX10, all opsel instructions preserve the high bits */
656    default: return gfx_level >= GFX10 && can_use_opsel(gfx_level, op, -1);
657    }
658 }
659 
660 /* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
661  * only supports v0-v127.
662  * The first three bits are used for operands 0-2, and the 4th bit is used for the destination.
663  */
664 uint8_t
get_gfx11_true16_mask(aco_opcode op)665 get_gfx11_true16_mask(aco_opcode op)
666 {
667    switch (op) {
668    case aco_opcode::v_ceil_f16:
669    case aco_opcode::v_cos_f16:
670    case aco_opcode::v_cvt_f16_i16:
671    case aco_opcode::v_cvt_f16_u16:
672    case aco_opcode::v_cvt_i16_f16:
673    case aco_opcode::v_cvt_u16_f16:
674    case aco_opcode::v_cvt_norm_i16_f16:
675    case aco_opcode::v_cvt_norm_u16_f16:
676    case aco_opcode::v_exp_f16:
677    case aco_opcode::v_floor_f16:
678    case aco_opcode::v_fract_f16:
679    case aco_opcode::v_frexp_exp_i16_f16:
680    case aco_opcode::v_frexp_mant_f16:
681    case aco_opcode::v_log_f16:
682    case aco_opcode::v_not_b16:
683    case aco_opcode::v_rcp_f16:
684    case aco_opcode::v_rndne_f16:
685    case aco_opcode::v_rsq_f16:
686    case aco_opcode::v_sin_f16:
687    case aco_opcode::v_sqrt_f16:
688    case aco_opcode::v_trunc_f16:
689    case aco_opcode::v_mov_b16: return 0x1 | 0x8;
690    case aco_opcode::v_add_f16:
691    case aco_opcode::v_fmaak_f16:
692    case aco_opcode::v_fmac_f16:
693    case aco_opcode::v_fmamk_f16:
694    case aco_opcode::v_ldexp_f16:
695    case aco_opcode::v_max_f16:
696    case aco_opcode::v_min_f16:
697    case aco_opcode::v_mul_f16:
698    case aco_opcode::v_sub_f16:
699    case aco_opcode::v_subrev_f16:
700    case aco_opcode::v_and_b16:
701    case aco_opcode::v_or_b16:
702    case aco_opcode::v_xor_b16: return 0x3 | 0x8;
703    case aco_opcode::v_cvt_f32_f16:
704    case aco_opcode::v_cvt_i32_i16:
705    case aco_opcode::v_cvt_u32_u16: return 0x1;
706    case aco_opcode::v_cmp_class_f16:
707    case aco_opcode::v_cmp_eq_f16:
708    case aco_opcode::v_cmp_eq_i16:
709    case aco_opcode::v_cmp_eq_u16:
710    case aco_opcode::v_cmp_ge_f16:
711    case aco_opcode::v_cmp_ge_i16:
712    case aco_opcode::v_cmp_ge_u16:
713    case aco_opcode::v_cmp_gt_f16:
714    case aco_opcode::v_cmp_gt_i16:
715    case aco_opcode::v_cmp_gt_u16:
716    case aco_opcode::v_cmp_le_f16:
717    case aco_opcode::v_cmp_le_i16:
718    case aco_opcode::v_cmp_le_u16:
719    case aco_opcode::v_cmp_lg_f16:
720    case aco_opcode::v_cmp_lg_i16:
721    case aco_opcode::v_cmp_lg_u16:
722    case aco_opcode::v_cmp_lt_f16:
723    case aco_opcode::v_cmp_lt_i16:
724    case aco_opcode::v_cmp_lt_u16:
725    case aco_opcode::v_cmp_neq_f16:
726    case aco_opcode::v_cmp_nge_f16:
727    case aco_opcode::v_cmp_ngt_f16:
728    case aco_opcode::v_cmp_nle_f16:
729    case aco_opcode::v_cmp_nlg_f16:
730    case aco_opcode::v_cmp_nlt_f16:
731    case aco_opcode::v_cmp_o_f16:
732    case aco_opcode::v_cmp_u_f16:
733    case aco_opcode::v_cmpx_class_f16:
734    case aco_opcode::v_cmpx_eq_f16:
735    case aco_opcode::v_cmpx_eq_i16:
736    case aco_opcode::v_cmpx_eq_u16:
737    case aco_opcode::v_cmpx_ge_f16:
738    case aco_opcode::v_cmpx_ge_i16:
739    case aco_opcode::v_cmpx_ge_u16:
740    case aco_opcode::v_cmpx_gt_f16:
741    case aco_opcode::v_cmpx_gt_i16:
742    case aco_opcode::v_cmpx_gt_u16:
743    case aco_opcode::v_cmpx_le_f16:
744    case aco_opcode::v_cmpx_le_i16:
745    case aco_opcode::v_cmpx_le_u16:
746    case aco_opcode::v_cmpx_lg_f16:
747    case aco_opcode::v_cmpx_lg_i16:
748    case aco_opcode::v_cmpx_lg_u16:
749    case aco_opcode::v_cmpx_lt_f16:
750    case aco_opcode::v_cmpx_lt_i16:
751    case aco_opcode::v_cmpx_lt_u16:
752    case aco_opcode::v_cmpx_neq_f16:
753    case aco_opcode::v_cmpx_nge_f16:
754    case aco_opcode::v_cmpx_ngt_f16:
755    case aco_opcode::v_cmpx_nle_f16:
756    case aco_opcode::v_cmpx_nlg_f16:
757    case aco_opcode::v_cmpx_nlt_f16:
758    case aco_opcode::v_cmpx_o_f16:
759    case aco_opcode::v_cmpx_u_f16: return 0x3;
760    case aco_opcode::v_cvt_f16_f32:
761    case aco_opcode::v_sat_pk_u8_i16: return 0x8;
762    default: return 0x0;
763    }
764 }
765 
766 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)767 get_reduction_identity(ReduceOp op, unsigned idx)
768 {
769    switch (op) {
770    case iadd8:
771    case iadd16:
772    case iadd32:
773    case iadd64:
774    case fadd16:
775    case fadd32:
776    case fadd64:
777    case ior8:
778    case ior16:
779    case ior32:
780    case ior64:
781    case ixor8:
782    case ixor16:
783    case ixor32:
784    case ixor64:
785    case umax8:
786    case umax16:
787    case umax32:
788    case umax64: return 0;
789    case imul8:
790    case imul16:
791    case imul32:
792    case imul64: return idx ? 0 : 1;
793    case fmul16: return 0x3c00u;                /* 1.0 */
794    case fmul32: return 0x3f800000u;            /* 1.0 */
795    case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
796    case imin8: return INT8_MAX;
797    case imin16: return INT16_MAX;
798    case imin32: return INT32_MAX;
799    case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
800    case imax8: return INT8_MIN;
801    case imax16: return INT16_MIN;
802    case imax32: return INT32_MIN;
803    case imax64: return idx ? 0x80000000u : 0;
804    case umin8:
805    case umin16:
806    case iand8:
807    case iand16: return 0xffffffffu;
808    case umin32:
809    case umin64:
810    case iand32:
811    case iand64: return 0xffffffffu;
812    case fmin16: return 0x7c00u;                /* infinity */
813    case fmin32: return 0x7f800000u;            /* infinity */
814    case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
815    case fmax16: return 0xfc00u;                /* negative infinity */
816    case fmax32: return 0xff800000u;            /* negative infinity */
817    case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
818    default: unreachable("Invalid reduction operation"); break;
819    }
820    return 0;
821 }
822 
823 unsigned
get_operand_size(aco_ptr<Instruction> & instr,unsigned index)824 get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
825 {
826    if (instr->isPseudo())
827       return instr->operands[index].bytes() * 8u;
828    else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
829             instr->opcode == aco_opcode::v_mad_i64_i32)
830       return index == 2 ? 64 : 32;
831    else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
832             instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
833             instr->opcode == aco_opcode::v_fma_mixhi_f16)
834       return instr->valu().opsel_hi[index] ? 16 : 32;
835    else if (instr->isVALU() || instr->isSALU())
836       return instr_info.operand_size[(int)instr->opcode];
837    else
838       return 0;
839 }
840 
841 bool
needs_exec_mask(const Instruction * instr)842 needs_exec_mask(const Instruction* instr)
843 {
844    if (instr->isVALU()) {
845       return instr->opcode != aco_opcode::v_readlane_b32 &&
846              instr->opcode != aco_opcode::v_readlane_b32_e64 &&
847              instr->opcode != aco_opcode::v_writelane_b32 &&
848              instr->opcode != aco_opcode::v_writelane_b32_e64;
849    }
850 
851    if (instr->isVMEM() || instr->isFlatLike())
852       return true;
853 
854    if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
855       return instr->reads_exec();
856 
857    if (instr->isPseudo()) {
858       switch (instr->opcode) {
859       case aco_opcode::p_create_vector:
860       case aco_opcode::p_extract_vector:
861       case aco_opcode::p_split_vector:
862       case aco_opcode::p_phi:
863       case aco_opcode::p_parallelcopy:
864          for (Definition def : instr->definitions) {
865             if (def.getTemp().type() == RegType::vgpr)
866                return true;
867          }
868          return instr->reads_exec();
869       case aco_opcode::p_spill:
870       case aco_opcode::p_reload:
871       case aco_opcode::p_end_linear_vgpr:
872       case aco_opcode::p_logical_start:
873       case aco_opcode::p_logical_end:
874       case aco_opcode::p_startpgm:
875       case aco_opcode::p_end_wqm:
876       case aco_opcode::p_init_scratch: return instr->reads_exec();
877       case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
878       default: break;
879       }
880    }
881 
882    return true;
883 }
884 
885 struct CmpInfo {
886    aco_opcode ordered;
887    aco_opcode unordered;
888    aco_opcode swapped;
889    aco_opcode inverse;
890    aco_opcode vcmpx;
891    aco_opcode f32;
892    unsigned size;
893 };
894 
895 ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)896 get_cmp_info(aco_opcode op, CmpInfo* info)
897 {
898    info->ordered = aco_opcode::num_opcodes;
899    info->unordered = aco_opcode::num_opcodes;
900    info->swapped = aco_opcode::num_opcodes;
901    info->inverse = aco_opcode::num_opcodes;
902    info->f32 = aco_opcode::num_opcodes;
903    info->vcmpx = aco_opcode::num_opcodes;
904    switch (op) {
905       // clang-format off
906 #define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
907    case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
908    case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
909       info->ordered = aco_opcode::v_cmp_##ord##_f##sz;                                             \
910       info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;                                        \
911       info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
912                                                       : aco_opcode::v_cmp_n##unord_swap##_f##sz;   \
913       info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
914                                                                : aco_opcode::v_cmp_n##ord##_f##sz; \
915       info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32            \
916                                                         : aco_opcode::v_cmp_n##unord##_f32;        \
917       info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz       \
918                                                           : aco_opcode::v_cmpx_n##unord##_f##sz;   \
919       info->size = sz;                                                                             \
920       return true;
921 #define CMP(ord, unord, ord_swap, unord_swap)                                                      \
922    CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
923    CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
924    CMP2(ord, unord, ord_swap, unord_swap, 64)
925       CMP(lt, /*n*/ge, gt, /*n*/le)
926       CMP(eq, /*n*/lg, eq, /*n*/lg)
927       CMP(le, /*n*/gt, ge, /*n*/lt)
928       CMP(gt, /*n*/le, lt, /*n*/ge)
929       CMP(lg, /*n*/eq, lg, /*n*/eq)
930       CMP(ge, /*n*/lt, le, /*n*/gt)
931 #undef CMP
932 #undef CMP2
933 #define ORD_TEST(sz)                                                                               \
934    case aco_opcode::v_cmp_u_f##sz:                                                                 \
935       info->f32 = aco_opcode::v_cmp_u_f32;                                                         \
936       info->swapped = aco_opcode::v_cmp_u_f##sz;                                                   \
937       info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
938       info->vcmpx = aco_opcode::v_cmpx_u_f##sz;                                                    \
939       info->size = sz;                                                                             \
940       return true;                                                                                 \
941    case aco_opcode::v_cmp_o_f##sz:                                                                 \
942       info->f32 = aco_opcode::v_cmp_o_f32;                                                         \
943       info->swapped = aco_opcode::v_cmp_o_f##sz;                                                   \
944       info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
945       info->vcmpx = aco_opcode::v_cmpx_o_f##sz;                                                    \
946       info->size = sz;                                                                             \
947       return true;
948       ORD_TEST(16)
949       ORD_TEST(32)
950       ORD_TEST(64)
951 #undef ORD_TEST
952 #define CMPI2(op, swap, inv, type, sz)                                                             \
953    case aco_opcode::v_cmp_##op##_##type##sz:                                                       \
954       info->swapped = aco_opcode::v_cmp_##swap##_##type##sz;                                       \
955       info->inverse = aco_opcode::v_cmp_##inv##_##type##sz;                                        \
956       info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz;                                          \
957       info->size = sz;                                                                             \
958       return true;
959 #define CMPI(op, swap, inv)                                                                        \
960    CMPI2(op, swap, inv, i, 16)                                                                     \
961    CMPI2(op, swap, inv, u, 16)                                                                     \
962    CMPI2(op, swap, inv, i, 32)                                                                     \
963    CMPI2(op, swap, inv, u, 32)                                                                     \
964    CMPI2(op, swap, inv, i, 64)                                                                     \
965    CMPI2(op, swap, inv, u, 64)
966       CMPI(lt, gt, ge)
967       CMPI(eq, eq, lg)
968       CMPI(le, ge, gt)
969       CMPI(gt, lt, le)
970       CMPI(lg, lg, eq)
971       CMPI(ge, le, lt)
972 #undef CMPI
973 #undef CMPI2
974 #define CMPCLASS(sz)                                                                               \
975    case aco_opcode::v_cmp_class_f##sz:                                                             \
976       info->vcmpx = aco_opcode::v_cmpx_class_f##sz;                                                \
977       info->size = sz;                                                                             \
978       return true;
979       CMPCLASS(16)
980       CMPCLASS(32)
981       CMPCLASS(64)
982 #undef CMPCLASS
983       // clang-format on
984    default: return false;
985    }
986 }
987 
988 aco_opcode
get_ordered(aco_opcode op)989 get_ordered(aco_opcode op)
990 {
991    CmpInfo info;
992    return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
993 }
994 
995 aco_opcode
get_unordered(aco_opcode op)996 get_unordered(aco_opcode op)
997 {
998    CmpInfo info;
999    return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
1000 }
1001 
1002 aco_opcode
get_inverse(aco_opcode op)1003 get_inverse(aco_opcode op)
1004 {
1005    CmpInfo info;
1006    return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
1007 }
1008 
1009 aco_opcode
get_swapped(aco_opcode op)1010 get_swapped(aco_opcode op)
1011 {
1012    CmpInfo info;
1013    return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
1014 }
1015 
1016 aco_opcode
get_f32_cmp(aco_opcode op)1017 get_f32_cmp(aco_opcode op)
1018 {
1019    CmpInfo info;
1020    return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
1021 }
1022 
1023 aco_opcode
get_vcmpx(aco_opcode op)1024 get_vcmpx(aco_opcode op)
1025 {
1026    CmpInfo info;
1027    return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
1028 }
1029 
1030 unsigned
get_cmp_bitsize(aco_opcode op)1031 get_cmp_bitsize(aco_opcode op)
1032 {
1033    CmpInfo info;
1034    return get_cmp_info(op, &info) ? info.size : 0;
1035 }
1036 
1037 bool
is_fp_cmp(aco_opcode op)1038 is_fp_cmp(aco_opcode op)
1039 {
1040    CmpInfo info;
1041    return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
1042 }
1043 
1044 bool
is_cmpx(aco_opcode op)1045 is_cmpx(aco_opcode op)
1046 {
1047    CmpInfo info;
1048    return !get_cmp_info(op, &info);
1049 }
1050 
1051 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op,unsigned idx0,unsigned idx1)1052 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
1053 {
1054    if (idx0 == idx1) {
1055       *new_op = instr->opcode;
1056       return true;
1057    }
1058 
1059    if (idx0 > idx1)
1060       std::swap(idx0, idx1);
1061 
1062    if (instr->isDPP())
1063       return false;
1064 
1065    if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
1066       return false;
1067 
1068    if (instr->isVOPC()) {
1069       CmpInfo info;
1070       if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
1071          *new_op = info.swapped;
1072          return true;
1073       }
1074    }
1075 
1076    /* opcodes not relevant for DPP or SGPRs optimizations are not included. */
1077    switch (instr->opcode) {
1078    case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
1079    case aco_opcode::v_add_u32:
1080    case aco_opcode::v_add_co_u32:
1081    case aco_opcode::v_add_co_u32_e64:
1082    case aco_opcode::v_add_i32:
1083    case aco_opcode::v_add_i16:
1084    case aco_opcode::v_add_u16_e64:
1085    case aco_opcode::v_add3_u32:
1086    case aco_opcode::v_add_f16:
1087    case aco_opcode::v_add_f32:
1088    case aco_opcode::v_mul_i32_i24:
1089    case aco_opcode::v_mul_hi_i32_i24:
1090    case aco_opcode::v_mul_u32_u24:
1091    case aco_opcode::v_mul_hi_u32_u24:
1092    case aco_opcode::v_mul_lo_u16:
1093    case aco_opcode::v_mul_lo_u16_e64:
1094    case aco_opcode::v_mul_f16:
1095    case aco_opcode::v_mul_f32:
1096    case aco_opcode::v_mul_legacy_f32:
1097    case aco_opcode::v_or_b32:
1098    case aco_opcode::v_and_b32:
1099    case aco_opcode::v_xor_b32:
1100    case aco_opcode::v_xnor_b32:
1101    case aco_opcode::v_xor3_b32:
1102    case aco_opcode::v_or3_b32:
1103    case aco_opcode::v_and_b16:
1104    case aco_opcode::v_or_b16:
1105    case aco_opcode::v_xor_b16:
1106    case aco_opcode::v_max3_f32:
1107    case aco_opcode::v_min3_f32:
1108    case aco_opcode::v_max3_f16:
1109    case aco_opcode::v_min3_f16:
1110    case aco_opcode::v_med3_f16:
1111    case aco_opcode::v_max3_u32:
1112    case aco_opcode::v_min3_u32:
1113    case aco_opcode::v_med3_u32:
1114    case aco_opcode::v_max3_i32:
1115    case aco_opcode::v_min3_i32:
1116    case aco_opcode::v_med3_i32:
1117    case aco_opcode::v_max3_u16:
1118    case aco_opcode::v_min3_u16:
1119    case aco_opcode::v_med3_u16:
1120    case aco_opcode::v_max3_i16:
1121    case aco_opcode::v_min3_i16:
1122    case aco_opcode::v_med3_i16:
1123    case aco_opcode::v_max_f16:
1124    case aco_opcode::v_max_f32:
1125    case aco_opcode::v_min_f16:
1126    case aco_opcode::v_min_f32:
1127    case aco_opcode::v_max_i32:
1128    case aco_opcode::v_min_i32:
1129    case aco_opcode::v_max_u32:
1130    case aco_opcode::v_min_u32:
1131    case aco_opcode::v_max_i16:
1132    case aco_opcode::v_min_i16:
1133    case aco_opcode::v_max_u16:
1134    case aco_opcode::v_min_u16:
1135    case aco_opcode::v_max_i16_e64:
1136    case aco_opcode::v_min_i16_e64:
1137    case aco_opcode::v_max_u16_e64:
1138    case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
1139    case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
1140    case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
1141    case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
1142    case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
1143    case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
1144    case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
1145    case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
1146    case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
1147    case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
1148    case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
1149    case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
1150    case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
1151    case aco_opcode::v_addc_co_u32:
1152    case aco_opcode::v_mad_i32_i24:
1153    case aco_opcode::v_mad_u32_u24:
1154    case aco_opcode::v_lerp_u8:
1155    case aco_opcode::v_sad_u8:
1156    case aco_opcode::v_sad_hi_u8:
1157    case aco_opcode::v_sad_u16:
1158    case aco_opcode::v_sad_u32:
1159    case aco_opcode::v_xad_u32:
1160    case aco_opcode::v_add_lshl_u32:
1161    case aco_opcode::v_and_or_b32:
1162    case aco_opcode::v_mad_u16:
1163    case aco_opcode::v_mad_i16:
1164    case aco_opcode::v_mad_u32_u16:
1165    case aco_opcode::v_mad_i32_i16:
1166    case aco_opcode::v_maxmin_f32:
1167    case aco_opcode::v_minmax_f32:
1168    case aco_opcode::v_maxmin_f16:
1169    case aco_opcode::v_minmax_f16:
1170    case aco_opcode::v_maxmin_u32:
1171    case aco_opcode::v_minmax_u32:
1172    case aco_opcode::v_maxmin_i32:
1173    case aco_opcode::v_minmax_i32:
1174    case aco_opcode::v_fma_f32:
1175    case aco_opcode::v_fma_legacy_f32:
1176    case aco_opcode::v_fmac_f32:
1177    case aco_opcode::v_fmac_legacy_f32:
1178    case aco_opcode::v_mac_f32:
1179    case aco_opcode::v_mac_legacy_f32:
1180    case aco_opcode::v_fma_f16:
1181    case aco_opcode::v_fmac_f16:
1182    case aco_opcode::v_mac_f16:
1183    case aco_opcode::v_dot4c_i32_i8:
1184    case aco_opcode::v_dot2c_f32_f16:
1185    case aco_opcode::v_dot2_f32_f16:
1186    case aco_opcode::v_dot2_f32_bf16:
1187    case aco_opcode::v_dot2_f16_f16:
1188    case aco_opcode::v_dot2_bf16_bf16:
1189    case aco_opcode::v_fma_mix_f32:
1190    case aco_opcode::v_fma_mixlo_f16:
1191    case aco_opcode::v_fma_mixhi_f16:
1192    case aco_opcode::v_pk_fmac_f16: {
1193       if (idx1 == 2)
1194          return false;
1195       *new_op = instr->opcode;
1196       return true;
1197    }
1198    case aco_opcode::v_subb_co_u32: {
1199       if (idx1 == 2)
1200          return false;
1201       *new_op = aco_opcode::v_subbrev_co_u32;
1202       return true;
1203    }
1204    case aco_opcode::v_subbrev_co_u32: {
1205       if (idx1 == 2)
1206          return false;
1207       *new_op = aco_opcode::v_subb_co_u32;
1208       return true;
1209    }
1210    default: return false;
1211    }
1212 }
1213 
wait_imm()1214 wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
1215 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)1216 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
1217     : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
1218 {}
1219 
wait_imm(enum amd_gfx_level gfx_level,uint16_t packed)1220 wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter)
1221 {
1222    if (gfx_level >= GFX11) {
1223       vm = (packed >> 10) & 0x3f;
1224       lgkm = (packed >> 4) & 0x3f;
1225       exp = packed & 0x7;
1226    } else {
1227       vm = packed & 0xf;
1228       if (gfx_level >= GFX9)
1229          vm |= (packed >> 10) & 0x30;
1230 
1231       exp = (packed >> 4) & 0x7;
1232 
1233       lgkm = (packed >> 8) & 0xf;
1234       if (gfx_level >= GFX10)
1235          lgkm |= (packed >> 8) & 0x30;
1236    }
1237 
1238    if (vm == (gfx_level >= GFX9 ? 0x3f : 0xf))
1239       vm = wait_imm::unset_counter;
1240    if (exp == 0x7)
1241       exp = wait_imm::unset_counter;
1242    if (lgkm == (gfx_level >= GFX10 ? 0x3f : 0xf))
1243       lgkm = wait_imm::unset_counter;
1244 }
1245 
1246 uint16_t
pack(enum amd_gfx_level gfx_level) const1247 wait_imm::pack(enum amd_gfx_level gfx_level) const
1248 {
1249    uint16_t imm = 0;
1250    assert(exp == unset_counter || exp <= 0x7);
1251    if (gfx_level >= GFX11) {
1252       assert(lgkm == unset_counter || lgkm <= 0x3f);
1253       assert(vm == unset_counter || vm <= 0x3f);
1254       imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
1255    } else if (gfx_level >= GFX10) {
1256       assert(lgkm == unset_counter || lgkm <= 0x3f);
1257       assert(vm == unset_counter || vm <= 0x3f);
1258       imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1259    } else if (gfx_level >= GFX9) {
1260       assert(lgkm == unset_counter || lgkm <= 0xf);
1261       assert(vm == unset_counter || vm <= 0x3f);
1262       imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1263    } else {
1264       assert(lgkm == unset_counter || lgkm <= 0xf);
1265       assert(vm == unset_counter || vm <= 0xf);
1266       imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1267    }
1268    if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
1269       imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
1270                         architecture when interpreting the immediate */
1271    if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
1272       imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
1273                         architecture when interpreting the immediate */
1274    return imm;
1275 }
1276 
1277 bool
combine(const wait_imm & other)1278 wait_imm::combine(const wait_imm& other)
1279 {
1280    bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
1281    vm = std::min(vm, other.vm);
1282    exp = std::min(exp, other.exp);
1283    lgkm = std::min(lgkm, other.lgkm);
1284    vs = std::min(vs, other.vs);
1285    return changed;
1286 }
1287 
1288 bool
empty() const1289 wait_imm::empty() const
1290 {
1291    return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
1292           vs == unset_counter;
1293 }
1294 
1295 void
print(FILE * output) const1296 wait_imm::print(FILE* output) const
1297 {
1298    if (exp != unset_counter)
1299       fprintf(output, "exp: %u\n", exp);
1300    if (vm != unset_counter)
1301       fprintf(output, "vm: %u\n", vm);
1302    if (lgkm != unset_counter)
1303       fprintf(output, "lgkm: %u\n", lgkm);
1304    if (vs != unset_counter)
1305       fprintf(output, "vs: %u\n", vs);
1306 }
1307 
1308 bool
should_form_clause(const Instruction * a,const Instruction * b)1309 should_form_clause(const Instruction* a, const Instruction* b)
1310 {
1311    if (a->definitions.empty() != b->definitions.empty())
1312       return false;
1313 
1314    if (a->format != b->format)
1315       return false;
1316 
1317    if (a->operands.empty() || b->operands.empty())
1318       return false;
1319 
1320    /* Assume loads which don't use descriptors might load from similar addresses. */
1321    if (a->isFlatLike() || a->accessesLDS())
1322       return true;
1323    if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
1324       return true;
1325 
1326    /* If they load from the same descriptor, assume they might load from similar
1327     * addresses.
1328     */
1329    if (a->isVMEM() || a->isSMEM())
1330       return a->operands[0].tempId() == b->operands[0].tempId();
1331 
1332    return false;
1333 }
1334 
1335 int
get_op_fixed_to_def(Instruction * instr)1336 get_op_fixed_to_def(Instruction* instr)
1337 {
1338    if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
1339        instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
1340        instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
1341        instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
1342        instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
1343        instr->opcode == aco_opcode::v_writelane_b32_e64 ||
1344        instr->opcode == aco_opcode::v_dot4c_i32_i8) {
1345       return 2;
1346    } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
1347               instr->opcode == aco_opcode::s_cmovk_i32) {
1348       return 0;
1349    } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) {
1350       return 3;
1351    } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
1352               !instr->operands[2].isUndefined()) {
1353       return 2;
1354    }
1355    return -1;
1356 }
1357 
1358 bool
dealloc_vgprs(Program * program)1359 dealloc_vgprs(Program* program)
1360 {
1361    if (program->gfx_level < GFX11)
1362       return false;
1363 
1364    /* skip if deallocating VGPRs won't increase occupancy */
1365    uint16_t max_waves = max_suitable_waves(program, program->dev.max_waves_per_simd);
1366    if (program->max_reg_demand.vgpr <= get_addr_vgpr_from_waves(program, max_waves))
1367       return false;
1368 
1369    /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch
1370     * store. */
1371    if (uses_scratch(program))
1372       return false;
1373 
1374    Block& block = program->blocks.back();
1375 
1376    /* don't bother checking if there is a pending VMEM store or export: there almost always is */
1377    Builder bld(program);
1378    if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
1379       bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
1380       /* Due to a hazard, an s_nop is needed before "s_sendmsg sendmsg_dealloc_vgprs". */
1381       bld.sopp(aco_opcode::s_nop, -1, 0);
1382       bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
1383    }
1384 
1385    return true;
1386 }
1387 
1388 bool
isTrans() const1389 Instruction::isTrans() const noexcept
1390 {
1391    return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
1392           instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental;
1393 }
1394 
1395 } // namespace aco
1396