• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 
27 #include "aco_builder.h"
28 
29 #include "util/debug.h"
30 
31 #include "c11/threads.h"
32 
33 namespace aco {
34 
35 uint64_t debug_flags = 0;
36 
37 static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
38                                                          {"validatera", DEBUG_VALIDATE_RA},
39                                                          {"perfwarn", DEBUG_PERFWARN},
40                                                          {"force-waitcnt", DEBUG_FORCE_WAITCNT},
41                                                          {"novn", DEBUG_NO_VN},
42                                                          {"noopt", DEBUG_NO_OPT},
43                                                          {"nosched", DEBUG_NO_SCHED},
44                                                          {"perfinfo", DEBUG_PERF_INFO},
45                                                          {"liveinfo", DEBUG_LIVE_INFO},
46                                                          {NULL, 0}};
47 
48 static once_flag init_once_flag = ONCE_FLAG_INIT;
49 
50 static void
init_once()51 init_once()
52 {
53    debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
54 
55 #ifndef NDEBUG
56    /* enable some flags by default on debug builds */
57    debug_flags |= aco::DEBUG_VALIDATE_IR;
58 #endif
59 }
60 
61 void
init()62 init()
63 {
64    call_once(&init_once_flag, init_once);
65 }
66 
67 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)68 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
69              enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
70              ac_shader_config* config)
71 {
72    program->stage = stage;
73    program->config = config;
74    program->info = *info;
75    program->gfx_level = gfx_level;
76    if (family == CHIP_UNKNOWN) {
77       switch (gfx_level) {
78       case GFX6: program->family = CHIP_TAHITI; break;
79       case GFX7: program->family = CHIP_BONAIRE; break;
80       case GFX8: program->family = CHIP_POLARIS10; break;
81       case GFX9: program->family = CHIP_VEGA10; break;
82       case GFX10: program->family = CHIP_NAVI10; break;
83       default: program->family = CHIP_UNKNOWN; break;
84       }
85    } else {
86       program->family = family;
87    }
88    program->wave_size = info->wave_size;
89    program->lane_mask = program->wave_size == 32 ? s1 : s2;
90 
91    program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024 :
92                                        gfx_level >= GFX7 ? 512 : 256;
93    program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
94    program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
95    /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96    program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97 
98    program->dev.vgpr_limit = 256;
99    program->dev.physical_vgprs = 256;
100    program->dev.vgpr_alloc_granule = 4;
101 
102    if (gfx_level >= GFX10) {
103       program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
104       program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
105       program->dev.sgpr_alloc_granule = 128;
106       program->dev.sgpr_limit =
107          108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
108       if (gfx_level == GFX10_3)
109          program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
110       else
111          program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
112    } else if (program->gfx_level >= GFX8) {
113       program->dev.physical_sgprs = 800;
114       program->dev.sgpr_alloc_granule = 16;
115       program->dev.sgpr_limit = 102;
116       if (family == CHIP_TONGA || family == CHIP_ICELAND)
117          program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
118    } else {
119       program->dev.physical_sgprs = 512;
120       program->dev.sgpr_alloc_granule = 8;
121       program->dev.sgpr_limit = 104;
122    }
123 
124    program->dev.max_wave64_per_simd = 10;
125    if (program->gfx_level >= GFX10_3)
126       program->dev.max_wave64_per_simd = 16;
127    else if (program->gfx_level == GFX10)
128       program->dev.max_wave64_per_simd = 20;
129    else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
130       program->dev.max_wave64_per_simd = 8;
131 
132    program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
133 
134    switch (program->family) {
135    /* GFX8 APUs */
136    case CHIP_CARRIZO:
137    case CHIP_STONEY:
138    /* GFX9 APUS */
139    case CHIP_RAVEN:
140    case CHIP_RAVEN2:
141    case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
142    default: break;
143    }
144 
145    program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
146    /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
147    program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
148    if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
149        program->family == CHIP_HAWAII)
150       program->dev.has_fast_fma32 = true;
151    program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level >= GFX10;
152 
153    program->dev.fused_mad_mix = program->gfx_level >= GFX10;
154    if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
155        program->family == CHIP_ARCTURUS || program->family == CHIP_ALDEBARAN)
156       program->dev.fused_mad_mix = true;
157 
158    if (program->gfx_level >= GFX11) {
159       program->dev.scratch_global_offset_min = -4096;
160       program->dev.scratch_global_offset_max = 4095;
161    } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
162       program->dev.scratch_global_offset_min = -2048;
163       program->dev.scratch_global_offset_max = 2047;
164    } else if (program->gfx_level == GFX9) {
165       /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
166       program->dev.scratch_global_offset_min = 0;
167       program->dev.scratch_global_offset_max = 4095;
168    }
169 
170    program->wgp_mode = wgp_mode;
171 
172    program->progress = CompilationProgress::after_isel;
173 
174    program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
175    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
176    program->next_fp_mode.must_flush_denorms32 = false;
177    program->next_fp_mode.must_flush_denorms16_64 = false;
178    program->next_fp_mode.care_about_round32 = false;
179    program->next_fp_mode.care_about_round16_64 = false;
180    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
181    program->next_fp_mode.denorm32 = 0;
182    program->next_fp_mode.round16_64 = fp_round_ne;
183    program->next_fp_mode.round32 = fp_round_ne;
184 }
185 
186 memory_sync_info
get_sync_info(const Instruction * instr)187 get_sync_info(const Instruction* instr)
188 {
189    switch (instr->format) {
190    case Format::SMEM: return instr->smem().sync;
191    case Format::MUBUF: return instr->mubuf().sync;
192    case Format::MIMG: return instr->mimg().sync;
193    case Format::MTBUF: return instr->mtbuf().sync;
194    case Format::FLAT:
195    case Format::GLOBAL:
196    case Format::SCRATCH: return instr->flatlike().sync;
197    case Format::DS: return instr->ds().sync;
198    default: return memory_sync_info();
199    }
200 }
201 
202 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)203 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
204 {
205    if (!instr->isVALU())
206       return false;
207 
208    if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
209       return false;
210 
211    if (instr->isSDWA())
212       return true;
213 
214    if (instr->isVOP3()) {
215       VOP3_instruction& vop3 = instr->vop3();
216       if (instr->format == Format::VOP3)
217          return false;
218       if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
219          return false;
220       if (vop3.omod && gfx_level < GFX9)
221          return false;
222 
223       // TODO: return true if we know we will use vcc
224       if (!pre_ra && instr->definitions.size() >= 2)
225          return false;
226 
227       for (unsigned i = 1; i < instr->operands.size(); i++) {
228          if (instr->operands[i].isLiteral())
229             return false;
230          if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
231             return false;
232       }
233    }
234 
235    if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
236       return false;
237 
238    if (!instr->operands.empty()) {
239       if (instr->operands[0].isLiteral())
240          return false;
241       if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
242          return false;
243       if (instr->operands[0].bytes() > 4)
244          return false;
245       if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
246          return false;
247    }
248 
249    bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
250                  instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
251 
252    if (gfx_level != GFX8 && is_mac)
253       return false;
254 
255    // TODO: return true if we know we will use vcc
256    if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
257       return false;
258    if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
259       return false;
260 
261    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
262           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
263           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
264           instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
265 }
266 
267 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
268 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)269 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
270 {
271    if (instr->isSDWA())
272       return NULL;
273 
274    aco_ptr<Instruction> tmp = std::move(instr);
275    Format format =
276       (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
277    instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
278                                                     tmp->definitions.size()));
279    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
280    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
281 
282    SDWA_instruction& sdwa = instr->sdwa();
283 
284    if (tmp->isVOP3()) {
285       VOP3_instruction& vop3 = tmp->vop3();
286       memcpy(sdwa.neg, vop3.neg, sizeof(sdwa.neg));
287       memcpy(sdwa.abs, vop3.abs, sizeof(sdwa.abs));
288       sdwa.omod = vop3.omod;
289       sdwa.clamp = vop3.clamp;
290    }
291 
292    for (unsigned i = 0; i < instr->operands.size(); i++) {
293       /* SDWA only uses operands 0 and 1. */
294       if (i >= 2)
295          break;
296 
297       sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
298    }
299 
300    sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
301 
302    if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
303       instr->definitions[0].setFixed(vcc);
304    if (instr->definitions.size() >= 2)
305       instr->definitions[1].setFixed(vcc);
306    if (instr->operands.size() >= 3)
307       instr->operands[2].setFixed(vcc);
308 
309    instr->pass_flags = tmp->pass_flags;
310 
311    return tmp;
312 }
313 
314 bool
can_use_DPP(const aco_ptr<Instruction> & instr,bool pre_ra,bool dpp8)315 can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8)
316 {
317    assert(instr->isVALU() && !instr->operands.empty());
318 
319    if (instr->isDPP())
320       return instr->isDPP8() == dpp8;
321 
322    if (instr->operands.size() && instr->operands[0].isLiteral())
323       return false;
324 
325    if (instr->isSDWA())
326       return false;
327 
328    if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) &&
329        instr->definitions.back().physReg() != vcc)
330       return false;
331 
332    if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc)
333       return false;
334 
335    if (instr->isVOP3()) {
336       const VOP3_instruction* vop3 = &instr->vop3();
337       if (vop3->clamp || vop3->omod || vop3->opsel)
338          return false;
339       if (dpp8)
340          return false;
341       if (instr->format == Format::VOP3)
342          return false;
343       if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr))
344          return false;
345    }
346 
347    /* there are more cases but those all take 64-bit inputs */
348    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
349           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
350           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
351           instr->opcode != aco_opcode::v_cvt_f64_i32 &&
352           instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32;
353 }
354 
355 aco_ptr<Instruction>
convert_to_DPP(aco_ptr<Instruction> & instr,bool dpp8)356 convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8)
357 {
358    if (instr->isDPP())
359       return NULL;
360 
361    aco_ptr<Instruction> tmp = std::move(instr);
362    Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) |
363                             (dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16));
364    if (dpp8)
365       instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(),
366                                                        tmp->definitions.size()));
367    else
368       instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(),
369                                                         tmp->definitions.size()));
370    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
371    for (unsigned i = 0; i < instr->definitions.size(); i++)
372       instr->definitions[i] = tmp->definitions[i];
373 
374    if (dpp8) {
375       DPP8_instruction* dpp = &instr->dpp8();
376       for (unsigned i = 0; i < 8; i++)
377          dpp->lane_sel[i] = i;
378    } else {
379       DPP16_instruction* dpp = &instr->dpp16();
380       dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
381       dpp->row_mask = 0xf;
382       dpp->bank_mask = 0xf;
383 
384       if (tmp->isVOP3()) {
385          const VOP3_instruction* vop3 = &tmp->vop3();
386          memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg));
387          memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs));
388       }
389    }
390 
391    if (instr->isVOPC() || instr->definitions.size() > 1)
392       instr->definitions.back().setFixed(vcc);
393 
394    if (instr->operands.size() >= 3)
395       instr->operands[2].setFixed(vcc);
396 
397    instr->pass_flags = tmp->pass_flags;
398 
399    return tmp;
400 }
401 
402 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)403 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
404 {
405    /* opsel is only GFX9+ */
406    if (gfx_level < GFX9)
407       return false;
408 
409    switch (op) {
410    case aco_opcode::v_div_fixup_f16:
411    case aco_opcode::v_fma_f16:
412    case aco_opcode::v_mad_f16:
413    case aco_opcode::v_mad_u16:
414    case aco_opcode::v_mad_i16:
415    case aco_opcode::v_med3_f16:
416    case aco_opcode::v_med3_i16:
417    case aco_opcode::v_med3_u16:
418    case aco_opcode::v_min3_f16:
419    case aco_opcode::v_min3_i16:
420    case aco_opcode::v_min3_u16:
421    case aco_opcode::v_max3_f16:
422    case aco_opcode::v_max3_i16:
423    case aco_opcode::v_max3_u16:
424    case aco_opcode::v_max_u16_e64:
425    case aco_opcode::v_max_i16_e64:
426    case aco_opcode::v_min_u16_e64:
427    case aco_opcode::v_min_i16_e64:
428    case aco_opcode::v_add_i16:
429    case aco_opcode::v_sub_i16:
430    case aco_opcode::v_add_u16_e64:
431    case aco_opcode::v_sub_u16_e64:
432    case aco_opcode::v_lshlrev_b16_e64:
433    case aco_opcode::v_lshrrev_b16_e64:
434    case aco_opcode::v_ashrrev_i16_e64:
435    case aco_opcode::v_mul_lo_u16_e64: return true;
436    case aco_opcode::v_pack_b32_f16:
437    case aco_opcode::v_cvt_pknorm_i16_f16:
438    case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
439    case aco_opcode::v_mad_u32_u16:
440    case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
441    default: return false;
442    }
443 }
444 
445 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)446 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
447 {
448    /* partial register writes are GFX9+, only */
449    if (gfx_level < GFX9)
450       return false;
451 
452    switch (op) {
453    /* VOP3 */
454    case aco_opcode::v_mad_f16:
455    case aco_opcode::v_mad_u16:
456    case aco_opcode::v_mad_i16:
457    case aco_opcode::v_fma_f16:
458    case aco_opcode::v_div_fixup_f16:
459    case aco_opcode::v_interp_p2_f16:
460    case aco_opcode::v_fma_mixlo_f16:
461    case aco_opcode::v_fma_mixhi_f16:
462    /* VOP2 */
463    case aco_opcode::v_mac_f16:
464    case aco_opcode::v_madak_f16:
465    case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
466    case aco_opcode::v_add_f16:
467    case aco_opcode::v_sub_f16:
468    case aco_opcode::v_subrev_f16:
469    case aco_opcode::v_mul_f16:
470    case aco_opcode::v_max_f16:
471    case aco_opcode::v_min_f16:
472    case aco_opcode::v_ldexp_f16:
473    case aco_opcode::v_fmac_f16:
474    case aco_opcode::v_fmamk_f16:
475    case aco_opcode::v_fmaak_f16:
476    /* VOP1 */
477    case aco_opcode::v_cvt_f16_f32:
478    case aco_opcode::v_cvt_f16_u16:
479    case aco_opcode::v_cvt_f16_i16:
480    case aco_opcode::v_rcp_f16:
481    case aco_opcode::v_sqrt_f16:
482    case aco_opcode::v_rsq_f16:
483    case aco_opcode::v_log_f16:
484    case aco_opcode::v_exp_f16:
485    case aco_opcode::v_frexp_mant_f16:
486    case aco_opcode::v_frexp_exp_i16_f16:
487    case aco_opcode::v_floor_f16:
488    case aco_opcode::v_ceil_f16:
489    case aco_opcode::v_trunc_f16:
490    case aco_opcode::v_rndne_f16:
491    case aco_opcode::v_fract_f16:
492    case aco_opcode::v_sin_f16:
493    case aco_opcode::v_cos_f16: return gfx_level >= GFX10;
494    // TODO: confirm whether these write 16 or 32 bit on GFX10+
495    // case aco_opcode::v_cvt_u16_f16:
496    // case aco_opcode::v_cvt_i16_f16:
497    // case aco_opcode::p_cvt_f16_f32_rtne:
498    // case aco_opcode::v_cvt_norm_i16_f16:
499    // case aco_opcode::v_cvt_norm_u16_f16:
500    /* on GFX10, all opsel instructions preserve the high bits */
501    default: return gfx_level >= GFX10 && can_use_opsel(gfx_level, op, -1);
502    }
503 }
504 
505 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)506 get_reduction_identity(ReduceOp op, unsigned idx)
507 {
508    switch (op) {
509    case iadd8:
510    case iadd16:
511    case iadd32:
512    case iadd64:
513    case fadd16:
514    case fadd32:
515    case fadd64:
516    case ior8:
517    case ior16:
518    case ior32:
519    case ior64:
520    case ixor8:
521    case ixor16:
522    case ixor32:
523    case ixor64:
524    case umax8:
525    case umax16:
526    case umax32:
527    case umax64: return 0;
528    case imul8:
529    case imul16:
530    case imul32:
531    case imul64: return idx ? 0 : 1;
532    case fmul16: return 0x3c00u;                /* 1.0 */
533    case fmul32: return 0x3f800000u;            /* 1.0 */
534    case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
535    case imin8: return INT8_MAX;
536    case imin16: return INT16_MAX;
537    case imin32: return INT32_MAX;
538    case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
539    case imax8: return INT8_MIN;
540    case imax16: return INT16_MIN;
541    case imax32: return INT32_MIN;
542    case imax64: return idx ? 0x80000000u : 0;
543    case umin8:
544    case umin16:
545    case iand8:
546    case iand16: return 0xffffffffu;
547    case umin32:
548    case umin64:
549    case iand32:
550    case iand64: return 0xffffffffu;
551    case fmin16: return 0x7c00u;                /* infinity */
552    case fmin32: return 0x7f800000u;            /* infinity */
553    case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
554    case fmax16: return 0xfc00u;                /* negative infinity */
555    case fmax32: return 0xff800000u;            /* negative infinity */
556    case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
557    default: unreachable("Invalid reduction operation"); break;
558    }
559    return 0;
560 }
561 
562 bool
needs_exec_mask(const Instruction * instr)563 needs_exec_mask(const Instruction* instr)
564 {
565    if (instr->isVALU()) {
566       return instr->opcode != aco_opcode::v_readlane_b32 &&
567              instr->opcode != aco_opcode::v_readlane_b32_e64 &&
568              instr->opcode != aco_opcode::v_writelane_b32 &&
569              instr->opcode != aco_opcode::v_writelane_b32_e64;
570    }
571 
572    if (instr->isVMEM() || instr->isFlatLike())
573       return true;
574 
575    if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
576       return instr->reads_exec();
577 
578    if (instr->isPseudo()) {
579       switch (instr->opcode) {
580       case aco_opcode::p_create_vector:
581       case aco_opcode::p_extract_vector:
582       case aco_opcode::p_split_vector:
583       case aco_opcode::p_phi:
584       case aco_opcode::p_parallelcopy:
585          for (Definition def : instr->definitions) {
586             if (def.getTemp().type() == RegType::vgpr)
587                return true;
588          }
589          return instr->reads_exec();
590       case aco_opcode::p_spill:
591       case aco_opcode::p_reload:
592       case aco_opcode::p_end_linear_vgpr:
593       case aco_opcode::p_logical_start:
594       case aco_opcode::p_logical_end:
595       case aco_opcode::p_startpgm:
596       case aco_opcode::p_init_scratch: return instr->reads_exec();
597       default: break;
598       }
599    }
600 
601    return true;
602 }
603 
604 struct CmpInfo {
605    aco_opcode ordered;
606    aco_opcode unordered;
607    aco_opcode swapped;
608    aco_opcode inverse;
609    aco_opcode vcmpx;
610    aco_opcode f32;
611    unsigned size;
612 };
613 
614 ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)615 get_cmp_info(aco_opcode op, CmpInfo* info)
616 {
617    info->ordered = aco_opcode::num_opcodes;
618    info->unordered = aco_opcode::num_opcodes;
619    info->swapped = aco_opcode::num_opcodes;
620    info->inverse = aco_opcode::num_opcodes;
621    info->f32 = aco_opcode::num_opcodes;
622    switch (op) {
623       // clang-format off
624 #define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
625    case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
626    case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
627       info->ordered = aco_opcode::v_cmp_##ord##_f##sz;                                             \
628       info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;                                        \
629       info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
630                                                       : aco_opcode::v_cmp_n##unord_swap##_f##sz;   \
631       info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
632                                                                : aco_opcode::v_cmp_n##ord##_f##sz; \
633       info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32            \
634                                                         : aco_opcode::v_cmp_n##unord##_f32;        \
635       info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz       \
636                                                           : aco_opcode::v_cmpx_n##unord##_f##sz;   \
637       info->size = sz;                                                                             \
638       return true;
639 #define CMP(ord, unord, ord_swap, unord_swap)                                                      \
640    CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
641    CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
642    CMP2(ord, unord, ord_swap, unord_swap, 64)
643       CMP(lt, /*n*/ge, gt, /*n*/le)
644       CMP(eq, /*n*/lg, eq, /*n*/lg)
645       CMP(le, /*n*/gt, ge, /*n*/lt)
646       CMP(gt, /*n*/le, lt, /*n*/ge)
647       CMP(lg, /*n*/eq, lg, /*n*/eq)
648       CMP(ge, /*n*/lt, le, /*n*/gt)
649 #undef CMP
650 #undef CMP2
651 #define ORD_TEST(sz)                                                                               \
652    case aco_opcode::v_cmp_u_f##sz:                                                                 \
653       info->f32 = aco_opcode::v_cmp_u_f32;                                                         \
654       info->swapped = aco_opcode::v_cmp_u_f##sz;                                                   \
655       info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
656       info->vcmpx = aco_opcode::v_cmpx_u_f##sz;                                                    \
657       info->size = sz;                                                                             \
658       return true;                                                                                 \
659    case aco_opcode::v_cmp_o_f##sz:                                                                 \
660       info->f32 = aco_opcode::v_cmp_o_f32;                                                         \
661       info->swapped = aco_opcode::v_cmp_o_f##sz;                                                   \
662       info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
663       info->vcmpx = aco_opcode::v_cmpx_o_f##sz;                                                    \
664       info->size = sz;                                                                             \
665       return true;
666       ORD_TEST(16)
667       ORD_TEST(32)
668       ORD_TEST(64)
669 #undef ORD_TEST
670 #define CMPI2(op, swap, inv, type, sz)                                                             \
671    case aco_opcode::v_cmp_##op##_##type##sz:                                                       \
672       info->swapped = aco_opcode::v_cmp_##swap##_##type##sz;                                       \
673       info->inverse = aco_opcode::v_cmp_##inv##_##type##sz;                                        \
674       info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz;                                          \
675       info->size = sz;                                                                             \
676       return true;
677 #define CMPI(op, swap, inv)                                                                        \
678    CMPI2(op, swap, inv, i, 16)                                                                     \
679    CMPI2(op, swap, inv, u, 16)                                                                     \
680    CMPI2(op, swap, inv, i, 32)                                                                     \
681    CMPI2(op, swap, inv, u, 32)                                                                     \
682    CMPI2(op, swap, inv, i, 64)                                                                     \
683    CMPI2(op, swap, inv, u, 64)
684       CMPI(lt, gt, ge)
685       CMPI(eq, eq, lg)
686       CMPI(le, ge, gt)
687       CMPI(gt, lt, le)
688       CMPI(lg, lg, eq)
689       CMPI(ge, le, lt)
690 #undef CMPI
691 #undef CMPI2
692 #define CMPCLASS(sz)                                                                               \
693    case aco_opcode::v_cmp_class_f##sz:                                                             \
694       info->vcmpx = aco_opcode::v_cmpx_class_f##sz;                                                \
695       info->size = sz;                                                                             \
696       return true;
697       CMPCLASS(16)
698       CMPCLASS(32)
699       CMPCLASS(64)
700 #undef CMPCLASS
701       // clang-format on
702    default: return false;
703    }
704 }
705 
706 aco_opcode
get_ordered(aco_opcode op)707 get_ordered(aco_opcode op)
708 {
709    CmpInfo info;
710    return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
711 }
712 
713 aco_opcode
get_unordered(aco_opcode op)714 get_unordered(aco_opcode op)
715 {
716    CmpInfo info;
717    return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
718 }
719 
720 aco_opcode
get_inverse(aco_opcode op)721 get_inverse(aco_opcode op)
722 {
723    CmpInfo info;
724    return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
725 }
726 
727 aco_opcode
get_f32_cmp(aco_opcode op)728 get_f32_cmp(aco_opcode op)
729 {
730    CmpInfo info;
731    return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
732 }
733 
734 aco_opcode
get_vcmpx(aco_opcode op)735 get_vcmpx(aco_opcode op)
736 {
737    CmpInfo info;
738    return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
739 }
740 
741 unsigned
get_cmp_bitsize(aco_opcode op)742 get_cmp_bitsize(aco_opcode op)
743 {
744    CmpInfo info;
745    return get_cmp_info(op, &info) ? info.size : 0;
746 }
747 
748 bool
is_cmp(aco_opcode op)749 is_cmp(aco_opcode op)
750 {
751    CmpInfo info;
752    return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
753 }
754 
755 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op)756 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op)
757 {
758    if (instr->isDPP())
759       return false;
760 
761    if (instr->operands[0].isConstant() ||
762        (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
763       return false;
764 
765    switch (instr->opcode) {
766    case aco_opcode::v_add_u32:
767    case aco_opcode::v_add_co_u32:
768    case aco_opcode::v_add_co_u32_e64:
769    case aco_opcode::v_add_i32:
770    case aco_opcode::v_add_f16:
771    case aco_opcode::v_add_f32:
772    case aco_opcode::v_mul_f16:
773    case aco_opcode::v_mul_f32:
774    case aco_opcode::v_or_b32:
775    case aco_opcode::v_and_b32:
776    case aco_opcode::v_xor_b32:
777    case aco_opcode::v_max_f16:
778    case aco_opcode::v_max_f32:
779    case aco_opcode::v_min_f16:
780    case aco_opcode::v_min_f32:
781    case aco_opcode::v_max_i32:
782    case aco_opcode::v_min_i32:
783    case aco_opcode::v_max_u32:
784    case aco_opcode::v_min_u32:
785    case aco_opcode::v_max_i16:
786    case aco_opcode::v_min_i16:
787    case aco_opcode::v_max_u16:
788    case aco_opcode::v_min_u16:
789    case aco_opcode::v_max_i16_e64:
790    case aco_opcode::v_min_i16_e64:
791    case aco_opcode::v_max_u16_e64:
792    case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
793    case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
794    case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
795    case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
796    case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
797    case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
798    default: {
799       CmpInfo info;
800       if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
801          *new_op = info.swapped;
802          return true;
803       }
804       return false;
805    }
806    }
807 }
808 
wait_imm()809 wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
810 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)811 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
812     : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
813 {}
814 
wait_imm(enum amd_gfx_level gfx_level,uint16_t packed)815 wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter)
816 {
817    vm = packed & 0xf;
818    if (gfx_level >= GFX9)
819       vm |= (packed >> 10) & 0x30;
820 
821    exp = (packed >> 4) & 0x7;
822 
823    lgkm = (packed >> 8) & 0xf;
824    if (gfx_level >= GFX10)
825       lgkm |= (packed >> 8) & 0x30;
826 }
827 
828 uint16_t
pack(enum amd_gfx_level gfx_level) const829 wait_imm::pack(enum amd_gfx_level gfx_level) const
830 {
831    uint16_t imm = 0;
832    assert(exp == unset_counter || exp <= 0x7);
833    switch (gfx_level) {
834    case GFX11:
835       assert(lgkm == unset_counter || lgkm <= 0x3f);
836       assert(vm == unset_counter || vm <= 0x3f);
837       imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
838       break;
839    case GFX10:
840    case GFX10_3:
841       assert(lgkm == unset_counter || lgkm <= 0x3f);
842       assert(vm == unset_counter || vm <= 0x3f);
843       imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
844       break;
845    case GFX9:
846       assert(lgkm == unset_counter || lgkm <= 0xf);
847       assert(vm == unset_counter || vm <= 0x3f);
848       imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
849       break;
850    default:
851       assert(lgkm == unset_counter || lgkm <= 0xf);
852       assert(vm == unset_counter || vm <= 0xf);
853       imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
854       break;
855    }
856    if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
857       imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
858                         architecture when interpreting the immediate */
859    if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
860       imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
861                         architecture when interpreting the immediate */
862    return imm;
863 }
864 
865 bool
combine(const wait_imm & other)866 wait_imm::combine(const wait_imm& other)
867 {
868    bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
869    vm = std::min(vm, other.vm);
870    exp = std::min(exp, other.exp);
871    lgkm = std::min(lgkm, other.lgkm);
872    vs = std::min(vs, other.vs);
873    return changed;
874 }
875 
876 bool
empty() const877 wait_imm::empty() const
878 {
879    return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
880           vs == unset_counter;
881 }
882 
883 bool
should_form_clause(const Instruction * a,const Instruction * b)884 should_form_clause(const Instruction* a, const Instruction* b)
885 {
886    /* Vertex attribute loads from the same binding likely load from similar addresses */
887    unsigned a_vtx_binding =
888       a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
889    unsigned b_vtx_binding =
890       b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
891    if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
892       return true;
893 
894    if (a->format != b->format)
895       return false;
896 
897    /* Assume loads which don't use descriptors might load from similar addresses. */
898    if (a->isFlatLike())
899       return true;
900    if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
901       return true;
902 
903    /* If they load from the same descriptor, assume they might load from similar
904     * addresses.
905     */
906    if (a->isVMEM() || a->isSMEM())
907       return a->operands[0].tempId() == b->operands[0].tempId();
908 
909    return false;
910 }
911 
912 } // namespace aco
913