• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright © 2020 Valve Corporation
3   *
4   * Permission is hereby granted, free of charge, to any person obtaining a
5   * copy of this software and associated documentation files (the "Software"),
6   * to deal in the Software without restriction, including without limitation
7   * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8   * and/or sell copies of the Software, and to permit persons to whom the
9   * Software is furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice (including the next
12   * paragraph) shall be included in all copies or substantial portions of the
13   * Software.
14   *
15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21   * IN THE SOFTWARE.
22   *
23   */
24  
25  #include "aco_ir.h"
26  
27  #include "aco_builder.h"
28  
29  #include "util/debug.h"
30  
31  #include "c11/threads.h"
32  
33  namespace aco {
34  
35  uint64_t debug_flags = 0;
36  
37  static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
38                                                           {"validatera", DEBUG_VALIDATE_RA},
39                                                           {"perfwarn", DEBUG_PERFWARN},
40                                                           {"force-waitcnt", DEBUG_FORCE_WAITCNT},
41                                                           {"novn", DEBUG_NO_VN},
42                                                           {"noopt", DEBUG_NO_OPT},
43                                                           {"nosched", DEBUG_NO_SCHED},
44                                                           {"perfinfo", DEBUG_PERF_INFO},
45                                                           {"liveinfo", DEBUG_LIVE_INFO},
46                                                           {NULL, 0}};
47  
48  static once_flag init_once_flag = ONCE_FLAG_INIT;
49  
50  static void
init_once()51  init_once()
52  {
53     debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
54  
55  #ifndef NDEBUG
56     /* enable some flags by default on debug builds */
57     debug_flags |= aco::DEBUG_VALIDATE_IR;
58  #endif
59  }
60  
61  void
init()62  init()
63  {
64     call_once(&init_once_flag, init_once);
65  }
66  
67  void
init_program(Program * program,Stage stage,const struct radv_shader_info * info,enum chip_class chip_class,enum radeon_family family,bool wgp_mode,ac_shader_config * config)68  init_program(Program* program, Stage stage, const struct radv_shader_info* info,
69               enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
70               ac_shader_config* config)
71  {
72     program->stage = stage;
73     program->config = config;
74     program->info = info;
75     program->chip_class = chip_class;
76     if (family == CHIP_UNKNOWN) {
77        switch (chip_class) {
78        case GFX6: program->family = CHIP_TAHITI; break;
79        case GFX7: program->family = CHIP_BONAIRE; break;
80        case GFX8: program->family = CHIP_POLARIS10; break;
81        case GFX9: program->family = CHIP_VEGA10; break;
82        case GFX10: program->family = CHIP_NAVI10; break;
83        default: program->family = CHIP_UNKNOWN; break;
84        }
85     } else {
86        program->family = family;
87     }
88     program->wave_size = info->wave_size;
89     program->lane_mask = program->wave_size == 32 ? s1 : s2;
90  
91     program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
92     program->dev.lds_alloc_granule =
93        chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
94     program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
95     /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96     program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97  
98     program->dev.vgpr_limit = 256;
99     program->dev.physical_vgprs = 256;
100     program->dev.vgpr_alloc_granule = 4;
101  
102     if (chip_class >= GFX10) {
103        program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
104        program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
105        program->dev.sgpr_alloc_granule = 128;
106        program->dev.sgpr_limit =
107           108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
108        if (chip_class >= GFX10_3)
109           program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
110        else
111           program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
112     } else if (program->chip_class >= GFX8) {
113        program->dev.physical_sgprs = 800;
114        program->dev.sgpr_alloc_granule = 16;
115        program->dev.sgpr_limit = 102;
116        if (family == CHIP_TONGA || family == CHIP_ICELAND)
117           program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
118     } else {
119        program->dev.physical_sgprs = 512;
120        program->dev.sgpr_alloc_granule = 8;
121        program->dev.sgpr_limit = 104;
122     }
123  
124     program->dev.max_wave64_per_simd = 10;
125     if (program->chip_class >= GFX10_3)
126        program->dev.max_wave64_per_simd = 16;
127     else if (program->chip_class == GFX10)
128        program->dev.max_wave64_per_simd = 20;
129     else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
130        program->dev.max_wave64_per_simd = 8;
131  
132     program->dev.simd_per_cu = program->chip_class >= GFX10 ? 2 : 4;
133  
134     switch (program->family) {
135     /* GFX8 APUs */
136     case CHIP_CARRIZO:
137     case CHIP_STONEY:
138     /* GFX9 APUS */
139     case CHIP_RAVEN:
140     case CHIP_RAVEN2:
141     case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
142     default: break;
143     }
144  
145     program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
146     /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
147     program->dev.has_fast_fma32 = program->chip_class >= GFX9;
148     if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
149         program->family == CHIP_HAWAII)
150        program->dev.has_fast_fma32 = true;
151  
152     program->wgp_mode = wgp_mode;
153  
154     program->progress = CompilationProgress::after_isel;
155  
156     program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
157     program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
158     program->next_fp_mode.must_flush_denorms32 = false;
159     program->next_fp_mode.must_flush_denorms16_64 = false;
160     program->next_fp_mode.care_about_round32 = false;
161     program->next_fp_mode.care_about_round16_64 = false;
162     program->next_fp_mode.denorm16_64 = fp_denorm_keep;
163     program->next_fp_mode.denorm32 = 0;
164     program->next_fp_mode.round16_64 = fp_round_ne;
165     program->next_fp_mode.round32 = fp_round_ne;
166  }
167  
168  memory_sync_info
get_sync_info(const Instruction * instr)169  get_sync_info(const Instruction* instr)
170  {
171     switch (instr->format) {
172     case Format::SMEM: return instr->smem().sync;
173     case Format::MUBUF: return instr->mubuf().sync;
174     case Format::MIMG: return instr->mimg().sync;
175     case Format::MTBUF: return instr->mtbuf().sync;
176     case Format::FLAT:
177     case Format::GLOBAL:
178     case Format::SCRATCH: return instr->flatlike().sync;
179     case Format::DS: return instr->ds().sync;
180     default: return memory_sync_info();
181     }
182  }
183  
184  bool
can_use_SDWA(chip_class chip,const aco_ptr<Instruction> & instr,bool pre_ra)185  can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
186  {
187     if (!instr->isVALU())
188        return false;
189  
190     if (chip < GFX8 || instr->isDPP())
191        return false;
192  
193     if (instr->isSDWA())
194        return true;
195  
196     if (instr->isVOP3()) {
197        VOP3_instruction& vop3 = instr->vop3();
198        if (instr->format == Format::VOP3)
199           return false;
200        if (vop3.clamp && instr->isVOPC() && chip != GFX8)
201           return false;
202        if (vop3.omod && chip < GFX9)
203           return false;
204  
205        // TODO: return true if we know we will use vcc
206        if (!pre_ra && instr->definitions.size() >= 2)
207           return false;
208  
209        for (unsigned i = 1; i < instr->operands.size(); i++) {
210           if (instr->operands[i].isLiteral())
211              return false;
212           if (chip < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
213              return false;
214        }
215     }
216  
217     if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
218        return false;
219  
220     if (!instr->operands.empty()) {
221        if (instr->operands[0].isLiteral())
222           return false;
223        if (chip < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
224           return false;
225        if (instr->operands[0].bytes() > 4)
226           return false;
227        if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
228           return false;
229     }
230  
231     bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
232                   instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
233  
234     if (chip != GFX8 && is_mac)
235        return false;
236  
237     // TODO: return true if we know we will use vcc
238     if (!pre_ra && instr->isVOPC() && chip == GFX8)
239        return false;
240     if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
241        return false;
242  
243     return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
244            instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
245            instr->opcode != aco_opcode::v_readfirstlane_b32 &&
246            instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
247  }
248  
249  /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
250  aco_ptr<Instruction>
convert_to_SDWA(chip_class chip,aco_ptr<Instruction> & instr)251  convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
252  {
253     if (instr->isSDWA())
254        return NULL;
255  
256     aco_ptr<Instruction> tmp = std::move(instr);
257     Format format =
258        (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
259     instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
260                                                      tmp->definitions.size()));
261     std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
262     std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
263  
264     SDWA_instruction& sdwa = instr->sdwa();
265  
266     if (tmp->isVOP3()) {
267        VOP3_instruction& vop3 = tmp->vop3();
268        memcpy(sdwa.neg, vop3.neg, sizeof(sdwa.neg));
269        memcpy(sdwa.abs, vop3.abs, sizeof(sdwa.abs));
270        sdwa.omod = vop3.omod;
271        sdwa.clamp = vop3.clamp;
272     }
273  
274     for (unsigned i = 0; i < instr->operands.size(); i++) {
275        /* SDWA only uses operands 0 and 1. */
276        if (i >= 2)
277           break;
278  
279        sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
280     }
281  
282     sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
283  
284     if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8)
285        instr->definitions[0].setFixed(vcc);
286     if (instr->definitions.size() >= 2)
287        instr->definitions[1].setFixed(vcc);
288     if (instr->operands.size() >= 3)
289        instr->operands[2].setFixed(vcc);
290  
291     return tmp;
292  }
293  
294  bool
can_use_DPP(const aco_ptr<Instruction> & instr,bool pre_ra)295  can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra)
296  {
297     assert(instr->isVALU() && !instr->operands.empty());
298  
299     if (instr->isDPP())
300        return true;
301  
302     if (instr->operands.size() && instr->operands[0].isLiteral())
303        return false;
304  
305     if (instr->isSDWA())
306        return false;
307  
308     if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) &&
309         instr->definitions.back().physReg() != vcc)
310        return false;
311  
312     if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc)
313        return false;
314  
315     if (instr->isVOP3()) {
316        const VOP3_instruction* vop3 = &instr->vop3();
317        if (vop3->clamp || vop3->omod || vop3->opsel)
318           return false;
319        if (instr->format == Format::VOP3)
320           return false;
321        if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr))
322           return false;
323     }
324  
325     /* there are more cases but those all take 64-bit inputs */
326     return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
327            instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
328            instr->opcode != aco_opcode::v_readfirstlane_b32 &&
329            instr->opcode != aco_opcode::v_cvt_f64_i32 &&
330            instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32;
331  }
332  
333  aco_ptr<Instruction>
convert_to_DPP(aco_ptr<Instruction> & instr)334  convert_to_DPP(aco_ptr<Instruction>& instr)
335  {
336     if (instr->isDPP())
337        return NULL;
338  
339     aco_ptr<Instruction> tmp = std::move(instr);
340     Format format =
341        (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) | (uint32_t)Format::DPP);
342     instr.reset(create_instruction<DPP_instruction>(tmp->opcode, format, tmp->operands.size(),
343                                                     tmp->definitions.size()));
344     std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
345     for (unsigned i = 0; i < instr->definitions.size(); i++)
346        instr->definitions[i] = tmp->definitions[i];
347  
348     DPP_instruction* dpp = &instr->dpp();
349     dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
350     dpp->row_mask = 0xf;
351     dpp->bank_mask = 0xf;
352  
353     if (tmp->isVOP3()) {
354        const VOP3_instruction* vop3 = &tmp->vop3();
355        memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg));
356        memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs));
357     }
358  
359     if (instr->isVOPC() || instr->definitions.size() > 1)
360        instr->definitions.back().setFixed(vcc);
361  
362     if (instr->operands.size() >= 3)
363        instr->operands[2].setFixed(vcc);
364  
365     return tmp;
366  }
367  
368  bool
can_use_opsel(chip_class chip,aco_opcode op,int idx,bool high)369  can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
370  {
371     /* opsel is only GFX9+ */
372     if ((high || idx == -1) && chip < GFX9)
373        return false;
374  
375     switch (op) {
376     case aco_opcode::v_div_fixup_f16:
377     case aco_opcode::v_fma_f16:
378     case aco_opcode::v_mad_f16:
379     case aco_opcode::v_mad_u16:
380     case aco_opcode::v_mad_i16:
381     case aco_opcode::v_med3_f16:
382     case aco_opcode::v_med3_i16:
383     case aco_opcode::v_med3_u16:
384     case aco_opcode::v_min3_f16:
385     case aco_opcode::v_min3_i16:
386     case aco_opcode::v_min3_u16:
387     case aco_opcode::v_max3_f16:
388     case aco_opcode::v_max3_i16:
389     case aco_opcode::v_max3_u16:
390     case aco_opcode::v_max_u16_e64:
391     case aco_opcode::v_max_i16_e64:
392     case aco_opcode::v_min_u16_e64:
393     case aco_opcode::v_min_i16_e64:
394     case aco_opcode::v_add_i16:
395     case aco_opcode::v_sub_i16:
396     case aco_opcode::v_add_u16_e64:
397     case aco_opcode::v_sub_u16_e64:
398     case aco_opcode::v_lshlrev_b16_e64:
399     case aco_opcode::v_lshrrev_b16_e64:
400     case aco_opcode::v_ashrrev_i16_e64:
401     case aco_opcode::v_mul_lo_u16_e64: return true;
402     case aco_opcode::v_pack_b32_f16:
403     case aco_opcode::v_cvt_pknorm_i16_f16:
404     case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
405     case aco_opcode::v_mad_u32_u16:
406     case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
407     default: return false;
408     }
409  }
410  
411  bool
instr_is_16bit(chip_class chip,aco_opcode op)412  instr_is_16bit(chip_class chip, aco_opcode op)
413  {
414     /* partial register writes are GFX9+, only */
415     if (chip < GFX9)
416        return false;
417  
418     switch (op) {
419     /* VOP3 */
420     case aco_opcode::v_mad_f16:
421     case aco_opcode::v_mad_u16:
422     case aco_opcode::v_mad_i16:
423     case aco_opcode::v_fma_f16:
424     case aco_opcode::v_div_fixup_f16:
425     case aco_opcode::v_interp_p2_f16:
426     case aco_opcode::v_fma_mixlo_f16:
427     /* VOP2 */
428     case aco_opcode::v_mac_f16:
429     case aco_opcode::v_madak_f16:
430     case aco_opcode::v_madmk_f16: return chip >= GFX9;
431     case aco_opcode::v_add_f16:
432     case aco_opcode::v_sub_f16:
433     case aco_opcode::v_subrev_f16:
434     case aco_opcode::v_mul_f16:
435     case aco_opcode::v_max_f16:
436     case aco_opcode::v_min_f16:
437     case aco_opcode::v_ldexp_f16:
438     case aco_opcode::v_fmac_f16:
439     case aco_opcode::v_fmamk_f16:
440     case aco_opcode::v_fmaak_f16:
441     /* VOP1 */
442     case aco_opcode::v_cvt_f16_f32:
443     case aco_opcode::v_cvt_f16_u16:
444     case aco_opcode::v_cvt_f16_i16:
445     case aco_opcode::v_rcp_f16:
446     case aco_opcode::v_sqrt_f16:
447     case aco_opcode::v_rsq_f16:
448     case aco_opcode::v_log_f16:
449     case aco_opcode::v_exp_f16:
450     case aco_opcode::v_frexp_mant_f16:
451     case aco_opcode::v_frexp_exp_i16_f16:
452     case aco_opcode::v_floor_f16:
453     case aco_opcode::v_ceil_f16:
454     case aco_opcode::v_trunc_f16:
455     case aco_opcode::v_rndne_f16:
456     case aco_opcode::v_fract_f16:
457     case aco_opcode::v_sin_f16:
458     case aco_opcode::v_cos_f16: return chip >= GFX10;
459     // TODO: confirm whether these write 16 or 32 bit on GFX10+
460     // case aco_opcode::v_cvt_u16_f16:
461     // case aco_opcode::v_cvt_i16_f16:
462     // case aco_opcode::p_cvt_f16_f32_rtne:
463     // case aco_opcode::v_cvt_norm_i16_f16:
464     // case aco_opcode::v_cvt_norm_u16_f16:
465     /* on GFX10, all opsel instructions preserve the high bits */
466     default: return chip >= GFX10 && can_use_opsel(chip, op, -1, false);
467     }
468  }
469  
470  uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)471  get_reduction_identity(ReduceOp op, unsigned idx)
472  {
473     switch (op) {
474     case iadd8:
475     case iadd16:
476     case iadd32:
477     case iadd64:
478     case fadd16:
479     case fadd32:
480     case fadd64:
481     case ior8:
482     case ior16:
483     case ior32:
484     case ior64:
485     case ixor8:
486     case ixor16:
487     case ixor32:
488     case ixor64:
489     case umax8:
490     case umax16:
491     case umax32:
492     case umax64: return 0;
493     case imul8:
494     case imul16:
495     case imul32:
496     case imul64: return idx ? 0 : 1;
497     case fmul16: return 0x3c00u;                /* 1.0 */
498     case fmul32: return 0x3f800000u;            /* 1.0 */
499     case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
500     case imin8: return INT8_MAX;
501     case imin16: return INT16_MAX;
502     case imin32: return INT32_MAX;
503     case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
504     case imax8: return INT8_MIN;
505     case imax16: return INT16_MIN;
506     case imax32: return INT32_MIN;
507     case imax64: return idx ? 0x80000000u : 0;
508     case umin8:
509     case umin16:
510     case iand8:
511     case iand16: return 0xffffffffu;
512     case umin32:
513     case umin64:
514     case iand32:
515     case iand64: return 0xffffffffu;
516     case fmin16: return 0x7c00u;                /* infinity */
517     case fmin32: return 0x7f800000u;            /* infinity */
518     case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
519     case fmax16: return 0xfc00u;                /* negative infinity */
520     case fmax32: return 0xff800000u;            /* negative infinity */
521     case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
522     default: unreachable("Invalid reduction operation"); break;
523     }
524     return 0;
525  }
526  
527  bool
needs_exec_mask(const Instruction * instr)528  needs_exec_mask(const Instruction* instr)
529  {
530     if (instr->isVALU()) {
531        return instr->opcode != aco_opcode::v_readlane_b32 &&
532               instr->opcode != aco_opcode::v_readlane_b32_e64 &&
533               instr->opcode != aco_opcode::v_writelane_b32 &&
534               instr->opcode != aco_opcode::v_writelane_b32_e64;
535     }
536  
537     if (instr->isVMEM() || instr->isFlatLike())
538        return true;
539  
540     if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
541        return instr->reads_exec();
542  
543     if (instr->isPseudo()) {
544        switch (instr->opcode) {
545        case aco_opcode::p_create_vector:
546        case aco_opcode::p_extract_vector:
547        case aco_opcode::p_split_vector:
548        case aco_opcode::p_phi:
549        case aco_opcode::p_parallelcopy:
550           for (Definition def : instr->definitions) {
551              if (def.getTemp().type() == RegType::vgpr)
552                 return true;
553           }
554           return instr->reads_exec();
555        case aco_opcode::p_spill:
556        case aco_opcode::p_reload:
557        case aco_opcode::p_logical_start:
558        case aco_opcode::p_logical_end:
559        case aco_opcode::p_startpgm: return instr->reads_exec();
560        default: break;
561        }
562     }
563  
564     return true;
565  }
566  
567  struct CmpInfo {
568     aco_opcode ordered;
569     aco_opcode unordered;
570     aco_opcode ordered_swapped;
571     aco_opcode unordered_swapped;
572     aco_opcode inverse;
573     aco_opcode f32;
574     unsigned size;
575  };
576  
577  ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)578  get_cmp_info(aco_opcode op, CmpInfo* info)
579  {
580     info->ordered = aco_opcode::num_opcodes;
581     info->unordered = aco_opcode::num_opcodes;
582     info->ordered_swapped = aco_opcode::num_opcodes;
583     info->unordered_swapped = aco_opcode::num_opcodes;
584     switch (op) {
585        // clang-format off
586  #define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
587     case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
588     case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
589        info->ordered = aco_opcode::v_cmp_##ord##_f##sz;                                             \
590        info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;                                        \
591        info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;                                \
592        info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;                           \
593        info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
594                                                                 : aco_opcode::v_cmp_n##ord##_f##sz; \
595        info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32            \
596                                                          : aco_opcode::v_cmp_n##unord##_f32;        \
597        info->size = sz;                                                                             \
598        return true;
599  #define CMP(ord, unord, ord_swap, unord_swap)                                                      \
600     CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
601     CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
602     CMP2(ord, unord, ord_swap, unord_swap, 64)
603        CMP(lt, /*n*/ge, gt, /*n*/le)
604        CMP(eq, /*n*/lg, eq, /*n*/lg)
605        CMP(le, /*n*/gt, ge, /*n*/lt)
606        CMP(gt, /*n*/le, lt, /*n*/le)
607        CMP(lg, /*n*/eq, lg, /*n*/eq)
608        CMP(ge, /*n*/lt, le, /*n*/gt)
609  #undef CMP
610  #undef CMP2
611  #define ORD_TEST(sz)                                                                               \
612     case aco_opcode::v_cmp_u_f##sz:                                                                 \
613        info->f32 = aco_opcode::v_cmp_u_f32;                                                         \
614        info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
615        info->size = sz;                                                                             \
616        return true;                                                                                 \
617     case aco_opcode::v_cmp_o_f##sz:                                                                 \
618        info->f32 = aco_opcode::v_cmp_o_f32;                                                         \
619        info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
620        info->size = sz;                                                                             \
621        return true;
622        ORD_TEST(16)
623        ORD_TEST(32)
624        ORD_TEST(64)
625  #undef ORD_TEST
626        // clang-format on
627     default: return false;
628     }
629  }
630  
631  aco_opcode
get_ordered(aco_opcode op)632  get_ordered(aco_opcode op)
633  {
634     CmpInfo info;
635     return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
636  }
637  
638  aco_opcode
get_unordered(aco_opcode op)639  get_unordered(aco_opcode op)
640  {
641     CmpInfo info;
642     return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
643  }
644  
645  aco_opcode
get_inverse(aco_opcode op)646  get_inverse(aco_opcode op)
647  {
648     CmpInfo info;
649     return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
650  }
651  
652  aco_opcode
get_f32_cmp(aco_opcode op)653  get_f32_cmp(aco_opcode op)
654  {
655     CmpInfo info;
656     return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
657  }
658  
659  unsigned
get_cmp_bitsize(aco_opcode op)660  get_cmp_bitsize(aco_opcode op)
661  {
662     CmpInfo info;
663     return get_cmp_info(op, &info) ? info.size : 0;
664  }
665  
666  bool
is_cmp(aco_opcode op)667  is_cmp(aco_opcode op)
668  {
669     CmpInfo info;
670     return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
671  }
672  
673  bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op)674  can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op)
675  {
676     if (instr->isDPP())
677        return false;
678  
679     if (instr->operands[0].isConstant() ||
680         (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
681        return false;
682  
683     switch (instr->opcode) {
684     case aco_opcode::v_add_u32:
685     case aco_opcode::v_add_co_u32:
686     case aco_opcode::v_add_co_u32_e64:
687     case aco_opcode::v_add_i32:
688     case aco_opcode::v_add_f16:
689     case aco_opcode::v_add_f32:
690     case aco_opcode::v_mul_f16:
691     case aco_opcode::v_mul_f32:
692     case aco_opcode::v_or_b32:
693     case aco_opcode::v_and_b32:
694     case aco_opcode::v_xor_b32:
695     case aco_opcode::v_max_f16:
696     case aco_opcode::v_max_f32:
697     case aco_opcode::v_min_f16:
698     case aco_opcode::v_min_f32:
699     case aco_opcode::v_max_i32:
700     case aco_opcode::v_min_i32:
701     case aco_opcode::v_max_u32:
702     case aco_opcode::v_min_u32:
703     case aco_opcode::v_max_i16:
704     case aco_opcode::v_min_i16:
705     case aco_opcode::v_max_u16:
706     case aco_opcode::v_min_u16:
707     case aco_opcode::v_max_i16_e64:
708     case aco_opcode::v_min_i16_e64:
709     case aco_opcode::v_max_u16_e64:
710     case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
711     case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
712     case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
713     case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
714     case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
715     case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
716     default: {
717        CmpInfo info;
718        get_cmp_info(instr->opcode, &info);
719        if (info.ordered == instr->opcode) {
720           *new_op = info.ordered_swapped;
721           return true;
722        }
723        if (info.unordered == instr->opcode) {
724           *new_op = info.unordered_swapped;
725           return true;
726        }
727        return false;
728     }
729     }
730  }
731  
wait_imm()732  wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
733  {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)734  wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
735      : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
736  {}
737  
wait_imm(enum chip_class chip,uint16_t packed)738  wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
739  {
740     vm = packed & 0xf;
741     if (chip >= GFX9)
742        vm |= (packed >> 10) & 0x30;
743  
744     exp = (packed >> 4) & 0x7;
745  
746     lgkm = (packed >> 8) & 0xf;
747     if (chip >= GFX10)
748        lgkm |= (packed >> 8) & 0x30;
749  }
750  
751  uint16_t
pack(enum chip_class chip) const752  wait_imm::pack(enum chip_class chip) const
753  {
754     uint16_t imm = 0;
755     assert(exp == unset_counter || exp <= 0x7);
756     switch (chip) {
757     case GFX10:
758     case GFX10_3:
759        assert(lgkm == unset_counter || lgkm <= 0x3f);
760        assert(vm == unset_counter || vm <= 0x3f);
761        imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
762        break;
763     case GFX9:
764        assert(lgkm == unset_counter || lgkm <= 0xf);
765        assert(vm == unset_counter || vm <= 0x3f);
766        imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
767        break;
768     default:
769        assert(lgkm == unset_counter || lgkm <= 0xf);
770        assert(vm == unset_counter || vm <= 0xf);
771        imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
772        break;
773     }
774     if (chip < GFX9 && vm == wait_imm::unset_counter)
775        imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
776                          architecture when interpreting the immediate */
777     if (chip < GFX10 && lgkm == wait_imm::unset_counter)
778        imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
779                          architecture when interpreting the immediate */
780     return imm;
781  }
782  
783  bool
combine(const wait_imm & other)784  wait_imm::combine(const wait_imm& other)
785  {
786     bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
787     vm = std::min(vm, other.vm);
788     exp = std::min(exp, other.exp);
789     lgkm = std::min(lgkm, other.lgkm);
790     vs = std::min(vs, other.vs);
791     return changed;
792  }
793  
794  bool
empty() const795  wait_imm::empty() const
796  {
797     return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
798            vs == unset_counter;
799  }
800  
801  bool
should_form_clause(const Instruction * a,const Instruction * b)802  should_form_clause(const Instruction* a, const Instruction* b)
803  {
804     /* Vertex attribute loads from the same binding likely load from similar addresses */
805     unsigned a_vtx_binding =
806        a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
807     unsigned b_vtx_binding =
808        b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
809     if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
810        return true;
811  
812     if (a->format != b->format)
813        return false;
814  
815     /* Assume loads which don't use descriptors might load from similar addresses. */
816     if (a->isFlatLike())
817        return true;
818     if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
819        return true;
820  
821     /* If they load from the same descriptor, assume they might load from similar
822      * addresses.
823      */
824     if (a->isVMEM() || a->isSMEM())
825        return a->operands[0].tempId() == b->operands[0].tempId();
826  
827     return false;
828  }
829  
830  } // namespace aco
831