• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  * Copyright © 2018 Google
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "aco_instruction_selection.h"
9 
10 #include "aco_builder.h"
11 #include "aco_interface.h"
12 #include "aco_ir.h"
13 
14 #include "common/ac_descriptors.h"
15 #include "common/ac_gpu_info.h"
16 #include "common/nir/ac_nir.h"
17 #include "common/sid.h"
18 
19 #include "util/fast_idiv_by_const.h"
20 #include "util/memstream.h"
21 
22 #include <array>
23 #include <functional>
24 #include <map>
25 #include <numeric>
26 #include <stack>
27 #include <utility>
28 #include <vector>
29 
30 namespace aco {
31 namespace {
32 
33 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
34 
35 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)36 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
37           const char* msg)
38 {
39    char* out;
40    size_t outsize;
41    struct u_memstream mem;
42    u_memstream_open(&mem, &out, &outsize);
43    FILE* const memf = u_memstream_get(&mem);
44 
45    fprintf(memf, "%s: ", msg);
46    nir_print_instr(instr, memf);
47    u_memstream_close(&mem);
48 
49    _aco_err(ctx->program, file, line, out);
50    free(out);
51 }
52 
53 struct loop_context {
54    Block loop_exit;
55 
56    unsigned header_idx_old;
57    Block* exit_old;
58    bool divergent_cont_old;
59    bool divergent_branch_old;
60    bool divergent_if_old;
61 };
62 
63 static void visit_cf_list(struct isel_context* ctx, struct exec_list* list);
64 
65 static void
add_logical_edge(unsigned pred_idx,Block * succ)66 add_logical_edge(unsigned pred_idx, Block* succ)
67 {
68    succ->logical_preds.emplace_back(pred_idx);
69 }
70 
71 static void
add_linear_edge(unsigned pred_idx,Block * succ)72 add_linear_edge(unsigned pred_idx, Block* succ)
73 {
74    succ->linear_preds.emplace_back(pred_idx);
75 }
76 
77 static void
add_edge(unsigned pred_idx,Block * succ)78 add_edge(unsigned pred_idx, Block* succ)
79 {
80    add_logical_edge(pred_idx, succ);
81    add_linear_edge(pred_idx, succ);
82 }
83 
84 static void
append_logical_start(Block * b)85 append_logical_start(Block* b)
86 {
87    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
88 }
89 
90 static void
append_logical_end(Block * b)91 append_logical_end(Block* b)
92 {
93    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
94 }
95 
96 Temp
get_ssa_temp(struct isel_context * ctx,nir_def * def)97 get_ssa_temp(struct isel_context* ctx, nir_def* def)
98 {
99    uint32_t id = ctx->first_temp_id + def->index;
100    return Temp(id, ctx->program->temp_rc[id]);
101 }
102 
103 static Builder
create_alu_builder(isel_context * ctx,nir_alu_instr * instr)104 create_alu_builder(isel_context* ctx, nir_alu_instr* instr)
105 {
106    Builder bld(ctx->program, ctx->block);
107    bld.is_precise = instr->exact;
108    bld.is_sz_preserve = nir_alu_instr_is_signed_zero_preserve(instr);
109    bld.is_inf_preserve = nir_alu_instr_is_inf_preserve(instr);
110    bld.is_nan_preserve = nir_alu_instr_is_nan_preserve(instr);
111    return bld;
112 }
113 
114 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())115 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
116 {
117    Builder bld(ctx->program, ctx->block);
118    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
119    assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
120 
121    if (ctx->program->wave_size == 32) {
122       Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
123       return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
124    }
125 
126    Operand mask_lo = Operand::c32(-1u);
127    Operand mask_hi = Operand::c32(-1u);
128 
129    if (mask.isTemp()) {
130       RegClass rc = RegClass(mask.regClass().type(), 1);
131       Builder::Result mask_split =
132          bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
133       mask_lo = Operand(mask_split.def(0).getTemp());
134       mask_hi = Operand(mask_split.def(1).getTemp());
135    } else if (mask.physReg() == exec) {
136       mask_lo = Operand(exec_lo, s1);
137       mask_hi = Operand(exec_hi, s1);
138    }
139 
140    Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
141 
142    if (ctx->program->gfx_level <= GFX7)
143       return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
144    else
145       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
146 }
147 
148 inline void
set_wqm(isel_context * ctx,bool enable_helpers=false)149 set_wqm(isel_context* ctx, bool enable_helpers = false)
150 {
151    if (ctx->program->stage == fragment_fs) {
152       ctx->wqm_block_idx = ctx->block->index;
153       ctx->wqm_instruction_idx = ctx->block->instructions.size();
154       if (ctx->shader)
155          enable_helpers |= ctx->shader->info.fs.require_full_quads;
156       ctx->program->needs_wqm |= enable_helpers;
157    }
158 }
159 
160 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)161 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
162 {
163    if (index.regClass() == s1)
164       return bld.readlane(bld.def(s1), data, index);
165 
166    /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
167     * of multiple binaries, because the VGPR use is not known when choosing
168     * which registers to use for the shared VGPRs.
169     */
170    const bool avoid_shared_vgprs =
171       ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
172       ctx->program->wave_size == 64 &&
173       (ctx->program->info.ps.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
174        ctx->program->info.vs.has_prolog || ctx->stage == raytracing_cs);
175 
176    if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
177       /* GFX6-7: there is no bpermute instruction */
178       return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
179                         bld.def(bld.lm, vcc), index, data);
180    } else if (ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level <= GFX11_5 &&
181               ctx->program->wave_size == 64) {
182 
183       /* GFX10-11.5 wave64 mode: emulate full-wave bpermute */
184       Temp index_is_lo =
185          bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
186       Builder::Result index_is_lo_split =
187          bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
188       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
189                                      index_is_lo_split.def(1).getTemp());
190       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
191                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
192       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
193 
194       if (ctx->options->gfx_level <= GFX10_3) {
195          /* We need one pair of shared VGPRs:
196           * Note, that these have twice the allocation granularity of normal VGPRs
197           */
198          ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
199 
200          return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
201                            bld.def(s1, scc), index_x4, data, same_half);
202       } else {
203          return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
204                            bld.def(s1, scc), Operand(v1.as_linear()), index_x4, data, same_half);
205       }
206    } else {
207       /* wave32 or GFX8-9, GFX12+: bpermute works normally */
208       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
209       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
210    }
211 }
212 
213 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask,bool allow_fi)214 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask, bool allow_fi)
215 {
216    if (ctx->options->gfx_level >= GFX8) {
217       unsigned and_mask = mask & 0x1f;
218       unsigned or_mask = (mask >> 5) & 0x1f;
219       unsigned xor_mask = (mask >> 10) & 0x1f;
220 
221       /* Eliminate or_mask. */
222       and_mask &= ~or_mask;
223       xor_mask ^= or_mask;
224 
225       uint16_t dpp_ctrl = 0xffff;
226 
227       /* DPP16 before DPP8 before v_permlane(x)16_b32
228        * because DPP16 supports modifiers and v_permlane
229        * can't be folded into valu instructions.
230        */
231       if ((and_mask & 0x1c) == 0x1c && xor_mask < 4) {
232          unsigned res[4];
233          for (unsigned i = 0; i < 4; i++)
234             res[i] = ((i & and_mask) ^ xor_mask);
235          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
236       } else if (and_mask == 0x1f && xor_mask == 8) {
237          dpp_ctrl = dpp_row_rr(8);
238       } else if (and_mask == 0x1f && xor_mask == 0xf) {
239          dpp_ctrl = dpp_row_mirror;
240       } else if (and_mask == 0x1f && xor_mask == 0x7) {
241          dpp_ctrl = dpp_row_half_mirror;
242       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && xor_mask < 0x10) {
243          dpp_ctrl = dpp_row_share(xor_mask);
244       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) {
245          dpp_ctrl = dpp_row_xmask(xor_mask);
246       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) {
247          uint32_t lane_sel = 0;
248          for (unsigned i = 0; i < 8; i++)
249             lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3);
250          return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel, allow_fi);
251       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) {
252          uint64_t lane_mask = 0;
253          for (unsigned i = 0; i < 16; i++)
254             lane_mask |= uint64_t((i & and_mask) ^ (xor_mask & 0xf)) << i * 4;
255          aco_opcode opcode =
256             xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
257          Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
258          Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
259          Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
260          ret->valu().opsel[0] = allow_fi; /* set FETCH_INACTIVE */
261          ret->valu().opsel[1] = true;     /* set BOUND_CTRL */
262          return ret;
263       }
264 
265       if (dpp_ctrl != 0xffff)
266          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl, 0xf, 0xf, true,
267                              allow_fi);
268    }
269 
270    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
271 }
272 
273 Temp
as_vgpr(Builder & bld,Temp val)274 as_vgpr(Builder& bld, Temp val)
275 {
276    if (val.type() == RegType::sgpr)
277       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
278    assert(val.type() == RegType::vgpr);
279    return val;
280 }
281 
282 Temp
as_vgpr(isel_context * ctx,Temp val)283 as_vgpr(isel_context* ctx, Temp val)
284 {
285    Builder bld(ctx->program, ctx->block);
286    return as_vgpr(bld, val);
287 }
288 
289 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)290 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
291 {
292    Builder bld(ctx->program, ctx->block);
293    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
294 }
295 
296 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)297 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
298 {
299    /* no need to extract the whole vector */
300    if (src.regClass() == dst_rc) {
301       assert(idx == 0);
302       return src;
303    }
304 
305    assert(src.bytes() > (idx * dst_rc.bytes()));
306    Builder bld(ctx->program, ctx->block);
307    auto it = ctx->allocated_vec.find(src.id());
308    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
309       if (it->second[idx].regClass() == dst_rc) {
310          return it->second[idx];
311       } else {
312          assert(!dst_rc.is_subdword());
313          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
314          return bld.copy(bld.def(dst_rc), it->second[idx]);
315       }
316    }
317 
318    if (dst_rc.is_subdword())
319       src = as_vgpr(ctx, src);
320 
321    if (src.bytes() == dst_rc.bytes()) {
322       assert(idx == 0);
323       return bld.copy(bld.def(dst_rc), src);
324    } else {
325       Temp dst = bld.tmp(dst_rc);
326       emit_extract_vector(ctx, src, idx, dst);
327       return dst;
328    }
329 }
330 
331 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)332 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
333 {
334    if (num_components == 1)
335       return;
336    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
337       return;
338    RegClass rc;
339    if (num_components > vec_src.size()) {
340       if (vec_src.type() == RegType::sgpr) {
341          /* should still help get_alu_src() */
342          emit_split_vector(ctx, vec_src, vec_src.size());
343          return;
344       }
345       /* sub-dword split */
346       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
347    } else {
348       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
349    }
350    aco_ptr<Instruction> split{
351       create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
352    split->operands[0] = Operand(vec_src);
353    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
354    for (unsigned i = 0; i < num_components; i++) {
355       elems[i] = ctx->program->allocateTmp(rc);
356       split->definitions[i] = Definition(elems[i]);
357    }
358    ctx->block->instructions.emplace_back(std::move(split));
359    ctx->allocated_vec.emplace(vec_src.id(), elems);
360 }
361 
362 /* This vector expansion uses a mask to determine which elements in the new vector
363  * come from the original vector. The other elements are undefined. */
364 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask,bool zero_padding=false)365 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
366               bool zero_padding = false)
367 {
368    assert(vec_src.type() == RegType::vgpr);
369    Builder bld(ctx->program, ctx->block);
370 
371    if (dst.type() == RegType::sgpr && num_components > dst.size()) {
372       Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
373       expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
374       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
375       ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
376       return;
377    }
378 
379    emit_split_vector(ctx, vec_src, util_bitcount(mask));
380 
381    if (vec_src == dst)
382       return;
383 
384    if (num_components == 1) {
385       if (dst.type() == RegType::sgpr)
386          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
387       else
388          bld.copy(Definition(dst), vec_src);
389       return;
390    }
391 
392    unsigned component_bytes = dst.bytes() / num_components;
393    RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
394    RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
395    assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
396    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
397 
398    Temp padding = Temp(0, dst_rc);
399    if (zero_padding)
400       padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
401 
402    aco_ptr<Instruction> vec{
403       create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
404    vec->definitions[0] = Definition(dst);
405    unsigned k = 0;
406    for (unsigned i = 0; i < num_components; i++) {
407       if (mask & (1 << i)) {
408          Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
409          if (dst.type() == RegType::sgpr)
410             src = bld.as_uniform(src);
411          vec->operands[i] = Operand(src);
412          elems[i] = src;
413       } else {
414          vec->operands[i] = Operand::zero(component_bytes);
415          elems[i] = padding;
416       }
417    }
418    ctx->block->instructions.emplace_back(std::move(vec));
419    ctx->allocated_vec.emplace(dst.id(), elems);
420 }
421 
422 Temp
get_ssa_temp_tex(struct isel_context * ctx,nir_def * def,bool is_16bit)423 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
424 {
425    RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
426    Temp tmp = get_ssa_temp(ctx, def);
427    if (tmp.bytes() != rc.bytes())
428       return emit_extract_vector(ctx, tmp, 0, rc);
429    else
430       return tmp;
431 }
432 
433 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))434 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
435 {
436    Builder bld(ctx->program, ctx->block);
437    if (!dst.id())
438       dst = bld.tmp(bld.lm);
439 
440    assert(val.regClass() == s1);
441    assert(dst.regClass() == bld.lm);
442 
443    return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
444                    bld.scc(val));
445 }
446 
447 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))448 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
449 {
450    Builder bld(ctx->program, ctx->block);
451    if (!dst.id())
452       dst = bld.tmp(s1);
453 
454    assert(val.regClass() == bld.lm);
455    assert(dst.regClass() == s1);
456 
457    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
458    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
459    return dst;
460 }
461 
462 /**
463  * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
464  * src_bits and dst_bits are truncated.
465  *
466  * Sign extension may be applied using the sign_extend parameter. The position of the input sign
467  * bit is indicated by src_bits in this case.
468  *
469  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
470  */
471 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())472 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
473             bool sign_extend, Temp dst = Temp())
474 {
475    assert(!(sign_extend && dst_bits < src_bits) &&
476           "Shrinking integers is not supported for signed inputs");
477 
478    if (!dst.id()) {
479       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
480          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
481       else
482          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
483    }
484 
485    assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
486    assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
487 
488    if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
489       /* Copy the raw value, leaving an undefined value in the upper bits for
490        * the caller to handle appropriately */
491       return bld.copy(Definition(dst), src);
492    } else if (dst.bytes() < src.bytes()) {
493       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
494    }
495 
496    Temp tmp = dst;
497    if (dst_bits == 64)
498       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
499 
500    if (tmp == src) {
501    } else if (src.regClass() == s1) {
502       assert(src_bits < 32);
503       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
504                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
505    } else {
506       assert(src_bits < 32);
507       bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
508                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
509    }
510 
511    if (dst_bits == 64) {
512       if (sign_extend && dst.regClass() == s2) {
513          Temp high =
514             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
515          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
516       } else if (sign_extend && dst.regClass() == v2) {
517          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
518          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
519       } else {
520          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
521       }
522    }
523 
524    return dst;
525 }
526 
527 enum sgpr_extract_mode {
528    sgpr_extract_sext,
529    sgpr_extract_zext,
530    sgpr_extract_undef,
531 };
532 
533 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)534 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
535 {
536    Temp vec = get_ssa_temp(ctx, src->src.ssa);
537    unsigned src_size = src->src.ssa->bit_size;
538    unsigned swizzle = src->swizzle[0];
539 
540    if (vec.size() > 1) {
541       assert(src_size == 16);
542       vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
543       swizzle = swizzle & 1;
544    }
545 
546    Builder bld(ctx->program, ctx->block);
547    Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
548 
549    if (mode == sgpr_extract_undef && swizzle == 0)
550       bld.copy(Definition(tmp), vec);
551    else
552       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
553                  Operand::c32(swizzle), Operand::c32(src_size),
554                  Operand::c32((mode == sgpr_extract_sext)));
555 
556    if (dst.regClass() == s2)
557       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
558 
559    return dst;
560 }
561 
562 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)563 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
564 {
565    if (src.src.ssa->num_components == 1 && size == 1)
566       return get_ssa_temp(ctx, src.src.ssa);
567 
568    Temp vec = get_ssa_temp(ctx, src.src.ssa);
569    unsigned elem_size = src.src.ssa->bit_size / 8u;
570    bool identity_swizzle = true;
571 
572    for (unsigned i = 0; identity_swizzle && i < size; i++) {
573       if (src.swizzle[i] != i)
574          identity_swizzle = false;
575    }
576    if (identity_swizzle)
577       return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
578 
579    assert(elem_size > 0);
580    assert(vec.bytes() % elem_size == 0);
581 
582    if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
583       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
584       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
585                                            sgpr_extract_undef);
586    }
587 
588    bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
589    if (as_uniform)
590       vec = as_vgpr(ctx, vec);
591 
592    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
593                                     : RegClass(vec.type(), elem_size / 4);
594    if (size == 1) {
595       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
596    } else {
597       assert(size <= 4);
598       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
599       aco_ptr<Instruction> vec_instr{
600          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
601       for (unsigned i = 0; i < size; ++i) {
602          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
603          vec_instr->operands[i] = Operand{elems[i]};
604       }
605       Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
606       vec_instr->definitions[0] = Definition(dst);
607       ctx->block->instructions.emplace_back(std::move(vec_instr));
608       ctx->allocated_vec.emplace(dst.id(), elems);
609       return as_uniform ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
610    }
611 }
612 
613 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)614 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
615 {
616    /* returns v2b or v1 for vop3p usage.
617     * The source expects exactly 2 16bit components
618     * which are within the same dword
619     */
620    assert(src.src.ssa->bit_size == 16);
621    assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
622 
623    Temp tmp = get_ssa_temp(ctx, src.src.ssa);
624    if (tmp.size() == 1)
625       return tmp;
626 
627    /* the size is larger than 1 dword: check the swizzle */
628    unsigned dword = src.swizzle[0] >> 1;
629 
630    /* extract a full dword if possible */
631    if (tmp.bytes() >= (dword + 1) * 4) {
632       /* if the source is split into components, use p_create_vector */
633       auto it = ctx->allocated_vec.find(tmp.id());
634       if (it != ctx->allocated_vec.end()) {
635          unsigned index = dword << 1;
636          Builder bld(ctx->program, ctx->block);
637          if (it->second[index].regClass() == v2b)
638             return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
639                               it->second[index + 1]);
640       }
641       return emit_extract_vector(ctx, tmp, dword, v1);
642    } else {
643       /* This must be a swizzled access to %a.zz where %a is v6b */
644       assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
645       assert(tmp.regClass() == v6b && dword == 1);
646       return emit_extract_vector(ctx, tmp, dword * 2, v2b);
647    }
648 }
649 
650 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)651 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
652 {
653    nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
654    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
655 }
656 
657 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)658 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
659 {
660    if (ptr.size() == 2)
661       return ptr;
662    Builder bld(ctx->program, ctx->block);
663    if (ptr.type() == RegType::vgpr && !non_uniform)
664       ptr = bld.as_uniform(ptr);
665    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
666                      Operand::c32((unsigned)ctx->options->address32_hi));
667 }
668 
669 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)670 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
671                       bool writes_scc, uint8_t uses_ub = 0)
672 {
673    Builder bld = create_alu_builder(ctx, instr);
674    bld.is_nuw = instr->no_unsigned_wrap;
675 
676    Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
677                           Operand(get_alu_src(ctx, instr->src[1]))};
678    u_foreach_bit (i, uses_ub) {
679       uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
680       if (src_ub <= 0xffff)
681          operands[i].set16bit(true);
682       else if (src_ub <= 0xffffff)
683          operands[i].set24bit(true);
684    }
685 
686    if (writes_scc)
687       bld.sop2(op, Definition(dst), bld.def(s1, scc), operands[0], operands[1]);
688    else
689       bld.sop2(op, Definition(dst), operands[0], operands[1]);
690 }
691 
692 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)693 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
694                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
695                       bool nuw = false, uint8_t uses_ub = 0)
696 {
697    Builder bld = create_alu_builder(ctx, instr);
698    bld.is_nuw = nuw;
699 
700    Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
701                           Operand(get_alu_src(ctx, instr->src[1]))};
702    u_foreach_bit (i, uses_ub) {
703       uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
704       if (src_ub <= 0xffff)
705          operands[i].set16bit(true);
706       else if (src_ub <= 0xffffff)
707          operands[i].set24bit(true);
708    }
709 
710    if (swap_srcs)
711       std::swap(operands[0], operands[1]);
712 
713    if (operands[1].isOfType(RegType::sgpr)) {
714       if (commutative && operands[0].isOfType(RegType::vgpr)) {
715          std::swap(operands[0], operands[1]);
716       } else {
717          operands[1] = bld.copy(bld.def(RegType::vgpr, operands[1].size()), operands[1]);
718       }
719    }
720 
721    if (flush_denorms && ctx->program->gfx_level < GFX9) {
722       assert(dst.size() == 1);
723       Temp tmp = bld.vop2(opc, bld.def(dst.regClass()), operands[0], operands[1]);
724       if (dst.bytes() == 2)
725          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
726       else
727          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
728    } else {
729       bld.vop2(opc, Definition(dst), operands[0], operands[1]);
730    }
731 }
732 
733 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)734 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
735 {
736    Builder bld = create_alu_builder(ctx, instr);
737 
738    Temp src0 = get_alu_src(ctx, instr->src[0]);
739    Temp src1 = get_alu_src(ctx, instr->src[1]);
740 
741    if (src1.type() == RegType::sgpr) {
742       assert(src0.type() == RegType::vgpr);
743       std::swap(src0, src1);
744    }
745 
746    Temp src00 = bld.tmp(src0.type(), 1);
747    Temp src01 = bld.tmp(src0.type(), 1);
748    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
749    Temp src10 = bld.tmp(v1);
750    Temp src11 = bld.tmp(v1);
751    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
752    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
753    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
754    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
755 }
756 
757 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)758 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
759                        bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
760 {
761    assert(num_sources == 2 || num_sources == 3);
762    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
763    bool has_sgpr = false;
764    for (unsigned i = 0; i < num_sources; i++) {
765       src[i] = get_alu_src(ctx, instr->src[(swap_srcs && i < 2) ? 1 - i : i]);
766       if (has_sgpr)
767          src[i] = as_vgpr(ctx, src[i]);
768       else
769          has_sgpr = src[i].type() == RegType::sgpr;
770    }
771 
772    Builder bld = create_alu_builder(ctx, instr);
773    if (flush_denorms && ctx->program->gfx_level < GFX9) {
774       Temp tmp;
775       if (num_sources == 3)
776          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
777       else
778          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
779       if (dst.size() == 1)
780          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
781       else
782          bld.vop3(aco_opcode::v_mul_f64_e64, Definition(dst), Operand::c64(0x3FF0000000000000),
783                   tmp);
784    } else if (num_sources == 3) {
785       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
786    } else {
787       bld.vop3(op, Definition(dst), src[0], src[1]);
788    }
789 }
790 
791 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)792 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
793                        bool swap_srcs = false)
794 {
795    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
796    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
797    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
798       src1 = as_vgpr(ctx, src1);
799    assert(instr->def.num_components == 2);
800 
801    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
802    unsigned opsel_lo =
803       (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
804    unsigned opsel_hi =
805       (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
806 
807    Builder bld = create_alu_builder(ctx, instr);
808    Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
809    emit_split_vector(ctx, dst, 2);
810    return res;
811 }
812 
813 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp,unsigned neg_lo=0)814 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
815                       unsigned neg_lo = 0)
816 {
817    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
818    bool has_sgpr = false;
819    for (unsigned i = 0; i < 3; i++) {
820       src[i] = get_alu_src(ctx, instr->src[i]);
821       if (has_sgpr)
822          src[i] = as_vgpr(ctx, src[i]);
823       else
824          has_sgpr = src[i].type() == RegType::sgpr;
825    }
826 
827    Builder bld = create_alu_builder(ctx, instr);
828    VALU_instruction& vop3p =
829       bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
830    vop3p.clamp = clamp;
831    vop3p.neg_lo = neg_lo;
832 }
833 
834 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)835 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
836 {
837    Builder bld = create_alu_builder(ctx, instr);
838    if (dst.type() == RegType::sgpr)
839       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
840                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
841    else
842       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
843 }
844 
845 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)846 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
847 {
848    Temp src0 = get_alu_src(ctx, instr->src[0]);
849    Temp src1 = get_alu_src(ctx, instr->src[1]);
850    assert(src0.size() == src1.size());
851 
852    aco_ptr<Instruction> vopc;
853    if (src1.type() == RegType::sgpr) {
854       if (src0.type() == RegType::vgpr) {
855          /* to swap the operands, we might also have to change the opcode */
856          op = get_vcmp_swapped(op);
857          Temp t = src0;
858          src0 = src1;
859          src1 = t;
860       } else {
861          src1 = as_vgpr(ctx, src1);
862       }
863    }
864 
865    Builder bld = create_alu_builder(ctx, instr);
866    bld.vopc(op, Definition(dst), src0, src1);
867 }
868 
869 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)870 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
871 {
872    Temp src0 = get_alu_src(ctx, instr->src[0]);
873    Temp src1 = get_alu_src(ctx, instr->src[1]);
874    Builder bld = create_alu_builder(ctx, instr);
875 
876    assert(dst.regClass() == bld.lm);
877    assert(src0.type() == RegType::sgpr);
878    assert(src1.type() == RegType::sgpr);
879 
880    /* Emit the SALU comparison instruction */
881    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
882    /* Turn the result into a per-lane bool */
883    bool_to_vector_condition(ctx, cmp, dst);
884 }
885 
886 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s16_op=aco_opcode::num_opcodes,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)887 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
888                 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s16_op = aco_opcode::num_opcodes,
889                 aco_opcode s32_op = aco_opcode::num_opcodes,
890                 aco_opcode s64_op = aco_opcode::num_opcodes)
891 {
892    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
893                      : instr->src[0].src.ssa->bit_size == 32 ? s32_op
894                                                              : s16_op;
895    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
896                      : instr->src[0].src.ssa->bit_size == 32 ? v32_op
897                                                              : v16_op;
898    bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
899                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
900                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
901    aco_opcode op = use_valu ? v_op : s_op;
902    assert(op != aco_opcode::num_opcodes);
903    assert(dst.regClass() == ctx->program->lane_mask);
904 
905    if (use_valu)
906       emit_vopc_instruction(ctx, instr, op, dst);
907    else
908       emit_sopc_instruction(ctx, instr, op, dst);
909 }
910 
911 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)912 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
913                    Temp dst)
914 {
915    Builder bld(ctx->program, ctx->block);
916    Temp src0 = get_alu_src(ctx, instr->src[0]);
917    Temp src1 = get_alu_src(ctx, instr->src[1]);
918 
919    assert(dst.regClass() == bld.lm);
920    assert(src0.regClass() == bld.lm);
921    assert(src1.regClass() == bld.lm);
922 
923    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
924 }
925 
926 void
select_vec2(isel_context * ctx,Temp dst,Temp cond,Temp then,Temp els)927 select_vec2(isel_context* ctx, Temp dst, Temp cond, Temp then, Temp els)
928 {
929    Builder bld(ctx->program, ctx->block);
930 
931    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
932    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
933    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
934    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
935 
936    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
937    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
938 
939    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
940 }
941 
942 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)943 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
944 {
945    Builder bld(ctx->program, ctx->block);
946    Temp cond = get_alu_src(ctx, instr->src[0]);
947    Temp then = get_alu_src(ctx, instr->src[1]);
948    Temp els = get_alu_src(ctx, instr->src[2]);
949 
950    assert(cond.regClass() == bld.lm);
951 
952    if (dst.type() == RegType::vgpr) {
953       aco_ptr<Instruction> bcsel;
954       if (dst.size() == 1) {
955          then = as_vgpr(ctx, then);
956          els = as_vgpr(ctx, els);
957 
958          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
959       } else if (dst.size() == 2) {
960          select_vec2(ctx, dst, cond, then, els);
961       } else {
962          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
963       }
964       return;
965    }
966 
967    if (instr->def.bit_size == 1) {
968       assert(dst.regClass() == bld.lm);
969       assert(then.regClass() == bld.lm);
970       assert(els.regClass() == bld.lm);
971    }
972 
973    if (!nir_src_is_divergent(&instr->src[0].src)) { /* uniform condition and values in sgpr */
974       if (dst.regClass() == s1 || dst.regClass() == s2) {
975          assert((then.regClass() == s1 || then.regClass() == s2) &&
976                 els.regClass() == then.regClass());
977          assert(dst.size() == then.size());
978          aco_opcode op =
979             dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
980          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
981       } else {
982          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
983       }
984       return;
985    }
986 
987    /* divergent boolean bcsel
988     * this implements bcsel on bools: dst = s0 ? s1 : s2
989     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
990    assert(instr->def.bit_size == 1);
991 
992    if (cond.id() != then.id())
993       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
994 
995    if (cond.id() == els.id())
996       bld.copy(Definition(dst), then);
997    else
998       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
999                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1000 }
1001 
1002 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode vop,aco_opcode sop,uint32_t undo)1003 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode vop,
1004                aco_opcode sop, uint32_t undo)
1005 {
1006    if (ctx->block->fp_mode.denorm32 == 0) {
1007       if (dst.regClass() == v1)
1008          bld.vop1(vop, dst, val);
1009       else if (ctx->options->gfx_level >= GFX12)
1010          bld.vop3(sop, dst, val);
1011       else
1012          bld.pseudo(aco_opcode::p_as_uniform, dst, bld.vop1(vop, bld.def(v1), val));
1013       return;
1014    }
1015 
1016    /* multiply by 16777216 to handle denormals */
1017    Temp scale, unscale;
1018    if (val.regClass() == v1) {
1019       val = as_vgpr(bld, val);
1020       Temp is_denormal = bld.tmp(bld.lm);
1021       VALU_instruction& valu = bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal),
1022                                             val, Operand::c32(1u << 4))
1023                                   ->valu();
1024       valu.neg[0] = true;
1025       valu.abs[0] = true;
1026       scale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
1027                            bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), is_denormal);
1028       unscale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
1029                              bld.copy(bld.def(s1), Operand::c32(undo)), is_denormal);
1030    } else {
1031       Temp abs = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), val,
1032                           bld.copy(bld.def(s1), Operand::c32(0x7fffffff)));
1033       Temp denorm_cmp = bld.copy(bld.def(s1), Operand::c32(0x00800000));
1034       Temp is_denormal = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, denorm_cmp);
1035       scale = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
1036                        bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), Operand::c32(0x3f800000),
1037                        bld.scc(is_denormal));
1038       unscale =
1039          bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(undo)),
1040                   Operand::c32(0x3f800000), bld.scc(is_denormal));
1041    }
1042 
1043    if (dst.regClass() == v1) {
1044       Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), scale, as_vgpr(bld, val));
1045       scaled = bld.vop1(vop, bld.def(v1), scaled);
1046       bld.vop2(aco_opcode::v_mul_f32, dst, unscale, scaled);
1047    } else {
1048       assert(ctx->options->gfx_level >= GFX11_5);
1049       Temp scaled = bld.sop2(aco_opcode::s_mul_f32, bld.def(s1), scale, val);
1050       if (ctx->options->gfx_level >= GFX12)
1051          scaled = bld.vop3(sop, bld.def(s1), scaled);
1052       else
1053          scaled = bld.as_uniform(bld.vop1(vop, bld.def(v1), scaled));
1054       bld.sop2(aco_opcode::s_mul_f32, dst, unscale, scaled);
1055    }
1056 }
1057 
1058 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1059 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1060 {
1061    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, aco_opcode::v_s_rcp_f32, 0x4b800000u);
1062 }
1063 
1064 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1065 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1066 {
1067    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, aco_opcode::v_s_rsq_f32, 0x45800000u);
1068 }
1069 
1070 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1071 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1072 {
1073    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, aco_opcode::v_s_sqrt_f32,
1074                   0x39800000u);
1075 }
1076 
1077 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1078 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1079 {
1080    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, aco_opcode::v_s_log_f32, 0xc1c00000u);
1081 }
1082 
1083 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1084 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1085 {
1086    if (ctx->options->gfx_level >= GFX7)
1087       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1088 
1089    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1090    /* TODO: create more efficient code! */
1091    if (val.type() == RegType::sgpr)
1092       val = as_vgpr(ctx, val);
1093 
1094    /* Split the input value. */
1095    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1096    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1097 
1098    /* Extract the exponent and compute the unbiased value. */
1099    Temp exponent =
1100       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1101    exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1102 
1103    /* Extract the fractional part. */
1104    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1105                                 Operand::c32(0x000fffffu));
1106    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1107 
1108    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1109    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1110               fract_mask);
1111 
1112    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1113    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1114    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1115    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1116    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1117 
1118    /* Get the sign bit. */
1119    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1120 
1121    /* Decide the operation to apply depending on the unbiased exponent. */
1122    Temp exp_lt0 =
1123       bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1124    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1125                           bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1126    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1127    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1128    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1129    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1130 
1131    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1132 }
1133 
1134 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1135 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1136 {
1137    if (ctx->options->gfx_level >= GFX7)
1138       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1139 
1140    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1141     * lowered at NIR level for precision reasons). */
1142    Temp src0 = as_vgpr(ctx, val);
1143 
1144    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1145                              Operand::c32(0x3fefffffu));
1146 
1147    Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1148    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1149    Temp min = bld.vop3(aco_opcode::v_min_f64_e64, bld.def(v2), fract, min_val);
1150 
1151    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1152    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1153    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1154    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1155 
1156    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1157    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1158 
1159    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1160 
1161    Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src0, v);
1162    add->valu().neg[1] = true;
1163 
1164    return add->definitions[0].getTemp();
1165 }
1166 
1167 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1168 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1169 {
1170    if (bld.program->gfx_level < GFX8) {
1171       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1172       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1173                           add.def(1).getTemp());
1174    }
1175 
1176    Builder::Result add(NULL);
1177    if (bld.program->gfx_level >= GFX9) {
1178       add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1179    } else {
1180       add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1181    }
1182    add->valu().clamp = 1;
1183    return dst.getTemp();
1184 }
1185 
1186 Temp
usub32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1187 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1188 {
1189    if (bld.program->gfx_level < GFX8) {
1190       Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1191       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1192                           sub.def(1).getTemp());
1193    }
1194 
1195    Builder::Result sub(NULL);
1196    if (bld.program->gfx_level >= GFX9) {
1197       sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1198    } else {
1199       sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1200    }
1201    sub->valu().clamp = 1;
1202    return dst.getTemp();
1203 }
1204 
1205 void
emit_vec2_f2f16(isel_context * ctx,nir_alu_instr * instr,Temp dst)1206 emit_vec2_f2f16(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1207 {
1208    Builder bld = create_alu_builder(ctx, instr);
1209    Temp src = get_ssa_temp(ctx, instr->src[0].src.ssa);
1210    RegClass rc = RegClass(src.regClass().type(), instr->src[0].src.ssa->bit_size / 32);
1211    Temp src0 = emit_extract_vector(ctx, src, instr->src[0].swizzle[0], rc);
1212    Temp src1 = emit_extract_vector(ctx, src, instr->src[0].swizzle[1], rc);
1213 
1214    if (dst.regClass() == s1) {
1215       bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src0, src1);
1216    } else {
1217       src1 = as_vgpr(ctx, src1);
1218       if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
1219          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src0, src1);
1220       else
1221          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
1222       emit_split_vector(ctx, dst, 2);
1223    }
1224 }
1225 
1226 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1227 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1228 {
1229    Builder bld = create_alu_builder(ctx, instr);
1230    Temp dst = get_ssa_temp(ctx, &instr->def);
1231    switch (instr->op) {
1232    case nir_op_vec2:
1233    case nir_op_vec3:
1234    case nir_op_vec4:
1235    case nir_op_vec5:
1236    case nir_op_vec8:
1237    case nir_op_vec16: {
1238       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1239       unsigned num = instr->def.num_components;
1240       for (unsigned i = 0; i < num; ++i)
1241          elems[i] = get_alu_src(ctx, instr->src[i]);
1242 
1243       if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1244          aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
1245                                                      instr->def.num_components, 1)};
1246          RegClass elem_rc = RegClass::get(dst.type(), instr->def.bit_size / 8u);
1247          for (unsigned i = 0; i < num; ++i) {
1248             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1249                elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1250 
1251             if (nir_src_is_undef(instr->src[i].src))
1252                vec->operands[i] = Operand{elem_rc};
1253             else
1254                vec->operands[i] = Operand{elems[i]};
1255          }
1256          vec->definitions[0] = Definition(dst);
1257          ctx->block->instructions.emplace_back(std::move(vec));
1258          ctx->allocated_vec.emplace(dst.id(), elems);
1259       } else {
1260          bool use_s_pack = ctx->program->gfx_level >= GFX9;
1261          Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1262 
1263          std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1264          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1265          bitarray32 undef_mask = UINT32_MAX;
1266          for (unsigned i = 0; i < num; i++) {
1267             unsigned packed_size = use_s_pack ? 16 : 32;
1268             unsigned idx = i * instr->def.bit_size / packed_size;
1269             unsigned offset = i * instr->def.bit_size % packed_size;
1270             if (nir_src_is_undef(instr->src[i].src))
1271                continue;
1272             else
1273                undef_mask[idx] = false;
1274 
1275             if (nir_src_is_const(instr->src[i].src)) {
1276                const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1277                continue;
1278             }
1279 
1280             if (offset != packed_size - instr->def.bit_size)
1281                elems[i] =
1282                   bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1283 
1284             if (offset)
1285                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1286                                    Operand::c32(offset));
1287 
1288             if (packed[idx].id())
1289                packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1290                                       packed[idx]);
1291             else
1292                packed[idx] = elems[i];
1293          }
1294 
1295          if (use_s_pack) {
1296             for (unsigned i = 0; i < dst.size(); i++) {
1297                bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1298 
1299                if (packed[i * 2].id() && packed[i * 2 + 1].id())
1300                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1301                                        packed[i * 2 + 1]);
1302                else if (packed[i * 2 + 1].id())
1303                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1304                                        Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1305                else if (packed[i * 2].id())
1306                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1307                                        Operand::c32(const_vals[i * 2 + 1]));
1308                else
1309                   packed[i] = Temp(0, s1); /* Both constants, so reset the entry */
1310 
1311                undef_mask[i] = undef_mask[i * 2] && undef_mask[i * 2 + 1];
1312 
1313                if (same)
1314                   const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1315                else
1316                   const_vals[i] = 0;
1317             }
1318          }
1319 
1320          for (unsigned i = 0; i < dst.size(); i++) {
1321             if (const_vals[i] && packed[i].id())
1322                packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1323                                     Operand::c32(const_vals[i]), packed[i]);
1324             else if (!packed[i].id() && !undef_mask[i])
1325                packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1326          }
1327 
1328          if (dst.size() == 1 && packed[0].id())
1329             bld.copy(Definition(dst), packed[0]);
1330          else {
1331             aco_ptr<Instruction> vec{
1332                create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1333             vec->definitions[0] = Definition(dst);
1334             for (unsigned i = 0; i < dst.size(); ++i)
1335                vec->operands[i] = Operand(packed[i]);
1336             bld.insert(std::move(vec));
1337          }
1338       }
1339       break;
1340    }
1341    case nir_op_mov: {
1342       Temp src = get_alu_src(ctx, instr->src[0]);
1343       if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1344          /* use size() instead of bytes() for 8/16-bit */
1345          assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1346          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1347       } else {
1348          assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1349          bld.copy(Definition(dst), src);
1350       }
1351       break;
1352    }
1353    case nir_op_inot: {
1354       Temp src = get_alu_src(ctx, instr->src[0]);
1355       if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1356          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1357       } else if (dst.regClass() == v2) {
1358          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1359          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1360          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1361          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1362          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1363       } else if (dst.type() == RegType::sgpr) {
1364          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1365          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1366       } else {
1367          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1368       }
1369       break;
1370    }
1371    case nir_op_iabs: {
1372       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1373          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1374 
1375          unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1376          unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1377 
1378          Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1379                               src, opsel_lo, opsel_hi);
1380          bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1381          emit_split_vector(ctx, dst, 2);
1382          break;
1383       }
1384       Temp src = get_alu_src(ctx, instr->src[0]);
1385       if (dst.regClass() == s1) {
1386          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1387       } else if (dst.regClass() == v1) {
1388          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1389                   bld.vsub32(bld.def(v1), Operand::zero(), src));
1390       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1391          bld.vop3(
1392             aco_opcode::v_max_i16_e64, Definition(dst), src,
1393             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1394       } else if (dst.regClass() == v2b) {
1395          src = as_vgpr(ctx, src);
1396          bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1397                   bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1398       } else {
1399          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1400       }
1401       break;
1402    }
1403    case nir_op_isign: {
1404       Temp src = get_alu_src(ctx, instr->src[0]);
1405       if (dst.regClass() == s1) {
1406          Temp tmp =
1407             bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1408          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1409       } else if (dst.regClass() == s2) {
1410          Temp neg =
1411             bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1412          Temp neqz;
1413          if (ctx->program->gfx_level >= GFX8)
1414             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1415          else
1416             neqz =
1417                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1418                   .def(1)
1419                   .getTemp();
1420          /* SCC gets zero-extended to 64 bit */
1421          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1422       } else if (dst.regClass() == v1) {
1423          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1424       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1425          bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1426       } else if (dst.regClass() == v2b) {
1427          src = as_vgpr(ctx, src);
1428          bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1429                   bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1430       } else if (dst.regClass() == v2) {
1431          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1432          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1433          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1434          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1435          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1436          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1437       } else {
1438          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1439       }
1440       break;
1441    }
1442    case nir_op_imax: {
1443       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1444          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1445       } else if (dst.regClass() == v2b) {
1446          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1447       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1448          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1449       } else if (dst.regClass() == v1) {
1450          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1451       } else if (dst.regClass() == s1) {
1452          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1453       } else {
1454          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1455       }
1456       break;
1457    }
1458    case nir_op_umax: {
1459       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1460          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1461       } else if (dst.regClass() == v2b) {
1462          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1463       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1464          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1465       } else if (dst.regClass() == v1) {
1466          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1467       } else if (dst.regClass() == s1) {
1468          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1469       } else {
1470          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1471       }
1472       break;
1473    }
1474    case nir_op_imin: {
1475       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1476          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1477       } else if (dst.regClass() == v2b) {
1478          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1479       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1480          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1481       } else if (dst.regClass() == v1) {
1482          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1483       } else if (dst.regClass() == s1) {
1484          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1485       } else {
1486          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1487       }
1488       break;
1489    }
1490    case nir_op_umin: {
1491       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1492          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1493       } else if (dst.regClass() == v2b) {
1494          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1495       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1496          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1497       } else if (dst.regClass() == v1) {
1498          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1499       } else if (dst.regClass() == s1) {
1500          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1501       } else {
1502          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1503       }
1504       break;
1505    }
1506    case nir_op_ior: {
1507       if (instr->def.bit_size == 1) {
1508          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1509       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1510          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1511       } else if (dst.regClass() == v2) {
1512          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1513       } else if (dst.regClass() == s1) {
1514          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1515       } else if (dst.regClass() == s2) {
1516          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1517       } else {
1518          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1519       }
1520       break;
1521    }
1522    case nir_op_iand: {
1523       if (instr->def.bit_size == 1) {
1524          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1525       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1526          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1527       } else if (dst.regClass() == v2) {
1528          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1529       } else if (dst.regClass() == s1) {
1530          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1531       } else if (dst.regClass() == s2) {
1532          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1533       } else {
1534          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1535       }
1536       break;
1537    }
1538    case nir_op_ixor: {
1539       if (instr->def.bit_size == 1) {
1540          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1541       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1542          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1543       } else if (dst.regClass() == v2) {
1544          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1545       } else if (dst.regClass() == s1) {
1546          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1547       } else if (dst.regClass() == s2) {
1548          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1549       } else {
1550          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1551       }
1552       break;
1553    }
1554    case nir_op_ushr: {
1555       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1556          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1557       } else if (dst.regClass() == v2b) {
1558          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1559       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1560          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1561       } else if (dst.regClass() == v1) {
1562          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1563       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1564          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1565                   get_alu_src(ctx, instr->src[0]));
1566       } else if (dst.regClass() == v2) {
1567          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1568       } else if (dst.regClass() == s2) {
1569          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1570       } else if (dst.regClass() == s1) {
1571          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1572       } else {
1573          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1574       }
1575       break;
1576    }
1577    case nir_op_ishl: {
1578       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1579          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1580       } else if (dst.regClass() == v2b) {
1581          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1582       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1583          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1584       } else if (dst.regClass() == v1) {
1585          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1586                                false, 1);
1587       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1588          bld.vop3(aco_opcode::v_lshlrev_b64_e64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1589                   get_alu_src(ctx, instr->src[0]));
1590       } else if (dst.regClass() == v2) {
1591          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1592       } else if (dst.regClass() == s1) {
1593          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1594       } else if (dst.regClass() == s2) {
1595          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1596       } else {
1597          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1598       }
1599       break;
1600    }
1601    case nir_op_ishr: {
1602       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1603          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1604       } else if (dst.regClass() == v2b) {
1605          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1606       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1607          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1608       } else if (dst.regClass() == v1) {
1609          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1610       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1611          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1612                   get_alu_src(ctx, instr->src[0]));
1613       } else if (dst.regClass() == v2) {
1614          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1615       } else if (dst.regClass() == s1) {
1616          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1617       } else if (dst.regClass() == s2) {
1618          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1619       } else {
1620          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1621       }
1622       break;
1623    }
1624    case nir_op_find_lsb: {
1625       Temp src = get_alu_src(ctx, instr->src[0]);
1626       if (src.regClass() == s1) {
1627          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1628       } else if (src.regClass() == v1) {
1629          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1630       } else if (src.regClass() == s2) {
1631          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1632       } else if (src.regClass() == v2) {
1633          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1634          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1635          lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
1636          hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
1637          hi = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32u), hi);
1638          bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
1639       } else {
1640          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1641       }
1642       break;
1643    }
1644    case nir_op_ufind_msb:
1645    case nir_op_ifind_msb: {
1646       Temp src = get_alu_src(ctx, instr->src[0]);
1647       if (src.regClass() == s1 || src.regClass() == s2) {
1648          aco_opcode op = src.regClass() == s2
1649                             ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1650                                                              : aco_opcode::s_flbit_i32_i64)
1651                             : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1652                                                              : aco_opcode::s_flbit_i32);
1653          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1654 
1655          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1656                                         Operand::c32(src.size() * 32u - 1u), msb_rev);
1657          Temp msb = sub.def(0).getTemp();
1658          Temp carry = sub.def(1).getTemp();
1659 
1660          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1661                   bld.scc(carry));
1662       } else if (src.regClass() == v1) {
1663          aco_opcode op =
1664             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1665          Temp msb_rev = bld.tmp(v1);
1666          emit_vop1_instruction(ctx, instr, op, msb_rev);
1667          Temp msb = bld.tmp(v1);
1668          Temp carry =
1669             bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1670          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1671       } else if (src.regClass() == v2) {
1672          aco_opcode op =
1673             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1674 
1675          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1676          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1677 
1678          lo = bld.vop1(op, bld.def(v1), lo);
1679          lo = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32), lo);
1680          hi = bld.vop1(op, bld.def(v1), hi);
1681          Temp msb_rev = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), lo, hi);
1682 
1683          Temp msb = bld.tmp(v1);
1684          Temp carry =
1685             bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1686          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1687       } else {
1688          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1689       }
1690       break;
1691    }
1692    case nir_op_ufind_msb_rev:
1693    case nir_op_ifind_msb_rev: {
1694       Temp src = get_alu_src(ctx, instr->src[0]);
1695       if (src.regClass() == s1) {
1696          aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1697                                                            : aco_opcode::s_flbit_i32;
1698          bld.sop1(op, Definition(dst), src);
1699       } else if (src.regClass() == v1) {
1700          aco_opcode op =
1701             instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1702          emit_vop1_instruction(ctx, instr, op, dst);
1703       } else {
1704          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1705       }
1706       break;
1707    }
1708    case nir_op_bitfield_reverse: {
1709       if (dst.regClass() == s1) {
1710          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1711       } else if (dst.regClass() == v1) {
1712          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1713       } else {
1714          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1715       }
1716       break;
1717    }
1718    case nir_op_iadd: {
1719       if (dst.regClass() == s1) {
1720          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1721          break;
1722       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1723          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1724          break;
1725       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1726          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1727          break;
1728       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1729          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1730          break;
1731       }
1732 
1733       Temp src0 = get_alu_src(ctx, instr->src[0]);
1734       Temp src1 = get_alu_src(ctx, instr->src[1]);
1735       if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1736          if (instr->no_unsigned_wrap)
1737             bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1738          else
1739             bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1740          break;
1741       }
1742 
1743       assert(src0.size() == 2 && src1.size() == 2);
1744       Temp src00 = bld.tmp(src0.type(), 1);
1745       Temp src01 = bld.tmp(dst.type(), 1);
1746       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1747       Temp src10 = bld.tmp(src1.type(), 1);
1748       Temp src11 = bld.tmp(dst.type(), 1);
1749       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1750 
1751       if (dst.regClass() == s2) {
1752          Temp carry = bld.tmp(s1);
1753          Temp dst0 =
1754             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1755          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1756                               bld.scc(carry));
1757          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1758       } else if (dst.regClass() == v2) {
1759          Temp dst0 = bld.tmp(v1);
1760          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1761          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1762          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1763       } else {
1764          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1765       }
1766       break;
1767    }
1768    case nir_op_uadd_sat: {
1769       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1770          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1771          add_instr->valu().clamp = 1;
1772          break;
1773       }
1774       Temp src0 = get_alu_src(ctx, instr->src[0]);
1775       Temp src1 = get_alu_src(ctx, instr->src[1]);
1776       if (dst.regClass() == s1) {
1777          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1778          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1779          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1780                   bld.scc(carry));
1781          break;
1782       } else if (dst.regClass() == v2b) {
1783          Instruction* add_instr;
1784          if (ctx->program->gfx_level >= GFX10) {
1785             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1786          } else {
1787             if (src1.type() == RegType::sgpr)
1788                std::swap(src0, src1);
1789             add_instr =
1790                bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1791          }
1792          add_instr->valu().clamp = 1;
1793          break;
1794       } else if (dst.regClass() == v1) {
1795          uadd32_sat(bld, Definition(dst), src0, src1);
1796          break;
1797       }
1798 
1799       assert(src0.size() == 2 && src1.size() == 2);
1800 
1801       Temp src00 = bld.tmp(src0.type(), 1);
1802       Temp src01 = bld.tmp(src0.type(), 1);
1803       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1804       Temp src10 = bld.tmp(src1.type(), 1);
1805       Temp src11 = bld.tmp(src1.type(), 1);
1806       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1807 
1808       if (dst.regClass() == s2) {
1809          Temp carry0 = bld.tmp(s1);
1810          Temp carry1 = bld.tmp(s1);
1811 
1812          Temp no_sat0 =
1813             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1814          Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1815                                  src01, src11, bld.scc(carry0));
1816 
1817          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1818 
1819          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1820                   bld.scc(carry1));
1821       } else if (dst.regClass() == v2) {
1822          Temp no_sat0 = bld.tmp(v1);
1823          Temp dst0 = bld.tmp(v1);
1824          Temp dst1 = bld.tmp(v1);
1825 
1826          Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1827          Temp carry1;
1828 
1829          if (ctx->program->gfx_level >= GFX8) {
1830             carry1 = bld.tmp(bld.lm);
1831             bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1832                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1833                ->valu()
1834                .clamp = 1;
1835          } else {
1836             Temp no_sat1 = bld.tmp(v1);
1837             carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1838             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1839                          carry1);
1840          }
1841 
1842          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
1843                       carry1);
1844          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1845       } else {
1846          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1847       }
1848       break;
1849    }
1850    case nir_op_iadd_sat: {
1851       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1852          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
1853          add_instr->valu().clamp = 1;
1854          break;
1855       }
1856       Temp src0 = get_alu_src(ctx, instr->src[0]);
1857       Temp src1 = get_alu_src(ctx, instr->src[1]);
1858       if (dst.regClass() == s1) {
1859          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
1860          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
1861                                Operand::c32(INT32_MAX), cond);
1862          Temp overflow = bld.tmp(s1);
1863          Temp add =
1864             bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
1865          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
1866          break;
1867       }
1868 
1869       src1 = as_vgpr(ctx, src1);
1870 
1871       if (dst.regClass() == v2b) {
1872          Instruction* add_instr =
1873             bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
1874          add_instr->valu().clamp = 1;
1875       } else if (dst.regClass() == v1) {
1876          Instruction* add_instr =
1877             bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
1878          add_instr->valu().clamp = 1;
1879       } else {
1880          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1881       }
1882       break;
1883    }
1884    case nir_op_uadd_carry: {
1885       Temp src0 = get_alu_src(ctx, instr->src[0]);
1886       Temp src1 = get_alu_src(ctx, instr->src[1]);
1887       if (dst.regClass() == s1) {
1888          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1889          break;
1890       }
1891       if (dst.regClass() == v1) {
1892          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1893          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1894                       carry);
1895          break;
1896       }
1897 
1898       Temp src00 = bld.tmp(src0.type(), 1);
1899       Temp src01 = bld.tmp(dst.type(), 1);
1900       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1901       Temp src10 = bld.tmp(src1.type(), 1);
1902       Temp src11 = bld.tmp(dst.type(), 1);
1903       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1904       if (dst.regClass() == s2) {
1905          Temp carry = bld.tmp(s1);
1906          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1907          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1908                           bld.scc(carry))
1909                     .def(1)
1910                     .getTemp();
1911          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1912       } else if (dst.regClass() == v2) {
1913          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1914          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1915          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1916                               Operand::c32(1u), carry);
1917          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1918       } else {
1919          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1920       }
1921       break;
1922    }
1923    case nir_op_isub: {
1924       if (dst.regClass() == s1) {
1925          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1926          break;
1927       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1928          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
1929          break;
1930       }
1931 
1932       Temp src0 = get_alu_src(ctx, instr->src[0]);
1933       Temp src1 = get_alu_src(ctx, instr->src[1]);
1934       if (dst.regClass() == v1) {
1935          bld.vsub32(Definition(dst), src0, src1);
1936          break;
1937       } else if (dst.bytes() <= 2) {
1938          if (ctx->program->gfx_level >= GFX10)
1939             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
1940          else if (src1.type() == RegType::sgpr)
1941             bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
1942          else if (ctx->program->gfx_level >= GFX8)
1943             bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
1944          else
1945             bld.vsub32(Definition(dst), src0, src1);
1946          break;
1947       }
1948 
1949       Temp src00 = bld.tmp(src0.type(), 1);
1950       Temp src01 = bld.tmp(dst.type(), 1);
1951       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1952       Temp src10 = bld.tmp(src1.type(), 1);
1953       Temp src11 = bld.tmp(dst.type(), 1);
1954       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1955       if (dst.regClass() == s2) {
1956          Temp borrow = bld.tmp(s1);
1957          Temp dst0 =
1958             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1959          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1960                               bld.scc(borrow));
1961          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1962       } else if (dst.regClass() == v2) {
1963          Temp lower = bld.tmp(v1);
1964          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1965          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1966          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1967       } else {
1968          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1969       }
1970       break;
1971    }
1972    case nir_op_usub_borrow: {
1973       Temp src0 = get_alu_src(ctx, instr->src[0]);
1974       Temp src1 = get_alu_src(ctx, instr->src[1]);
1975       if (dst.regClass() == s1) {
1976          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1977          break;
1978       } else if (dst.regClass() == v1) {
1979          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1980          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1981                       borrow);
1982          break;
1983       }
1984 
1985       Temp src00 = bld.tmp(src0.type(), 1);
1986       Temp src01 = bld.tmp(dst.type(), 1);
1987       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1988       Temp src10 = bld.tmp(src1.type(), 1);
1989       Temp src11 = bld.tmp(dst.type(), 1);
1990       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1991       if (dst.regClass() == s2) {
1992          Temp borrow = bld.tmp(s1);
1993          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1994          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1995                            bld.scc(borrow))
1996                      .def(1)
1997                      .getTemp();
1998          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1999       } else if (dst.regClass() == v2) {
2000          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2001          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2002          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2003                                Operand::c32(1u), borrow);
2004          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2005       } else {
2006          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2007       }
2008       break;
2009    }
2010    case nir_op_usub_sat: {
2011       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2012          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2013          sub_instr->valu().clamp = 1;
2014          break;
2015       }
2016       Temp src0 = get_alu_src(ctx, instr->src[0]);
2017       Temp src1 = get_alu_src(ctx, instr->src[1]);
2018       if (dst.regClass() == s1) {
2019          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2020          bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2021          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2022          break;
2023       } else if (dst.regClass() == v2b) {
2024          Instruction* sub_instr;
2025          if (ctx->program->gfx_level >= GFX10) {
2026             sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2027          } else {
2028             aco_opcode op = aco_opcode::v_sub_u16;
2029             if (src1.type() == RegType::sgpr) {
2030                std::swap(src0, src1);
2031                op = aco_opcode::v_subrev_u16;
2032             }
2033             sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2034          }
2035          sub_instr->valu().clamp = 1;
2036          break;
2037       } else if (dst.regClass() == v1) {
2038          usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2039          break;
2040       }
2041 
2042       assert(src0.size() == 2 && src1.size() == 2);
2043       Temp src00 = bld.tmp(src0.type(), 1);
2044       Temp src01 = bld.tmp(src0.type(), 1);
2045       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2046       Temp src10 = bld.tmp(src1.type(), 1);
2047       Temp src11 = bld.tmp(src1.type(), 1);
2048       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2049 
2050       if (dst.regClass() == s2) {
2051          Temp carry0 = bld.tmp(s1);
2052          Temp carry1 = bld.tmp(s1);
2053 
2054          Temp no_sat0 =
2055             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2056          Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2057                                  src01, src11, bld.scc(carry0));
2058 
2059          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2060 
2061          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2062                   bld.scc(carry1));
2063       } else if (dst.regClass() == v2) {
2064          Temp no_sat0 = bld.tmp(v1);
2065          Temp dst0 = bld.tmp(v1);
2066          Temp dst1 = bld.tmp(v1);
2067 
2068          Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2069          Temp carry1;
2070 
2071          if (ctx->program->gfx_level >= GFX8) {
2072             carry1 = bld.tmp(bld.lm);
2073             bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2074                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2075                ->valu()
2076                .clamp = 1;
2077          } else {
2078             Temp no_sat1 = bld.tmp(v1);
2079             carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2080             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2081                          carry1);
2082          }
2083 
2084          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2085                       carry1);
2086          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2087       } else {
2088          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2089       }
2090       break;
2091    }
2092    case nir_op_isub_sat: {
2093       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2094          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2095          sub_instr->valu().clamp = 1;
2096          break;
2097       }
2098       Temp src0 = get_alu_src(ctx, instr->src[0]);
2099       Temp src1 = get_alu_src(ctx, instr->src[1]);
2100       if (dst.regClass() == s1) {
2101          Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2102          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2103                                Operand::c32(INT32_MAX), cond);
2104          Temp overflow = bld.tmp(s1);
2105          Temp sub =
2106             bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2107          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2108          break;
2109       }
2110 
2111       src1 = as_vgpr(ctx, src1);
2112 
2113       if (dst.regClass() == v2b) {
2114          Instruction* sub_instr =
2115             bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2116          sub_instr->valu().clamp = 1;
2117       } else if (dst.regClass() == v1) {
2118          Instruction* sub_instr =
2119             bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2120          sub_instr->valu().clamp = 1;
2121       } else {
2122          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2123       }
2124       break;
2125    }
2126    case nir_op_imul: {
2127       if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2128          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2129       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2130          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2131       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2132          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2133       } else if (dst.type() == RegType::vgpr) {
2134          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2135          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2136 
2137          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2138             bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2139             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2140                                   true /* commutative */, false, false, nuw_16bit, 0x3);
2141          } else if (nir_src_is_const(instr->src[0].src)) {
2142             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2143                           nir_src_as_uint(instr->src[0].src), false);
2144          } else if (nir_src_is_const(instr->src[1].src)) {
2145             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2146                           nir_src_as_uint(instr->src[1].src), false);
2147          } else {
2148             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2149          }
2150       } else if (dst.regClass() == s1) {
2151          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2152       } else {
2153          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2154       }
2155       break;
2156    }
2157    case nir_op_umul_high: {
2158       if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2159          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2160       } else if (dst.bytes() == 4) {
2161          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2162          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2163 
2164          Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2165          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2166             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2167          } else {
2168             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2169          }
2170 
2171          if (dst.regClass() == s1)
2172             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2173       } else {
2174          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2175       }
2176       break;
2177    }
2178    case nir_op_imul_high: {
2179       if (dst.regClass() == v1) {
2180          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2181       } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2182          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2183       } else if (dst.regClass() == s1) {
2184          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2185                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2186          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2187       } else {
2188          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2189       }
2190       break;
2191    }
2192    case nir_op_fmul: {
2193       if (dst.regClass() == v2b) {
2194          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2195       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2196          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2197       } else if (dst.regClass() == v1) {
2198          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2199       } else if (dst.regClass() == v2) {
2200          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64_e64, dst);
2201       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2202          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f16, dst, false);
2203       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2204          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f32, dst, false);
2205       } else {
2206          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2207       }
2208       break;
2209    }
2210    case nir_op_fmulz: {
2211       if (dst.regClass() == v1) {
2212          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2213       } else {
2214          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2215       }
2216       break;
2217    }
2218    case nir_op_fadd: {
2219       if (dst.regClass() == v2b) {
2220          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2221       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2222          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2223       } else if (dst.regClass() == v1) {
2224          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2225       } else if (dst.regClass() == v2) {
2226          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64_e64, dst);
2227       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2228          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f16, dst, false);
2229       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2230          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f32, dst, false);
2231       } else {
2232          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2233       }
2234       break;
2235    }
2236    case nir_op_fsub: {
2237       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2238          Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2239          VALU_instruction& sub = add->valu();
2240          sub.neg_lo[1] = true;
2241          sub.neg_hi[1] = true;
2242          break;
2243       }
2244 
2245       Temp src0 = get_alu_src(ctx, instr->src[0]);
2246       Temp src1 = get_alu_src(ctx, instr->src[1]);
2247       if (dst.regClass() == v2b) {
2248          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2249             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2250          else
2251             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2252       } else if (dst.regClass() == v1) {
2253          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2254             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2255          else
2256             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2257       } else if (dst.regClass() == v2) {
2258          Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), as_vgpr(ctx, src0),
2259                                      as_vgpr(ctx, src1));
2260          add->valu().neg[1] = true;
2261       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2262          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f16, dst, false);
2263       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2264          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f32, dst, false);
2265       } else {
2266          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2267       }
2268       break;
2269    }
2270    case nir_op_ffma: {
2271       if (dst.regClass() == v2b) {
2272          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2273       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2274          assert(instr->def.num_components == 2);
2275 
2276          Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2277          Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2278          Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2279 
2280          /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2281          unsigned opsel_lo = 0, opsel_hi = 0;
2282          for (unsigned i = 0; i < 3; i++) {
2283             opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2284             opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2285          }
2286 
2287          bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2288          emit_split_vector(ctx, dst, 2);
2289       } else if (dst.regClass() == v1) {
2290          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2291                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2292       } else if (dst.regClass() == v2) {
2293          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2294       } else if (dst.regClass() == s1) {
2295          Temp src0 = get_alu_src(ctx, instr->src[0]);
2296          Temp src1 = get_alu_src(ctx, instr->src[1]);
2297          Temp src2 = get_alu_src(ctx, instr->src[2]);
2298          aco_opcode op =
2299             instr->def.bit_size == 16 ? aco_opcode::s_fmac_f16 : aco_opcode::s_fmac_f32;
2300          bld.sop2(op, Definition(dst), src0, src1, src2);
2301       } else {
2302          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2303       }
2304       break;
2305    }
2306    case nir_op_ffmaz: {
2307       if (dst.regClass() == v1) {
2308          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2309                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2310       } else {
2311          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2312       }
2313       break;
2314    }
2315    case nir_op_fmax: {
2316       if (dst.regClass() == v2b) {
2317          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true, false,
2318                                ctx->block->fp_mode.must_flush_denorms16_64);
2319       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2320          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2321       } else if (dst.regClass() == v1) {
2322          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2323                                ctx->block->fp_mode.must_flush_denorms32);
2324       } else if (dst.regClass() == v2) {
2325          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64_e64, dst,
2326                                 ctx->block->fp_mode.must_flush_denorms16_64);
2327       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2328          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f16, dst, false);
2329       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2330          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f32, dst, false);
2331       } else {
2332          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2333       }
2334       break;
2335    }
2336    case nir_op_fmin: {
2337       if (dst.regClass() == v2b) {
2338          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true, false,
2339                                ctx->block->fp_mode.must_flush_denorms16_64);
2340       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2341          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2342       } else if (dst.regClass() == v1) {
2343          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2344                                ctx->block->fp_mode.must_flush_denorms32);
2345       } else if (dst.regClass() == v2) {
2346          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64_e64, dst,
2347                                 ctx->block->fp_mode.must_flush_denorms16_64);
2348       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2349          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f16, dst, false);
2350       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2351          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f32, dst, false);
2352       } else {
2353          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2354       }
2355       break;
2356    }
2357    case nir_op_sdot_4x8_iadd: {
2358       if (ctx->options->gfx_level >= GFX11)
2359          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2360       else
2361          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2362       break;
2363    }
2364    case nir_op_sdot_4x8_iadd_sat: {
2365       if (ctx->options->gfx_level >= GFX11)
2366          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2367       else
2368          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2369       break;
2370    }
2371    case nir_op_sudot_4x8_iadd: {
2372       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2373       break;
2374    }
2375    case nir_op_sudot_4x8_iadd_sat: {
2376       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2377       break;
2378    }
2379    case nir_op_udot_4x8_uadd: {
2380       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2381       break;
2382    }
2383    case nir_op_udot_4x8_uadd_sat: {
2384       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2385       break;
2386    }
2387    case nir_op_sdot_2x16_iadd: {
2388       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2389       break;
2390    }
2391    case nir_op_sdot_2x16_iadd_sat: {
2392       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2393       break;
2394    }
2395    case nir_op_udot_2x16_uadd: {
2396       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2397       break;
2398    }
2399    case nir_op_udot_2x16_uadd_sat: {
2400       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2401       break;
2402    }
2403    case nir_op_cube_amd: {
2404       Temp in = get_alu_src(ctx, instr->src[0], 3);
2405       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2406                      emit_extract_vector(ctx, in, 2, v1)};
2407       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2408       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2409       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2410       Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2411       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2412       break;
2413    }
2414    case nir_op_bcsel: {
2415       emit_bcsel(ctx, instr, dst);
2416       break;
2417    }
2418    case nir_op_frsq: {
2419       if (instr->def.bit_size == 16) {
2420          if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2421             bld.vop3(aco_opcode::v_s_rsq_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2422          else
2423             emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2424       } else if (instr->def.bit_size == 32) {
2425          emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2426       } else if (instr->def.bit_size == 64) {
2427          /* Lowered at NIR level for precision reasons. */
2428          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2429       } else {
2430          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2431       }
2432       break;
2433    }
2434    case nir_op_fneg: {
2435       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2436          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2437          Instruction* vop3p =
2438             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2439                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2440          vop3p->valu().neg_lo[0] = true;
2441          vop3p->valu().neg_hi[0] = true;
2442          emit_split_vector(ctx, dst, 2);
2443          break;
2444       }
2445       Temp src = get_alu_src(ctx, instr->src[0]);
2446       if (dst.regClass() == v2b) {
2447          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2448       } else if (dst.regClass() == v1) {
2449          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2450                   as_vgpr(ctx, src));
2451       } else if (dst.regClass() == v2) {
2452          if (ctx->block->fp_mode.must_flush_denorms16_64)
2453             src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2454                            as_vgpr(ctx, src));
2455          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2456          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2457          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2458          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2459       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2460          bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0xbc00u), src);
2461       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2462          bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0xbf800000u), src);
2463       } else {
2464          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2465       }
2466       break;
2467    }
2468    case nir_op_fabs: {
2469       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2470          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2471          Instruction* vop3p =
2472             bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2473                       instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2474                .instr;
2475          vop3p->valu().neg_lo[1] = true;
2476          vop3p->valu().neg_hi[1] = true;
2477          emit_split_vector(ctx, dst, 2);
2478          break;
2479       }
2480       Temp src = get_alu_src(ctx, instr->src[0]);
2481       if (dst.regClass() == v2b) {
2482          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2483                                          Operand::c16(0x3c00), as_vgpr(ctx, src))
2484                                .instr;
2485          mul->valu().abs[1] = true;
2486       } else if (dst.regClass() == v1) {
2487          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2488                                          Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2489                                .instr;
2490          mul->valu().abs[1] = true;
2491       } else if (dst.regClass() == v2) {
2492          if (ctx->block->fp_mode.must_flush_denorms16_64)
2493             src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2494                            as_vgpr(ctx, src));
2495          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2496          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2497          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2498          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2499       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2500          Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fff));
2501          if (ctx->block->fp_mode.denorm16_64 == fp_denorm_keep) {
2502             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
2503          } else {
2504             Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
2505             bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
2506          }
2507       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2508          Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
2509          if (ctx->block->fp_mode.denorm32 == fp_denorm_keep) {
2510             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
2511          } else {
2512             Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
2513             bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0x3f800000), tmp);
2514          }
2515       } else {
2516          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2517       }
2518       break;
2519    }
2520    case nir_op_fsat: {
2521       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2522          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2523          Instruction* vop3p =
2524             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2525                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2526          vop3p->valu().clamp = true;
2527          emit_split_vector(ctx, dst, 2);
2528          break;
2529       }
2530       Temp src = get_alu_src(ctx, instr->src[0]);
2531       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
2532          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2533                   src);
2534       } else if (dst.regClass() == v2b) {
2535          bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src)
2536             ->valu()
2537             .clamp = true;
2538       } else if (dst.regClass() == v1) {
2539          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2540                   Operand::c32(0x3f800000u), src);
2541          /* apparently, it is not necessary to flush denorms if this instruction is used with these
2542           * operands */
2543          // TODO: confirm that this holds under any circumstances
2544       } else if (dst.regClass() == v2) {
2545          Instruction* add =
2546             bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src, Operand::zero());
2547          add->valu().clamp = true;
2548       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2549          Temp low = bld.sop2(aco_opcode::s_max_f16, bld.def(s1), src, Operand::c16(0));
2550          bld.sop2(aco_opcode::s_min_f16, Definition(dst), low, Operand::c16(0x3C00));
2551       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2552          Temp low = bld.sop2(aco_opcode::s_max_f32, bld.def(s1), src, Operand::c32(0));
2553          bld.sop2(aco_opcode::s_min_f32, Definition(dst), low, Operand::c32(0x3f800000));
2554       } else {
2555          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2556       }
2557       break;
2558    }
2559    case nir_op_flog2: {
2560       if (instr->def.bit_size == 16) {
2561          if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2562             bld.vop3(aco_opcode::v_s_log_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2563          else
2564             emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2565       } else if (instr->def.bit_size == 32) {
2566          emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2567       } else {
2568          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2569       }
2570       break;
2571    }
2572    case nir_op_frcp: {
2573       if (instr->def.bit_size == 16) {
2574          if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2575             bld.vop3(aco_opcode::v_s_rcp_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2576          else
2577             emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2578       } else if (instr->def.bit_size == 32) {
2579          emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2580       } else if (instr->def.bit_size == 64) {
2581          /* Lowered at NIR level for precision reasons. */
2582          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2583       } else {
2584          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2585       }
2586       break;
2587    }
2588    case nir_op_fexp2: {
2589       if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX12) {
2590          aco_opcode opcode =
2591             instr->def.bit_size == 16 ? aco_opcode::v_s_exp_f16 : aco_opcode::v_s_exp_f32;
2592          bld.vop3(opcode, Definition(dst), get_alu_src(ctx, instr->src[0]));
2593       } else if (instr->def.bit_size == 16) {
2594          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2595       } else if (instr->def.bit_size == 32) {
2596          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2597       } else {
2598          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2599       }
2600       break;
2601    }
2602    case nir_op_fsqrt: {
2603       if (instr->def.bit_size == 16) {
2604          if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2605             bld.vop3(aco_opcode::v_s_sqrt_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2606          else
2607             emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2608       } else if (instr->def.bit_size == 32) {
2609          emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2610       } else if (instr->def.bit_size == 64) {
2611          /* Lowered at NIR level for precision reasons. */
2612          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2613       } else {
2614          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2615       }
2616       break;
2617    }
2618    case nir_op_ffract: {
2619       if (dst.regClass() == v2b) {
2620          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2621       } else if (dst.regClass() == v1) {
2622          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2623       } else if (dst.regClass() == v2) {
2624          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2625       } else if (dst.regClass() == s1) {
2626          Temp src = get_alu_src(ctx, instr->src[0]);
2627          aco_opcode op =
2628             instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
2629          Temp floor = bld.sop1(op, bld.def(s1), src);
2630          op = instr->def.bit_size == 16 ? aco_opcode::s_sub_f16 : aco_opcode::s_sub_f32;
2631          bld.sop2(op, Definition(dst), src, floor);
2632       } else {
2633          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2634       }
2635       break;
2636    }
2637    case nir_op_ffloor: {
2638       if (dst.regClass() == v2b) {
2639          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2640       } else if (dst.regClass() == v1) {
2641          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2642       } else if (dst.regClass() == v2) {
2643          Temp src = get_alu_src(ctx, instr->src[0]);
2644          emit_floor_f64(ctx, bld, Definition(dst), src);
2645       } else if (dst.regClass() == s1) {
2646          Temp src = get_alu_src(ctx, instr->src[0]);
2647          aco_opcode op =
2648             instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
2649          bld.sop1(op, Definition(dst), src);
2650       } else {
2651          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2652       }
2653       break;
2654    }
2655    case nir_op_fceil: {
2656       if (dst.regClass() == v2b) {
2657          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2658       } else if (dst.regClass() == v1) {
2659          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2660       } else if (dst.regClass() == v2) {
2661          if (ctx->options->gfx_level >= GFX7) {
2662             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2663          } else {
2664             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2665             /* trunc = trunc(src0)
2666              * if (src0 > 0.0 && src0 != trunc)
2667              *    trunc += 1.0
2668              */
2669             Temp src0 = get_alu_src(ctx, instr->src[0]);
2670             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2671             Temp tmp0 =
2672                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2673             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2674             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2675             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2676                                 bld.copy(bld.def(v1), Operand::zero()),
2677                                 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2678             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2679                              bld.copy(bld.def(v1), Operand::zero()), add);
2680             bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), trunc, add);
2681          }
2682       } else if (dst.regClass() == s1) {
2683          Temp src = get_alu_src(ctx, instr->src[0]);
2684          aco_opcode op =
2685             instr->def.bit_size == 16 ? aco_opcode::s_ceil_f16 : aco_opcode::s_ceil_f32;
2686          bld.sop1(op, Definition(dst), src);
2687       } else {
2688          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2689       }
2690       break;
2691    }
2692    case nir_op_ftrunc: {
2693       if (dst.regClass() == v2b) {
2694          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2695       } else if (dst.regClass() == v1) {
2696          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2697       } else if (dst.regClass() == v2) {
2698          Temp src = get_alu_src(ctx, instr->src[0]);
2699          emit_trunc_f64(ctx, bld, Definition(dst), src);
2700       } else if (dst.regClass() == s1) {
2701          Temp src = get_alu_src(ctx, instr->src[0]);
2702          aco_opcode op =
2703             instr->def.bit_size == 16 ? aco_opcode::s_trunc_f16 : aco_opcode::s_trunc_f32;
2704          bld.sop1(op, Definition(dst), src);
2705       } else {
2706          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2707       }
2708       break;
2709    }
2710    case nir_op_fround_even: {
2711       if (dst.regClass() == v2b) {
2712          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2713       } else if (dst.regClass() == v1) {
2714          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2715       } else if (dst.regClass() == v2) {
2716          if (ctx->options->gfx_level >= GFX7) {
2717             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2718          } else {
2719             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2720             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2721             Temp src0 = get_alu_src(ctx, instr->src[0]);
2722             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2723 
2724             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2725                                     bld.copy(bld.def(s1), Operand::c32(-2u)));
2726             Temp bfi =
2727                bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2728                         bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2729             Temp tmp =
2730                bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), src0,
2731                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2732             Instruction* sub =
2733                bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), tmp,
2734                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2735             sub->valu().neg[1] = true;
2736             tmp = sub->definitions[0].getTemp();
2737 
2738             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2739                                 Operand::c32(0x432fffffu));
2740             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2741             vop3->valu().abs[0] = true;
2742             Temp cond = vop3->definitions[0].getTemp();
2743 
2744             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2745             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2746             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2747                                      as_vgpr(ctx, src0_lo), cond);
2748             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2749                                      as_vgpr(ctx, src0_hi), cond);
2750 
2751             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2752          }
2753       } else if (dst.regClass() == s1) {
2754          Temp src = get_alu_src(ctx, instr->src[0]);
2755          aco_opcode op =
2756             instr->def.bit_size == 16 ? aco_opcode::s_rndne_f16 : aco_opcode::s_rndne_f32;
2757          bld.sop1(op, Definition(dst), src);
2758       } else {
2759          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2760       }
2761       break;
2762    }
2763    case nir_op_fsin_amd:
2764    case nir_op_fcos_amd: {
2765       if (instr->def.bit_size == 16 || instr->def.bit_size == 32) {
2766          bool is_sin = instr->op == nir_op_fsin_amd;
2767          aco_opcode opcode, fract;
2768          RegClass rc;
2769          if (instr->def.bit_size == 16) {
2770             opcode = is_sin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2771             fract = aco_opcode::v_fract_f16;
2772             rc = v2b;
2773          } else {
2774             opcode = is_sin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2775             fract = aco_opcode::v_fract_f32;
2776             rc = v1;
2777          }
2778 
2779          Temp src = get_alu_src(ctx, instr->src[0]);
2780          /* before GFX9, v_sin and v_cos had a valid input domain of [-256, +256] */
2781          if (ctx->options->gfx_level < GFX9)
2782             src = bld.vop1(fract, bld.def(rc), src);
2783 
2784          if (dst.regClass() == rc) {
2785             bld.vop1(opcode, Definition(dst), src);
2786          } else {
2787             Temp tmp = bld.vop1(opcode, bld.def(rc), src);
2788             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2789          }
2790       } else {
2791          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2792       }
2793       break;
2794    }
2795    case nir_op_ldexp: {
2796       if (dst.regClass() == v2b) {
2797          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2798       } else if (dst.regClass() == v1) {
2799          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2800       } else if (dst.regClass() == v2) {
2801          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2802       } else {
2803          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2804       }
2805       break;
2806    }
2807    case nir_op_frexp_sig: {
2808       if (dst.regClass() == v2b) {
2809          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2810       } else if (dst.regClass() == v1) {
2811          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2812       } else if (dst.regClass() == v2) {
2813          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2814       } else {
2815          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2816       }
2817       break;
2818    }
2819    case nir_op_frexp_exp: {
2820       if (instr->src[0].src.ssa->bit_size == 16) {
2821          Temp src = get_alu_src(ctx, instr->src[0]);
2822          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2823          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2824          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2825       } else if (instr->src[0].src.ssa->bit_size == 32) {
2826          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2827       } else if (instr->src[0].src.ssa->bit_size == 64) {
2828          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2829       } else {
2830          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2831       }
2832       break;
2833    }
2834    case nir_op_fsign: {
2835       Temp src = get_alu_src(ctx, instr->src[0]);
2836       if (dst.regClass() == v2b) {
2837          /* replace negative zero with positive zero */
2838          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), as_vgpr(ctx, src));
2839          if (ctx->program->gfx_level >= GFX9) {
2840             src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src,
2841                            Operand::c16(1u));
2842             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2843          } else {
2844             src = convert_int(ctx, bld, src, 16, 32, true);
2845             src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src,
2846                            Operand::c32(1u));
2847             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2848          }
2849       } else if (dst.regClass() == v1) {
2850          /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
2851           * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
2852           */
2853          Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
2854          src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, as_vgpr(ctx, src));
2855          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
2856                   Operand::c32(0xbf800000));
2857       } else if (dst.regClass() == v2) {
2858          src = as_vgpr(ctx, src);
2859          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2860          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2861          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2862                                    emit_extract_vector(ctx, src, 1, v1), cond);
2863 
2864          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2865          tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2866          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2867 
2868          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2869       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2870          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f16, bld.def(s1, scc), Operand::c16(0), src);
2871          src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3c00), src,
2872                         bld.scc(cond));
2873          cond = bld.sopc(aco_opcode::s_cmp_ge_f16, bld.def(s1, scc), src, Operand::c16(0));
2874          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbc00),
2875                   bld.scc(cond));
2876       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2877          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f32, bld.def(s1, scc), Operand::c32(0), src);
2878          src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3f800000), src,
2879                         bld.scc(cond));
2880          cond = bld.sopc(aco_opcode::s_cmp_ge_f32, bld.def(s1, scc), src, Operand::c32(0));
2881          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbf800000),
2882                   bld.scc(cond));
2883       } else {
2884          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2885       }
2886       break;
2887    }
2888    case nir_op_f2f16:
2889    case nir_op_f2f16_rtne: {
2890       assert(instr->src[0].src.ssa->bit_size == 32);
2891       if (instr->def.num_components == 2) {
2892          /* Vectorizing f2f16 is only possible with rtz. */
2893          assert(instr->op != nir_op_f2f16_rtne);
2894          assert(ctx->block->fp_mode.round16_64 == fp_round_tz ||
2895                 !ctx->block->fp_mode.care_about_round16_64);
2896          emit_vec2_f2f16(ctx, instr, dst);
2897          break;
2898       }
2899       Temp src = get_alu_src(ctx, instr->src[0]);
2900       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) {
2901          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2902           * keep value numbering and the scheduler simpler.
2903           */
2904          if (dst.regClass() == v2b)
2905             bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src);
2906          else
2907             bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, Definition(dst), src);
2908       } else {
2909          if (dst.regClass() == v2b)
2910             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2911          else
2912             bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
2913       }
2914       break;
2915    }
2916    case nir_op_f2f16_rtz: {
2917       assert(instr->src[0].src.ssa->bit_size == 32);
2918       if (instr->def.num_components == 2) {
2919          emit_vec2_f2f16(ctx, instr, dst);
2920          break;
2921       }
2922       Temp src = get_alu_src(ctx, instr->src[0]);
2923       if (ctx->block->fp_mode.round16_64 == fp_round_tz) {
2924          if (dst.regClass() == v2b)
2925             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2926          else
2927             bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
2928       } else if (dst.regClass() == s1) {
2929          bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src, Operand::zero());
2930       } else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9) {
2931          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2932       } else {
2933          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2934       }
2935       break;
2936    }
2937    case nir_op_f2f32: {
2938       if (dst.regClass() == s1) {
2939          assert(instr->src[0].src.ssa->bit_size == 16);
2940          Temp src = get_alu_src(ctx, instr->src[0]);
2941          bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
2942       } else if (instr->src[0].src.ssa->bit_size == 16) {
2943          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2944       } else if (instr->src[0].src.ssa->bit_size == 64) {
2945          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2946       } else {
2947          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2948       }
2949       break;
2950    }
2951    case nir_op_f2f64: {
2952       assert(instr->src[0].src.ssa->bit_size == 32);
2953       Temp src = get_alu_src(ctx, instr->src[0]);
2954       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2955       break;
2956    }
2957    case nir_op_i2f16: {
2958       Temp src = get_alu_src(ctx, instr->src[0]);
2959       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2960       if (dst.regClass() == v2b) {
2961          if (input_size <= 16) {
2962             /* Expand integer to the size expected by the uint→float converter used below */
2963             unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2964             if (input_size != target_size) {
2965                src = convert_int(ctx, bld, src, input_size, target_size, true);
2966             }
2967          }
2968 
2969          if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2970             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2971          } else {
2972             /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
2973              *
2974              * This is also the fallback-path taken on GFX7 and earlier, which
2975              * do not support direct f16⟷i16 conversions.
2976              */
2977             src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2978             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2979          }
2980       } else if (dst.regClass() == s1) {
2981          if (input_size <= 16) {
2982             src = convert_int(ctx, bld, src, input_size, 32, true);
2983          }
2984          src = bld.sop1(aco_opcode::s_cvt_f32_i32, bld.def(s1), src);
2985          bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
2986       } else {
2987          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2988       }
2989       break;
2990    }
2991    case nir_op_i2f32: {
2992       assert(dst.size() == 1);
2993       Temp src = get_alu_src(ctx, instr->src[0]);
2994       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2995       if (input_size <= 32) {
2996          if (input_size <= 16) {
2997             /* Sign-extend to 32-bits */
2998             src = convert_int(ctx, bld, src, input_size, 32, true);
2999          }
3000          if (dst.regClass() == v1)
3001             bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
3002          else
3003             bld.sop1(aco_opcode::s_cvt_f32_i32, Definition(dst), src);
3004       } else {
3005          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3006       }
3007       break;
3008    }
3009    case nir_op_i2f64: {
3010       if (instr->src[0].src.ssa->bit_size <= 32) {
3011          Temp src = get_alu_src(ctx, instr->src[0]);
3012          if (instr->src[0].src.ssa->bit_size <= 16)
3013             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
3014          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
3015       } else {
3016          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3017       }
3018       break;
3019    }
3020    case nir_op_u2f16: {
3021       Temp src = get_alu_src(ctx, instr->src[0]);
3022       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3023       if (dst.regClass() == v2b) {
3024          if (input_size <= 16) {
3025             /* Expand integer to the size expected by the uint→float converter used below */
3026             unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3027             if (input_size != target_size) {
3028                src = convert_int(ctx, bld, src, input_size, target_size, false);
3029             }
3030          }
3031 
3032          if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
3033             bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
3034          } else {
3035             /* Large 32bit inputs need to return inf/FLOAT_MAX.
3036              *
3037              * This is also the fallback-path taken on GFX7 and earlier, which
3038              * do not support direct f16⟷u16 conversions.
3039              */
3040             src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
3041             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3042          }
3043       } else if (dst.regClass() == s1) {
3044          if (input_size <= 16) {
3045             src = convert_int(ctx, bld, src, input_size, 32, false);
3046          }
3047          src = bld.sop1(aco_opcode::s_cvt_f32_u32, bld.def(s1), src);
3048          bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3049       } else {
3050          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3051       }
3052       break;
3053    }
3054    case nir_op_u2f32: {
3055       assert(dst.size() == 1);
3056       Temp src = get_alu_src(ctx, instr->src[0]);
3057       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3058       if (input_size == 8 && dst.regClass() == v1) {
3059          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3060       } else if (input_size <= 32) {
3061          if (input_size <= 16)
3062             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3063          if (dst.regClass() == v1)
3064             bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3065          else
3066             bld.sop1(aco_opcode::s_cvt_f32_u32, Definition(dst), src);
3067       } else {
3068          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3069       }
3070       break;
3071    }
3072    case nir_op_u2f64: {
3073       if (instr->src[0].src.ssa->bit_size <= 32) {
3074          Temp src = get_alu_src(ctx, instr->src[0]);
3075          if (instr->src[0].src.ssa->bit_size <= 16)
3076             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3077          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3078       } else {
3079          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3080       }
3081       break;
3082    }
3083    case nir_op_f2i8:
3084    case nir_op_f2i16: {
3085       if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3086           ctx->program->gfx_level >= GFX11_5) {
3087          Temp src = get_alu_src(ctx, instr->src[0]);
3088          Temp tmp = bld.as_uniform(src);
3089          if (instr->src[0].src.ssa->bit_size == 16)
3090             tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3091          bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
3092       } else if (instr->src[0].src.ssa->bit_size == 16) {
3093          if (ctx->program->gfx_level >= GFX8) {
3094             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3095          } else {
3096             /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3097             Temp tmp = bld.tmp(v1);
3098             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3099             tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3100             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3101                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3102             if (dst.type() == RegType::sgpr) {
3103                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3104             }
3105          }
3106       } else if (instr->src[0].src.ssa->bit_size == 32) {
3107          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3108       } else {
3109          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3110       }
3111       break;
3112    }
3113    case nir_op_f2u8:
3114    case nir_op_f2u16: {
3115       if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3116           ctx->program->gfx_level >= GFX11_5) {
3117          Temp src = get_alu_src(ctx, instr->src[0]);
3118          Temp tmp = bld.as_uniform(src);
3119          if (instr->src[0].src.ssa->bit_size == 16)
3120             tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3121          bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
3122       } else if (instr->src[0].src.ssa->bit_size == 16) {
3123          if (ctx->program->gfx_level >= GFX8) {
3124             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3125          } else {
3126             /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3127             Temp tmp = bld.tmp(v1);
3128             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3129             tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3130             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3131                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3132             if (dst.type() == RegType::sgpr) {
3133                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3134             }
3135          }
3136       } else if (instr->src[0].src.ssa->bit_size == 32) {
3137          if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX11)
3138             bld.vop3(aco_opcode::p_v_cvt_pk_u8_f32, Definition(dst),
3139                      get_alu_src(ctx, instr->src[0]));
3140          else
3141             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3142       } else {
3143          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3144       }
3145       break;
3146    }
3147    case nir_op_f2i32: {
3148       Temp src = get_alu_src(ctx, instr->src[0]);
3149       if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3150           ctx->program->gfx_level >= GFX11_5) {
3151          Temp tmp = bld.as_uniform(src);
3152          if (instr->src[0].src.ssa->bit_size == 16)
3153             tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3154          bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
3155       } else if (instr->src[0].src.ssa->bit_size == 16) {
3156          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3157          if (dst.type() == RegType::vgpr) {
3158             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3159          } else {
3160             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3161                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3162          }
3163       } else if (instr->src[0].src.ssa->bit_size == 32) {
3164          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3165       } else if (instr->src[0].src.ssa->bit_size == 64) {
3166          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3167       } else {
3168          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3169       }
3170       break;
3171    }
3172    case nir_op_f2u32: {
3173       Temp src = get_alu_src(ctx, instr->src[0]);
3174       if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3175           ctx->program->gfx_level >= GFX11_5) {
3176          Temp tmp = bld.as_uniform(src);
3177          if (instr->src[0].src.ssa->bit_size == 16)
3178             tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3179          bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
3180       } else if (instr->src[0].src.ssa->bit_size == 16) {
3181          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3182          if (dst.type() == RegType::vgpr) {
3183             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3184          } else {
3185             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3186                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3187          }
3188       } else if (instr->src[0].src.ssa->bit_size == 32) {
3189          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3190       } else if (instr->src[0].src.ssa->bit_size == 64) {
3191          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3192       } else {
3193          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3194       }
3195       break;
3196    }
3197    case nir_op_b2f16: {
3198       Temp src = get_alu_src(ctx, instr->src[0]);
3199       assert(src.regClass() == bld.lm);
3200 
3201       if (dst.regClass() == s1) {
3202          src = bool_to_scalar_condition(ctx, src);
3203          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3204       } else if (dst.regClass() == v2b) {
3205          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3206          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3207       } else {
3208          unreachable("Wrong destination register class for nir_op_b2f16.");
3209       }
3210       break;
3211    }
3212    case nir_op_b2f32: {
3213       Temp src = get_alu_src(ctx, instr->src[0]);
3214       assert(src.regClass() == bld.lm);
3215 
3216       if (dst.regClass() == s1) {
3217          src = bool_to_scalar_condition(ctx, src);
3218          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3219       } else if (dst.regClass() == v1) {
3220          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3221                       Operand::c32(0x3f800000u), src);
3222       } else {
3223          unreachable("Wrong destination register class for nir_op_b2f32.");
3224       }
3225       break;
3226    }
3227    case nir_op_b2f64: {
3228       Temp src = get_alu_src(ctx, instr->src[0]);
3229       assert(src.regClass() == bld.lm);
3230 
3231       if (dst.regClass() == s2) {
3232          src = bool_to_scalar_condition(ctx, src);
3233          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3234                   Operand::zero(), bld.scc(src));
3235       } else if (dst.regClass() == v2) {
3236          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3237          Temp upper =
3238             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3239          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3240       } else {
3241          unreachable("Wrong destination register class for nir_op_b2f64.");
3242       }
3243       break;
3244    }
3245    case nir_op_i2i8:
3246    case nir_op_i2i16:
3247    case nir_op_i2i32: {
3248       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3249          /* no need to do the extract in get_alu_src() */
3250          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3251                                      ? sgpr_extract_sext
3252                                      : sgpr_extract_undef;
3253          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3254       } else {
3255          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3256          const unsigned output_bitsize = instr->def.bit_size;
3257          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3258                      output_bitsize > input_bitsize, dst);
3259       }
3260       break;
3261    }
3262    case nir_op_u2u8:
3263    case nir_op_u2u16:
3264    case nir_op_u2u32: {
3265       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3266          /* no need to do the extract in get_alu_src() */
3267          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3268                                      ? sgpr_extract_zext
3269                                      : sgpr_extract_undef;
3270          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3271       } else {
3272          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3273                      instr->def.bit_size, false, dst);
3274       }
3275       break;
3276    }
3277    case nir_op_b2b32:
3278    case nir_op_b2i8:
3279    case nir_op_b2i16:
3280    case nir_op_b2i32: {
3281       Temp src = get_alu_src(ctx, instr->src[0]);
3282       assert(src.regClass() == bld.lm);
3283 
3284       if (dst.regClass() == s1) {
3285          bool_to_scalar_condition(ctx, src, dst);
3286       } else if (dst.type() == RegType::vgpr) {
3287          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3288                       src);
3289       } else {
3290          unreachable("Invalid register class for b2i32");
3291       }
3292       break;
3293    }
3294    case nir_op_b2b1: {
3295       Temp src = get_alu_src(ctx, instr->src[0]);
3296       assert(dst.regClass() == bld.lm);
3297 
3298       if (src.type() == RegType::vgpr) {
3299          assert(src.regClass() == v1 || src.regClass() == v2);
3300          assert(dst.regClass() == bld.lm);
3301          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3302                   Definition(dst), Operand::zero(), src);
3303       } else {
3304          assert(src.regClass() == s1 || src.regClass() == s2);
3305          Temp tmp;
3306          if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3307             tmp =
3308                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3309                   .def(1)
3310                   .getTemp();
3311          } else {
3312             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3313                            bld.scc(bld.def(s1)), Operand::zero(), src);
3314          }
3315          bool_to_vector_condition(ctx, tmp, dst);
3316       }
3317       break;
3318    }
3319    case nir_op_unpack_64_2x32:
3320    case nir_op_unpack_32_2x16:
3321    case nir_op_unpack_64_4x16:
3322    case nir_op_unpack_32_4x8:
3323       bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3324       emit_split_vector(
3325          ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3326       break;
3327    case nir_op_pack_64_2x32_split: {
3328       Operand src[2];
3329       RegClass elem_rc = dst.regClass() == s2 ? s1 : v1;
3330       for (unsigned i = 0; i < 2; i++) {
3331          if (nir_src_is_undef(instr->src[i].src))
3332             src[i] = Operand(elem_rc);
3333          else
3334             src[i] = Operand(get_alu_src(ctx, instr->src[i]));
3335       }
3336 
3337       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src[0], src[1]);
3338       break;
3339    }
3340    case nir_op_unpack_64_2x32_split_x:
3341       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3342                  get_alu_src(ctx, instr->src[0]));
3343       break;
3344    case nir_op_unpack_64_2x32_split_y:
3345       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3346                  get_alu_src(ctx, instr->src[0]));
3347       break;
3348    case nir_op_unpack_32_2x16_split_x:
3349       if (dst.type() == RegType::vgpr) {
3350          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3351                     get_alu_src(ctx, instr->src[0]));
3352       } else {
3353          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3354       }
3355       break;
3356    case nir_op_unpack_32_2x16_split_y:
3357       if (dst.type() == RegType::vgpr) {
3358          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3359                     get_alu_src(ctx, instr->src[0]));
3360       } else {
3361          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3362                     get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3363                     Operand::zero());
3364       }
3365       break;
3366    case nir_op_pack_32_2x16_split: {
3367       Operand src0 = Operand(get_alu_src(ctx, instr->src[0]));
3368       Operand src1 = Operand(get_alu_src(ctx, instr->src[1]));
3369       if (dst.regClass() == v1) {
3370          if (nir_src_is_undef(instr->src[0].src))
3371             src0 = Operand(v2b);
3372          else
3373             src0 = Operand(emit_extract_vector(ctx, src0.getTemp(), 0, v2b));
3374 
3375          if (nir_src_is_undef(instr->src[1].src))
3376             src1 = Operand(v2b);
3377          else
3378             src1 = Operand(emit_extract_vector(ctx, src1.getTemp(), 0, v2b));
3379 
3380          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3381       } else if (nir_src_is_undef(instr->src[1].src)) {
3382          bld.copy(Definition(dst), src0);
3383       } else if (nir_src_is_undef(instr->src[0].src)) {
3384          bld.pseudo(aco_opcode::p_insert, Definition(dst), bld.def(s1, scc), src1, Operand::c32(1),
3385                     Operand::c32(16));
3386       } else if (ctx->program->gfx_level >= GFX9) {
3387          bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(dst), src0, src1);
3388       } else {
3389          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3390                          Operand::c32(0xFFFFu));
3391          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3392                          Operand::c32(16u));
3393          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3394       }
3395       break;
3396    }
3397    case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3398    case nir_op_pack_half_2x16_rtz_split:
3399    case nir_op_pack_half_2x16_split: {
3400       if (dst.regClass() == v1) {
3401          if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3402             emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3403          else
3404             emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3405       } else if (dst.regClass() == s1) {
3406          emit_sop2_instruction(ctx, instr, aco_opcode::s_cvt_pk_rtz_f16_f32, dst, false);
3407       } else {
3408          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3409       }
3410       break;
3411    }
3412    case nir_op_pack_unorm_2x16:
3413    case nir_op_pack_snorm_2x16: {
3414       unsigned bit_size = instr->src[0].src.ssa->bit_size;
3415       /* Only support 16 and 32bit. */
3416       assert(bit_size == 32 || bit_size == 16);
3417 
3418       RegClass src_rc = bit_size == 32 ? v1 : v2b;
3419       Temp src = get_alu_src(ctx, instr->src[0], 2);
3420       Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3421       Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3422 
3423       /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3424       if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3425          src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3426          src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3427          bit_size = 32;
3428       }
3429 
3430       aco_opcode opcode;
3431       if (bit_size == 32) {
3432          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3433                                                       : aco_opcode::v_cvt_pknorm_i16_f32;
3434       } else {
3435          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3436                                                       : aco_opcode::v_cvt_pknorm_i16_f16;
3437       }
3438       bld.vop3(opcode, Definition(dst), src0, src1);
3439       break;
3440    }
3441    case nir_op_pack_uint_2x16:
3442    case nir_op_pack_sint_2x16: {
3443       Temp src = get_alu_src(ctx, instr->src[0], 2);
3444       Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3445       Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3446       aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3447                                                              : aco_opcode::v_cvt_pk_i16_i32;
3448       bld.vop3(opcode, Definition(dst), src0, src1);
3449       break;
3450    }
3451    case nir_op_unpack_half_2x16_split_x: {
3452       Temp src = get_alu_src(ctx, instr->src[0]);
3453       if (dst.regClass() == s1) {
3454          bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
3455          break;
3456       }
3457       if (src.regClass() == v1)
3458          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3459       if (dst.regClass() == v1) {
3460          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3461       } else {
3462          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3463       }
3464       break;
3465    }
3466    case nir_op_unpack_half_2x16_split_y: {
3467       Temp src = get_alu_src(ctx, instr->src[0]);
3468       if (dst.regClass() == s1) {
3469          bld.sop1(aco_opcode::s_cvt_hi_f32_f16, Definition(dst), src);
3470          break;
3471       }
3472       if (src.regClass() == s1)
3473          src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3474                           Operand::c32(1u), Operand::c32(16u), Operand::zero());
3475       else
3476          src =
3477             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3478       if (dst.regClass() == v1) {
3479          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3480       } else {
3481          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3482       }
3483       break;
3484    }
3485    case nir_op_msad_4x8: {
3486       assert(dst.regClass() == v1);
3487       emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
3488       break;
3489    }
3490    case nir_op_mqsad_4x8: {
3491       assert(dst.regClass() == v4);
3492       Temp ref = get_alu_src(ctx, instr->src[0]);
3493       Temp src = get_alu_src(ctx, instr->src[1], 2);
3494       Temp accum = get_alu_src(ctx, instr->src[2], 4);
3495       bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), as_vgpr(ctx, ref),
3496                as_vgpr(ctx, accum));
3497       emit_split_vector(ctx, dst, 4);
3498       break;
3499    }
3500    case nir_op_shfr: {
3501       if (dst.regClass() == s1) {
3502          Temp src0 = get_alu_src(ctx, instr->src[0]);
3503          Temp src1 = get_alu_src(ctx, instr->src[1]);
3504 
3505          Temp amount;
3506          if (nir_src_is_const(instr->src[2].src)) {
3507             unsigned camount = nir_src_as_uint(instr->src[2].src) & 0x1f;
3508             if (camount == 16 && ctx->program->gfx_level >= GFX11) {
3509                bld.sop2(aco_opcode::s_pack_hl_b32_b16, Definition(dst), src1, src0);
3510                break;
3511             }
3512             amount = bld.copy(bld.def(s1), Operand::c32(camount));
3513          } else if (get_alu_src_ub(ctx, instr, 2) >= 32) {
3514             amount = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3515                               get_alu_src(ctx, instr->src[2]), Operand::c32(0x1f));
3516          } else {
3517             amount = get_alu_src(ctx, instr->src[2]);
3518          }
3519 
3520          Temp src = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), src1, src0);
3521 
3522          Temp res = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), src, amount);
3523          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), res, Operand::zero());
3524       } else if (dst.regClass() == v1) {
3525          emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbit_b32, dst, false, 3u);
3526       } else {
3527          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3528       }
3529       break;
3530    }
3531    case nir_op_alignbyte_amd: {
3532       if (dst.regClass() == v1) {
3533          emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbyte_b32, dst, false, 3u);
3534       } else {
3535          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3536       }
3537       break;
3538    }
3539    case nir_op_fquantize2f16: {
3540       Temp src = get_alu_src(ctx, instr->src[0]);
3541       if (dst.regClass() == v1) {
3542          Temp f16;
3543          if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3544             f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src);
3545          else
3546             f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3547 
3548          if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
3549             bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16);
3550             break;
3551          }
3552 
3553          Temp denorm_zero;
3554          Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3555          if (ctx->program->gfx_level >= GFX8) {
3556             /* value is negative/positive denormal value/zero */
3557             Instruction* tmp0 =
3558                bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, Operand::c32(0x30));
3559             tmp0->valu().abs[0] = true;
3560             tmp0->valu().neg[0] = true;
3561             denorm_zero = tmp0->definitions[0].getTemp();
3562          } else {
3563             /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3564              * so compare the result and flush to 0 if it's smaller.
3565              */
3566             Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3567             Instruction* tmp0 =
3568                bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3569             tmp0->valu().abs[0] = true;
3570             denorm_zero = tmp0->definitions[0].getTemp();
3571          }
3572          if (nir_alu_instr_is_signed_zero_preserve(instr)) {
3573             Temp copysign_0 =
3574                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3575             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), f32, copysign_0, denorm_zero);
3576          } else {
3577             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), f32, Operand::zero(),
3578                          denorm_zero);
3579          }
3580       } else if (dst.regClass() == s1) {
3581          Temp f16;
3582          if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3583             f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src);
3584          else
3585             f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src);
3586 
3587          if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
3588             bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);
3589          } else {
3590             Temp f32 = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), f16);
3591             Temp abs_mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
3592             Temp abs =
3593                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
3594             Operand sign;
3595             if (nir_alu_instr_is_signed_zero_preserve(instr)) {
3596                sign =
3597                   bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
3598             } else {
3599                sign = Operand::c32(0);
3600             }
3601             Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3602             Temp denorm_zero = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, smallest);
3603             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), sign, f32, bld.scc(denorm_zero));
3604          }
3605       } else {
3606          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3607       }
3608       break;
3609    }
3610    case nir_op_bfm: {
3611       Temp bits = get_alu_src(ctx, instr->src[0]);
3612       Temp offset = get_alu_src(ctx, instr->src[1]);
3613 
3614       if (dst.regClass() == s1) {
3615          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3616       } else if (dst.regClass() == v1) {
3617          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3618       } else {
3619          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3620       }
3621       break;
3622    }
3623    case nir_op_bitfield_select: {
3624 
3625       /* dst = (insert & bitmask) | (base & ~bitmask) */
3626       if (dst.regClass() == s1) {
3627          Temp bitmask = get_alu_src(ctx, instr->src[0]);
3628          Temp insert = get_alu_src(ctx, instr->src[1]);
3629          Temp base = get_alu_src(ctx, instr->src[2]);
3630          aco_ptr<Instruction> sop2;
3631          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3632          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3633 
3634          if (const_bitmask && ctx->program->gfx_level >= GFX9 &&
3635              (const_bitmask->u32 == 0xffff || const_bitmask->u32 == 0xffff0000)) {
3636             if (const_bitmask->u32 == 0xffff) {
3637                bld.sop2(aco_opcode::s_pack_lh_b32_b16, Definition(dst), insert, base);
3638             } else {
3639                bld.sop2(aco_opcode::s_pack_lh_b32_b16, Definition(dst), base, insert);
3640             }
3641             break;
3642          }
3643 
3644          Operand lhs;
3645          if (const_insert && const_bitmask) {
3646             lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3647          } else {
3648             insert =
3649                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3650             lhs = Operand(insert);
3651          }
3652 
3653          Operand rhs;
3654          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3655          if (const_base && const_bitmask) {
3656             rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3657          } else {
3658             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3659             rhs = Operand(base);
3660          }
3661 
3662          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3663 
3664       } else if (dst.regClass() == v1) {
3665          emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3666       } else {
3667          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3668       }
3669       break;
3670    }
3671    case nir_op_ubfe:
3672    case nir_op_ibfe: {
3673       if (dst.bytes() != 4)
3674          unreachable("Unsupported BFE bit size");
3675 
3676       if (dst.type() == RegType::sgpr) {
3677          Temp base = get_alu_src(ctx, instr->src[0]);
3678 
3679          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3680          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3681          aco_opcode opcode =
3682             instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3683          if (const_offset && const_bits) {
3684             uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3685             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3686             break;
3687          }
3688 
3689          Temp offset = get_alu_src(ctx, instr->src[1]);
3690          Temp bits = get_alu_src(ctx, instr->src[2]);
3691 
3692          if (ctx->program->gfx_level >= GFX9) {
3693             Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3694                                          : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3695                                                     bld.def(s1, scc), bits, Operand::c32(0x1fu));
3696             Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3697             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3698          } else if (instr->op == nir_op_ubfe) {
3699             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3700             Temp masked =
3701                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3702             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3703          } else {
3704             Operand bits_op = const_bits
3705                                  ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3706                                  : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3707                                             bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3708                                                      bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3709                                             Operand::c32(16u));
3710             Operand offset_op = const_offset
3711                                    ? Operand::c32(const_offset->u32 & 0x1fu)
3712                                    : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3713                                               offset, Operand::c32(0x1fu));
3714 
3715             Temp extract =
3716                bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3717             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3718          }
3719 
3720       } else {
3721          aco_opcode opcode =
3722             instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3723          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3724       }
3725       break;
3726    }
3727    case nir_op_extract_u8:
3728    case nir_op_extract_i8:
3729    case nir_op_extract_u16:
3730    case nir_op_extract_i16: {
3731       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3732       unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3733       uint32_t bits = comp == 4 ? 8 : 16;
3734       unsigned index = nir_src_as_uint(instr->src[1].src);
3735       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3736          assert(index == 0);
3737          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3738       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3739          Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3740          unsigned swizzle = instr->src[0].swizzle[0];
3741          if (vec.size() > 1) {
3742             vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3743             swizzle = swizzle & 1;
3744          }
3745          index += swizzle * instr->def.bit_size / bits;
3746          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3747                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3748       } else if (dst.regClass() == s1) {
3749          Temp src = get_alu_src(ctx, instr->src[0]);
3750          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(src),
3751                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3752       } else if (dst.regClass() == s2) {
3753          Temp src = get_alu_src(ctx, instr->src[0]);
3754          aco_opcode op = is_signed ? aco_opcode::s_bfe_i64 : aco_opcode::s_bfe_u64;
3755          Temp extract = bld.copy(bld.def(s1), Operand::c32((bits << 16) | (index * bits)));
3756          bld.sop2(op, Definition(dst), bld.def(s1, scc), src, extract);
3757       } else {
3758          assert(dst.regClass().type() == RegType::vgpr);
3759          Temp src = get_alu_src(ctx, instr->src[0]);
3760          Definition def(dst);
3761 
3762          if (dst.bytes() == 8) {
3763             src = emit_extract_vector(ctx, src, index / comp, v1);
3764             index %= comp;
3765             def = bld.def(v1);
3766          }
3767 
3768          assert(def.bytes() <= 4);
3769          src = emit_extract_vector(ctx, src, 0, def.regClass());
3770          bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3771                     Operand::c32(bits), Operand::c32(is_signed));
3772 
3773          if (dst.size() == 2) {
3774             Temp lo = def.getTemp();
3775             Operand hi = Operand::zero();
3776             if (is_signed)
3777                hi = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31), lo);
3778             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
3779          }
3780       }
3781       break;
3782    }
3783    case nir_op_insert_u8:
3784    case nir_op_insert_u16: {
3785       unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3786       uint32_t bits = comp == 4 ? 8 : 16;
3787       unsigned index = nir_src_as_uint(instr->src[1].src);
3788       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3789          assert(index == 0);
3790          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3791       } else {
3792          Temp src = get_alu_src(ctx, instr->src[0]);
3793          Definition def(dst);
3794          bool swap = false;
3795          if (dst.bytes() == 8) {
3796             src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3797             swap = index >= comp;
3798             index %= comp;
3799             def = bld.def(src.type(), 1);
3800          }
3801          if (def.regClass() == s1) {
3802             bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3803                        Operand::c32(index), Operand::c32(bits));
3804          } else {
3805             src = emit_extract_vector(ctx, src, 0, def.regClass());
3806             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3807                        Operand::c32(bits));
3808          }
3809          if (dst.size() == 2 && swap)
3810             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3811                        def.getTemp());
3812          else if (dst.size() == 2)
3813             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3814                        Operand::zero());
3815       }
3816       break;
3817    }
3818    case nir_op_bit_count: {
3819       Temp src = get_alu_src(ctx, instr->src[0]);
3820       if (src.regClass() == s1) {
3821          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3822       } else if (src.regClass() == v1) {
3823          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3824       } else if (src.regClass() == v2) {
3825          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3826                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3827                            emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3828       } else if (src.regClass() == s2) {
3829          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3830       } else {
3831          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3832       }
3833       break;
3834    }
3835    case nir_op_flt: {
3836       emit_comparison(
3837          ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3838          aco_opcode::v_cmp_lt_f64,
3839          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f16 : aco_opcode::num_opcodes,
3840          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f32 : aco_opcode::num_opcodes);
3841       break;
3842    }
3843    case nir_op_fge: {
3844       emit_comparison(
3845          ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3846          aco_opcode::v_cmp_ge_f64,
3847          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f16 : aco_opcode::num_opcodes,
3848          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f32 : aco_opcode::num_opcodes);
3849       break;
3850    }
3851    case nir_op_fltu: {
3852       emit_comparison(
3853          ctx, instr, dst, aco_opcode::v_cmp_nge_f16, aco_opcode::v_cmp_nge_f32,
3854          aco_opcode::v_cmp_nge_f64,
3855          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f16 : aco_opcode::num_opcodes,
3856          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f32 : aco_opcode::num_opcodes);
3857       break;
3858    }
3859    case nir_op_fgeu: {
3860       emit_comparison(
3861          ctx, instr, dst, aco_opcode::v_cmp_nlt_f16, aco_opcode::v_cmp_nlt_f32,
3862          aco_opcode::v_cmp_nlt_f64,
3863          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f16 : aco_opcode::num_opcodes,
3864          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f32 : aco_opcode::num_opcodes);
3865       break;
3866    }
3867    case nir_op_feq: {
3868       emit_comparison(
3869          ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3870          aco_opcode::v_cmp_eq_f64,
3871          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f16 : aco_opcode::num_opcodes,
3872          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f32 : aco_opcode::num_opcodes);
3873       break;
3874    }
3875    case nir_op_fneu: {
3876       emit_comparison(
3877          ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3878          aco_opcode::v_cmp_neq_f64,
3879          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f16 : aco_opcode::num_opcodes,
3880          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f32 : aco_opcode::num_opcodes);
3881       break;
3882    }
3883    case nir_op_fequ: {
3884       emit_comparison(
3885          ctx, instr, dst, aco_opcode::v_cmp_nlg_f16, aco_opcode::v_cmp_nlg_f32,
3886          aco_opcode::v_cmp_nlg_f64,
3887          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f16 : aco_opcode::num_opcodes,
3888          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f32 : aco_opcode::num_opcodes);
3889       break;
3890    }
3891    case nir_op_fneo: {
3892       emit_comparison(
3893          ctx, instr, dst, aco_opcode::v_cmp_lg_f16, aco_opcode::v_cmp_lg_f32,
3894          aco_opcode::v_cmp_lg_f64,
3895          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f16 : aco_opcode::num_opcodes,
3896          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f32 : aco_opcode::num_opcodes);
3897       break;
3898    }
3899    case nir_op_funord: {
3900       emit_comparison(
3901          ctx, instr, dst, aco_opcode::v_cmp_u_f16, aco_opcode::v_cmp_u_f32, aco_opcode::v_cmp_u_f64,
3902          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f16 : aco_opcode::num_opcodes,
3903          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f32 : aco_opcode::num_opcodes);
3904       break;
3905    }
3906    case nir_op_ford: {
3907       emit_comparison(
3908          ctx, instr, dst, aco_opcode::v_cmp_o_f16, aco_opcode::v_cmp_o_f32, aco_opcode::v_cmp_o_f64,
3909          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f16 : aco_opcode::num_opcodes,
3910          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f32 : aco_opcode::num_opcodes);
3911       break;
3912    }
3913    case nir_op_ilt: {
3914       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3915                       aco_opcode::v_cmp_lt_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_i32);
3916       break;
3917    }
3918    case nir_op_ige: {
3919       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3920                       aco_opcode::v_cmp_ge_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_i32);
3921       break;
3922    }
3923    case nir_op_ieq: {
3924       if (instr->src[0].src.ssa->bit_size == 1)
3925          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3926       else
3927          emit_comparison(
3928             ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3929             aco_opcode::v_cmp_eq_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_eq_i32,
3930             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3931       break;
3932    }
3933    case nir_op_ine: {
3934       if (instr->src[0].src.ssa->bit_size == 1)
3935          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3936       else
3937          emit_comparison(
3938             ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3939             aco_opcode::v_cmp_lg_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lg_i32,
3940             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3941       break;
3942    }
3943    case nir_op_ult: {
3944       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3945                       aco_opcode::v_cmp_lt_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_u32);
3946       break;
3947    }
3948    case nir_op_uge: {
3949       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3950                       aco_opcode::v_cmp_ge_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_u32);
3951       break;
3952    }
3953    case nir_op_bitz:
3954    case nir_op_bitnz: {
3955       assert(instr->src[0].src.ssa->bit_size != 1);
3956       bool test0 = instr->op == nir_op_bitz;
3957       Temp src0 = get_alu_src(ctx, instr->src[0]);
3958       Temp src1 = get_alu_src(ctx, instr->src[1]);
3959       bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
3960       if (!use_valu) {
3961          aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
3962                                                                : aco_opcode::s_bitcmp1_b32;
3963          if (test0)
3964             op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
3965                                                        : aco_opcode::s_bitcmp0_b32;
3966          emit_sopc_instruction(ctx, instr, op, dst);
3967          break;
3968       }
3969 
3970       /* We do not have a VALU version of s_bitcmp.
3971        * But if the second source is constant, we can use
3972        * v_cmp_class_f32's LUT to check the bit.
3973        * The LUT only has 10 entries, so extract a higher byte if we have to.
3974        * For sign bits comparision with 0 is better because v_cmp_class
3975        * can't be inverted.
3976        */
3977       if (nir_src_is_const(instr->src[1].src)) {
3978          uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
3979          bit &= instr->src[0].src.ssa->bit_size - 1;
3980          src0 = as_vgpr(ctx, src0);
3981 
3982          if (src0.regClass() == v2) {
3983             src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
3984             bit &= 31;
3985          }
3986 
3987          if (bit == 31) {
3988             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3989                      Operand::c32(0), src0);
3990             break;
3991          }
3992 
3993          if (bit == 15 && ctx->program->gfx_level >= GFX8) {
3994             bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
3995                      Operand::c32(0), src0);
3996             break;
3997          }
3998 
3999          /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
4000          const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
4001          const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
4002          const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
4003          if (use_opsel) {
4004             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
4005                               Operand::c32(16), Operand::c32(0));
4006             bit &= 0xf;
4007          }
4008 
4009          /* If we can use sdwa the extract is free, while test0's s_not is not. */
4010          if (bit == 7 && test0 && can_sdwa) {
4011             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
4012                               Operand::c32(8), Operand::c32(1));
4013             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
4014                      Operand::c32(0), src0);
4015             break;
4016          }
4017 
4018          if (bit > max_bit) {
4019             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
4020                               Operand::c32(8), Operand::c32(0));
4021             bit &= 0x7;
4022          }
4023 
4024          /* denorm and snan/qnan inputs are preserved using all float control modes. */
4025          static const struct {
4026             uint32_t fp32;
4027             uint32_t fp16;
4028             bool negate;
4029          } float_lut[10] = {
4030             {0x7f800001, 0x7c01, false}, /* snan */
4031             {~0u, ~0u, false},           /* qnan */
4032             {0xff800000, 0xfc00, false}, /* -inf */
4033             {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
4034             {1, 1, true},                /* -denormal */
4035             {0, 0, true},                /* -0.0 */
4036             {0, 0, false},               /* +0.0 */
4037             {1, 1, false},               /* +denormal */
4038             {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
4039             {0x7f800000, 0x7c00, false}, /* +inf */
4040          };
4041 
4042          Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
4043          /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
4044          const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
4045                                (ctx->program->gfx_level >= GFX11 && use_opsel);
4046          const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
4047          const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
4048 
4049          VALU_instruction& res =
4050             bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
4051          if (float_lut[bit].negate) {
4052             res.format = asVOP3(res.format);
4053             res.neg[0] = true;
4054          }
4055 
4056          if (test0)
4057             bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
4058 
4059          break;
4060       }
4061 
4062       Temp res;
4063       aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
4064       if (instr->src[0].src.ssa->bit_size == 16) {
4065          op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
4066          if (ctx->program->gfx_level < GFX10)
4067             res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
4068          else
4069             res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
4070 
4071          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
4072       } else if (instr->src[0].src.ssa->bit_size == 32) {
4073          res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
4074       } else if (instr->src[0].src.ssa->bit_size == 64) {
4075          if (ctx->program->gfx_level < GFX8)
4076             res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
4077          else
4078             res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
4079 
4080          res = emit_extract_vector(ctx, res, 0, v1);
4081          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
4082       } else {
4083          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
4084       }
4085       bld.vopc(op, Definition(dst), Operand::c32(0), res);
4086       break;
4087    }
4088    default: isel_err(&instr->instr, "Unknown NIR ALU instr");
4089    }
4090 }
4091 
4092 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)4093 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
4094 {
4095    Temp dst = get_ssa_temp(ctx, &instr->def);
4096 
4097    // TODO: we really want to have the resulting type as this would allow for 64bit literals
4098    // which get truncated the lsb if double and msb if int
4099    // for now, we only use s_mov_b64 with 64bit inline constants
4100    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
4101    assert(dst.type() == RegType::sgpr);
4102 
4103    Builder bld(ctx->program, ctx->block);
4104 
4105    if (instr->def.bit_size == 1) {
4106       assert(dst.regClass() == bld.lm);
4107       int val = instr->value[0].b ? -1 : 0;
4108       Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
4109       bld.copy(Definition(dst), op);
4110    } else if (instr->def.bit_size == 8) {
4111       bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
4112    } else if (instr->def.bit_size == 16) {
4113       /* sign-extend to use s_movk_i32 instead of a literal */
4114       bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
4115    } else if (dst.size() == 1) {
4116       bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
4117    } else {
4118       assert(dst.size() != 1);
4119       aco_ptr<Instruction> vec{
4120          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4121       if (instr->def.bit_size == 64)
4122          for (unsigned i = 0; i < dst.size(); i++)
4123             vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
4124       else {
4125          for (unsigned i = 0; i < dst.size(); i++)
4126             vec->operands[i] = Operand::c32(instr->value[i].u32);
4127       }
4128       vec->definitions[0] = Definition(dst);
4129       ctx->block->instructions.emplace_back(std::move(vec));
4130    }
4131 }
4132 
4133 Temp
emit_readfirstlane(isel_context * ctx,Temp src,Temp dst)4134 emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
4135 {
4136    Builder bld(ctx->program, ctx->block);
4137 
4138    if (src.regClass().type() == RegType::sgpr) {
4139       bld.copy(Definition(dst), src);
4140    } else if (src.size() == 1) {
4141       bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
4142    } else {
4143       aco_ptr<Instruction> split{
4144          create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())};
4145       split->operands[0] = Operand(src);
4146 
4147       for (unsigned i = 0; i < src.size(); i++) {
4148          split->definitions[i] =
4149             bld.def(RegClass::get(RegType::vgpr, MIN2(src.bytes() - i * 4, 4)));
4150       }
4151 
4152       Instruction* split_raw = split.get();
4153       ctx->block->instructions.emplace_back(std::move(split));
4154 
4155       aco_ptr<Instruction> vec{
4156          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)};
4157       vec->definitions[0] = Definition(dst);
4158       for (unsigned i = 0; i < src.size(); i++) {
4159          vec->operands[i] = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1),
4160                                      split_raw->definitions[i].getTemp());
4161       }
4162 
4163       ctx->block->instructions.emplace_back(std::move(vec));
4164       if (src.bytes() % 4 == 0)
4165          emit_split_vector(ctx, dst, src.size());
4166    }
4167 
4168    return dst;
4169 }
4170 
4171 struct LoadEmitInfo {
4172    Operand offset;
4173    Temp dst;
4174    unsigned num_components;
4175    unsigned component_size;
4176    Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
4177    Temp idx = Temp(0, v1);      /* buffer index */
4178    unsigned component_stride = 0;
4179    unsigned const_offset = 0;
4180    unsigned align_mul = 0;
4181    unsigned align_offset = 0;
4182    pipe_format format;
4183 
4184    ac_hw_cache_flags cache = {{0, 0, 0, 0, 0}};
4185    bool split_by_component_stride = true;
4186    bool readfirstlane_for_uniform = false;
4187    unsigned swizzle_component_size = 0;
4188    memory_sync_info sync;
4189    Temp soffset = Temp(0, s1);
4190 };
4191 
4192 struct EmitLoadParameters {
4193    using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
4194                              unsigned bytes_needed, unsigned align, unsigned const_offset,
4195                              Temp dst_hint);
4196 
4197    Callback callback;
4198    unsigned max_const_offset_plus_one;
4199 };
4200 
4201 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)4202 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
4203           const EmitLoadParameters& params)
4204 {
4205    unsigned load_size = info.num_components * info.component_size;
4206    unsigned component_size = info.component_size;
4207 
4208    unsigned num_vals = 0;
4209    Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
4210 
4211    unsigned const_offset = info.const_offset;
4212 
4213    const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
4214    unsigned align_offset = info.align_offset % align_mul;
4215 
4216    unsigned bytes_read = 0;
4217    while (bytes_read < load_size) {
4218       unsigned bytes_needed = load_size - bytes_read;
4219 
4220       if (info.split_by_component_stride) {
4221          if (info.swizzle_component_size)
4222             bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
4223          if (info.component_stride)
4224             bytes_needed = MIN2(bytes_needed, info.component_size);
4225       }
4226 
4227       /* reduce constant offset */
4228       Operand offset = info.offset;
4229       unsigned reduced_const_offset = const_offset;
4230       if (const_offset && (const_offset >= params.max_const_offset_plus_one)) {
4231          unsigned to_add =
4232             const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4233          reduced_const_offset %= params.max_const_offset_plus_one;
4234          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4235          if (offset.isConstant()) {
4236             offset = Operand::c32(offset.constantValue() + to_add);
4237          } else if (offset.isUndefined()) {
4238             offset = Operand::c32(to_add);
4239          } else if (offset_tmp.regClass() == s1) {
4240             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4241                               Operand::c32(to_add));
4242          } else if (offset_tmp.regClass() == v1) {
4243             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4244          } else {
4245             Temp lo = bld.tmp(offset_tmp.type(), 1);
4246             Temp hi = bld.tmp(offset_tmp.type(), 1);
4247             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4248 
4249             if (offset_tmp.regClass() == s2) {
4250                Temp carry = bld.tmp(s1);
4251                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4252                              Operand::c32(to_add));
4253                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4254                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4255             } else {
4256                Temp new_lo = bld.tmp(v1);
4257                Temp carry =
4258                   bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4259                hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4260                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4261             }
4262          }
4263       }
4264 
4265       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4266       Temp offset_tmp = offset.isTemp()       ? offset.getTemp()
4267                         : offset.isConstant() ? bld.copy(bld.def(s1), offset)
4268                                               : Temp(0, s1);
4269 
4270       Temp val = params.callback(bld, info, offset_tmp, bytes_needed, align, reduced_const_offset,
4271                                  info.dst);
4272 
4273       /* the callback wrote directly to dst */
4274       if (val == info.dst) {
4275          assert(num_vals == 0);
4276          emit_split_vector(ctx, info.dst, info.num_components);
4277          return;
4278       }
4279 
4280       /* add result to list and advance */
4281       if (info.component_stride) {
4282          assert(val.bytes() % info.component_size == 0);
4283          unsigned num_loaded_components = val.bytes() / info.component_size;
4284          unsigned advance_bytes = info.component_stride * num_loaded_components;
4285          const_offset += advance_bytes;
4286          align_offset = (align_offset + advance_bytes) % align_mul;
4287       } else {
4288          const_offset += val.bytes();
4289          align_offset = (align_offset + val.bytes()) % align_mul;
4290       }
4291       bytes_read += val.bytes();
4292       vals[num_vals++] = val;
4293    }
4294 
4295    /* create array of components */
4296    unsigned components_split = 0;
4297    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4298    bool has_vgprs = false;
4299    for (unsigned i = 0; i < num_vals;) {
4300       Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4301       unsigned num_tmps = 0;
4302       unsigned tmp_size = 0;
4303       RegType reg_type = RegType::sgpr;
4304       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4305          if (vals[i].type() == RegType::vgpr)
4306             reg_type = RegType::vgpr;
4307          tmp_size += vals[i].bytes();
4308          tmp[num_tmps++] = vals[i++];
4309       }
4310       if (num_tmps > 1) {
4311          aco_ptr<Instruction> vec{
4312             create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4313          for (unsigned j = 0; j < num_tmps; j++)
4314             vec->operands[j] = Operand(tmp[j]);
4315          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4316          vec->definitions[0] = Definition(tmp[0]);
4317          bld.insert(std::move(vec));
4318       }
4319 
4320       if (tmp[0].bytes() % component_size) {
4321          /* trim tmp[0] */
4322          assert(i == num_vals);
4323          RegClass new_rc =
4324             RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4325          tmp[0] =
4326             bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4327       }
4328 
4329       RegClass elem_rc = RegClass::get(reg_type, component_size);
4330 
4331       unsigned start = components_split;
4332 
4333       if (tmp_size == elem_rc.bytes()) {
4334          allocated_vec[components_split++] = tmp[0];
4335       } else {
4336          assert(tmp_size % elem_rc.bytes() == 0);
4337          aco_ptr<Instruction> split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO,
4338                                                        1, tmp_size / elem_rc.bytes())};
4339          for (auto& def : split->definitions) {
4340             Temp component = bld.tmp(elem_rc);
4341             allocated_vec[components_split++] = component;
4342             def = Definition(component);
4343          }
4344          split->operands[0] = Operand(tmp[0]);
4345          bld.insert(std::move(split));
4346       }
4347 
4348       /* try to p_as_uniform early so we can create more optimizable code and
4349        * also update allocated_vec */
4350       for (unsigned j = start; j < components_split; j++) {
4351          if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) {
4352             if (info.readfirstlane_for_uniform) {
4353                allocated_vec[j] = emit_readfirstlane(
4354                   ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size())));
4355             } else {
4356                allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4357             }
4358          }
4359          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4360       }
4361    }
4362 
4363    /* concatenate components and p_as_uniform() result if needed */
4364    if (info.dst.type() == RegType::vgpr || !has_vgprs)
4365       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4366 
4367    int padding_bytes =
4368       MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4369 
4370    aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
4371                                                info.num_components + !!padding_bytes, 1)};
4372    for (unsigned i = 0; i < info.num_components; i++)
4373       vec->operands[i] = Operand(allocated_vec[i]);
4374    if (padding_bytes)
4375       vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4376    if (info.dst.type() == RegType::sgpr && has_vgprs) {
4377       Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4378       vec->definitions[0] = Definition(tmp);
4379       bld.insert(std::move(vec));
4380       if (info.readfirstlane_for_uniform)
4381          emit_readfirstlane(ctx, tmp, info.dst);
4382       else
4383          bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4384    } else {
4385       vec->definitions[0] = Definition(info.dst);
4386       bld.insert(std::move(vec));
4387    }
4388 }
4389 
4390 Operand
load_lds_size_m0(Builder & bld)4391 load_lds_size_m0(Builder& bld)
4392 {
4393    /* m0 does not need to be initialized on GFX9+ */
4394    if (bld.program->gfx_level >= GFX9)
4395       return Operand(s1);
4396 
4397    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4398 }
4399 
4400 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4401 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4402                   unsigned align, unsigned const_offset, Temp dst_hint)
4403 {
4404    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4405 
4406    Operand m = load_lds_size_m0(bld);
4407 
4408    bool large_ds_read = bld.program->gfx_level >= GFX7;
4409    bool usable_read2 = bld.program->gfx_level >= GFX7;
4410 
4411    bool read2 = false;
4412    unsigned size = 0;
4413    aco_opcode op;
4414    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4415       size = 16;
4416       op = aco_opcode::ds_read_b128;
4417    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4418       size = 16;
4419       read2 = true;
4420       op = aco_opcode::ds_read2_b64;
4421    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4422       size = 12;
4423       op = aco_opcode::ds_read_b96;
4424    } else if (bytes_needed >= 8 && align % 8 == 0) {
4425       size = 8;
4426       op = aco_opcode::ds_read_b64;
4427    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4428       size = 8;
4429       read2 = true;
4430       op = aco_opcode::ds_read2_b32;
4431    } else if (bytes_needed >= 4 && align % 4 == 0) {
4432       size = 4;
4433       op = aco_opcode::ds_read_b32;
4434    } else if (bytes_needed >= 2 && align % 2 == 0) {
4435       size = 2;
4436       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4437    } else {
4438       size = 1;
4439       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4440    }
4441 
4442    unsigned const_offset_unit = read2 ? size / 2u : 1u;
4443    unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4444 
4445    if (const_offset > (const_offset_range - const_offset_unit)) {
4446       unsigned excess = const_offset - (const_offset % const_offset_range);
4447       offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4448       const_offset -= excess;
4449    }
4450 
4451    const_offset /= const_offset_unit;
4452 
4453    RegClass rc = RegClass::get(RegType::vgpr, size);
4454    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4455    Instruction* instr;
4456    if (read2)
4457       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4458    else
4459       instr = bld.ds(op, Definition(val), offset, m, const_offset);
4460    instr->ds().sync = info.sync;
4461 
4462    if (m.isUndefined())
4463       instr->operands.pop_back();
4464 
4465    return val;
4466 }
4467 
4468 const EmitLoadParameters lds_load_params{lds_load_callback, UINT32_MAX};
4469 
4470 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4471 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4472                    unsigned align, unsigned const_offset, Temp dst_hint)
4473 {
4474    assert(align >= 4u);
4475 
4476    bld.program->has_smem_buffer_or_global_loads = true;
4477 
4478    bool buffer = info.resource.id() && info.resource.bytes() == 16;
4479    Temp addr = info.resource;
4480    if (!buffer && !addr.id()) {
4481       addr = offset;
4482       offset = Temp();
4483    }
4484 
4485    bytes_needed = MIN2(bytes_needed, 64);
4486    unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4487    unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4488    /* Only round-up global loads if it's aligned so that it won't cross pages */
4489    bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4490 
4491    aco_opcode op;
4492    if (bytes_needed <= 4) {
4493       op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4494    } else if (bytes_needed <= 8) {
4495       op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4496    } else if (bytes_needed <= 16) {
4497       op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4498    } else if (bytes_needed <= 32) {
4499       op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4500    } else {
4501       assert(bytes_needed == 64);
4502       op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4503    }
4504 
4505    aco_ptr<Instruction> load{create_instruction(op, Format::SMEM, 2, 1)};
4506    if (buffer) {
4507       if (const_offset)
4508          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4509                            Operand::c32(const_offset));
4510       load->operands[0] = Operand(info.resource);
4511       load->operands[1] = Operand(offset);
4512    } else {
4513       load->operands[0] = Operand(addr);
4514       if (offset.id() && const_offset)
4515          load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4516                                       Operand::c32(const_offset));
4517       else if (offset.id())
4518          load->operands[1] = Operand(offset);
4519       else
4520          load->operands[1] = Operand::c32(const_offset);
4521    }
4522    RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4523    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4524    load->definitions[0] = Definition(val);
4525    load->smem().cache = info.cache;
4526    load->smem().sync = info.sync;
4527    bld.insert(std::move(load));
4528    return val;
4529 }
4530 
4531 const EmitLoadParameters smem_load_params{smem_load_callback, 1024};
4532 
4533 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4534 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4535                     unsigned align_, unsigned const_offset, Temp dst_hint)
4536 {
4537    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4538    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4539 
4540    if (info.soffset.id()) {
4541       if (soffset.isTemp())
4542          vaddr = bld.copy(bld.def(v1), soffset);
4543       soffset = Operand(info.soffset);
4544    }
4545 
4546    if (soffset.isUndefined())
4547       soffset = Operand::zero();
4548 
4549    bool offen = !vaddr.isUndefined();
4550    bool idxen = info.idx.id();
4551 
4552    if (offen && idxen)
4553       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4554    else if (idxen)
4555       vaddr = Operand(info.idx);
4556 
4557    unsigned bytes_size = 0;
4558    aco_opcode op;
4559    if (bytes_needed == 1 || align_ % 2) {
4560       bytes_size = 1;
4561       op = aco_opcode::buffer_load_ubyte;
4562    } else if (bytes_needed == 2 || align_ % 4) {
4563       bytes_size = 2;
4564       op = aco_opcode::buffer_load_ushort;
4565    } else if (bytes_needed <= 4) {
4566       bytes_size = 4;
4567       op = aco_opcode::buffer_load_dword;
4568    } else if (bytes_needed <= 8) {
4569       bytes_size = 8;
4570       op = aco_opcode::buffer_load_dwordx2;
4571    } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4572       bytes_size = 12;
4573       op = aco_opcode::buffer_load_dwordx3;
4574    } else {
4575       bytes_size = 16;
4576       op = aco_opcode::buffer_load_dwordx4;
4577    }
4578    aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4579    mubuf->operands[0] = Operand(info.resource);
4580    mubuf->operands[1] = vaddr;
4581    mubuf->operands[2] = soffset;
4582    mubuf->mubuf().offen = offen;
4583    mubuf->mubuf().idxen = idxen;
4584    mubuf->mubuf().cache = info.cache;
4585    mubuf->mubuf().sync = info.sync;
4586    mubuf->mubuf().offset = const_offset;
4587    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4588    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4589    mubuf->definitions[0] = Definition(val);
4590    bld.insert(std::move(mubuf));
4591 
4592    return val;
4593 }
4594 
4595 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, 4096};
4596 
4597 Temp
mubuf_load_format_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4598 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4599                            unsigned bytes_needed, unsigned align_, unsigned const_offset,
4600                            Temp dst_hint)
4601 {
4602    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4603    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4604 
4605    if (info.soffset.id()) {
4606       if (soffset.isTemp())
4607          vaddr = bld.copy(bld.def(v1), soffset);
4608       soffset = Operand(info.soffset);
4609    }
4610 
4611    if (soffset.isUndefined())
4612       soffset = Operand::zero();
4613 
4614    bool offen = !vaddr.isUndefined();
4615    bool idxen = info.idx.id();
4616 
4617    if (offen && idxen)
4618       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4619    else if (idxen)
4620       vaddr = Operand(info.idx);
4621 
4622    aco_opcode op = aco_opcode::num_opcodes;
4623    if (info.component_size == 2) {
4624       switch (bytes_needed) {
4625       case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4626       case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4627       case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4628       case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4629       default: unreachable("invalid buffer load format size"); break;
4630       }
4631    } else {
4632       assert(info.component_size == 4);
4633       switch (bytes_needed) {
4634       case 4: op = aco_opcode::buffer_load_format_x; break;
4635       case 8: op = aco_opcode::buffer_load_format_xy; break;
4636       case 12: op = aco_opcode::buffer_load_format_xyz; break;
4637       case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4638       default: unreachable("invalid buffer load format size"); break;
4639       }
4640    }
4641 
4642    aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4643    mubuf->operands[0] = Operand(info.resource);
4644    mubuf->operands[1] = vaddr;
4645    mubuf->operands[2] = soffset;
4646    mubuf->mubuf().offen = offen;
4647    mubuf->mubuf().idxen = idxen;
4648    mubuf->mubuf().cache = info.cache;
4649    mubuf->mubuf().sync = info.sync;
4650    mubuf->mubuf().offset = const_offset;
4651    RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4652    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4653    mubuf->definitions[0] = Definition(val);
4654    bld.insert(std::move(mubuf));
4655 
4656    return val;
4657 }
4658 
4659 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, 4096};
4660 
4661 Temp
scratch_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4662 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4663                       unsigned align_, unsigned const_offset, Temp dst_hint)
4664 {
4665    unsigned bytes_size = 0;
4666    aco_opcode op;
4667    if (bytes_needed == 1 || align_ % 2u) {
4668       bytes_size = 1;
4669       op = aco_opcode::scratch_load_ubyte;
4670    } else if (bytes_needed == 2 || align_ % 4u) {
4671       bytes_size = 2;
4672       op = aco_opcode::scratch_load_ushort;
4673    } else if (bytes_needed <= 4) {
4674       bytes_size = 4;
4675       op = aco_opcode::scratch_load_dword;
4676    } else if (bytes_needed <= 8) {
4677       bytes_size = 8;
4678       op = aco_opcode::scratch_load_dwordx2;
4679    } else if (bytes_needed <= 12) {
4680       bytes_size = 12;
4681       op = aco_opcode::scratch_load_dwordx3;
4682    } else {
4683       bytes_size = 16;
4684       op = aco_opcode::scratch_load_dwordx4;
4685    }
4686    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4687    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4688    aco_ptr<Instruction> flat{create_instruction(op, Format::SCRATCH, 2, 1)};
4689    flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4690    flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4691    flat->scratch().sync = info.sync;
4692    flat->scratch().offset = const_offset;
4693    flat->definitions[0] = Definition(val);
4694    bld.insert(std::move(flat));
4695 
4696    return val;
4697 }
4698 
4699 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, 4096};
4700 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, 2048};
4701 
4702 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4703 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4704 {
4705    uint32_t desc[4];
4706    ac_build_raw_buffer_descriptor(bld.program->gfx_level, 0, 0xffffffff, desc);
4707 
4708    if (addr.type() == RegType::vgpr)
4709       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4710                         Operand::c32(desc[2]), Operand::c32(desc[3]));
4711    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(desc[2]),
4712                      Operand::c32(desc[3]));
4713 }
4714 
4715 Temp
add64_32(Builder & bld,Temp src0,Temp src1)4716 add64_32(Builder& bld, Temp src0, Temp src1)
4717 {
4718    Temp src00 = bld.tmp(src0.type(), 1);
4719    Temp src01 = bld.tmp(src0.type(), 1);
4720    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4721 
4722    if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4723       Temp dst0 = bld.tmp(v1);
4724       Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4725       Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4726       return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4727    } else {
4728       Temp carry = bld.tmp(s1);
4729       Temp dst0 =
4730          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4731       Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4732       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4733    }
4734 }
4735 
4736 void
lower_global_address(Builder & bld,uint32_t offset_in,Temp * address_inout,uint32_t * const_offset_inout,Temp * offset_inout)4737 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4738                      uint32_t* const_offset_inout, Temp* offset_inout)
4739 {
4740    Temp address = *address_inout;
4741    uint64_t const_offset = *const_offset_inout + offset_in;
4742    Temp offset = *offset_inout;
4743 
4744    uint64_t max_const_offset_plus_one =
4745       1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4746    if (bld.program->gfx_level >= GFX9)
4747       max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4748    else if (bld.program->gfx_level == GFX6)
4749       max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4750    uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4751    const_offset %= max_const_offset_plus_one;
4752 
4753    if (!offset.id()) {
4754       while (unlikely(excess_offset > UINT32_MAX)) {
4755          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4756          excess_offset -= UINT32_MAX;
4757       }
4758       if (excess_offset)
4759          offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4760    } else {
4761       /* If we add to "offset", we would transform the indended
4762        * "address + u2u64(offset) + u2u64(const_offset)" into
4763        * "address + u2u64(offset + const_offset)", so add to the address.
4764        * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4765        * but that should be really rare.
4766        */
4767       while (excess_offset) {
4768          uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4769          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4770          excess_offset -= src2;
4771       }
4772    }
4773 
4774    if (bld.program->gfx_level == GFX6) {
4775       /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4776       if (offset.type() != RegType::sgpr) {
4777          address = add64_32(bld, address, offset);
4778          offset = Temp();
4779       }
4780       offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4781    } else if (bld.program->gfx_level <= GFX8) {
4782       /* GFX7,8 (FLAT): VGPR address */
4783       if (offset.id()) {
4784          address = add64_32(bld, address, offset);
4785          offset = Temp();
4786       }
4787       address = as_vgpr(bld, address);
4788    } else {
4789       /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4790       if (address.type() == RegType::vgpr && offset.id()) {
4791          address = add64_32(bld, address, offset);
4792          offset = Temp();
4793       } else if (address.type() == RegType::sgpr && offset.id()) {
4794          offset = as_vgpr(bld, offset);
4795       }
4796       if (address.type() == RegType::sgpr && !offset.id())
4797          offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4798    }
4799 
4800    *address_inout = address;
4801    *const_offset_inout = const_offset;
4802    *offset_inout = offset;
4803 }
4804 
4805 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4806 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4807                      unsigned align_, unsigned const_offset, Temp dst_hint)
4808 {
4809    Temp addr = info.resource;
4810    if (!addr.id()) {
4811       addr = offset;
4812       offset = Temp();
4813    }
4814    lower_global_address(bld, 0, &addr, &const_offset, &offset);
4815 
4816    unsigned bytes_size = 0;
4817    bool use_mubuf = bld.program->gfx_level == GFX6;
4818    bool global = bld.program->gfx_level >= GFX9;
4819    aco_opcode op;
4820    if (bytes_needed == 1 || align_ % 2u) {
4821       bytes_size = 1;
4822       op = use_mubuf ? aco_opcode::buffer_load_ubyte
4823            : global  ? aco_opcode::global_load_ubyte
4824                      : aco_opcode::flat_load_ubyte;
4825    } else if (bytes_needed == 2 || align_ % 4u) {
4826       bytes_size = 2;
4827       op = use_mubuf ? aco_opcode::buffer_load_ushort
4828            : global  ? aco_opcode::global_load_ushort
4829                      : aco_opcode::flat_load_ushort;
4830    } else if (bytes_needed <= 4) {
4831       bytes_size = 4;
4832       op = use_mubuf ? aco_opcode::buffer_load_dword
4833            : global  ? aco_opcode::global_load_dword
4834                      : aco_opcode::flat_load_dword;
4835    } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
4836       bytes_size = 8;
4837       op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4838            : global  ? aco_opcode::global_load_dwordx2
4839                      : aco_opcode::flat_load_dwordx2;
4840    } else if (bytes_needed <= 12 && !use_mubuf) {
4841       bytes_size = 12;
4842       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4843    } else {
4844       bytes_size = 16;
4845       op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4846            : global  ? aco_opcode::global_load_dwordx4
4847                      : aco_opcode::flat_load_dwordx4;
4848    }
4849    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4850    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4851    if (use_mubuf) {
4852       aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4853       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
4854       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
4855       mubuf->operands[2] = Operand(offset);
4856       mubuf->mubuf().cache = info.cache;
4857       mubuf->mubuf().offset = const_offset;
4858       mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
4859       mubuf->mubuf().disable_wqm = false;
4860       mubuf->mubuf().sync = info.sync;
4861       mubuf->definitions[0] = Definition(val);
4862       bld.insert(std::move(mubuf));
4863    } else {
4864       aco_ptr<Instruction> flat{
4865          create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4866       if (addr.regClass() == s2) {
4867          assert(global && offset.id() && offset.type() == RegType::vgpr);
4868          flat->operands[0] = Operand(offset);
4869          flat->operands[1] = Operand(addr);
4870       } else {
4871          assert(addr.type() == RegType::vgpr && !offset.id());
4872          flat->operands[0] = Operand(addr);
4873          flat->operands[1] = Operand(s1);
4874       }
4875       flat->flatlike().cache = info.cache;
4876       flat->flatlike().sync = info.sync;
4877       assert(global || !const_offset);
4878       flat->flatlike().offset = const_offset;
4879       flat->definitions[0] = Definition(val);
4880       bld.insert(std::move(flat));
4881    }
4882 
4883    return val;
4884 }
4885 
4886 const EmitLoadParameters global_load_params{global_load_callback, UINT32_MAX};
4887 
4888 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)4889 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4890          Temp address, unsigned base_offset, unsigned align)
4891 {
4892    assert(util_is_power_of_two_nonzero(align));
4893 
4894    Builder bld(ctx->program, ctx->block);
4895 
4896    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4897    info.align_mul = align;
4898    info.align_offset = 0;
4899    info.sync = memory_sync_info(storage_shared);
4900    info.const_offset = base_offset;
4901    /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses,
4902     * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order
4903     * to avoid copy-propagation.
4904     */
4905    info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
4906                                     ctx->program->wave_size == 64 &&
4907                                     ctx->program->workgroup_size > 64;
4908    emit_load(ctx, bld, info, lds_load_params);
4909 
4910    return dst;
4911 }
4912 
4913 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)4914 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4915                  Temp src)
4916 {
4917    if (!count)
4918       return;
4919 
4920    Builder bld(ctx->program, ctx->block);
4921 
4922    /* count == 1 fast path */
4923    if (count == 1) {
4924       if (dst_type == RegType::sgpr)
4925          dst[0] = bld.as_uniform(src);
4926       else
4927          dst[0] = as_vgpr(ctx, src);
4928       return;
4929    }
4930 
4931    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4932    unsigned elem_size_bytes =
4933       1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4934 
4935    ASSERTED bool is_subdword = elem_size_bytes < 4;
4936    assert(!is_subdword || dst_type == RegType::vgpr);
4937 
4938    for (unsigned i = 0; i < count; i++)
4939       dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4940 
4941    std::vector<Temp> temps;
4942    /* use allocated_vec if possible */
4943    auto it = ctx->allocated_vec.find(src.id());
4944    if (it != ctx->allocated_vec.end()) {
4945       if (!it->second[0].id())
4946          goto split;
4947       unsigned elem_size = it->second[0].bytes();
4948       assert(src.bytes() % elem_size == 0);
4949 
4950       for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4951          if (!it->second[i].id())
4952             goto split;
4953       }
4954       if (elem_size_bytes % elem_size)
4955          goto split;
4956 
4957       temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4958       elem_size_bytes = elem_size;
4959    }
4960 
4961 split:
4962    /* split src if necessary */
4963    if (temps.empty()) {
4964       if (is_subdword && src.type() == RegType::sgpr)
4965          src = as_vgpr(ctx, src);
4966       if (dst_type == RegType::sgpr)
4967          src = bld.as_uniform(src);
4968 
4969       unsigned num_elems = src.bytes() / elem_size_bytes;
4970       aco_ptr<Instruction> split{
4971          create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4972       split->operands[0] = Operand(src);
4973       for (unsigned i = 0; i < num_elems; i++) {
4974          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4975          split->definitions[i] = Definition(temps.back());
4976       }
4977       bld.insert(std::move(split));
4978    }
4979 
4980    unsigned idx = 0;
4981    for (unsigned i = 0; i < count; i++) {
4982       unsigned op_count = dst[i].bytes() / elem_size_bytes;
4983       if (op_count == 1) {
4984          if (dst_type == RegType::sgpr)
4985             dst[i] = bld.as_uniform(temps[idx++]);
4986          else
4987             dst[i] = as_vgpr(ctx, temps[idx++]);
4988          continue;
4989       }
4990 
4991       aco_ptr<Instruction> vec{
4992          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
4993       for (unsigned j = 0; j < op_count; j++) {
4994          Temp tmp = temps[idx++];
4995          if (dst_type == RegType::sgpr)
4996             tmp = bld.as_uniform(tmp);
4997          vec->operands[j] = Operand(tmp);
4998       }
4999       vec->definitions[0] = Definition(dst[i]);
5000       bld.insert(std::move(vec));
5001    }
5002    return;
5003 }
5004 
5005 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)5006 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
5007 {
5008    unsigned start_elem = ffs(todo_mask) - 1;
5009    bool skip = !(mask & (1 << start_elem));
5010    if (skip)
5011       mask = ~mask & todo_mask;
5012 
5013    mask &= todo_mask;
5014 
5015    u_bit_scan_consecutive_range(&mask, start, count);
5016 
5017    return !skip;
5018 }
5019 
5020 void
advance_write_mask(uint32_t * todo_mask,int start,int count)5021 advance_write_mask(uint32_t* todo_mask, int start, int count)
5022 {
5023    *todo_mask &= ~u_bit_consecutive(0, count) << start;
5024 }
5025 
5026 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)5027 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
5028           unsigned base_offset, unsigned align)
5029 {
5030    assert(util_is_power_of_two_nonzero(align));
5031    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
5032 
5033    Builder bld(ctx->program, ctx->block);
5034    bool large_ds_write = ctx->options->gfx_level >= GFX7;
5035    bool usable_write2 = ctx->options->gfx_level >= GFX7;
5036 
5037    unsigned write_count = 0;
5038    Temp write_datas[32];
5039    unsigned offsets[32];
5040    unsigned bytes[32];
5041    aco_opcode opcodes[32];
5042 
5043    wrmask = util_widen_mask(wrmask, elem_size_bytes);
5044 
5045    const unsigned wrmask_bitcnt = util_bitcount(wrmask);
5046    uint32_t todo = u_bit_consecutive(0, data.bytes());
5047 
5048    if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
5049       todo = MIN2(todo, wrmask);
5050 
5051    while (todo) {
5052       int offset, byte;
5053       if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
5054          offsets[write_count] = offset;
5055          bytes[write_count] = byte;
5056          opcodes[write_count] = aco_opcode::num_opcodes;
5057          write_count++;
5058          advance_write_mask(&todo, offset, byte);
5059          continue;
5060       }
5061 
5062       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
5063       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
5064       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
5065       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
5066 
5067       // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
5068       aco_opcode op = aco_opcode::num_opcodes;
5069       if (byte >= 16 && aligned16 && large_ds_write) {
5070          op = aco_opcode::ds_write_b128;
5071          byte = 16;
5072       } else if (byte >= 12 && aligned16 && large_ds_write) {
5073          op = aco_opcode::ds_write_b96;
5074          byte = 12;
5075       } else if (byte >= 8 && aligned8) {
5076          op = aco_opcode::ds_write_b64;
5077          byte = 8;
5078       } else if (byte >= 4 && aligned4) {
5079          op = aco_opcode::ds_write_b32;
5080          byte = 4;
5081       } else if (byte >= 2 && aligned2) {
5082          op = aco_opcode::ds_write_b16;
5083          byte = 2;
5084       } else if (byte >= 1) {
5085          op = aco_opcode::ds_write_b8;
5086          byte = 1;
5087       } else {
5088          assert(false);
5089       }
5090 
5091       offsets[write_count] = offset;
5092       bytes[write_count] = byte;
5093       opcodes[write_count] = op;
5094       write_count++;
5095       advance_write_mask(&todo, offset, byte);
5096    }
5097 
5098    Operand m = load_lds_size_m0(bld);
5099 
5100    split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
5101 
5102    for (unsigned i = 0; i < write_count; i++) {
5103       aco_opcode op = opcodes[i];
5104       if (op == aco_opcode::num_opcodes)
5105          continue;
5106 
5107       Temp split_data = write_datas[i];
5108 
5109       unsigned second = write_count;
5110       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
5111          for (second = i + 1; second < write_count; second++) {
5112             if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
5113                op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
5114                opcodes[second] = aco_opcode::num_opcodes;
5115                break;
5116             }
5117          }
5118       }
5119 
5120       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
5121       unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
5122 
5123       unsigned inline_offset = base_offset + offsets[i];
5124       unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
5125       Temp address_offset = address;
5126       if (inline_offset > max_offset) {
5127          address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
5128          inline_offset = offsets[i];
5129       }
5130 
5131       /* offsets[i] shouldn't be large enough for this to happen */
5132       assert(inline_offset <= max_offset);
5133 
5134       Instruction* instr;
5135       if (write2) {
5136          Temp second_data = write_datas[second];
5137          inline_offset /= split_data.bytes();
5138          instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
5139                         inline_offset + write2_off);
5140       } else {
5141          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
5142       }
5143       instr->ds().sync = memory_sync_info(storage_shared);
5144 
5145       if (m.isUndefined())
5146          instr->operands.pop_back();
5147    }
5148 }
5149 
5150 aco_opcode
get_buffer_store_op(unsigned bytes)5151 get_buffer_store_op(unsigned bytes)
5152 {
5153    switch (bytes) {
5154    case 1: return aco_opcode::buffer_store_byte;
5155    case 2: return aco_opcode::buffer_store_short;
5156    case 4: return aco_opcode::buffer_store_dword;
5157    case 8: return aco_opcode::buffer_store_dwordx2;
5158    case 12: return aco_opcode::buffer_store_dwordx3;
5159    case 16: return aco_opcode::buffer_store_dwordx4;
5160    }
5161    unreachable("Unexpected store size");
5162    return aco_opcode::num_opcodes;
5163 }
5164 
5165 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)5166 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
5167                    Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
5168                    Temp* write_datas, unsigned* offsets)
5169 {
5170    unsigned write_count_with_skips = 0;
5171    bool skips[16];
5172    unsigned bytes[16];
5173 
5174    /* determine how to split the data */
5175    unsigned todo = u_bit_consecutive(0, data.bytes());
5176    while (todo) {
5177       int offset, byte;
5178       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
5179       offsets[write_count_with_skips] = offset;
5180       if (skips[write_count_with_skips]) {
5181          bytes[write_count_with_skips] = byte;
5182          advance_write_mask(&todo, offset, byte);
5183          write_count_with_skips++;
5184          continue;
5185       }
5186 
5187       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5188        * larger than swizzle_element_size */
5189       byte = MIN2(byte, swizzle_element_size);
5190       if (byte % 4)
5191          byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5192 
5193       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5194       if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5195          byte = 8;
5196 
5197       /* dword or larger stores have to be dword-aligned */
5198       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5199       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5200       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5201       if (!dword_aligned)
5202          byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5203 
5204       bytes[write_count_with_skips] = byte;
5205       advance_write_mask(&todo, offset, byte);
5206       write_count_with_skips++;
5207    }
5208 
5209    /* actually split data */
5210    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5211 
5212    /* remove skips */
5213    for (unsigned i = 0; i < write_count_with_skips; i++) {
5214       if (skips[i])
5215          continue;
5216       write_datas[*write_count] = write_datas[i];
5217       offsets[*write_count] = offsets[i];
5218       (*write_count)++;
5219    }
5220 }
5221 
5222 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())5223 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5224                       unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5225 {
5226    Builder bld(ctx->program, ctx->block);
5227    unsigned dword_size = elem_size_bytes / 4;
5228 
5229    if (!dst.id())
5230       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5231 
5232    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5233    aco_ptr<Instruction> instr{
5234       create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5235    instr->definitions[0] = Definition(dst);
5236 
5237    for (unsigned i = 0; i < cnt; ++i) {
5238       if (arr[i].id()) {
5239          assert(arr[i].size() == dword_size);
5240          allocated_vec[i] = arr[i];
5241          instr->operands[i] = Operand(arr[i]);
5242       } else {
5243          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5244                               Operand::zero(dword_size == 2 ? 8 : 4));
5245          allocated_vec[i] = zero;
5246          instr->operands[i] = Operand(zero);
5247       }
5248    }
5249 
5250    bld.insert(std::move(instr));
5251 
5252    if (split_cnt)
5253       emit_split_vector(ctx, dst, split_cnt);
5254    else
5255       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5256 
5257    return dst;
5258 }
5259 
5260 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)5261 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5262 {
5263    if (const_offset >= 4096) {
5264       unsigned excess_const_offset = const_offset / 4096u * 4096u;
5265       const_offset %= 4096u;
5266 
5267       if (!voffset.id())
5268          voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5269       else if (unlikely(voffset.regClass() == s1))
5270          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5271                             Operand::c32(excess_const_offset), Operand(voffset));
5272       else if (likely(voffset.regClass() == v1))
5273          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5274       else
5275          unreachable("Unsupported register class of voffset");
5276    }
5277 
5278    return const_offset;
5279 }
5280 
5281 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)5282 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5283 {
5284    unsigned write_mask = nir_intrinsic_write_mask(instr);
5285    unsigned component = nir_intrinsic_component(instr);
5286    nir_src offset = *nir_get_io_offset_src(instr);
5287 
5288    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5289       return false;
5290 
5291    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5292 
5293    if (instr->src[0].ssa->bit_size == 64)
5294       write_mask = util_widen_mask(write_mask, 2);
5295 
5296    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5297 
5298    /* Use semantic location as index. radv already uses it as intrinsic base
5299     * but radeonsi does not. We need to make LS output and TCS input index
5300     * match each other, so need to use semantic location explicitly. Also for
5301     * TCS epilog to index tess factor temps using semantic location directly.
5302     */
5303    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5304    unsigned base = sem.location;
5305    if (ctx->stage == fragment_fs) {
5306       /* color result is a legacy slot which won't appear with data result
5307        * at the same time. Here we just use the data slot for it to simplify
5308        * code handling for both of them.
5309        */
5310       if (base == FRAG_RESULT_COLOR)
5311          base = FRAG_RESULT_DATA0;
5312 
5313       /* Sencond output of dual source blend just use data1 slot for simplicity,
5314        * because dual source blend does not support multi render target.
5315        */
5316       base += sem.dual_source_blend_index;
5317    }
5318    unsigned idx = base * 4u + component;
5319 
5320    for (unsigned i = 0; i < 8; ++i) {
5321       if (write_mask & (1 << i)) {
5322          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5323          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5324       }
5325       idx++;
5326    }
5327 
5328    if (ctx->stage == fragment_fs && ctx->program->info.ps.has_epilog && base >= FRAG_RESULT_DATA0) {
5329       unsigned index = base - FRAG_RESULT_DATA0;
5330 
5331       if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5332          ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5333       } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5334          ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5335       } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5336          ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5337       }
5338    }
5339 
5340    return true;
5341 }
5342 
5343 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)5344 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5345 {
5346    /* Only TCS per-vertex inputs are supported by this function.
5347     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5348     * is the same.
5349     */
5350    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5351       return false;
5352 
5353    /* This can only be indexing with invocation_id because all other access has been lowered
5354     * to load_shared.
5355     */
5356    nir_src* off_src = nir_get_io_offset_src(instr);
5357    assert(nir_src_is_const(*off_src));
5358 
5359    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5360 
5361    unsigned idx =
5362       sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5363    Temp* src = &ctx->inputs.temps[idx];
5364    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5365 
5366    return true;
5367 }
5368 
5369 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)5370 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5371 {
5372    /* LS pass output to TCS by temp if they have same in/out patch size. */
5373    bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5374                          ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5375 
5376    bool ps_need_output = ctx->stage == fragment_fs;
5377 
5378    if (ls_need_output || ps_need_output) {
5379       bool stored_to_temps = store_output_to_temps(ctx, instr);
5380       if (!stored_to_temps) {
5381          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5382          abort();
5383       }
5384    } else {
5385       unreachable("Shader stage not implemented");
5386    }
5387 }
5388 
5389 bool
in_exec_divergent_or_in_loop(isel_context * ctx)5390 in_exec_divergent_or_in_loop(isel_context* ctx)
5391 {
5392    return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5393           ctx->cf_info.had_divergent_discard;
5394 }
5395 
5396 void
emit_interp_instr_gfx11(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask,bool high_16bits)5397 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5398                         Temp prim_mask, bool high_16bits)
5399 {
5400    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5401    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5402 
5403    Builder bld(ctx->program, ctx->block);
5404 
5405    if (in_exec_divergent_or_in_loop(ctx)) {
5406       bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5407                  Operand::c32(idx), Operand::c32(component), Operand::c32(high_16bits), coord1,
5408                  coord2, bld.m0(prim_mask));
5409       return;
5410    }
5411 
5412    Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5413 
5414    Temp res;
5415    if (dst.regClass() == v2b) {
5416       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1,
5417                                    p, high_16bits ? 0x5 : 0);
5418       bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p, coord2, p10,
5419                         high_16bits ? 0x1 : 0);
5420    } else {
5421       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5422       bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
5423    }
5424    /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5425    set_wqm(ctx, true);
5426 }
5427 
5428 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask,bool high_16bits)5429 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5430                   Temp prim_mask, bool high_16bits)
5431 {
5432    if (ctx->options->gfx_level >= GFX11) {
5433       emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask, high_16bits);
5434       return;
5435    }
5436 
5437    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5438    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5439 
5440    Builder bld(ctx->program, ctx->block);
5441 
5442    if (dst.regClass() == v2b) {
5443       if (ctx->program->dev.has_16bank_lds) {
5444          assert(ctx->options->gfx_level <= GFX8);
5445          Builder::Result interp_p1 =
5446             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5447                        bld.m0(prim_mask), idx, component);
5448          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v1), coord1,
5449                                 bld.m0(prim_mask), interp_p1, idx, component, high_16bits);
5450          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5451                     interp_p1, idx, component, high_16bits);
5452       } else {
5453          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5454 
5455          if (ctx->options->gfx_level == GFX8)
5456             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5457 
5458          Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5459                                                 bld.m0(prim_mask), idx, component, high_16bits);
5460          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5461                     component, high_16bits);
5462       }
5463    } else {
5464       assert(!high_16bits);
5465       Temp interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5466                                   bld.m0(prim_mask), idx, component);
5467 
5468       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5469                  idx, component);
5470    }
5471 }
5472 
5473 void
emit_interp_mov_instr(isel_context * ctx,unsigned idx,unsigned component,unsigned vertex_id,Temp dst,Temp prim_mask,bool high_16bits)5474 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5475                       Temp dst, Temp prim_mask, bool high_16bits)
5476 {
5477    Builder bld(ctx->program, ctx->block);
5478    Temp tmp = dst.bytes() == 2 ? bld.tmp(v1) : dst;
5479    if (ctx->options->gfx_level >= GFX11) {
5480       uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5481       if (in_exec_divergent_or_in_loop(ctx)) {
5482          bld.pseudo(aco_opcode::p_interp_gfx11, Definition(tmp), Operand(v1.as_linear()),
5483                     Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5484                     bld.m0(prim_mask));
5485       } else {
5486          Temp p =
5487             bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5488          bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(tmp), p, dpp_ctrl);
5489          /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5490          set_wqm(ctx, true);
5491       }
5492    } else {
5493       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(tmp), Operand::c32((vertex_id + 2) % 3),
5494                  bld.m0(prim_mask), idx, component);
5495    }
5496 
5497    if (dst.id() != tmp.id())
5498       emit_extract_vector(ctx, tmp, high_16bits, dst);
5499 }
5500 
5501 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)5502 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5503 {
5504    Temp dst = get_ssa_temp(ctx, &instr->def);
5505    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5506    unsigned idx = nir_intrinsic_base(instr);
5507    unsigned component = nir_intrinsic_component(instr);
5508    bool high_16bits = nir_intrinsic_io_semantics(instr).high_16bits;
5509    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5510 
5511    assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5512 
5513    if (instr->def.num_components == 1) {
5514       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask, high_16bits);
5515    } else {
5516       aco_ptr<Instruction> vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
5517                                                   instr->def.num_components, 1));
5518       for (unsigned i = 0; i < instr->def.num_components; i++) {
5519          Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5520          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask, high_16bits);
5521          vec->operands[i] = Operand(tmp);
5522       }
5523       vec->definitions[0] = Definition(dst);
5524       ctx->block->instructions.emplace_back(std::move(vec));
5525    }
5526 }
5527 
5528 Temp
mtbuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned alignment,unsigned const_offset,Temp dst_hint)5529 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5530                     unsigned alignment, unsigned const_offset, Temp dst_hint)
5531 {
5532    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5533    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5534 
5535    if (info.soffset.id()) {
5536       if (soffset.isTemp())
5537          vaddr = bld.copy(bld.def(v1), soffset);
5538       soffset = Operand(info.soffset);
5539    }
5540 
5541    if (soffset.isUndefined())
5542       soffset = Operand::zero();
5543 
5544    const bool offen = !vaddr.isUndefined();
5545    const bool idxen = info.idx.id();
5546 
5547    if (offen && idxen)
5548       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5549    else if (idxen)
5550       vaddr = Operand(info.idx);
5551 
5552    /* Determine number of fetched components.
5553     * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5554     */
5555    const struct ac_vtx_format_info* vtx_info =
5556       ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5557    /* The number of channels in the format determines the memory range. */
5558    const unsigned max_components = vtx_info->num_channels;
5559    /* Calculate maximum number of components loaded according to alignment. */
5560    unsigned max_fetched_components = bytes_needed / info.component_size;
5561    max_fetched_components =
5562       ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5563                              alignment, max_fetched_components);
5564    const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5565    /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5566     * If a larger format is selected, it's still OK to load a smaller amount from it.
5567     */
5568    bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5569    unsigned bytes_size = 0;
5570    const unsigned bit_size = info.component_size * 8;
5571    aco_opcode op = aco_opcode::num_opcodes;
5572 
5573    if (bytes_needed == 2) {
5574       bytes_size = 2;
5575       op = aco_opcode::tbuffer_load_format_d16_x;
5576    } else if (bytes_needed <= 4) {
5577       bytes_size = 4;
5578       if (bit_size == 16)
5579          op = aco_opcode::tbuffer_load_format_d16_xy;
5580       else
5581          op = aco_opcode::tbuffer_load_format_x;
5582    } else if (bytes_needed <= 6) {
5583       bytes_size = 6;
5584       if (bit_size == 16)
5585          op = aco_opcode::tbuffer_load_format_d16_xyz;
5586       else
5587          op = aco_opcode::tbuffer_load_format_xy;
5588    } else if (bytes_needed <= 8) {
5589       bytes_size = 8;
5590       if (bit_size == 16)
5591          op = aco_opcode::tbuffer_load_format_d16_xyzw;
5592       else
5593          op = aco_opcode::tbuffer_load_format_xy;
5594    } else if (bytes_needed <= 12) {
5595       bytes_size = 12;
5596       op = aco_opcode::tbuffer_load_format_xyz;
5597    } else {
5598       bytes_size = 16;
5599       op = aco_opcode::tbuffer_load_format_xyzw;
5600    }
5601 
5602    /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5603    if (op == aco_opcode::num_opcodes) {
5604       aco_err(bld.program, "unsupported bit size for typed buffer load");
5605       abort();
5606    }
5607 
5608    aco_ptr<Instruction> mtbuf{create_instruction(op, Format::MTBUF, 3, 1)};
5609    mtbuf->operands[0] = Operand(info.resource);
5610    mtbuf->operands[1] = vaddr;
5611    mtbuf->operands[2] = soffset;
5612    mtbuf->mtbuf().offen = offen;
5613    mtbuf->mtbuf().idxen = idxen;
5614    mtbuf->mtbuf().cache = info.cache;
5615    mtbuf->mtbuf().sync = info.sync;
5616    mtbuf->mtbuf().offset = const_offset;
5617    mtbuf->mtbuf().dfmt = fetch_fmt & 0xf;
5618    mtbuf->mtbuf().nfmt = fetch_fmt >> 4;
5619    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5620    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5621    mtbuf->definitions[0] = Definition(val);
5622    bld.insert(std::move(mtbuf));
5623 
5624    return val;
5625 }
5626 
5627 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, 4096};
5628 
5629 void
visit_load_fs_input(isel_context * ctx,nir_intrinsic_instr * instr)5630 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5631 {
5632    Builder bld(ctx->program, ctx->block);
5633    Temp dst = get_ssa_temp(ctx, &instr->def);
5634    nir_src offset = *nir_get_io_offset_src(instr);
5635 
5636    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5637       isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5638 
5639    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5640 
5641    unsigned idx = nir_intrinsic_base(instr);
5642    unsigned component = nir_intrinsic_component(instr);
5643    bool high_16bits = nir_intrinsic_io_semantics(instr).high_16bits;
5644    unsigned vertex_id = 0; /* P0 */
5645 
5646    if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5647       vertex_id = nir_src_as_uint(instr->src[0]);
5648 
5649    if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5650       emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask, high_16bits);
5651    } else {
5652       unsigned num_components = instr->def.num_components;
5653       if (instr->def.bit_size == 64)
5654          num_components *= 2;
5655       aco_ptr<Instruction> vec{
5656          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5657       for (unsigned i = 0; i < num_components; i++) {
5658          unsigned chan_component = (component + i) % 4;
5659          unsigned chan_idx = idx + (component + i) / 4;
5660          vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5661          emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5662                                prim_mask, high_16bits);
5663       }
5664       vec->definitions[0] = Definition(dst);
5665       bld.insert(std::move(vec));
5666    }
5667 }
5668 
5669 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5670 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5671 {
5672    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5673 
5674    Builder bld(ctx->program, ctx->block);
5675    Temp dst = get_ssa_temp(ctx, &instr->def);
5676 
5677    if (load_input_from_temps(ctx, instr, dst))
5678       return;
5679 
5680    unreachable("LDS-based TCS input should have been lowered in NIR.");
5681 }
5682 
5683 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5684 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5685 {
5686    switch (ctx->shader->info.stage) {
5687    case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5688    default: unreachable("Unimplemented shader stage");
5689    }
5690 }
5691 
5692 ac_hw_cache_flags
get_cache_flags(isel_context * ctx,unsigned access)5693 get_cache_flags(isel_context* ctx, unsigned access)
5694 {
5695    return ac_get_hw_cache_flags(ctx->program->gfx_level, (gl_access_qualifier)access);
5696 }
5697 
5698 ac_hw_cache_flags
get_atomic_cache_flags(isel_context * ctx,bool return_previous)5699 get_atomic_cache_flags(isel_context* ctx, bool return_previous)
5700 {
5701    ac_hw_cache_flags cache = get_cache_flags(ctx, ACCESS_TYPE_ATOMIC);
5702    if (return_previous && ctx->program->gfx_level >= GFX12)
5703       cache.gfx12.temporal_hint |= gfx12_atomic_return;
5704    else if (return_previous)
5705       cache.value |= ac_glc;
5706    return cache;
5707 }
5708 
5709 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,unsigned access=ACCESS_CAN_REORDER,memory_sync_info sync=memory_sync_info ())5710 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5711             Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
5712             unsigned access = ACCESS_CAN_REORDER, memory_sync_info sync = memory_sync_info())
5713 {
5714    assert(!(access & ACCESS_SMEM_AMD) || (component_size >= 4));
5715 
5716    Builder bld(ctx->program, ctx->block);
5717 
5718    bool use_smem = access & ACCESS_SMEM_AMD;
5719    if (use_smem) {
5720       offset = bld.as_uniform(offset);
5721    } else {
5722       /* GFX6-7 are affected by a hw bug that prevents address clamping to
5723        * work correctly when the SGPR offset is used.
5724        */
5725       if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5726          offset = as_vgpr(ctx, offset);
5727    }
5728 
5729    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5730    info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD | (use_smem ? ACCESS_TYPE_SMEM : 0));
5731    info.sync = sync;
5732    info.align_mul = align_mul;
5733    info.align_offset = align_offset;
5734    if (use_smem)
5735       emit_load(ctx, bld, info, smem_load_params);
5736    else
5737       emit_load(ctx, bld, info, mubuf_load_params);
5738 }
5739 
5740 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)5741 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5742 {
5743    Temp dst = get_ssa_temp(ctx, &instr->def);
5744    Builder bld(ctx->program, ctx->block);
5745    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5746 
5747    unsigned size = instr->def.bit_size / 8;
5748    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5749                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr),
5750                nir_intrinsic_access(instr) | ACCESS_CAN_REORDER);
5751 }
5752 
5753 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)5754 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5755 {
5756    Temp dst = get_ssa_temp(ctx, &instr->def);
5757 
5758    Builder bld(ctx->program, ctx->block);
5759 
5760    uint32_t desc[4];
5761    ac_build_raw_buffer_descriptor(ctx->options->gfx_level, 0, 0, desc);
5762 
5763    unsigned base = nir_intrinsic_base(instr);
5764    unsigned range = nir_intrinsic_range(instr);
5765 
5766    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5767    if (base && offset.type() == RegType::sgpr)
5768       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5769                               Operand::c32(base));
5770    else if (base && offset.type() == RegType::vgpr)
5771       offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5772 
5773    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5774                           bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5775                                      Operand::c32(ctx->constant_data_offset)),
5776                           Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5777                           Operand::c32(desc[3]));
5778    unsigned size = instr->def.bit_size / 8;
5779    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, nir_intrinsic_align_mul(instr),
5780                nir_intrinsic_align_offset(instr), nir_intrinsic_access(instr) | ACCESS_CAN_REORDER);
5781 }
5782 
5783 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
5784  * The byte count of each input Temp must be a multiple of 2.
5785  */
5786 static std::vector<Temp>
emit_pack_v1(isel_context * ctx,const std::vector<Temp> & unpacked)5787 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
5788 {
5789    Builder bld(ctx->program, ctx->block);
5790    std::vector<Temp> packed;
5791    Temp low = Temp();
5792    for (Temp tmp : unpacked) {
5793       assert(tmp.bytes() % 2 == 0);
5794       unsigned byte_idx = 0;
5795       while (byte_idx < tmp.bytes()) {
5796          if (low != Temp()) {
5797             Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5798             Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
5799             low = Temp();
5800             packed.push_back(dword);
5801             byte_idx += 2;
5802          } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
5803             packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
5804             byte_idx += 4;
5805          } else {
5806             low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5807             byte_idx += 2;
5808          }
5809       }
5810    }
5811    if (low != Temp()) {
5812       Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
5813       packed.push_back(dword);
5814    }
5815    return packed;
5816 }
5817 
5818 static bool
should_declare_array(ac_image_dim dim)5819 should_declare_array(ac_image_dim dim)
5820 {
5821    return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5822           dim == ac_image_2darraymsaa;
5823 }
5824 
5825 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)5826 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5827 {
5828    switch (dim) {
5829    case GLSL_SAMPLER_DIM_BUF: return 1;
5830    case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5831    case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5832    case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
5833    case GLSL_SAMPLER_DIM_3D:
5834    case GLSL_SAMPLER_DIM_CUBE: return 3;
5835    case GLSL_SAMPLER_DIM_RECT:
5836    case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5837    case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
5838    default: break;
5839    }
5840    return 0;
5841 }
5842 
5843 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Temp dst,Temp rsrc,Operand samp,std::vector<Temp> coords,Operand vdata=Operand (v1))5844 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
5845           Operand vdata = Operand(v1))
5846 {
5847    bool is_vsample = !samp.isUndefined() || op == aco_opcode::image_msaa_load;
5848 
5849    size_t nsa_size = bld.program->dev.max_nsa_vgprs;
5850    if (!is_vsample && bld.program->gfx_level >= GFX12)
5851       nsa_size++; /* VIMAGE can encode one more VADDR */
5852    nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
5853 
5854    const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
5855    if (strict_wqm)
5856       nsa_size = coords.size();
5857 
5858    for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
5859       if (!coords[i].id())
5860          continue;
5861 
5862       coords[i] = as_vgpr(bld, coords[i]);
5863    }
5864 
5865    if (nsa_size < coords.size()) {
5866       Temp coord = coords[nsa_size];
5867       if (coords.size() - nsa_size > 1) {
5868          aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
5869                                                      coords.size() - nsa_size, 1)};
5870 
5871          unsigned coord_size = 0;
5872          for (unsigned i = nsa_size; i < coords.size(); i++) {
5873             vec->operands[i - nsa_size] = Operand(coords[i]);
5874             coord_size += coords[i].size();
5875          }
5876 
5877          coord = bld.tmp(RegType::vgpr, coord_size);
5878          vec->definitions[0] = Definition(coord);
5879          bld.insert(std::move(vec));
5880       } else {
5881          coord = as_vgpr(bld, coord);
5882       }
5883 
5884       coords[nsa_size] = coord;
5885       coords.resize(nsa_size + 1);
5886    }
5887 
5888    bool has_dst = dst.id() != 0;
5889 
5890    aco_ptr<Instruction> mimg{create_instruction(op, Format::MIMG, 3 + coords.size(), has_dst)};
5891    if (has_dst)
5892       mimg->definitions[0] = Definition(dst);
5893    mimg->operands[0] = Operand(rsrc);
5894    mimg->operands[1] = samp;
5895    mimg->operands[2] = vdata;
5896    for (unsigned i = 0; i < coords.size(); i++)
5897       mimg->operands[3 + i] = Operand(coords[i]);
5898    mimg->mimg().strict_wqm = strict_wqm;
5899 
5900    return &bld.insert(std::move(mimg))->mimg();
5901 }
5902 
5903 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)5904 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5905 {
5906    Builder bld(ctx->program, ctx->block);
5907    Temp dst = get_ssa_temp(ctx, &instr->def);
5908    Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
5909    Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
5910    Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
5911    Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
5912    Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
5913    Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
5914 
5915    /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
5916     * There are five smaller vector groups:
5917     * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
5918     * These directly match the NIR intrinsic sources.
5919     */
5920    std::vector<Temp> args = {
5921       node, tmax, origin, dir, inv_dir,
5922    };
5923 
5924    if (bld.program->gfx_level == GFX10_3) {
5925       std::vector<Temp> scalar_args;
5926       for (Temp tmp : args) {
5927          for (unsigned i = 0; i < tmp.size(); i++)
5928             scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
5929       }
5930       args = std::move(scalar_args);
5931    }
5932 
5933    MIMG_instruction* mimg =
5934       emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
5935    mimg->dim = ac_image_1d;
5936    mimg->dmask = 0xf;
5937    mimg->unrm = true;
5938    mimg->r128 = true;
5939 
5940    emit_split_vector(ctx, dst, instr->def.num_components);
5941 }
5942 
5943 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)5944 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
5945 {
5946 
5947    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
5948    bool a16 = instr->src[1].ssa->bit_size == 16;
5949    RegClass rc = a16 ? v2b : v1;
5950    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
5951    bool is_array = nir_intrinsic_image_array(instr);
5952    ASSERTED bool add_frag_pos =
5953       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5954    assert(!add_frag_pos && "Input attachments should be lowered.");
5955    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5956    bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
5957    int count = image_type_to_components_count(dim, is_array);
5958    std::vector<Temp> coords;
5959    Builder bld(ctx->program, ctx->block);
5960 
5961    if (gfx9_1d) {
5962       coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
5963       coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
5964       if (is_array)
5965          coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
5966    } else {
5967       for (int i = 0; i < count; i++)
5968          coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
5969    }
5970 
5971    bool has_lod = false;
5972    Temp lod;
5973 
5974    if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
5975        instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
5976        instr->intrinsic == nir_intrinsic_bindless_image_store) {
5977       int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
5978       assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
5979       has_lod =
5980          !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
5981 
5982       if (has_lod)
5983          lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
5984    }
5985 
5986    if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
5987       /* The hw can't bind a slice of a 3D image as a 2D image, because it
5988        * ignores BASE_ARRAY if the target is 3D. The workaround is to read
5989        * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
5990        */
5991       assert(ctx->options->gfx_level == GFX9);
5992       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5993       Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
5994       /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
5995       Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
5996                                   Operand::c32(13u));
5997 
5998       if (has_lod) {
5999          /* If there's a lod parameter it matter if the image is 3d or 2d because
6000           * the hw reads either the fourth or third component as lod. So detect
6001           * 3d images and place the lod at the third component otherwise.
6002           * For non 3D descriptors we effectively add lod twice to coords,
6003           * but the hw will only read the first one, the second is ignored.
6004           */
6005          Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6006          Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6007                               Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6008          Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6009                                    Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6010          first_layer =
6011             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6012       }
6013 
6014       if (a16)
6015          coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6016       else
6017          coords.emplace_back(first_layer);
6018    }
6019 
6020    if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6021       assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6022       coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6023    }
6024 
6025    if (has_lod)
6026       coords.emplace_back(lod);
6027 
6028    return emit_pack_v1(ctx, coords);
6029 }
6030 
6031 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6032 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6033 {
6034    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6035    if (semantics & semantic_atomicrmw)
6036       return memory_sync_info(storage, semantics);
6037 
6038    unsigned access = nir_intrinsic_access(instr);
6039 
6040    if (access & ACCESS_VOLATILE)
6041       semantics |= semantic_volatile;
6042    if (access & ACCESS_CAN_REORDER)
6043       semantics |= semantic_can_reorder | semantic_private;
6044 
6045    return memory_sync_info(storage, semantics);
6046 }
6047 
6048 Operand
emit_tfe_init(Builder & bld,Temp dst)6049 emit_tfe_init(Builder& bld, Temp dst)
6050 {
6051    Temp tmp = bld.tmp(dst.regClass());
6052 
6053    aco_ptr<Instruction> vec{
6054       create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6055    for (unsigned i = 0; i < dst.size(); i++)
6056       vec->operands[i] = Operand::zero();
6057    vec->definitions[0] = Definition(tmp);
6058    /* Since this is fixed to an instruction's definition register, any CSE will
6059     * just create copies. Copying costs about the same as zero-initialization,
6060     * but these copies can break up clauses.
6061     */
6062    vec->definitions[0].setNoCSE(true);
6063    bld.insert(std::move(vec));
6064 
6065    return Operand(tmp);
6066 }
6067 
6068 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6069 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6070 {
6071    Builder bld(ctx->program, ctx->block);
6072    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6073    bool is_array = nir_intrinsic_image_array(instr);
6074    bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6075    Temp dst = get_ssa_temp(ctx, &instr->def);
6076 
6077    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6078 
6079    unsigned result_size = instr->def.num_components - is_sparse;
6080    unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6081    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6082    if (dim == GLSL_SAMPLER_DIM_BUF)
6083       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6084    unsigned dmask = expand_mask;
6085    if (instr->def.bit_size == 64) {
6086       expand_mask &= 0x9;
6087       /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6088       dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6089    }
6090    if (is_sparse)
6091       expand_mask |= 1 << result_size;
6092 
6093    bool d16 = instr->def.bit_size == 16;
6094    assert(!d16 || !is_sparse);
6095 
6096    unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6097 
6098    Temp tmp;
6099    if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6100       tmp = dst;
6101    else
6102       tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6103 
6104    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6105 
6106    if (dim == GLSL_SAMPLER_DIM_BUF) {
6107       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6108 
6109       aco_opcode opcode;
6110       if (!d16) {
6111          switch (util_bitcount(dmask)) {
6112          case 1: opcode = aco_opcode::buffer_load_format_x; break;
6113          case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6114          case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6115          case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6116          default: unreachable(">4 channel buffer image load");
6117          }
6118       } else {
6119          switch (util_bitcount(dmask)) {
6120          case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6121          case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6122          case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6123          case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6124          default: unreachable(">4 channel buffer image load");
6125          }
6126       }
6127       aco_ptr<Instruction> load{create_instruction(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6128       load->operands[0] = Operand(resource);
6129       load->operands[1] = Operand(vindex);
6130       load->operands[2] = Operand::c32(0);
6131       load->definitions[0] = Definition(tmp);
6132       load->mubuf().idxen = true;
6133       load->mubuf().cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
6134       load->mubuf().sync = sync;
6135       load->mubuf().tfe = is_sparse;
6136       if (load->mubuf().tfe)
6137          load->operands[3] = emit_tfe_init(bld, tmp);
6138       ctx->block->instructions.emplace_back(std::move(load));
6139    } else {
6140       std::vector<Temp> coords = get_image_coords(ctx, instr);
6141 
6142       aco_opcode opcode;
6143       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6144          opcode = aco_opcode::image_load;
6145       } else {
6146          bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6147          opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6148       }
6149 
6150       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6151       MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
6152       load->cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
6153       load->a16 = instr->src[1].ssa->bit_size == 16;
6154       load->d16 = d16;
6155       load->dmask = dmask;
6156       load->unrm = true;
6157       load->tfe = is_sparse;
6158 
6159       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6160          load->dim = is_array ? ac_image_2darray : ac_image_2d;
6161          load->da = is_array;
6162          load->sync = memory_sync_info();
6163       } else {
6164          ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6165          load->dim = sdim;
6166          load->da = should_declare_array(sdim);
6167          load->sync = sync;
6168       }
6169    }
6170 
6171    if (is_sparse && instr->def.bit_size == 64) {
6172       /* The result components are 64-bit but the sparse residency code is
6173        * 32-bit. So add a zero to the end so expand_vector() works correctly.
6174        */
6175       tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6176                        Operand::zero());
6177    }
6178 
6179    expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6180 }
6181 
6182 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6183 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6184 {
6185    Builder bld(ctx->program, ctx->block);
6186    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6187    bool is_array = nir_intrinsic_image_array(instr);
6188    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6189    bool d16 = instr->src[3].ssa->bit_size == 16;
6190 
6191    /* only R64_UINT and R64_SINT supported */
6192    if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6193       data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6194    data = as_vgpr(ctx, data);
6195 
6196    uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6197 
6198    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6199    unsigned access = nir_intrinsic_access(instr);
6200    ac_hw_cache_flags cache =
6201       get_cache_flags(ctx, access | ACCESS_TYPE_STORE | ACCESS_MAY_STORE_SUBDWORD);
6202 
6203    uint32_t dmask = BITFIELD_MASK(num_components);
6204    if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6205       for (uint32_t i = 0; i < instr->num_components; i++) {
6206          /* components not in dmask receive:
6207           * GFX6-11.5:  zero
6208           * GFX12+: first component in dmask
6209           */
6210          nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6211          if (nir_scalar_is_undef(comp)) {
6212             dmask &= ~BITFIELD_BIT(i);
6213          } else if (ctx->options->gfx_level <= GFX11_5) {
6214             if (nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0)
6215                dmask &= ~BITFIELD_BIT(i);
6216          } else {
6217             unsigned first = dim == GLSL_SAMPLER_DIM_BUF ? 0 : ffs(dmask) - 1;
6218             if (i != first && nir_scalar_equal(nir_scalar_resolved(instr->src[3].ssa, first), comp))
6219                dmask &= ~BITFIELD_BIT(i);
6220          }
6221       }
6222 
6223       /* dmask cannot be 0, at least one vgpr is always read */
6224       if (dmask == 0)
6225          dmask = 1;
6226       /* buffer store only supports consecutive components. */
6227       if (dim == GLSL_SAMPLER_DIM_BUF)
6228          dmask = BITFIELD_MASK(util_last_bit(dmask));
6229 
6230       if (dmask != BITFIELD_MASK(num_components)) {
6231          uint32_t dmask_count = util_bitcount(dmask);
6232          RegClass rc = d16 ? v2b : v1;
6233          if (dmask_count == 1) {
6234             data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6235          } else {
6236             aco_ptr<Instruction> vec{
6237                create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6238             uint32_t index = 0;
6239             u_foreach_bit (bit, dmask) {
6240                vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6241             }
6242             data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6243             vec->definitions[0] = Definition(data);
6244             bld.insert(std::move(vec));
6245          }
6246       }
6247    }
6248 
6249    if (dim == GLSL_SAMPLER_DIM_BUF) {
6250       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6251       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6252       aco_opcode opcode;
6253       if (!d16) {
6254          switch (dmask) {
6255          case 0x1: opcode = aco_opcode::buffer_store_format_x; break;
6256          case 0x3: opcode = aco_opcode::buffer_store_format_xy; break;
6257          case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break;
6258          case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break;
6259          default: unreachable(">4 channel buffer image store");
6260          }
6261       } else {
6262          switch (dmask) {
6263          case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6264          case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6265          case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6266          case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6267          default: unreachable(">4 channel buffer image store");
6268          }
6269       }
6270       aco_ptr<Instruction> store{create_instruction(opcode, Format::MUBUF, 4, 0)};
6271       store->operands[0] = Operand(rsrc);
6272       store->operands[1] = Operand(vindex);
6273       store->operands[2] = Operand::c32(0);
6274       store->operands[3] = Operand(data);
6275       store->mubuf().idxen = true;
6276       store->mubuf().cache = cache;
6277       store->mubuf().disable_wqm = true;
6278       store->mubuf().sync = sync;
6279       ctx->program->needs_exact = true;
6280       ctx->block->instructions.emplace_back(std::move(store));
6281       return;
6282    }
6283 
6284    assert(data.type() == RegType::vgpr);
6285    std::vector<Temp> coords = get_image_coords(ctx, instr);
6286    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6287 
6288    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6289    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6290 
6291    MIMG_instruction* store =
6292       emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
6293    store->cache = cache;
6294    store->a16 = instr->src[1].ssa->bit_size == 16;
6295    store->d16 = d16;
6296    store->dmask = dmask;
6297    store->unrm = true;
6298    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6299    store->dim = sdim;
6300    store->da = should_declare_array(sdim);
6301    store->disable_wqm = true;
6302    store->sync = sync;
6303    ctx->program->needs_exact = true;
6304    return;
6305 }
6306 
6307 void
translate_buffer_image_atomic_op(const nir_atomic_op op,aco_opcode * buf_op,aco_opcode * buf_op64,aco_opcode * image_op)6308 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6309                                  aco_opcode* image_op)
6310 {
6311    switch (op) {
6312    case nir_atomic_op_iadd:
6313       *buf_op = aco_opcode::buffer_atomic_add;
6314       *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6315       *image_op = aco_opcode::image_atomic_add;
6316       break;
6317    case nir_atomic_op_umin:
6318       *buf_op = aco_opcode::buffer_atomic_umin;
6319       *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6320       *image_op = aco_opcode::image_atomic_umin;
6321       break;
6322    case nir_atomic_op_imin:
6323       *buf_op = aco_opcode::buffer_atomic_smin;
6324       *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6325       *image_op = aco_opcode::image_atomic_smin;
6326       break;
6327    case nir_atomic_op_umax:
6328       *buf_op = aco_opcode::buffer_atomic_umax;
6329       *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6330       *image_op = aco_opcode::image_atomic_umax;
6331       break;
6332    case nir_atomic_op_imax:
6333       *buf_op = aco_opcode::buffer_atomic_smax;
6334       *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6335       *image_op = aco_opcode::image_atomic_smax;
6336       break;
6337    case nir_atomic_op_iand:
6338       *buf_op = aco_opcode::buffer_atomic_and;
6339       *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6340       *image_op = aco_opcode::image_atomic_and;
6341       break;
6342    case nir_atomic_op_ior:
6343       *buf_op = aco_opcode::buffer_atomic_or;
6344       *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6345       *image_op = aco_opcode::image_atomic_or;
6346       break;
6347    case nir_atomic_op_ixor:
6348       *buf_op = aco_opcode::buffer_atomic_xor;
6349       *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6350       *image_op = aco_opcode::image_atomic_xor;
6351       break;
6352    case nir_atomic_op_xchg:
6353       *buf_op = aco_opcode::buffer_atomic_swap;
6354       *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6355       *image_op = aco_opcode::image_atomic_swap;
6356       break;
6357    case nir_atomic_op_cmpxchg:
6358       *buf_op = aco_opcode::buffer_atomic_cmpswap;
6359       *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6360       *image_op = aco_opcode::image_atomic_cmpswap;
6361       break;
6362    case nir_atomic_op_inc_wrap:
6363       *buf_op = aco_opcode::buffer_atomic_inc;
6364       *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6365       *image_op = aco_opcode::image_atomic_inc;
6366       break;
6367    case nir_atomic_op_dec_wrap:
6368       *buf_op = aco_opcode::buffer_atomic_dec;
6369       *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6370       *image_op = aco_opcode::image_atomic_dec;
6371       break;
6372    case nir_atomic_op_fadd:
6373       *buf_op = aco_opcode::buffer_atomic_add_f32;
6374       *buf_op64 = aco_opcode::num_opcodes;
6375       *image_op = aco_opcode::num_opcodes;
6376       break;
6377    case nir_atomic_op_fmin:
6378       *buf_op = aco_opcode::buffer_atomic_fmin;
6379       *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6380       *image_op = aco_opcode::image_atomic_fmin;
6381       break;
6382    case nir_atomic_op_fmax:
6383       *buf_op = aco_opcode::buffer_atomic_fmax;
6384       *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6385       *image_op = aco_opcode::image_atomic_fmax;
6386       break;
6387    default: unreachable("unsupported atomic operation");
6388    }
6389 }
6390 
6391 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6392 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6393 {
6394    bool return_previous = !nir_def_is_unused(&instr->def);
6395    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6396    bool is_array = nir_intrinsic_image_array(instr);
6397    Builder bld(ctx->program, ctx->block);
6398 
6399    const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6400    const bool cmpswap = op == nir_atomic_op_cmpxchg;
6401 
6402    aco_opcode buf_op, buf_op64, image_op;
6403    translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6404 
6405    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6406    bool is_64bit = data.bytes() == 8;
6407    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6408 
6409    if (cmpswap)
6410       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6411                         get_ssa_temp(ctx, instr->src[4].ssa), data);
6412 
6413    Temp dst = get_ssa_temp(ctx, &instr->def);
6414    memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6415 
6416    if (dim == GLSL_SAMPLER_DIM_BUF) {
6417       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6418       Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6419       // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6420       // implemented.");
6421       aco_ptr<Instruction> mubuf{create_instruction(is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4,
6422                                                     return_previous ? 1 : 0)};
6423       mubuf->operands[0] = Operand(resource);
6424       mubuf->operands[1] = Operand(vindex);
6425       mubuf->operands[2] = Operand::c32(0);
6426       mubuf->operands[3] = Operand(data);
6427       Definition def =
6428          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6429       if (return_previous)
6430          mubuf->definitions[0] = def;
6431       mubuf->mubuf().offset = 0;
6432       mubuf->mubuf().idxen = true;
6433       mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6434       mubuf->mubuf().disable_wqm = true;
6435       mubuf->mubuf().sync = sync;
6436       ctx->program->needs_exact = true;
6437       ctx->block->instructions.emplace_back(std::move(mubuf));
6438       if (return_previous && cmpswap)
6439          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6440       return;
6441    }
6442 
6443    std::vector<Temp> coords = get_image_coords(ctx, instr);
6444    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6445    Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6446    MIMG_instruction* mimg =
6447       emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
6448    mimg->cache = get_atomic_cache_flags(ctx, return_previous);
6449    mimg->dmask = (1 << data.size()) - 1;
6450    mimg->a16 = instr->src[1].ssa->bit_size == 16;
6451    mimg->unrm = true;
6452    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6453    mimg->dim = sdim;
6454    mimg->da = should_declare_array(sdim);
6455    mimg->disable_wqm = true;
6456    mimg->sync = sync;
6457    ctx->program->needs_exact = true;
6458    if (return_previous && cmpswap)
6459       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6460    return;
6461 }
6462 
6463 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6464 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6465 {
6466    Builder bld(ctx->program, ctx->block);
6467    unsigned num_components = instr->num_components;
6468 
6469    Temp dst = get_ssa_temp(ctx, &instr->def);
6470    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6471 
6472    unsigned access = nir_intrinsic_access(instr);
6473    unsigned size = instr->def.bit_size / 8;
6474 
6475    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6476                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), access,
6477                get_memory_sync_info(instr, storage_buffer, 0));
6478 }
6479 
6480 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6481 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6482 {
6483    Builder bld(ctx->program, ctx->block);
6484    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6485    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6486    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6487    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6488 
6489    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6490 
6491    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6492 
6493    unsigned write_count = 0;
6494    Temp write_datas[32];
6495    unsigned offsets[32];
6496    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6497                       write_datas, offsets);
6498 
6499    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6500     * correctly when the SGPR offset is used.
6501     */
6502    if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6503       offset = as_vgpr(ctx, offset);
6504 
6505    for (unsigned i = 0; i < write_count; i++) {
6506       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6507       unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE;
6508       if (write_datas[i].bytes() < 4)
6509          access |= ACCESS_MAY_STORE_SUBDWORD;
6510 
6511       aco_ptr<Instruction> store{create_instruction(op, Format::MUBUF, 4, 0)};
6512       store->operands[0] = Operand(rsrc);
6513       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6514       store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6515       store->operands[3] = Operand(write_datas[i]);
6516       store->mubuf().offset = offsets[i];
6517       store->mubuf().offen = (offset.type() == RegType::vgpr);
6518       store->mubuf().cache = get_cache_flags(ctx, access);
6519       store->mubuf().disable_wqm = true;
6520       store->mubuf().sync = sync;
6521       ctx->program->needs_exact = true;
6522       ctx->block->instructions.emplace_back(std::move(store));
6523    }
6524 }
6525 
6526 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6527 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6528 {
6529    Builder bld(ctx->program, ctx->block);
6530    bool return_previous = !nir_def_is_unused(&instr->def);
6531    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6532 
6533    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6534    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6535 
6536    aco_opcode op32, op64, image_op;
6537    translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6538 
6539    if (cmpswap)
6540       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6541                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6542 
6543    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6544    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6545    Temp dst = get_ssa_temp(ctx, &instr->def);
6546 
6547    aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6548    aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6549    mubuf->operands[0] = Operand(rsrc);
6550    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6551    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6552    mubuf->operands[3] = Operand(data);
6553    Definition def =
6554       return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6555    if (return_previous)
6556       mubuf->definitions[0] = def;
6557    mubuf->mubuf().offset = 0;
6558    mubuf->mubuf().offen = (offset.type() == RegType::vgpr);
6559    mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6560    mubuf->mubuf().disable_wqm = true;
6561    mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6562    ctx->program->needs_exact = true;
6563    ctx->block->instructions.emplace_back(std::move(mubuf));
6564    if (return_previous && cmpswap)
6565       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6566 }
6567 
6568 void
parse_global(isel_context * ctx,nir_intrinsic_instr * intrin,Temp * address,uint32_t * const_offset,Temp * offset)6569 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6570              Temp* offset)
6571 {
6572    bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6573    *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6574 
6575    *const_offset = nir_intrinsic_base(intrin);
6576 
6577    unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6578    nir_src offset_src = intrin->src[num_src - 1];
6579    if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6580       *offset = get_ssa_temp(ctx, offset_src.ssa);
6581    else
6582       *offset = Temp();
6583 }
6584 
6585 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6586 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6587 {
6588    Builder bld(ctx->program, ctx->block);
6589    unsigned num_components = instr->num_components;
6590    unsigned component_size = instr->def.bit_size / 8;
6591 
6592    Temp addr, offset;
6593    uint32_t const_offset;
6594    parse_global(ctx, instr, &addr, &const_offset, &offset);
6595 
6596    LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6597                         component_size};
6598    if (offset.id()) {
6599       info.resource = addr;
6600       info.offset = Operand(offset);
6601    }
6602    info.const_offset = const_offset;
6603    info.align_mul = nir_intrinsic_align_mul(instr);
6604    info.align_offset = nir_intrinsic_align_offset(instr);
6605    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6606 
6607    unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD;
6608    if (access & ACCESS_SMEM_AMD) {
6609       assert(component_size >= 4);
6610       if (info.resource.id())
6611          info.resource = bld.as_uniform(info.resource);
6612       info.offset = Operand(bld.as_uniform(info.offset));
6613       info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_SMEM);
6614       emit_load(ctx, bld, info, smem_load_params);
6615    } else {
6616       EmitLoadParameters params = global_load_params;
6617       info.cache = get_cache_flags(ctx, access);
6618       emit_load(ctx, bld, info, params);
6619    }
6620 }
6621 
6622 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6623 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6624 {
6625    Builder bld(ctx->program, ctx->block);
6626    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6627    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6628 
6629    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6630    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6631 
6632    unsigned write_count = 0;
6633    Temp write_datas[32];
6634    unsigned offsets[32];
6635    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6636                       write_datas, offsets);
6637 
6638    Temp addr, offset;
6639    uint32_t const_offset;
6640    parse_global(ctx, instr, &addr, &const_offset, &offset);
6641 
6642    for (unsigned i = 0; i < write_count; i++) {
6643       Temp write_address = addr;
6644       uint32_t write_const_offset = const_offset;
6645       Temp write_offset = offset;
6646       lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6647 
6648       unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE;
6649       if (write_datas[i].bytes() < 4)
6650          access |= ACCESS_MAY_STORE_SUBDWORD;
6651 
6652       if (ctx->options->gfx_level >= GFX7) {
6653          bool global = ctx->options->gfx_level >= GFX9;
6654          aco_opcode op;
6655          switch (write_datas[i].bytes()) {
6656          case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6657          case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6658          case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6659          case 8:
6660             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6661             break;
6662          case 12:
6663             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6664             break;
6665          case 16:
6666             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6667             break;
6668          default: unreachable("store_global not implemented for this size.");
6669          }
6670 
6671          aco_ptr<Instruction> flat{
6672             create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6673          if (write_address.regClass() == s2) {
6674             assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6675             flat->operands[0] = Operand(write_offset);
6676             flat->operands[1] = Operand(write_address);
6677          } else {
6678             assert(write_address.type() == RegType::vgpr && !write_offset.id());
6679             flat->operands[0] = Operand(write_address);
6680             flat->operands[1] = Operand(s1);
6681          }
6682          flat->operands[2] = Operand(write_datas[i]);
6683          flat->flatlike().cache = get_cache_flags(ctx, access);
6684          assert(global || !write_const_offset);
6685          flat->flatlike().offset = write_const_offset;
6686          flat->flatlike().disable_wqm = true;
6687          flat->flatlike().sync = sync;
6688          ctx->program->needs_exact = true;
6689          ctx->block->instructions.emplace_back(std::move(flat));
6690       } else {
6691          assert(ctx->options->gfx_level == GFX6);
6692 
6693          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6694 
6695          Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6696 
6697          aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, 0)};
6698          mubuf->operands[0] = Operand(rsrc);
6699          mubuf->operands[1] =
6700             write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6701          mubuf->operands[2] = Operand(write_offset);
6702          mubuf->operands[3] = Operand(write_datas[i]);
6703          mubuf->mubuf().cache = get_cache_flags(ctx, access);
6704          mubuf->mubuf().offset = write_const_offset;
6705          mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr;
6706          mubuf->mubuf().disable_wqm = true;
6707          mubuf->mubuf().sync = sync;
6708          ctx->program->needs_exact = true;
6709          ctx->block->instructions.emplace_back(std::move(mubuf));
6710       }
6711    }
6712 }
6713 
6714 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6715 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6716 {
6717    Builder bld(ctx->program, ctx->block);
6718    bool return_previous = !nir_def_is_unused(&instr->def);
6719    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6720 
6721    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6722    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6723 
6724    if (cmpswap)
6725       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6726                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6727 
6728    Temp dst = get_ssa_temp(ctx, &instr->def);
6729 
6730    aco_opcode op32, op64;
6731 
6732    Temp addr, offset;
6733    uint32_t const_offset;
6734    parse_global(ctx, instr, &addr, &const_offset, &offset);
6735    lower_global_address(bld, 0, &addr, &const_offset, &offset);
6736 
6737    if (ctx->options->gfx_level >= GFX7) {
6738       bool global = ctx->options->gfx_level >= GFX9;
6739       switch (nir_op) {
6740       case nir_atomic_op_iadd:
6741          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6742          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6743          break;
6744       case nir_atomic_op_imin:
6745          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6746          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6747          break;
6748       case nir_atomic_op_umin:
6749          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6750          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6751          break;
6752       case nir_atomic_op_imax:
6753          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6754          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6755          break;
6756       case nir_atomic_op_umax:
6757          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6758          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6759          break;
6760       case nir_atomic_op_iand:
6761          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6762          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6763          break;
6764       case nir_atomic_op_ior:
6765          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6766          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6767          break;
6768       case nir_atomic_op_ixor:
6769          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6770          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6771          break;
6772       case nir_atomic_op_xchg:
6773          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6774          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6775          break;
6776       case nir_atomic_op_cmpxchg:
6777          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6778          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6779          break;
6780       case nir_atomic_op_fadd:
6781          op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
6782          op64 = aco_opcode::num_opcodes;
6783          break;
6784       case nir_atomic_op_fmin:
6785          op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6786          op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6787          break;
6788       case nir_atomic_op_fmax:
6789          op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6790          op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6791          break;
6792       case nir_atomic_op_ordered_add_gfx12_amd:
6793          assert(ctx->options->gfx_level >= GFX12 && instr->def.bit_size == 64);
6794          op32 = aco_opcode::num_opcodes;
6795          op64 = aco_opcode::global_atomic_ordered_add_b64;
6796          break;
6797       default: unreachable("unsupported atomic operation");
6798       }
6799 
6800       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6801       aco_ptr<Instruction> flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3,
6802                                                    return_previous ? 1 : 0)};
6803       if (addr.regClass() == s2) {
6804          assert(global && offset.id() && offset.type() == RegType::vgpr);
6805          flat->operands[0] = Operand(offset);
6806          flat->operands[1] = Operand(addr);
6807       } else {
6808          assert(addr.type() == RegType::vgpr && !offset.id());
6809          flat->operands[0] = Operand(addr);
6810          flat->operands[1] = Operand(s1);
6811       }
6812       flat->operands[2] = Operand(data);
6813       if (return_previous)
6814          flat->definitions[0] = Definition(dst);
6815       flat->flatlike().cache = get_atomic_cache_flags(ctx, return_previous);
6816       assert(global || !const_offset);
6817       flat->flatlike().offset = const_offset;
6818       flat->flatlike().disable_wqm = true;
6819       flat->flatlike().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6820       ctx->program->needs_exact = true;
6821       ctx->block->instructions.emplace_back(std::move(flat));
6822    } else {
6823       assert(ctx->options->gfx_level == GFX6);
6824 
6825       UNUSED aco_opcode image_op;
6826       translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6827 
6828       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6829 
6830       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6831 
6832       aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6833       mubuf->operands[0] = Operand(rsrc);
6834       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6835       mubuf->operands[2] = Operand(offset);
6836       mubuf->operands[3] = Operand(data);
6837       Definition def =
6838          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6839       if (return_previous)
6840          mubuf->definitions[0] = def;
6841       mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6842       mubuf->mubuf().offset = const_offset;
6843       mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
6844       mubuf->mubuf().disable_wqm = true;
6845       mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6846       ctx->program->needs_exact = true;
6847       ctx->block->instructions.emplace_back(std::move(mubuf));
6848       if (return_previous && cmpswap)
6849          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6850    }
6851 }
6852 
6853 unsigned
aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)6854 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
6855 {
6856    unsigned storage = storage_none;
6857 
6858    if (mem_mode & nir_var_shader_out)
6859       storage |= storage_vmem_output;
6860    if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
6861       storage |= storage_buffer;
6862    if (mem_mode & nir_var_mem_task_payload)
6863       storage |= storage_task_payload;
6864    if (mem_mode & nir_var_mem_shared)
6865       storage |= storage_shared;
6866    if (mem_mode & nir_var_image)
6867       storage |= storage_image;
6868 
6869    return storage;
6870 }
6871 
6872 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)6873 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6874 {
6875    Builder bld(ctx->program, ctx->block);
6876 
6877    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
6878    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
6879    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
6880                 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
6881    bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
6882    bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
6883 
6884    Temp dst = get_ssa_temp(ctx, &intrin->def);
6885    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
6886    Temp v_offset =
6887       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
6888    Temp s_offset =
6889       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
6890    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
6891 
6892    ac_hw_cache_flags cache = get_cache_flags(ctx, nir_intrinsic_access(intrin) | ACCESS_TYPE_LOAD);
6893 
6894    unsigned const_offset = nir_intrinsic_base(intrin);
6895    unsigned elem_size_bytes = intrin->def.bit_size / 8u;
6896    unsigned num_components = intrin->def.num_components;
6897 
6898    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
6899    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
6900 
6901    LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
6902    info.idx = idx;
6903    info.cache = cache;
6904    info.soffset = s_offset;
6905    info.const_offset = const_offset;
6906    info.sync = sync;
6907 
6908    if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
6909       const pipe_format format = nir_intrinsic_format(intrin);
6910       const struct ac_vtx_format_info* vtx_info =
6911          ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
6912       const struct util_format_description* f = util_format_description(format);
6913       const unsigned align_mul = nir_intrinsic_align_mul(intrin);
6914       const unsigned align_offset = nir_intrinsic_align_offset(intrin);
6915 
6916       /* Avoid splitting:
6917        * - non-array formats because that would result in incorrect code
6918        * - when element size is same as component size (to reduce instruction count)
6919        */
6920       const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
6921 
6922       info.align_mul = align_mul;
6923       info.align_offset = align_offset;
6924       info.format = format;
6925       info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
6926       info.split_by_component_stride = false;
6927 
6928       emit_load(ctx, bld, info, mtbuf_load_params);
6929    } else {
6930       assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
6931 
6932       if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
6933          assert(!swizzled);
6934 
6935          emit_load(ctx, bld, info, mubuf_load_format_params);
6936       } else {
6937          const unsigned swizzle_element_size =
6938             swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
6939 
6940          info.component_stride = swizzle_element_size;
6941          info.swizzle_component_size = swizzle_element_size ? 4 : 0;
6942          info.align_mul = MIN2(elem_size_bytes, 4);
6943          info.align_offset = 0;
6944 
6945          emit_load(ctx, bld, info, mubuf_load_params);
6946       }
6947    }
6948 }
6949 
6950 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)6951 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6952 {
6953    Builder bld(ctx->program, ctx->block);
6954 
6955    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
6956    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
6957    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
6958                 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
6959    bool offen = !nir_src_is_const(intrin->src[2]) || nir_src_as_uint(intrin->src[2]);
6960 
6961    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
6962    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
6963    Temp v_offset = offen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa)) : Temp();
6964    Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
6965    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
6966 
6967    unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
6968    assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
6969           elem_size_bytes == 8);
6970 
6971    unsigned write_mask = nir_intrinsic_write_mask(intrin);
6972    write_mask = util_widen_mask(write_mask, elem_size_bytes);
6973 
6974    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
6975    /* GS outputs are only written once. */
6976    const bool written_once =
6977       mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
6978    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
6979                          written_once ? semantic_can_reorder : semantic_none);
6980 
6981    unsigned write_count = 0;
6982    Temp write_datas[32];
6983    unsigned offsets[32];
6984    split_buffer_store(ctx, NULL, false, RegType::vgpr, store_src, write_mask,
6985                       swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
6986                       write_datas, offsets);
6987 
6988    for (unsigned i = 0; i < write_count; i++) {
6989       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6990       Temp write_voffset = v_offset;
6991       unsigned const_offset = resolve_excess_vmem_const_offset(
6992          bld, write_voffset, offsets[i] + nir_intrinsic_base(intrin));
6993 
6994       /* write_voffset may be updated in resolve_excess_vmem_const_offset(). */
6995       offen = write_voffset.id();
6996 
6997       Operand vaddr_op(v1);
6998       if (offen && idxen)
6999          vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, write_voffset);
7000       else if (offen)
7001          vaddr_op = Operand(write_voffset);
7002       else if (idxen)
7003          vaddr_op = Operand(idx);
7004 
7005       unsigned access = nir_intrinsic_access(intrin);
7006       if (write_datas[i].bytes() < 4)
7007          access |= ACCESS_MAY_STORE_SUBDWORD;
7008       ac_hw_cache_flags cache = get_cache_flags(ctx, access | ACCESS_TYPE_STORE);
7009 
7010       Instruction* mubuf = bld.mubuf(op, Operand(descriptor), vaddr_op, s_offset,
7011                                      Operand(write_datas[i]), const_offset, offen, idxen,
7012                                      /* addr64 */ false, /* disable_wqm */ false, cache)
7013                               .instr;
7014       mubuf->mubuf().sync = sync;
7015    }
7016 }
7017 
7018 void
visit_load_smem(isel_context * ctx,nir_intrinsic_instr * instr)7019 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7020 {
7021    Builder bld(ctx->program, ctx->block);
7022    Temp dst = get_ssa_temp(ctx, &instr->def);
7023    Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7024    Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7025 
7026    /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7027    if (base.bytes() == 4) {
7028       base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7029                         Operand::c32(ctx->options->address32_hi));
7030    }
7031 
7032    aco_opcode opcode = aco_opcode::s_load_dword;
7033    unsigned size = 1;
7034 
7035    assert(dst.bytes() <= 64);
7036 
7037    if (dst.bytes() > 32) {
7038       opcode = aco_opcode::s_load_dwordx16;
7039       size = 16;
7040    } else if (dst.bytes() > 16) {
7041       opcode = aco_opcode::s_load_dwordx8;
7042       size = 8;
7043    } else if (dst.bytes() > 8) {
7044       opcode = aco_opcode::s_load_dwordx4;
7045       size = 4;
7046    } else if (dst.bytes() > 4) {
7047       opcode = aco_opcode::s_load_dwordx2;
7048       size = 2;
7049    }
7050 
7051    if (dst.size() != size) {
7052       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7053                  bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7054    } else {
7055       bld.smem(opcode, Definition(dst), base, offset);
7056    }
7057    emit_split_vector(ctx, dst, instr->def.num_components);
7058 }
7059 
7060 sync_scope
translate_nir_scope(mesa_scope scope)7061 translate_nir_scope(mesa_scope scope)
7062 {
7063    switch (scope) {
7064    case SCOPE_NONE:
7065    case SCOPE_INVOCATION: return scope_invocation;
7066    case SCOPE_SUBGROUP: return scope_subgroup;
7067    case SCOPE_WORKGROUP: return scope_workgroup;
7068    case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7069    case SCOPE_DEVICE: return scope_device;
7070    case SCOPE_SHADER_CALL: return scope_invocation;
7071    }
7072    unreachable("invalid scope");
7073 }
7074 
7075 void
emit_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7076 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7077 {
7078    Builder bld(ctx->program, ctx->block);
7079 
7080    unsigned storage_allowed = storage_buffer | storage_image;
7081    unsigned semantics = 0;
7082    sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7083    sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7084 
7085    /* We use shared storage for the following:
7086     * - compute shaders expose it in their API
7087     * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7088     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7089     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7090     */
7091    bool shared_storage_used =
7092       ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7093       ctx->stage.hw == AC_HW_HULL_SHADER ||
7094       (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7095       ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7096 
7097    if (shared_storage_used)
7098       storage_allowed |= storage_shared;
7099 
7100    /* Task payload: Task Shader output, Mesh Shader input */
7101    if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7102       storage_allowed |= storage_task_payload;
7103 
7104    /* Allow VMEM output for all stages that can have outputs. */
7105    if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7106        ctx->stage.has(SWStage::TS))
7107       storage_allowed |= storage_vmem_output;
7108 
7109    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7110     * They are allowed in CS, TCS, and in any NGG shader.
7111     */
7112    ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7113                                            ctx->stage.hw == AC_HW_HULL_SHADER ||
7114                                            ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7115 
7116    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7117    unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7118    storage &= storage_allowed;
7119 
7120    unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7121    if (nir_semantics & NIR_MEMORY_ACQUIRE)
7122       semantics |= semantic_acquire | semantic_release;
7123    if (nir_semantics & NIR_MEMORY_RELEASE)
7124       semantics |= semantic_acquire | semantic_release;
7125 
7126    assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7127    assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7128 
7129    bld.barrier(aco_opcode::p_barrier,
7130                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7131                exec_scope);
7132 }
7133 
7134 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7135 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7136 {
7137    // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7138    Temp dst = get_ssa_temp(ctx, &instr->def);
7139    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7140    Builder bld(ctx->program, ctx->block);
7141 
7142    unsigned elem_size_bytes = instr->def.bit_size / 8;
7143    unsigned num_components = instr->def.num_components;
7144    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7145    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7146 }
7147 
7148 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7149 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7150 {
7151    unsigned writemask = nir_intrinsic_write_mask(instr);
7152    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7153    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7154    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7155 
7156    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7157    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7158 }
7159 
7160 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7161 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7162 {
7163    unsigned offset = nir_intrinsic_base(instr);
7164    Builder bld(ctx->program, ctx->block);
7165    Operand m = load_lds_size_m0(bld);
7166    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7167    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7168 
7169    unsigned num_operands = 3;
7170    aco_opcode op32, op64, op32_rtn, op64_rtn;
7171    switch (nir_intrinsic_atomic_op(instr)) {
7172    case nir_atomic_op_iadd:
7173       op32 = aco_opcode::ds_add_u32;
7174       op64 = aco_opcode::ds_add_u64;
7175       op32_rtn = aco_opcode::ds_add_rtn_u32;
7176       op64_rtn = aco_opcode::ds_add_rtn_u64;
7177       break;
7178    case nir_atomic_op_imin:
7179       op32 = aco_opcode::ds_min_i32;
7180       op64 = aco_opcode::ds_min_i64;
7181       op32_rtn = aco_opcode::ds_min_rtn_i32;
7182       op64_rtn = aco_opcode::ds_min_rtn_i64;
7183       break;
7184    case nir_atomic_op_umin:
7185       op32 = aco_opcode::ds_min_u32;
7186       op64 = aco_opcode::ds_min_u64;
7187       op32_rtn = aco_opcode::ds_min_rtn_u32;
7188       op64_rtn = aco_opcode::ds_min_rtn_u64;
7189       break;
7190    case nir_atomic_op_imax:
7191       op32 = aco_opcode::ds_max_i32;
7192       op64 = aco_opcode::ds_max_i64;
7193       op32_rtn = aco_opcode::ds_max_rtn_i32;
7194       op64_rtn = aco_opcode::ds_max_rtn_i64;
7195       break;
7196    case nir_atomic_op_umax:
7197       op32 = aco_opcode::ds_max_u32;
7198       op64 = aco_opcode::ds_max_u64;
7199       op32_rtn = aco_opcode::ds_max_rtn_u32;
7200       op64_rtn = aco_opcode::ds_max_rtn_u64;
7201       break;
7202    case nir_atomic_op_iand:
7203       op32 = aco_opcode::ds_and_b32;
7204       op64 = aco_opcode::ds_and_b64;
7205       op32_rtn = aco_opcode::ds_and_rtn_b32;
7206       op64_rtn = aco_opcode::ds_and_rtn_b64;
7207       break;
7208    case nir_atomic_op_ior:
7209       op32 = aco_opcode::ds_or_b32;
7210       op64 = aco_opcode::ds_or_b64;
7211       op32_rtn = aco_opcode::ds_or_rtn_b32;
7212       op64_rtn = aco_opcode::ds_or_rtn_b64;
7213       break;
7214    case nir_atomic_op_ixor:
7215       op32 = aco_opcode::ds_xor_b32;
7216       op64 = aco_opcode::ds_xor_b64;
7217       op32_rtn = aco_opcode::ds_xor_rtn_b32;
7218       op64_rtn = aco_opcode::ds_xor_rtn_b64;
7219       break;
7220    case nir_atomic_op_xchg:
7221       op32 = aco_opcode::ds_write_b32;
7222       op64 = aco_opcode::ds_write_b64;
7223       op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7224       op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7225       break;
7226    case nir_atomic_op_cmpxchg:
7227       op32 = aco_opcode::ds_cmpst_b32;
7228       op64 = aco_opcode::ds_cmpst_b64;
7229       op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7230       op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7231       num_operands = 4;
7232       break;
7233    case nir_atomic_op_fadd:
7234       op32 = aco_opcode::ds_add_f32;
7235       op32_rtn = aco_opcode::ds_add_rtn_f32;
7236       op64 = aco_opcode::num_opcodes;
7237       op64_rtn = aco_opcode::num_opcodes;
7238       break;
7239    case nir_atomic_op_fmin:
7240       op32 = aco_opcode::ds_min_f32;
7241       op32_rtn = aco_opcode::ds_min_rtn_f32;
7242       op64 = aco_opcode::ds_min_f64;
7243       op64_rtn = aco_opcode::ds_min_rtn_f64;
7244       break;
7245    case nir_atomic_op_fmax:
7246       op32 = aco_opcode::ds_max_f32;
7247       op32_rtn = aco_opcode::ds_max_rtn_f32;
7248       op64 = aco_opcode::ds_max_f64;
7249       op64_rtn = aco_opcode::ds_max_rtn_f64;
7250       break;
7251    default: unreachable("Unhandled shared atomic intrinsic");
7252    }
7253 
7254    bool return_previous = !nir_def_is_unused(&instr->def);
7255 
7256    aco_opcode op;
7257    if (data.size() == 1) {
7258       assert(instr->def.bit_size == 32);
7259       op = return_previous ? op32_rtn : op32;
7260    } else {
7261       assert(instr->def.bit_size == 64);
7262       op = return_previous ? op64_rtn : op64;
7263    }
7264 
7265    if (offset > 65535) {
7266       address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7267       offset = 0;
7268    }
7269 
7270    aco_ptr<Instruction> ds;
7271    ds.reset(create_instruction(op, Format::DS, num_operands, return_previous ? 1 : 0));
7272    ds->operands[0] = Operand(address);
7273    ds->operands[1] = Operand(data);
7274    if (num_operands == 4) {
7275       Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7276       ds->operands[2] = Operand(data2);
7277       if (bld.program->gfx_level >= GFX11)
7278          std::swap(ds->operands[1], ds->operands[2]);
7279    }
7280    ds->operands[num_operands - 1] = m;
7281    ds->ds().offset0 = offset;
7282    if (return_previous)
7283       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7284    ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7285 
7286    if (m.isUndefined())
7287       ds->operands.pop_back();
7288 
7289    ctx->block->instructions.emplace_back(std::move(ds));
7290 }
7291 
7292 void
visit_shared_append(isel_context * ctx,nir_intrinsic_instr * instr)7293 visit_shared_append(isel_context* ctx, nir_intrinsic_instr* instr)
7294 {
7295    Builder bld(ctx->program, ctx->block);
7296    unsigned address = nir_intrinsic_base(instr);
7297    assert(address <= 65535 && (address % 4 == 0));
7298 
7299    aco_opcode op;
7300    switch (instr->intrinsic) {
7301    case nir_intrinsic_shared_append_amd: op = aco_opcode::ds_append; break;
7302    case nir_intrinsic_shared_consume_amd: op = aco_opcode::ds_consume; break;
7303    default: unreachable("not shared_append/consume");
7304    }
7305 
7306    Temp tmp = bld.tmp(v1);
7307    Instruction *ds;
7308    Operand m = load_lds_size_m0(bld);
7309    if (m.isUndefined())
7310       ds = bld.ds(op, Definition(tmp), address);
7311    else
7312       ds = bld.ds(op, Definition(tmp), m, address);
7313    ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7314 
7315    /* In wave64 for hw with native wave32, ds_append seems to be split in a load for the low half
7316     * and an atomic for the high half, and other LDS instructions can be scheduled between the two.
7317     * Which means the result of the low half is unusable because it might be out of date.
7318     */
7319    if (ctx->program->gfx_level >= GFX10 && ctx->program->wave_size == 64 &&
7320        ctx->program->workgroup_size > 64) {
7321       Temp last_lane = bld.sop1(aco_opcode::s_flbit_i32_b64, bld.def(s1), Operand(exec, s2));
7322       last_lane = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(63),
7323                            last_lane);
7324       bld.readlane(Definition(get_ssa_temp(ctx, &instr->def)), tmp, last_lane);
7325    } else {
7326       bld.pseudo(aco_opcode::p_as_uniform, Definition(get_ssa_temp(ctx, &instr->def)), tmp);
7327    }
7328 }
7329 
7330 void
visit_access_shared2_amd(isel_context * ctx,nir_intrinsic_instr * instr)7331 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7332 {
7333    bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7334    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7335    Builder bld(ctx->program, ctx->block);
7336 
7337    assert(bld.program->gfx_level >= GFX7);
7338 
7339    bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7340    uint8_t offset0 = nir_intrinsic_offset0(instr);
7341    uint8_t offset1 = nir_intrinsic_offset1(instr);
7342    bool st64 = nir_intrinsic_st64(instr);
7343 
7344    Operand m = load_lds_size_m0(bld);
7345    Instruction* ds;
7346    if (is_store) {
7347       aco_opcode op = st64
7348                          ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7349                          : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7350       Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7351       RegClass comp_rc = is64bit ? v2 : v1;
7352       Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7353       Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7354       ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7355    } else {
7356       Temp dst = get_ssa_temp(ctx, &instr->def);
7357       Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7358       aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7359                            : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7360       ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7361    }
7362    ds->ds().sync = memory_sync_info(storage_shared);
7363    if (m.isUndefined())
7364       ds->operands.pop_back();
7365 
7366    if (!is_store) {
7367       Temp dst = get_ssa_temp(ctx, &instr->def);
7368       if (dst.type() == RegType::sgpr) {
7369          emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7370          Temp comp[4];
7371          /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7372          for (unsigned i = 0; i < dst.size(); i++)
7373             comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7374          if (is64bit) {
7375             Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7376             Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7377             ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7378             ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7379             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7380             ctx->allocated_vec[dst.id()] = {comp0, comp1};
7381          } else {
7382             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7383          }
7384       }
7385 
7386       emit_split_vector(ctx, dst, 2);
7387    }
7388 }
7389 
7390 Temp
get_scratch_resource(isel_context * ctx)7391 get_scratch_resource(isel_context* ctx)
7392 {
7393    Builder bld(ctx->program, ctx->block);
7394    Temp scratch_addr = ctx->program->private_segment_buffer;
7395    if (!scratch_addr.bytes()) {
7396       Temp addr_lo =
7397          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7398       Temp addr_hi =
7399          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7400       scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7401    } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7402       scratch_addr =
7403          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7404    }
7405 
7406    struct ac_buffer_state ac_state = {0};
7407    uint32_t desc[4];
7408 
7409    ac_state.size = 0xffffffff;
7410    ac_state.format = PIPE_FORMAT_R32_FLOAT;
7411    for (int i = 0; i < 4; i++)
7412       ac_state.swizzle[i] = PIPE_SWIZZLE_0;
7413    /* older generations need element size = 4 bytes. element size removed in GFX9 */
7414    ac_state.element_size = ctx->program->gfx_level <= GFX8 ? 1u : 0u;
7415    ac_state.index_stride = ctx->program->wave_size == 64 ? 3u : 2u;
7416    ac_state.add_tid = true;
7417    ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
7418 
7419    ac_build_buffer_descriptor(ctx->program->gfx_level, &ac_state, desc);
7420 
7421    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(desc[2]),
7422                      Operand::c32(desc[3]));
7423 }
7424 
7425 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7426 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7427 {
7428    Builder bld(ctx->program, ctx->block);
7429    Temp dst = get_ssa_temp(ctx, &instr->def);
7430 
7431    LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7432    info.align_mul = nir_intrinsic_align_mul(instr);
7433    info.align_offset = nir_intrinsic_align_offset(instr);
7434    info.cache = get_cache_flags(ctx, ACCESS_TYPE_LOAD | ACCESS_IS_SWIZZLED_AMD);
7435    info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7436    info.sync = memory_sync_info(storage_scratch, semantic_private);
7437    if (ctx->program->gfx_level >= GFX9) {
7438       if (nir_src_is_const(instr->src[0])) {
7439          uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7440          info.offset =
7441             bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7442          info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7443       } else {
7444          info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7445       }
7446       EmitLoadParameters params = scratch_flat_load_params;
7447       params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7448       emit_load(ctx, bld, info, params);
7449    } else {
7450       info.resource = get_scratch_resource(ctx);
7451       info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7452       info.soffset = ctx->program->scratch_offset;
7453       emit_load(ctx, bld, info, scratch_mubuf_load_params);
7454    }
7455 }
7456 
7457 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7458 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7459 {
7460    Builder bld(ctx->program, ctx->block);
7461    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7462    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7463 
7464    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7465    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7466 
7467    unsigned write_count = 0;
7468    Temp write_datas[32];
7469    unsigned offsets[32];
7470    unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7471    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7472                       &write_count, write_datas, offsets);
7473 
7474    if (ctx->program->gfx_level >= GFX9) {
7475       uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7476       offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7477       uint32_t base_const_offset =
7478          nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7479 
7480       for (unsigned i = 0; i < write_count; i++) {
7481          aco_opcode op;
7482          switch (write_datas[i].bytes()) {
7483          case 1: op = aco_opcode::scratch_store_byte; break;
7484          case 2: op = aco_opcode::scratch_store_short; break;
7485          case 4: op = aco_opcode::scratch_store_dword; break;
7486          case 8: op = aco_opcode::scratch_store_dwordx2; break;
7487          case 12: op = aco_opcode::scratch_store_dwordx3; break;
7488          case 16: op = aco_opcode::scratch_store_dwordx4; break;
7489          default: unreachable("Unexpected store size");
7490          }
7491 
7492          uint32_t const_offset = base_const_offset + offsets[i];
7493          assert(const_offset < max || offset.id() == 0);
7494 
7495          Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7496          Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7497          if (offset.id() == 0)
7498             saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7499 
7500          bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7501                      memory_sync_info(storage_scratch, semantic_private));
7502       }
7503    } else {
7504       Temp rsrc = get_scratch_resource(ctx);
7505       offset = as_vgpr(ctx, offset);
7506       for (unsigned i = 0; i < write_count; i++) {
7507          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7508          Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7509                                         write_datas[i], offsets[i], true);
7510          mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7511          unsigned access = ACCESS_TYPE_STORE | ACCESS_IS_SWIZZLED_AMD |
7512                            (write_datas[i].bytes() < 4 ? ACCESS_MAY_STORE_SUBDWORD : 0);
7513          mubuf->mubuf().cache = get_cache_flags(ctx, access);
7514       }
7515    }
7516 }
7517 
7518 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7519 get_reduce_op(nir_op op, unsigned bit_size)
7520 {
7521    switch (op) {
7522 #define CASEI(name)                                                                                \
7523    case nir_op_##name:                                                                             \
7524       return (bit_size == 32)   ? name##32                                                         \
7525              : (bit_size == 16) ? name##16                                                         \
7526              : (bit_size == 8)  ? name##8                                                          \
7527                                 : name##64;
7528 #define CASEF(name)                                                                                \
7529    case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7530       CASEI(iadd)
7531       CASEI(imul)
7532       CASEI(imin)
7533       CASEI(umin)
7534       CASEI(imax)
7535       CASEI(umax)
7536       CASEI(iand)
7537       CASEI(ior)
7538       CASEI(ixor)
7539       CASEF(fadd)
7540       CASEF(fmul)
7541       CASEF(fmin)
7542       CASEF(fmax)
7543    default: unreachable("unknown reduction op");
7544 #undef CASEI
7545 #undef CASEF
7546    }
7547 }
7548 
7549 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7550 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7551 {
7552    Builder bld(ctx->program, ctx->block);
7553    Definition dst(get_ssa_temp(ctx, &instr->def));
7554    assert(dst.regClass().type() != RegType::vgpr);
7555    if (src.regClass().type() == RegType::vgpr)
7556       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7557    else
7558       bld.copy(dst, src);
7559 }
7560 
7561 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7562 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7563 {
7564    Builder bld(ctx->program, ctx->block);
7565    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7566 
7567    if (op == nir_op_fadd) {
7568       src_tmp = as_vgpr(ctx, src_tmp);
7569       Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7570                                       : dst.getTemp();
7571 
7572       if (src.ssa->bit_size == 16) {
7573          count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7574          bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7575       } else {
7576          assert(src.ssa->bit_size == 32);
7577          count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7578          bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7579       }
7580 
7581       if (tmp != dst.getTemp())
7582          bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7583 
7584       return;
7585    }
7586 
7587    if (dst.regClass() == s1)
7588       src_tmp = bld.as_uniform(src_tmp);
7589 
7590    if (op == nir_op_ixor && count.type() == RegType::sgpr)
7591       count =
7592          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7593    else if (op == nir_op_ixor)
7594       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7595 
7596    assert(dst.getTemp().type() == count.type());
7597 
7598    if (nir_src_is_const(src)) {
7599       uint32_t imm = nir_src_as_uint(src);
7600       if (imm == 1 && dst.bytes() <= 2)
7601          bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7602       else if (imm == 1)
7603          bld.copy(dst, count);
7604       else if (imm == 0)
7605          bld.copy(dst, Operand::zero(dst.bytes()));
7606       else if (count.type() == RegType::vgpr)
7607          bld.v_mul_imm(dst, count, imm, true, true);
7608       else if (imm == 0xffffffff)
7609          bld.sop2(aco_opcode::s_sub_i32, dst, bld.def(s1, scc), Operand::zero(), count);
7610       else if (util_is_power_of_two_or_zero(imm))
7611          bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), count,
7612                   Operand::c32(ffs(imm) - 1u));
7613       else
7614          bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7615    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7616       bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7617    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7618       bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7619    } else if (dst.getTemp().type() == RegType::vgpr) {
7620       bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7621    } else {
7622       bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7623    }
7624 }
7625 
7626 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7627 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7628 {
7629    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7630    if (op == nir_op_imul || op == nir_op_fmul)
7631       return false;
7632 
7633    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7634       Builder bld(ctx->program, ctx->block);
7635       Definition dst(get_ssa_temp(ctx, &instr->def));
7636       unsigned bit_size = instr->src[0].ssa->bit_size;
7637       if (bit_size > 32)
7638          return false;
7639 
7640       Temp thread_count =
7641          bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7642       set_wqm(ctx);
7643 
7644       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7645    } else {
7646       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7647    }
7648 
7649    return true;
7650 }
7651 
7652 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7653 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7654 {
7655    Builder bld(ctx->program, ctx->block);
7656    Definition dst(get_ssa_temp(ctx, &instr->def));
7657    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7658    bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7659 
7660    if (op == nir_op_imul || op == nir_op_fmul)
7661       return false;
7662 
7663    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7664       if (instr->src[0].ssa->bit_size > 32)
7665          return false;
7666 
7667       Temp packed_tid;
7668       if (inc)
7669          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7670       else
7671          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7672       set_wqm(ctx);
7673 
7674       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7675       return true;
7676    }
7677 
7678    assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7679           op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7680 
7681    if (inc) {
7682       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7683       return true;
7684    }
7685 
7686    /* Copy the source and write the reduction operation identity to the first lane. */
7687    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7688    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7689    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7690    if (dst.bytes() == 8) {
7691       Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7692       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7693       uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7694       uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7695 
7696       lo =
7697          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7698       hi =
7699          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7700       bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7701    } else {
7702       uint32_t identity = get_reduction_identity(reduce_op, 0);
7703       bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7704                     as_vgpr(ctx, src));
7705    }
7706 
7707    set_wqm(ctx);
7708    return true;
7709 }
7710 
7711 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7712 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7713                      Definition dst, Temp src)
7714 {
7715    assert(src.bytes() <= 8);
7716    assert(src.type() == RegType::vgpr);
7717 
7718    Builder bld(ctx->program, ctx->block);
7719 
7720    unsigned num_defs = 0;
7721    Definition defs[5];
7722    defs[num_defs++] = dst;
7723    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7724 
7725    /* scalar identity temporary */
7726    bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7727                      aco_op != aco_opcode::p_reduce;
7728    if (aco_op == aco_opcode::p_exclusive_scan) {
7729       need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7730                      op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7731                      op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7732                      op == fmul64);
7733    }
7734    if (need_sitmp)
7735       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7736 
7737    /* scc clobber */
7738    defs[num_defs++] = bld.def(s1, scc);
7739 
7740    /* vcc clobber */
7741    bool clobber_vcc = false;
7742    if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
7743       clobber_vcc = true;
7744    if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
7745       clobber_vcc = true;
7746    if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7747       clobber_vcc = true;
7748 
7749    if (clobber_vcc)
7750       defs[num_defs++] = bld.def(bld.lm, vcc);
7751 
7752    Instruction* reduce = create_instruction(aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7753    reduce->operands[0] = Operand(src);
7754    /* setup_reduce_temp will update these undef operands if needed */
7755    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7756    reduce->operands[2] = Operand(v1.as_linear());
7757    std::copy(defs, defs + num_defs, reduce->definitions.begin());
7758 
7759    reduce->reduction().reduce_op = op;
7760    reduce->reduction().cluster_size = cluster_size;
7761    bld.insert(std::move(reduce));
7762 
7763    return dst.getTemp();
7764 }
7765 
7766 Temp
inclusive_scan_to_exclusive(isel_context * ctx,ReduceOp op,Definition dst,Temp src)7767 inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Definition dst, Temp src)
7768 {
7769    Builder bld(ctx->program, ctx->block);
7770 
7771    Temp scan = emit_reduction_instr(ctx, aco_opcode::p_inclusive_scan, op, ctx->program->wave_size,
7772                                     bld.def(dst.regClass()), src);
7773 
7774    switch (op) {
7775    case iadd8:
7776    case iadd16:
7777    case iadd32: return bld.vsub32(dst, scan, src);
7778    case ixor64:
7779    case iadd64: {
7780       Temp src00 = bld.tmp(v1);
7781       Temp src01 = bld.tmp(v1);
7782       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan);
7783       Temp src10 = bld.tmp(v1);
7784       Temp src11 = bld.tmp(v1);
7785       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src);
7786 
7787       Temp lower = bld.tmp(v1);
7788       Temp upper = bld.tmp(v1);
7789       if (op == iadd64) {
7790          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
7791          bld.vsub32(Definition(upper), src01, src11, false, borrow);
7792       } else {
7793          bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
7794          bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
7795       }
7796       return bld.pseudo(aco_opcode::p_create_vector, dst, lower, upper);
7797    }
7798    case ixor8:
7799    case ixor16:
7800    case ixor32: return bld.vop2(aco_opcode::v_xor_b32, dst, scan, src);
7801    default: unreachable("Unsupported op");
7802    }
7803 }
7804 
7805 bool
emit_rotate_by_constant(isel_context * ctx,Temp & dst,Temp src,unsigned cluster_size,uint64_t delta)7806 emit_rotate_by_constant(isel_context* ctx, Temp& dst, Temp src, unsigned cluster_size,
7807                         uint64_t delta)
7808 {
7809    Builder bld(ctx->program, ctx->block);
7810    RegClass rc = src.regClass();
7811    dst = Temp(0, rc);
7812    delta %= cluster_size;
7813 
7814    if (delta == 0) {
7815       dst = bld.copy(bld.def(rc), src);
7816    } else if (delta * 2 == cluster_size && cluster_size <= 32) {
7817       dst = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, delta), true);
7818    } else if (cluster_size == 4) {
7819       unsigned res[4];
7820       for (unsigned i = 0; i < 4; i++)
7821          res[i] = (i + delta) & 0x3;
7822       uint32_t dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
7823       if (ctx->program->gfx_level >= GFX8)
7824          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_ctrl);
7825       else
7826          dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl);
7827    } else if (cluster_size == 8 && ctx->program->gfx_level >= GFX10) {
7828       uint32_t lane_sel = 0;
7829       for (unsigned i = 0; i < 8; i++)
7830          lane_sel |= ((i + delta) & 0x7) << (i * 3);
7831       dst = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(rc), src, lane_sel);
7832    } else if (cluster_size == 16 && ctx->program->gfx_level >= GFX8) {
7833       dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_row_rr(16 - delta));
7834    } else if (cluster_size <= 32 && ctx->program->gfx_level >= GFX8) {
7835       uint32_t ctrl = ds_pattern_rotate(delta, ~(cluster_size - 1) & 0x1f);
7836       dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, ctrl);
7837    } else if (cluster_size == 64) {
7838       bool has_wf_dpp = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX10;
7839       if (delta == 32 && ctx->program->gfx_level >= GFX11) {
7840          dst = bld.vop1(aco_opcode::v_permlane64_b32, bld.def(rc), src);
7841       } else if (delta == 1 && has_wf_dpp) {
7842          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rl1);
7843       } else if (delta == 63 && has_wf_dpp) {
7844          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rr1);
7845       }
7846    }
7847 
7848    return dst.id() != 0;
7849 }
7850 
7851 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7852 Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset);
7853 void pops_await_overlapped_waves(isel_context* ctx);
7854 
7855 void
ds_ordered_count_offsets(isel_context * ctx,unsigned index_operand,unsigned wave_release,unsigned wave_done,unsigned * offset0,unsigned * offset1)7856 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
7857                          unsigned wave_done, unsigned* offset0, unsigned* offset1)
7858 {
7859    unsigned ordered_count_index = index_operand & 0x3f;
7860    unsigned count_dword = (index_operand >> 24) & 0xf;
7861 
7862    assert(ctx->options->gfx_level >= GFX10);
7863    assert(count_dword >= 1 && count_dword <= 4);
7864 
7865    *offset0 = ordered_count_index << 2;
7866    *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
7867 
7868    if (ctx->options->gfx_level < GFX11)
7869       *offset1 |= 3 /* GS shader type */ << 2;
7870 }
7871 
7872 struct aco_export_mrt {
7873    Operand out[4];
7874    unsigned enabled_channels;
7875    unsigned target;
7876    bool compr;
7877 };
7878 
7879 static void
create_fs_dual_src_export_gfx11(isel_context * ctx,const struct aco_export_mrt * mrt0,const struct aco_export_mrt * mrt1)7880 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
7881                                 const struct aco_export_mrt* mrt1)
7882 {
7883    Builder bld(ctx->program, ctx->block);
7884 
7885    aco_ptr<Instruction> exp{
7886       create_instruction(aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
7887    for (unsigned i = 0; i < 4; i++) {
7888       exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
7889       exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
7890    }
7891 
7892    RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
7893    exp->definitions[0] = bld.def(type); /* mrt0 */
7894    exp->definitions[1] = bld.def(type); /* mrt1 */
7895    exp->definitions[2] = bld.def(bld.lm);
7896    exp->definitions[3] = bld.def(bld.lm);
7897    exp->definitions[4] = bld.def(bld.lm, vcc);
7898    exp->definitions[5] = bld.def(s1, scc);
7899    ctx->block->instructions.emplace_back(std::move(exp));
7900 
7901    ctx->program->has_color_exports = true;
7902 }
7903 
7904 static void
visit_cmat_muladd(isel_context * ctx,nir_intrinsic_instr * instr)7905 visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
7906 {
7907    aco_opcode opcode = aco_opcode::num_opcodes;
7908    unsigned signed_mask = 0;
7909    bool clamp = false;
7910 
7911    switch (instr->src[0].ssa->bit_size) {
7912    case 16:
7913       switch (instr->def.bit_size) {
7914       case 32: opcode = aco_opcode::v_wmma_f32_16x16x16_f16; break;
7915       case 16: opcode = aco_opcode::v_wmma_f16_16x16x16_f16; break;
7916       }
7917       break;
7918    case 8:
7919       opcode = aco_opcode::v_wmma_i32_16x16x16_iu8;
7920       signed_mask = nir_intrinsic_cmat_signed_mask(instr);
7921       clamp = nir_intrinsic_saturate(instr);
7922       break;
7923    }
7924 
7925    if (opcode == aco_opcode::num_opcodes)
7926       unreachable("visit_cmat_muladd: invalid bit size combination");
7927 
7928    Builder bld(ctx->program, ctx->block);
7929 
7930    Temp dst = get_ssa_temp(ctx, &instr->def);
7931    Operand A(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7932    Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)));
7933    Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
7934 
7935    VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu();
7936    vop3p.neg_lo[0] = (signed_mask & 0x1) != 0;
7937    vop3p.neg_lo[1] = (signed_mask & 0x2) != 0;
7938    vop3p.clamp = clamp;
7939 
7940    emit_split_vector(ctx, dst, instr->def.num_components);
7941 }
7942 
7943 static void begin_empty_exec_skip(isel_context* ctx, nir_instr* instr, nir_block* block);
7944 
7945 static void end_empty_exec_skip(isel_context* ctx);
7946 
7947 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)7948 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
7949 {
7950    Builder bld(ctx->program, ctx->block);
7951    switch (instr->intrinsic) {
7952    case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
7953    case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
7954    case nir_intrinsic_load_input:
7955    case nir_intrinsic_load_per_primitive_input:
7956    case nir_intrinsic_load_input_vertex:
7957       if (ctx->program->stage == fragment_fs)
7958          visit_load_fs_input(ctx, instr);
7959       else
7960          isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
7961       break;
7962    case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
7963    case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
7964    case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
7965    case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
7966    case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
7967    case nir_intrinsic_shared_atomic:
7968    case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
7969    case nir_intrinsic_shared_append_amd:
7970    case nir_intrinsic_shared_consume_amd: visit_shared_append(ctx, instr); break;
7971    case nir_intrinsic_load_shared2_amd:
7972    case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
7973    case nir_intrinsic_bindless_image_load:
7974    case nir_intrinsic_bindless_image_fragment_mask_load_amd:
7975    case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
7976    case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
7977    case nir_intrinsic_bindless_image_atomic:
7978    case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
7979    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
7980    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
7981    case nir_intrinsic_load_typed_buffer_amd:
7982    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
7983    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
7984    case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
7985    case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
7986    case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
7987    case nir_intrinsic_global_atomic_amd:
7988    case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
7989    case nir_intrinsic_ssbo_atomic:
7990    case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
7991    case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
7992    case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
7993    case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
7994    case nir_intrinsic_load_num_workgroups: {
7995       Temp dst = get_ssa_temp(ctx, &instr->def);
7996       if (ctx->options->load_grid_size_from_user_sgpr) {
7997          bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
7998       } else {
7999          Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8000          assert(addr.regClass() == s2);
8001          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8002                     bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8003                     bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8004       }
8005       emit_split_vector(ctx, dst, 3);
8006       break;
8007    }
8008    case nir_intrinsic_load_workgroup_id: {
8009       Temp dst = get_ssa_temp(ctx, &instr->def);
8010       if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8011          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), ctx->workgroup_id[0],
8012                     ctx->workgroup_id[1], ctx->workgroup_id[2]);
8013          emit_split_vector(ctx, dst, 3);
8014       } else {
8015          isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8016       }
8017       break;
8018    }
8019    case nir_intrinsic_load_subgroup_id: {
8020       assert(ctx->options->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER);
8021       bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8022                ctx->ttmp8, Operand::c32(25 | (5 << 16)));
8023       break;
8024    }
8025    case nir_intrinsic_ddx:
8026    case nir_intrinsic_ddy:
8027    case nir_intrinsic_ddx_fine:
8028    case nir_intrinsic_ddy_fine:
8029    case nir_intrinsic_ddx_coarse:
8030    case nir_intrinsic_ddy_coarse: {
8031       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8032       Temp dst = get_ssa_temp(ctx, &instr->def);
8033 
8034       uint16_t dpp_ctrl1, dpp_ctrl2;
8035       if (instr->intrinsic == nir_intrinsic_ddx_fine) {
8036          if (nir_def_all_uses_ignore_sign_bit(&instr->def)) {
8037             dpp_ctrl1 = dpp_quad_perm(1, 0, 3, 2);
8038             dpp_ctrl2 = dpp_quad_perm(0, 1, 2, 3);
8039          } else {
8040             dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
8041             dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
8042          }
8043       } else if (instr->intrinsic == nir_intrinsic_ddy_fine) {
8044          if (nir_def_all_uses_ignore_sign_bit(&instr->def)) {
8045             dpp_ctrl1 = dpp_quad_perm(2, 3, 0, 1);
8046             dpp_ctrl2 = dpp_quad_perm(0, 1, 2, 3);
8047          } else {
8048             dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
8049             dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
8050          }
8051       } else {
8052          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
8053          if (instr->intrinsic == nir_intrinsic_ddx ||
8054              instr->intrinsic == nir_intrinsic_ddx_coarse)
8055             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
8056          else
8057             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
8058       }
8059 
8060       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
8061          assert(instr->def.num_components == 2);
8062 
8063          /* identify swizzle to opsel */
8064          unsigned opsel_lo = 0b00;
8065          unsigned opsel_hi = 0b11;
8066 
8067          Temp tl = src;
8068          if (nir_src_is_divergent(&instr->src[0]))
8069             tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
8070 
8071          Builder::Result sub =
8072             bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), src, tl, opsel_lo, opsel_hi);
8073          sub->valu().neg_lo[1] = true;
8074          sub->valu().neg_hi[1] = true;
8075 
8076          if (nir_src_is_divergent(&instr->src[0]) && dpp_ctrl2 != dpp_quad_perm(0, 1, 2, 3))
8077             bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), sub, dpp_ctrl2);
8078          else
8079             bld.copy(Definition(dst), sub);
8080          emit_split_vector(ctx, dst, 2);
8081       } else {
8082          aco_opcode subrev =
8083             instr->def.bit_size == 16 ? aco_opcode::v_subrev_f16 : aco_opcode::v_subrev_f32;
8084 
8085          /* v_interp with constant sources only works on GFX11/11.5,
8086           * and it's only faster on GFX11.5.
8087           */
8088          bool use_interp = dpp_ctrl1 == dpp_quad_perm(0, 0, 0, 0) && instr->def.bit_size == 32 &&
8089                            ctx->program->gfx_level == GFX11_5;
8090          if (!nir_src_is_divergent(&instr->src[0])) {
8091             bld.vop2(subrev, Definition(dst), src, src);
8092          } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(1, 1, 1, 1)) {
8093             bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), src,
8094                               Operand::c32(0x3f800000), src)
8095                ->valu()
8096                .neg[2] = true;
8097          } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(2, 2, 2, 2)) {
8098             Builder::Result tmp = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1),
8099                                                     Operand::c32(0), Operand::c32(0), src);
8100             tmp->valu().neg = 0x6;
8101             bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), src,
8102                               Operand::c32(0x3f800000), tmp);
8103          } else if (ctx->program->gfx_level >= GFX8 && dpp_ctrl2 == dpp_quad_perm(0, 1, 2, 3)) {
8104             bld.vop2_dpp(subrev, Definition(dst), src, src, dpp_ctrl1);
8105          } else if (ctx->program->gfx_level >= GFX8) {
8106             Temp tmp = bld.vop2_dpp(subrev, bld.def(v1), src, src, dpp_ctrl1);
8107             bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), tmp, dpp_ctrl2);
8108          } else {
8109             Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
8110             Temp tr = src;
8111             if (dpp_ctrl2 != dpp_quad_perm(0, 1, 2, 3))
8112                tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
8113             bld.vop2(subrev, Definition(dst), tl, tr);
8114          }
8115       }
8116       set_wqm(ctx, true);
8117       break;
8118    }
8119 
8120    case nir_intrinsic_ballot_relaxed:
8121    case nir_intrinsic_ballot: {
8122       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8123       Temp dst = get_ssa_temp(ctx, &instr->def);
8124 
8125       if (instr->src[0].ssa->bit_size == 1) {
8126          assert(src.regClass() == bld.lm);
8127       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8128          src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8129       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8130          src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8131       } else {
8132          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8133       }
8134 
8135       /* Make sure that all inactive lanes return zero.
8136        * Value-numbering might remove the comparison above */
8137       Definition def = dst.size() == bld.lm.size() ? Definition(dst) : bld.def(bld.lm);
8138       if (instr->intrinsic == nir_intrinsic_ballot_relaxed)
8139          src = bld.copy(def, src);
8140       else
8141          src = bld.sop2(Builder::s_and, def, bld.def(s1, scc), src, Operand(exec, bld.lm));
8142       if (dst.size() != bld.lm.size()) {
8143          /* Wave32 with ballot size set to 64 */
8144          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8145       }
8146 
8147       set_wqm(ctx);
8148       break;
8149    }
8150    case nir_intrinsic_inverse_ballot: {
8151       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8152       Temp dst = get_ssa_temp(ctx, &instr->def);
8153 
8154       assert(dst.size() == bld.lm.size());
8155       if (src.size() > dst.size()) {
8156          emit_extract_vector(ctx, src, 0, dst);
8157       } else if (src.size() < dst.size()) {
8158          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8159       } else {
8160          bld.copy(Definition(dst), src);
8161       }
8162       break;
8163    }
8164    case nir_intrinsic_shuffle:
8165    case nir_intrinsic_read_invocation: {
8166       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8167       assert(instr->def.bit_size != 1);
8168       if (!nir_src_is_divergent(&instr->src[0])) {
8169          emit_uniform_subgroup(ctx, instr, src);
8170       } else {
8171          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8172          if (instr->intrinsic == nir_intrinsic_read_invocation ||
8173              !nir_src_is_divergent(&instr->src[1]))
8174             tid = bld.as_uniform(tid);
8175          Temp dst = get_ssa_temp(ctx, &instr->def);
8176 
8177          src = as_vgpr(ctx, src);
8178 
8179          if (src.regClass() == v1b || src.regClass() == v2b) {
8180             Temp tmp = bld.tmp(v1);
8181             tmp = emit_bpermute(ctx, bld, tid, src);
8182             if (dst.type() == RegType::vgpr)
8183                bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8184                           bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8185             else
8186                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8187          } else if (src.regClass() == v1) {
8188             Temp tmp = emit_bpermute(ctx, bld, tid, src);
8189             bld.copy(Definition(dst), tmp);
8190          } else if (src.regClass() == v2) {
8191             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8192             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8193             lo = emit_bpermute(ctx, bld, tid, lo);
8194             hi = emit_bpermute(ctx, bld, tid, hi);
8195             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8196             emit_split_vector(ctx, dst, 2);
8197          } else {
8198             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8199          }
8200          set_wqm(ctx);
8201       }
8202       break;
8203    }
8204    case nir_intrinsic_rotate: {
8205       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8206       Temp delta = get_ssa_temp(ctx, instr->src[1].ssa);
8207       Temp dst = get_ssa_temp(ctx, &instr->def);
8208       assert(instr->def.bit_size > 1 && instr->def.bit_size <= 32);
8209 
8210       if (!nir_src_is_divergent(&instr->src[0])) {
8211          emit_uniform_subgroup(ctx, instr, src);
8212          break;
8213       }
8214 
8215       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
8216       cluster_size = util_next_power_of_two(
8217          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8218 
8219       if (cluster_size == 1) {
8220          bld.copy(Definition(dst), src);
8221          break;
8222       }
8223 
8224       delta = bld.as_uniform(delta);
8225       src = as_vgpr(ctx, src);
8226 
8227       Temp tmp;
8228       if (nir_src_is_const(instr->src[1]) &&
8229           emit_rotate_by_constant(ctx, tmp, src, cluster_size, nir_src_as_uint(instr->src[1]))) {
8230       } else if (cluster_size == 2) {
8231          Temp noswap =
8232             bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), delta, Operand::c32(0));
8233          noswap = bool_to_vector_condition(ctx, noswap);
8234          Temp swapped = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, 0x1), true);
8235          tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(src.regClass()), swapped, src, noswap);
8236       } else if (ctx->program->gfx_level >= GFX10 && cluster_size <= 16) {
8237          if (cluster_size == 4) /* shift mask already does this for 8/16. */
8238             delta = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), delta,
8239                              Operand::c32(0x3));
8240          delta =
8241             bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), delta, Operand::c32(2));
8242 
8243          Temp lo = bld.copy(bld.def(s1), Operand::c32(cluster_size == 4 ? 0x32103210 : 0x76543210));
8244          Temp hi;
8245 
8246          if (cluster_size <= 8) {
8247             Temp shr = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8248             if (cluster_size == 4) {
8249                Temp lotolohi = bld.copy(bld.def(s1), Operand::c32(0x4444));
8250                Temp lohi =
8251                   bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, lotolohi);
8252                lo = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), shr, lohi);
8253             } else {
8254                delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
8255                                 Operand::c32(32), delta);
8256                Temp shl =
8257                   bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8258                lo = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, shl);
8259             }
8260             Temp lotohi = bld.copy(bld.def(s1), Operand::c32(0x88888888));
8261             hi = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), lo, lotohi);
8262          } else {
8263             hi = bld.copy(bld.def(s1), Operand::c32(0xfedcba98));
8264 
8265             Temp lohi = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
8266 
8267             Temp shr = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8268             delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(64),
8269                              delta);
8270             Temp shl = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8271 
8272             lohi = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), shr, shl);
8273             lo = bld.tmp(s1);
8274             hi = bld.tmp(s1);
8275             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), lohi);
8276          }
8277 
8278          Builder::Result ret =
8279             bld.vop3(aco_opcode::v_permlane16_b32, bld.def(src.regClass()), src, lo, hi);
8280          ret->valu().opsel[0] = true; /* set FETCH_INACTIVE */
8281          ret->valu().opsel[1] = true; /* set BOUND_CTRL */
8282          tmp = ret;
8283       } else {
8284          /* Fallback to ds_bpermute if we can't find a special instruction. */
8285          Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
8286          Temp src_lane = bld.vadd32(bld.def(v1), tid, delta);
8287 
8288          if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level <= GFX11_5 &&
8289              cluster_size == 32) {
8290             /* ds_bpermute is restricted to 32 lanes on GFX10-GFX11.5. */
8291             Temp index_x4 =
8292                bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), src_lane);
8293             tmp = bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, src);
8294          } else {
8295             /* Technically, full wave rotate doesn't need this, but it breaks the pseudo ops. */
8296             src_lane = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), Operand::c32(cluster_size - 1),
8297                                 src_lane, tid);
8298             tmp = emit_bpermute(ctx, bld, src_lane, src);
8299          }
8300       }
8301 
8302       tmp = emit_extract_vector(ctx, tmp, 0, dst.regClass());
8303       bld.copy(Definition(dst), tmp);
8304       set_wqm(ctx);
8305       break;
8306    }
8307    case nir_intrinsic_read_first_invocation: {
8308       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8309       Temp dst = get_ssa_temp(ctx, &instr->def);
8310       if (instr->def.bit_size == 1) {
8311          assert(src.regClass() == bld.lm);
8312          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8313                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8314          bool_to_vector_condition(ctx, tmp, dst);
8315       } else {
8316          emit_readfirstlane(ctx, src, dst);
8317       }
8318       set_wqm(ctx);
8319       break;
8320    }
8321    case nir_intrinsic_as_uniform: {
8322       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8323       Temp dst = get_ssa_temp(ctx, &instr->def);
8324       if (src.type() == RegType::vgpr)
8325          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
8326       else
8327          bld.copy(Definition(dst), src);
8328       break;
8329    }
8330    case nir_intrinsic_vote_all: {
8331       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8332       Temp dst = get_ssa_temp(ctx, &instr->def);
8333       assert(src.regClass() == bld.lm);
8334       assert(dst.regClass() == bld.lm);
8335 
8336       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8337       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8338                .def(1)
8339                .getTemp();
8340       Temp cond = bool_to_vector_condition(ctx, tmp);
8341       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8342       set_wqm(ctx);
8343       break;
8344    }
8345    case nir_intrinsic_vote_any: {
8346       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8347       Temp dst = get_ssa_temp(ctx, &instr->def);
8348       assert(src.regClass() == bld.lm);
8349       assert(dst.regClass() == bld.lm);
8350 
8351       Temp tmp = bool_to_scalar_condition(ctx, src);
8352       bool_to_vector_condition(ctx, tmp, dst);
8353       set_wqm(ctx);
8354       break;
8355    }
8356    case nir_intrinsic_quad_vote_any: {
8357       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8358       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8359       bld.sop1(Builder::s_wqm, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8360       set_wqm(ctx);
8361       break;
8362    }
8363    case nir_intrinsic_quad_vote_all: {
8364       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8365       src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8366       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8367       src = bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), src);
8368       bld.sop1(Builder::s_not, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8369       set_wqm(ctx);
8370       break;
8371    }
8372    case nir_intrinsic_reduce:
8373    case nir_intrinsic_inclusive_scan:
8374    case nir_intrinsic_exclusive_scan: {
8375       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8376       Temp dst = get_ssa_temp(ctx, &instr->def);
8377       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8378       unsigned cluster_size =
8379          instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8380       cluster_size = util_next_power_of_two(
8381          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8382       const unsigned bit_size = instr->src[0].ssa->bit_size;
8383       assert(bit_size != 1);
8384 
8385       if (!nir_src_is_divergent(&instr->src[0])) {
8386          /* We use divergence analysis to assign the regclass, so check if it's
8387           * working as expected */
8388          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8389          if (instr->intrinsic == nir_intrinsic_inclusive_scan ||
8390              cluster_size != ctx->program->wave_size)
8391             expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor ||
8392                                  op == nir_op_imul || op == nir_op_fmul;
8393          assert(instr->def.divergent == expected_divergent);
8394 
8395          if (instr->intrinsic == nir_intrinsic_reduce) {
8396             if (!instr->def.divergent && emit_uniform_reduce(ctx, instr))
8397                break;
8398          } else if (emit_uniform_scan(ctx, instr)) {
8399             break;
8400          }
8401       }
8402 
8403       src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8404       ReduceOp reduce_op = get_reduce_op(op, bit_size);
8405 
8406       aco_opcode aco_op;
8407       switch (instr->intrinsic) {
8408       case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8409       case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8410       case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8411       default: unreachable("unknown reduce intrinsic");
8412       }
8413 
8414       /* Avoid whole wave shift. */
8415       const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan &&
8416                                                (op == nir_op_iadd || op == nir_op_ixor) &&
8417                                                dst.type() == RegType::vgpr;
8418       if (use_inclusive_for_exclusive)
8419          inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src);
8420       else
8421          emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src);
8422 
8423       set_wqm(ctx);
8424       break;
8425    }
8426    case nir_intrinsic_dpp16_shift_amd: {
8427       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8428       Temp dst = get_ssa_temp(ctx, &instr->def);
8429       int delta = nir_intrinsic_base(instr);
8430       assert(delta >= -15 && delta <= 15 && delta != 0);
8431       assert(instr->def.bit_size != 1 && instr->def.bit_size < 64);
8432       assert(ctx->options->gfx_level >= GFX8);
8433 
8434       uint16_t dpp_ctrl = delta < 0 ? dpp_row_sr(-delta) : dpp_row_sl(delta);
8435       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), src, dpp_ctrl);
8436 
8437       set_wqm(ctx);
8438       break;
8439    }
8440    case nir_intrinsic_quad_broadcast:
8441    case nir_intrinsic_quad_swap_horizontal:
8442    case nir_intrinsic_quad_swap_vertical:
8443    case nir_intrinsic_quad_swap_diagonal:
8444    case nir_intrinsic_quad_swizzle_amd: {
8445       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8446 
8447       if (!instr->def.divergent) {
8448          emit_uniform_subgroup(ctx, instr, src);
8449          break;
8450       }
8451 
8452       /* Quad broadcast lane. */
8453       unsigned lane = 0;
8454       /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8455       bool bool_use_valu = instr->def.bit_size == 1;
8456 
8457       uint16_t dpp_ctrl = 0;
8458 
8459       bool allow_fi = true;
8460       switch (instr->intrinsic) {
8461       case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8462       case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8463       case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8464       case nir_intrinsic_quad_swizzle_amd:
8465          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
8466          allow_fi &= nir_intrinsic_fetch_inactive(instr);
8467          break;
8468       case nir_intrinsic_quad_broadcast:
8469          lane = nir_src_as_const_value(instr->src[1])->u32;
8470          dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8471          bool_use_valu = false;
8472          break;
8473       default: break;
8474       }
8475 
8476       Temp dst = get_ssa_temp(ctx, &instr->def);
8477 
8478       /* Setup source. */
8479       if (bool_use_valu)
8480          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8481                             Operand::c32(-1), src);
8482       else if (instr->def.bit_size != 1)
8483          src = as_vgpr(ctx, src);
8484 
8485       if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8486          /* Special case for quad broadcast using SALU only. */
8487          assert(src.regClass() == bld.lm && dst.regClass() == bld.lm);
8488 
8489          uint32_t half_mask = 0x11111111u << lane;
8490          Operand mask_tmp = bld.lm.bytes() == 4
8491                                ? Operand::c32(half_mask)
8492                                : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8493                                             Operand::c32(half_mask), Operand::c32(half_mask));
8494 
8495          src =
8496             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8497          src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8498          bld.sop1(Builder::s_wqm, Definition(dst), bld.def(s1, scc), src);
8499       } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8500          unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8501          Definition def = (excess_bytes || bool_use_valu) ? bld.def(v1) : Definition(dst);
8502 
8503          if (ctx->program->gfx_level >= GFX8)
8504             bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl, 0xf, 0xf, true, allow_fi);
8505          else
8506             bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8507 
8508          if (excess_bytes)
8509             bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8510                        bld.def(RegClass::get(dst.type(), excess_bytes)), def.getTemp());
8511          if (bool_use_valu)
8512             bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), def.getTemp());
8513       } else if (instr->def.bit_size == 64) {
8514          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8515          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8516 
8517          if (ctx->program->gfx_level >= GFX8) {
8518             lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl, 0xf, 0xf, true,
8519                               allow_fi);
8520             hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl, 0xf, 0xf, true,
8521                               allow_fi);
8522          } else {
8523             lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8524             hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8525          }
8526 
8527          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8528          emit_split_vector(ctx, dst, 2);
8529       } else {
8530          isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8531       }
8532 
8533       set_wqm(ctx);
8534       break;
8535    }
8536    case nir_intrinsic_masked_swizzle_amd: {
8537       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8538       if (!instr->def.divergent) {
8539          emit_uniform_subgroup(ctx, instr, src);
8540          break;
8541       }
8542       Temp dst = get_ssa_temp(ctx, &instr->def);
8543       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8544       bool allow_fi = nir_intrinsic_fetch_inactive(instr);
8545 
8546       if (instr->def.bit_size != 1)
8547          src = as_vgpr(ctx, src);
8548 
8549       if (instr->def.bit_size == 1) {
8550          assert(src.regClass() == bld.lm);
8551          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8552                             Operand::c32(-1), src);
8553          src = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8554          bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), src);
8555       } else if (dst.regClass() == v1b) {
8556          Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8557          emit_extract_vector(ctx, tmp, 0, dst);
8558       } else if (dst.regClass() == v2b) {
8559          Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8560          emit_extract_vector(ctx, tmp, 0, dst);
8561       } else if (dst.regClass() == v1) {
8562          bld.copy(Definition(dst), emit_masked_swizzle(ctx, bld, src, mask, allow_fi));
8563       } else if (dst.regClass() == v2) {
8564          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8565          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8566          lo = emit_masked_swizzle(ctx, bld, lo, mask, allow_fi);
8567          hi = emit_masked_swizzle(ctx, bld, hi, mask, allow_fi);
8568          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8569          emit_split_vector(ctx, dst, 2);
8570       } else {
8571          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8572       }
8573       set_wqm(ctx);
8574       break;
8575    }
8576    case nir_intrinsic_write_invocation_amd: {
8577       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8578       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8579       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8580       Temp dst = get_ssa_temp(ctx, &instr->def);
8581       if (dst.regClass() == v1) {
8582          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8583          bld.writelane(Definition(dst), val, lane, src);
8584       } else if (dst.regClass() == v2) {
8585          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8586          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8587          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8588          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8589          Temp lo = bld.writelane(bld.def(v1), val_lo, lane, src_hi);
8590          Temp hi = bld.writelane(bld.def(v1), val_hi, lane, src_hi);
8591          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8592          emit_split_vector(ctx, dst, 2);
8593       } else {
8594          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8595       }
8596       break;
8597    }
8598    case nir_intrinsic_mbcnt_amd: {
8599       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8600       Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8601       Temp dst = get_ssa_temp(ctx, &instr->def);
8602       /* Fit 64-bit mask for wave32 */
8603       src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8604       emit_mbcnt(ctx, dst, Operand(src), Operand(add_src));
8605       set_wqm(ctx);
8606       break;
8607    }
8608    case nir_intrinsic_lane_permute_16_amd: {
8609       /* NOTE: If we use divergence analysis information here instead of the src regclass,
8610        * skip_uniformize_merge_phi() should be updated.
8611        */
8612       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8613       Temp dst = get_ssa_temp(ctx, &instr->def);
8614       assert(ctx->program->gfx_level >= GFX10);
8615 
8616       if (src.regClass() == s1) {
8617          bld.copy(Definition(dst), src);
8618       } else if (dst.regClass() == v1 && src.regClass() == v1) {
8619          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8620                   bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8621                   bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8622       } else {
8623          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8624       }
8625       break;
8626    }
8627    case nir_intrinsic_load_helper_invocation:
8628    case nir_intrinsic_is_helper_invocation: {
8629       /* load_helper() after demote() get lowered to is_helper().
8630        * Otherwise, these two behave the same. */
8631       Temp dst = get_ssa_temp(ctx, &instr->def);
8632       bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8633       ctx->program->needs_exact = true;
8634       break;
8635    }
8636    case nir_intrinsic_demote:
8637    case nir_intrinsic_demote_if: {
8638       Operand cond = Operand::c32(-1u);
8639       if (instr->intrinsic == nir_intrinsic_demote_if) {
8640          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8641          assert(src.regClass() == bld.lm);
8642          if (in_exec_divergent_or_in_loop(ctx)) {
8643             cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src,
8644                             Operand(exec, bld.lm));
8645          } else {
8646             cond = Operand(src);
8647          }
8648       }
8649 
8650       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8651 
8652       /* Perform the demote in WQM so that it doesn't make exec empty. WQM should last until at
8653        * least the next top-level block.
8654        */
8655       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8656          set_wqm(ctx);
8657 
8658       ctx->block->kind |= block_kind_uses_discard;
8659       ctx->program->needs_exact = true;
8660 
8661       /* Enable WQM in order to prevent helper lanes from getting terminated. */
8662       if (ctx->shader->info.maximally_reconverges)
8663          ctx->program->needs_wqm = true;
8664 
8665       break;
8666    }
8667    case nir_intrinsic_terminate:
8668    case nir_intrinsic_terminate_if: {
8669       Operand cond = Operand::c32(-1u);
8670       if (instr->intrinsic == nir_intrinsic_terminate_if) {
8671          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8672          assert(src.regClass() == bld.lm);
8673          if (in_exec_divergent_or_in_loop(ctx)) {
8674             cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src,
8675                             Operand(exec, bld.lm));
8676          } else {
8677             cond = Operand(src);
8678          }
8679 
8680          ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(&instr->src[0]);
8681       }
8682 
8683       bld.pseudo(aco_opcode::p_discard_if, cond);
8684       ctx->block->kind |= block_kind_uses_discard;
8685 
8686       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) {
8687          ctx->cf_info.exec.potentially_empty_discard = true;
8688          begin_empty_exec_skip(ctx, &instr->instr, instr->instr.block);
8689       }
8690       ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
8691       ctx->program->needs_exact = true;
8692       break;
8693    }
8694    case nir_intrinsic_debug_break: {
8695       bld.sopp(aco_opcode::s_trap, 1u);
8696       break;
8697    }
8698    case nir_intrinsic_first_invocation: {
8699       bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)),
8700                Operand(exec, bld.lm));
8701       set_wqm(ctx);
8702       break;
8703    }
8704    case nir_intrinsic_last_invocation: {
8705       Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8706       bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8707                Operand::c32(ctx->program->wave_size - 1u), flbit);
8708       set_wqm(ctx);
8709       break;
8710    }
8711    case nir_intrinsic_elect: {
8712       /* p_elect is lowered in aco_insert_exec_mask.
8713        * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8714        * two p_elect with different exec masks as the same.
8715        */
8716       bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)),
8717                  Operand(exec, bld.lm));
8718       set_wqm(ctx);
8719       break;
8720    }
8721    case nir_intrinsic_shader_clock: {
8722       Temp dst = get_ssa_temp(ctx, &instr->def);
8723       if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8724           ctx->options->gfx_level >= GFX12) {
8725          Temp hi0 = bld.tmp(s1);
8726          Temp hi1 = bld.tmp(s1);
8727          Temp lo = bld.tmp(s1);
8728          bld.pseudo(aco_opcode::p_shader_cycles_hi_lo_hi, Definition(hi0), Definition(lo), Definition(hi1));
8729          Temp hi_eq = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), hi0, hi1);
8730          lo = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), lo, Operand::zero(), bld.scc(hi_eq));
8731          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi1);
8732       } else if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8733           ctx->options->gfx_level >= GFX10_3) {
8734          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8735          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8736          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8737       } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
8738                  ctx->options->gfx_level >= GFX11) {
8739          bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
8740                   Operand::c32(sendmsg_rtn_get_realtime));
8741       } else {
8742          aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
8743                                 ? aco_opcode::s_memrealtime
8744                                 : aco_opcode::s_memtime;
8745          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8746       }
8747       emit_split_vector(ctx, dst, 2);
8748       break;
8749    }
8750    case nir_intrinsic_sendmsg_amd: {
8751       unsigned imm = nir_intrinsic_base(instr);
8752       Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8753       bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), imm);
8754       break;
8755    }
8756    case nir_intrinsic_is_subgroup_invocation_lt_amd: {
8757       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8758       unsigned offset = nir_intrinsic_base(instr);
8759       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src, offset));
8760       break;
8761    }
8762    case nir_intrinsic_gds_atomic_add_amd: {
8763       Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8764       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8765       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8766       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8767       bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8768              true);
8769       break;
8770    }
8771    case nir_intrinsic_load_sbt_base_amd: {
8772       Temp dst = get_ssa_temp(ctx, &instr->def);
8773       Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
8774       assert(addr.regClass() == s2);
8775       bld.copy(Definition(dst), Operand(addr));
8776       break;
8777    }
8778    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8779    case nir_intrinsic_load_resume_shader_address_amd: {
8780       bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
8781                  bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
8782       break;
8783    }
8784    case nir_intrinsic_load_scalar_arg_amd:
8785    case nir_intrinsic_load_vector_arg_amd: {
8786       assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
8787       Temp dst = get_ssa_temp(ctx, &instr->def);
8788       Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
8789       assert(src.id());
8790       assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
8791                                                                                   : RegType::vgpr));
8792       bld.copy(Definition(dst), src);
8793       emit_split_vector(ctx, dst, dst.size());
8794       break;
8795    }
8796    case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: {
8797       Temp dst = get_ssa_temp(ctx, &instr->def);
8798       Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
8799       Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
8800 
8801       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
8802       unsigned offset0, offset1;
8803       Instruction* ds_instr;
8804       Operand m;
8805 
8806       /* Lock a GDS mutex. */
8807       ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
8808       m = bld.m0(bld.as_uniform(ordered_id));
8809       ds_instr =
8810          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8811       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8812 
8813       aco_ptr<Instruction> vec{
8814          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
8815       unsigned write_mask = nir_intrinsic_write_mask(instr);
8816 
8817       for (unsigned i = 0; i < instr->num_components; i++) {
8818          if (write_mask & (1 << i)) {
8819             Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
8820 
8821             ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
8822                               i * 4, 0u, true);
8823             ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
8824 
8825             vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
8826          } else {
8827             vec->operands[i] = Operand::zero();
8828          }
8829       }
8830 
8831       vec->definitions[0] = Definition(dst);
8832       ctx->block->instructions.emplace_back(std::move(vec));
8833 
8834       /* Unlock a GDS mutex. */
8835       ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
8836       m = bld.m0(bld.as_uniform(ordered_id));
8837       ds_instr =
8838          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8839       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8840 
8841       emit_split_vector(ctx, dst, instr->num_components);
8842       break;
8843    }
8844    case nir_intrinsic_xfb_counter_sub_gfx11_amd: {
8845       unsigned write_mask = nir_intrinsic_write_mask(instr);
8846       Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
8847 
8848       u_foreach_bit (i, write_mask) {
8849          Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
8850          Instruction* ds_instr;
8851 
8852          ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
8853                            i * 4, 0u, true);
8854          ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
8855       }
8856       break;
8857    }
8858    case nir_intrinsic_export_amd:
8859    case nir_intrinsic_export_row_amd: {
8860       unsigned flags = nir_intrinsic_flags(instr);
8861       unsigned target = nir_intrinsic_base(instr);
8862       unsigned write_mask = nir_intrinsic_write_mask(instr);
8863 
8864       /* Mark vertex export block. */
8865       if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
8866          ctx->block->kind |= block_kind_export_end;
8867 
8868       if (target < V_008DFC_SQ_EXP_MRTZ)
8869          ctx->program->has_color_exports = true;
8870 
8871       const bool row_en = instr->intrinsic == nir_intrinsic_export_row_amd;
8872 
8873       aco_ptr<Instruction> exp{create_instruction(aco_opcode::exp, Format::EXP, 4 + row_en, 0)};
8874 
8875       exp->exp().dest = target;
8876       exp->exp().enabled_mask = write_mask;
8877       exp->exp().compressed = flags & AC_EXP_FLAG_COMPRESSED;
8878 
8879       /* ACO may reorder position/mrt export instructions, then mark done for last
8880        * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
8881        * exports here and leave it to ACO.
8882        */
8883       if (target == V_008DFC_SQ_EXP_PRIM)
8884          exp->exp().done = flags & AC_EXP_FLAG_DONE;
8885       else
8886          exp->exp().done = false;
8887 
8888       /* ACO may reorder mrt export instructions, then mark valid mask for last
8889        * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
8890        * exports here and leave it to ACO.
8891        */
8892       if (target > V_008DFC_SQ_EXP_NULL)
8893          exp->exp().valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
8894       else
8895          exp->exp().valid_mask = false;
8896 
8897       exp->exp().row_en = row_en;
8898 
8899       /* Compressed export uses two bits for a channel. */
8900       uint32_t channel_mask = exp->exp().compressed
8901                                  ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0)
8902                                  : write_mask;
8903 
8904       Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
8905       for (unsigned i = 0; i < 4; i++) {
8906          exp->operands[i] = channel_mask & BITFIELD_BIT(i)
8907                                ? Operand(emit_extract_vector(ctx, value, i, v1))
8908                                : Operand(v1);
8909       }
8910 
8911       if (row_en) {
8912          Temp row = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8913          /* Hack to prevent the RA from moving the source into m0 and then back to a normal SGPR. */
8914          row = bld.copy(bld.def(s1, m0), row);
8915          exp->operands[4] = bld.m0(row);
8916       }
8917 
8918       ctx->block->instructions.emplace_back(std::move(exp));
8919       break;
8920    }
8921    case nir_intrinsic_export_dual_src_blend_amd: {
8922       Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
8923       Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
8924       unsigned write_mask = nir_intrinsic_write_mask(instr);
8925 
8926       struct aco_export_mrt mrt0, mrt1;
8927       for (unsigned i = 0; i < 4; i++) {
8928          mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
8929                                                     : Operand(v1);
8930 
8931          mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
8932                                                     : Operand(v1);
8933       }
8934       mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
8935 
8936       create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
8937 
8938       ctx->block->kind |= block_kind_export_end;
8939       break;
8940    }
8941    case nir_intrinsic_strict_wqm_coord_amd: {
8942       Temp dst = get_ssa_temp(ctx, &instr->def);
8943       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8944       unsigned begin_size = nir_intrinsic_base(instr);
8945 
8946       unsigned num_src = 1;
8947       auto it = ctx->allocated_vec.find(src.id());
8948       if (it != ctx->allocated_vec.end())
8949          num_src = src.bytes() / it->second[0].bytes();
8950 
8951       aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO,
8952                                                   num_src + !!begin_size, 1)};
8953 
8954       if (begin_size)
8955          vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
8956       for (unsigned i = 0; i < num_src; i++) {
8957          Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
8958          vec->operands[i + !!begin_size] = Operand(comp);
8959       }
8960 
8961       vec->definitions[0] = Definition(dst);
8962       ctx->block->instructions.emplace_back(std::move(vec));
8963       break;
8964    }
8965    case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
8966       Temp dst = get_ssa_temp(ctx, &instr->def);
8967       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
8968                Operand::c32(aco_symbol_lds_ngg_scratch_base));
8969       break;
8970    }
8971    case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
8972       Temp dst = get_ssa_temp(ctx, &instr->def);
8973       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
8974                Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
8975       break;
8976    }
8977    case nir_intrinsic_store_scalar_arg_amd: {
8978       BITSET_SET(ctx->output_args, nir_intrinsic_base(instr));
8979       ctx->arg_temps[nir_intrinsic_base(instr)] =
8980          bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8981       break;
8982    }
8983    case nir_intrinsic_store_vector_arg_amd: {
8984       BITSET_SET(ctx->output_args, nir_intrinsic_base(instr));
8985       ctx->arg_temps[nir_intrinsic_base(instr)] =
8986          as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8987       break;
8988    }
8989    case nir_intrinsic_begin_invocation_interlock: {
8990       pops_await_overlapped_waves(ctx);
8991       break;
8992    }
8993    case nir_intrinsic_end_invocation_interlock: {
8994       if (ctx->options->gfx_level < GFX11)
8995          bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
8996       break;
8997    }
8998    case nir_intrinsic_cmat_muladd_amd: visit_cmat_muladd(ctx, instr); break;
8999    case nir_intrinsic_nop_amd: bld.sopp(aco_opcode::s_nop, nir_intrinsic_base(instr)); break;
9000    case nir_intrinsic_sleep_amd: bld.sopp(aco_opcode::s_sleep, nir_intrinsic_base(instr)); break;
9001    case nir_intrinsic_unit_test_amd:
9002       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(nir_intrinsic_base(instr)),
9003                  get_ssa_temp(ctx, instr->src[0].ssa));
9004       break;
9005    case nir_intrinsic_unit_test_uniform_amd:
9006    case nir_intrinsic_unit_test_divergent_amd:
9007       bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)),
9008                  Operand::c32(nir_intrinsic_base(instr)));
9009       break;
9010    default:
9011       isel_err(&instr->instr, "Unimplemented intrinsic instr");
9012       abort();
9013 
9014       break;
9015    }
9016 }
9017 
9018 void
get_const_vec(nir_def * vec,nir_const_value * cv[4])9019 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9020 {
9021    if (vec->parent_instr->type != nir_instr_type_alu)
9022       return;
9023    nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9024    if (vec_instr->op != nir_op_vec(vec->num_components))
9025       return;
9026 
9027    for (unsigned i = 0; i < vec->num_components; i++) {
9028       cv[i] =
9029          vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9030    }
9031 }
9032 
9033 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9034 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9035 {
9036    assert(instr->op != nir_texop_samples_identical);
9037 
9038    Builder bld(ctx->program, ctx->block);
9039    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9040         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9041         has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9042    Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9043                            offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9044                            coord = Temp(), wqm_coord = Temp();
9045    std::vector<Temp> coords;
9046    std::vector<Temp> derivs;
9047    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9048 
9049    for (unsigned i = 0; i < instr->num_srcs; i++) {
9050       switch (instr->src[i].src_type) {
9051       case nir_tex_src_texture_handle:
9052          resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9053          break;
9054       case nir_tex_src_sampler_handle:
9055          sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9056          break;
9057       default: break;
9058       }
9059    }
9060 
9061    bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9062                                   (instr->dest_type & (nir_type_int | nir_type_uint));
9063    bool tg4_integer_cube_workaround =
9064       tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9065 
9066    bool a16 = false, g16 = false;
9067 
9068    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9069    if (coord_idx > 0)
9070       a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9071 
9072    int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9073    if (ddx_idx > 0)
9074       g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9075 
9076    for (unsigned i = 0; i < instr->num_srcs; i++) {
9077       switch (instr->src[i].src_type) {
9078       case nir_tex_src_coord: {
9079          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9080          coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9081          break;
9082       }
9083       case nir_tex_src_backend1: {
9084          assert(instr->src[i].src.ssa->bit_size == 32);
9085          wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9086          has_wqm_coord = true;
9087          break;
9088       }
9089       case nir_tex_src_bias:
9090          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9091          /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9092          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9093          has_bias = true;
9094          break;
9095       case nir_tex_src_lod: {
9096          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9097             level_zero = true;
9098          } else {
9099             assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9100             lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9101             has_lod = true;
9102          }
9103          break;
9104       }
9105       case nir_tex_src_min_lod:
9106          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9107          clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9108          has_clamped_lod = true;
9109          break;
9110       case nir_tex_src_comparator:
9111          if (instr->is_shadow) {
9112             assert(instr->src[i].src.ssa->bit_size == 32);
9113             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9114             has_compare = true;
9115          }
9116          break;
9117       case nir_tex_src_offset:
9118       case nir_tex_src_backend2:
9119          assert(instr->src[i].src.ssa->bit_size == 32);
9120          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9121          get_const_vec(instr->src[i].src.ssa, const_offset);
9122          has_offset = true;
9123          break;
9124       case nir_tex_src_ddx:
9125          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9126          ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9127          has_ddx = true;
9128          break;
9129       case nir_tex_src_ddy:
9130          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9131          ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9132          has_ddy = true;
9133          break;
9134       case nir_tex_src_ms_index:
9135          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9136          sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9137          has_sample_index = true;
9138          break;
9139       case nir_tex_src_texture_offset:
9140       case nir_tex_src_sampler_offset:
9141       default: break;
9142       }
9143    }
9144 
9145    if (has_wqm_coord) {
9146       assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9147              instr->op == nir_texop_lod);
9148       assert(wqm_coord.regClass().is_linear_vgpr());
9149       assert(!a16 && !g16);
9150    }
9151 
9152    if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9153       level_zero = true;
9154 
9155    if (has_offset) {
9156       assert(instr->op != nir_texop_txf);
9157 
9158       aco_ptr<Instruction> tmp_instr;
9159       Temp acc, pack = Temp();
9160 
9161       uint32_t pack_const = 0;
9162       for (unsigned i = 0; i < offset.size(); i++) {
9163          if (!const_offset[i])
9164             continue;
9165          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9166       }
9167 
9168       if (offset.type() == RegType::sgpr) {
9169          for (unsigned i = 0; i < offset.size(); i++) {
9170             if (const_offset[i])
9171                continue;
9172 
9173             acc = emit_extract_vector(ctx, offset, i, s1);
9174             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9175                            Operand::c32(0x3Fu));
9176 
9177             if (i) {
9178                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9179                               Operand::c32(8u * i));
9180             }
9181 
9182             if (pack == Temp()) {
9183                pack = acc;
9184             } else {
9185                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9186             }
9187          }
9188 
9189          if (pack_const && pack != Temp())
9190             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9191                             Operand::c32(pack_const), pack);
9192       } else {
9193          for (unsigned i = 0; i < offset.size(); i++) {
9194             if (const_offset[i])
9195                continue;
9196 
9197             acc = emit_extract_vector(ctx, offset, i, v1);
9198             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9199 
9200             if (i) {
9201                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9202             }
9203 
9204             if (pack == Temp()) {
9205                pack = acc;
9206             } else {
9207                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9208             }
9209          }
9210 
9211          if (pack_const && pack != Temp())
9212             pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9213       }
9214       if (pack == Temp())
9215          offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9216       else
9217          offset = pack;
9218    }
9219 
9220    std::vector<Temp> unpacked_coord;
9221    if (coord != Temp())
9222       unpacked_coord.push_back(coord);
9223    if (has_sample_index)
9224       unpacked_coord.push_back(sample_index);
9225    if (has_lod)
9226       unpacked_coord.push_back(lod);
9227    if (has_clamped_lod)
9228       unpacked_coord.push_back(clamped_lod);
9229 
9230    coords = emit_pack_v1(ctx, unpacked_coord);
9231 
9232    /* pack derivatives */
9233    if (has_ddx || has_ddy) {
9234       assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9235       std::array<Temp, 2> ddxddy = {ddx, ddy};
9236       for (Temp tmp : ddxddy) {
9237          if (tmp == Temp())
9238             continue;
9239          std::vector<Temp> unpacked = {tmp};
9240          for (Temp derv : emit_pack_v1(ctx, unpacked))
9241             derivs.push_back(derv);
9242       }
9243       has_derivs = true;
9244    }
9245 
9246    unsigned dim = 0;
9247    bool da = false;
9248    if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9249       dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9250       da = should_declare_array((ac_image_dim)dim);
9251    }
9252 
9253    /* Build tex instruction */
9254    unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9255    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9256       dmask = u_bit_consecutive(0, util_last_bit(dmask));
9257    if (instr->is_sparse)
9258       dmask = MAX2(dmask, 1) | 0x10;
9259    bool d16 = instr->def.bit_size == 16;
9260    Temp dst = get_ssa_temp(ctx, &instr->def);
9261    Temp tmp_dst = dst;
9262 
9263    /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9264    if (instr->op == nir_texop_tg4) {
9265       assert(instr->def.num_components == (4 + instr->is_sparse));
9266       if (instr->is_shadow)
9267          dmask = 1;
9268       else
9269          dmask = 1 << instr->component;
9270       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9271          tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9272    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9273       tmp_dst = bld.tmp(v1);
9274    } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9275       unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9276       tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9277    }
9278 
9279    Temp tg4_compare_cube_wa64 = Temp();
9280 
9281    if (tg4_integer_workarounds) {
9282       Temp half_texel[2];
9283       if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
9284          half_texel[0] = half_texel[1] = bld.copy(bld.def(v1), Operand::c32(0xbf000000 /*-0.5*/));
9285       } else {
9286          Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9287          Temp size = bld.tmp(v2);
9288          MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9289                                            Operand(s4), std::vector<Temp>{tg4_lod});
9290          tex->dim = dim;
9291          tex->dmask = 0x3;
9292          tex->da = da;
9293          emit_split_vector(ctx, size, size.size());
9294 
9295          for (unsigned i = 0; i < 2; i++) {
9296             half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9297             half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9298             half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9299             half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9300                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9301          }
9302 
9303          if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9304             /* In vulkan, whether the sampler uses unnormalized
9305              * coordinates or not is a dynamic property of the
9306              * sampler. Hence, to figure out whether or not we
9307              * need to divide by the texture size, we need to test
9308              * the sampler at runtime. This tests the bit set by
9309              * radv_init_sampler().
9310              */
9311             unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9312             Temp dword0 = emit_extract_vector(ctx, sampler, 0, s1);
9313             Temp not_needed =
9314                bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), dword0, Operand::c32(bit_idx));
9315 
9316             not_needed = bool_to_vector_condition(ctx, not_needed);
9317             half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9318                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9319             half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9320                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9321          }
9322       }
9323 
9324       Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9325                             bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9326 
9327       if (tg4_integer_cube_workaround) {
9328          /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9329          Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9330          aco_ptr<Instruction> split{
9331             create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9332          split->operands[0] = Operand(resource);
9333          for (unsigned i = 0; i < resource.size(); i++) {
9334             desc[i] = bld.tmp(s1);
9335             split->definitions[i] = Definition(desc[i]);
9336          }
9337          ctx->block->instructions.emplace_back(std::move(split));
9338 
9339          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9340                               Operand::c32(20u | (6u << 16)));
9341          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9342                                          Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9343 
9344          Temp nfmt;
9345          if (instr->dest_type & nir_type_uint) {
9346             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9347                             Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9348                             Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9349          } else {
9350             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9351                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9352                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9353          }
9354          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9355          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9356 
9357          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9358                          Operand::c32(26u));
9359 
9360          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9361                             Operand::c32(C_008F14_NUM_FORMAT));
9362          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9363 
9364          aco_ptr<Instruction> vec{
9365             create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9366          for (unsigned i = 0; i < resource.size(); i++)
9367             vec->operands[i] = Operand(desc[i]);
9368          resource = bld.tmp(resource.regClass());
9369          vec->definitions[0] = Definition(resource);
9370          ctx->block->instructions.emplace_back(std::move(vec));
9371 
9372          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9373                                   tg4_compare_cube_wa64);
9374          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9375                                   tg4_compare_cube_wa64);
9376       }
9377       coords[0] = new_coords[0];
9378       coords[1] = new_coords[1];
9379    }
9380 
9381    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9382       // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9383       // ac_build_buffer_load_format_gfx9_safe()
9384 
9385       assert(coords.size() == 1);
9386       aco_opcode op;
9387       if (d16) {
9388          switch (util_last_bit(dmask & 0xf)) {
9389          case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9390          case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9391          case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9392          case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9393          default: unreachable("Tex instruction loads more than 4 components.");
9394          }
9395       } else {
9396          switch (util_last_bit(dmask & 0xf)) {
9397          case 1: op = aco_opcode::buffer_load_format_x; break;
9398          case 2: op = aco_opcode::buffer_load_format_xy; break;
9399          case 3: op = aco_opcode::buffer_load_format_xyz; break;
9400          case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9401          default: unreachable("Tex instruction loads more than 4 components.");
9402          }
9403       }
9404 
9405       aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9406       mubuf->operands[0] = Operand(resource);
9407       mubuf->operands[1] = Operand(coords[0]);
9408       mubuf->operands[2] = Operand::c32(0);
9409       mubuf->definitions[0] = Definition(tmp_dst);
9410       mubuf->mubuf().idxen = true;
9411       mubuf->mubuf().tfe = instr->is_sparse;
9412       if (mubuf->mubuf().tfe)
9413          mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9414       ctx->block->instructions.emplace_back(std::move(mubuf));
9415 
9416       expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9417       return;
9418    }
9419 
9420    /* gather MIMG address components */
9421    std::vector<Temp> args;
9422    if (has_wqm_coord) {
9423       args.emplace_back(wqm_coord);
9424       if (!(ctx->block->kind & block_kind_top_level))
9425          ctx->unended_linear_vgprs.push_back(wqm_coord);
9426    }
9427    if (has_offset)
9428       args.emplace_back(offset);
9429    if (has_bias)
9430       args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9431    if (has_compare)
9432       args.emplace_back(compare);
9433    if (has_derivs)
9434       args.insert(args.end(), derivs.begin(), derivs.end());
9435 
9436    args.insert(args.end(), coords.begin(), coords.end());
9437 
9438    if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9439        instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
9440       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9441                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9442                          ? aco_opcode::image_load
9443                          : aco_opcode::image_load_mip;
9444       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9445       MIMG_instruction* tex = emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, vdata);
9446       if (instr->op == nir_texop_fragment_mask_fetch_amd)
9447          tex->dim = da ? ac_image_2darray : ac_image_2d;
9448       else
9449          tex->dim = dim;
9450       tex->dmask = dmask & 0xf;
9451       tex->unrm = true;
9452       tex->da = da;
9453       tex->tfe = instr->is_sparse;
9454       tex->d16 = d16;
9455       tex->a16 = a16;
9456 
9457       if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9458          /* Use 0x76543210 if the image doesn't have FMASK. */
9459          assert(dmask == 1 && dst.bytes() == 4);
9460          assert(dst.id() != tmp_dst.id());
9461 
9462          if (dst.regClass() == s1) {
9463             Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9464                                         emit_extract_vector(ctx, resource, 1, s1));
9465             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
9466                      Operand::c32(0x76543210), bld.scc(is_not_null));
9467          } else {
9468             Temp is_not_null = bld.tmp(bld.lm);
9469             bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9470                          emit_extract_vector(ctx, resource, 1, s1));
9471             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9472                      bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9473          }
9474       } else {
9475          expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9476       }
9477       return;
9478    }
9479 
9480    bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
9481 
9482    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9483    aco_opcode opcode = aco_opcode::image_sample;
9484    if (has_offset) { /* image_sample_*_o */
9485       if (has_clamped_lod) {
9486          if (has_compare) {
9487             opcode = aco_opcode::image_sample_c_cl_o;
9488             if (separate_g16)
9489                opcode = aco_opcode::image_sample_c_d_cl_o_g16;
9490             else if (has_derivs)
9491                opcode = aco_opcode::image_sample_c_d_cl_o;
9492             if (has_bias)
9493                opcode = aco_opcode::image_sample_c_b_cl_o;
9494          } else {
9495             opcode = aco_opcode::image_sample_cl_o;
9496             if (separate_g16)
9497                opcode = aco_opcode::image_sample_d_cl_o_g16;
9498             else if (has_derivs)
9499                opcode = aco_opcode::image_sample_d_cl_o;
9500             if (has_bias)
9501                opcode = aco_opcode::image_sample_b_cl_o;
9502          }
9503       } else if (has_compare) {
9504          opcode = aco_opcode::image_sample_c_o;
9505          if (separate_g16)
9506             opcode = aco_opcode::image_sample_c_d_o_g16;
9507          else if (has_derivs)
9508             opcode = aco_opcode::image_sample_c_d_o;
9509          if (has_bias)
9510             opcode = aco_opcode::image_sample_c_b_o;
9511          if (level_zero)
9512             opcode = aco_opcode::image_sample_c_lz_o;
9513          if (has_lod)
9514             opcode = aco_opcode::image_sample_c_l_o;
9515       } else {
9516          opcode = aco_opcode::image_sample_o;
9517          if (separate_g16)
9518             opcode = aco_opcode::image_sample_d_o_g16;
9519          else if (has_derivs)
9520             opcode = aco_opcode::image_sample_d_o;
9521          if (has_bias)
9522             opcode = aco_opcode::image_sample_b_o;
9523          if (level_zero)
9524             opcode = aco_opcode::image_sample_lz_o;
9525          if (has_lod)
9526             opcode = aco_opcode::image_sample_l_o;
9527       }
9528    } else if (has_clamped_lod) { /* image_sample_*_cl */
9529       if (has_compare) {
9530          opcode = aco_opcode::image_sample_c_cl;
9531          if (separate_g16)
9532             opcode = aco_opcode::image_sample_c_d_cl_g16;
9533          else if (has_derivs)
9534             opcode = aco_opcode::image_sample_c_d_cl;
9535          if (has_bias)
9536             opcode = aco_opcode::image_sample_c_b_cl;
9537       } else {
9538          opcode = aco_opcode::image_sample_cl;
9539          if (separate_g16)
9540             opcode = aco_opcode::image_sample_d_cl_g16;
9541          else if (has_derivs)
9542             opcode = aco_opcode::image_sample_d_cl;
9543          if (has_bias)
9544             opcode = aco_opcode::image_sample_b_cl;
9545       }
9546    } else { /* no offset */
9547       if (has_compare) {
9548          opcode = aco_opcode::image_sample_c;
9549          if (separate_g16)
9550             opcode = aco_opcode::image_sample_c_d_g16;
9551          else if (has_derivs)
9552             opcode = aco_opcode::image_sample_c_d;
9553          if (has_bias)
9554             opcode = aco_opcode::image_sample_c_b;
9555          if (level_zero)
9556             opcode = aco_opcode::image_sample_c_lz;
9557          if (has_lod)
9558             opcode = aco_opcode::image_sample_c_l;
9559       } else {
9560          opcode = aco_opcode::image_sample;
9561          if (separate_g16)
9562             opcode = aco_opcode::image_sample_d_g16;
9563          else if (has_derivs)
9564             opcode = aco_opcode::image_sample_d;
9565          if (has_bias)
9566             opcode = aco_opcode::image_sample_b;
9567          if (level_zero)
9568             opcode = aco_opcode::image_sample_lz;
9569          if (has_lod)
9570             opcode = aco_opcode::image_sample_l;
9571       }
9572    }
9573 
9574    if (instr->op == nir_texop_tg4) {
9575       /* GFX11 supports implicit LOD, but the extension is unsupported. */
9576       assert(level_zero || ctx->options->gfx_level < GFX11);
9577 
9578       if (has_offset) { /* image_gather4_*_o */
9579          if (has_compare) {
9580             opcode = aco_opcode::image_gather4_c_o;
9581             if (level_zero)
9582                opcode = aco_opcode::image_gather4_c_lz_o;
9583             if (has_lod)
9584                opcode = aco_opcode::image_gather4_c_l_o;
9585             if (has_bias)
9586                opcode = aco_opcode::image_gather4_c_b_o;
9587          } else {
9588             opcode = aco_opcode::image_gather4_o;
9589             if (level_zero)
9590                opcode = aco_opcode::image_gather4_lz_o;
9591             if (has_lod)
9592                opcode = aco_opcode::image_gather4_l_o;
9593             if (has_bias)
9594                opcode = aco_opcode::image_gather4_b_o;
9595          }
9596       } else {
9597          if (has_compare) {
9598             opcode = aco_opcode::image_gather4_c;
9599             if (level_zero)
9600                opcode = aco_opcode::image_gather4_c_lz;
9601             if (has_lod)
9602                opcode = aco_opcode::image_gather4_c_l;
9603             if (has_bias)
9604                opcode = aco_opcode::image_gather4_c_b;
9605          } else {
9606             opcode = aco_opcode::image_gather4;
9607             if (level_zero)
9608                opcode = aco_opcode::image_gather4_lz;
9609             if (has_lod)
9610                opcode = aco_opcode::image_gather4_l;
9611             if (has_bias)
9612                opcode = aco_opcode::image_gather4_b;
9613          }
9614       }
9615    } else if (instr->op == nir_texop_lod) {
9616       opcode = aco_opcode::image_get_lod;
9617    }
9618 
9619    bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9620                           !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9621                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9622 
9623    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9624    MIMG_instruction* tex = emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, vdata);
9625    tex->dim = dim;
9626    tex->dmask = dmask & 0xf;
9627    tex->da = da;
9628    tex->unrm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
9629    tex->tfe = instr->is_sparse;
9630    tex->d16 = d16;
9631    tex->a16 = a16;
9632    if (implicit_derivs)
9633       set_wqm(ctx, true);
9634 
9635    if (tg4_integer_cube_workaround) {
9636       assert(tmp_dst.id() != dst.id());
9637       assert(tmp_dst.size() == dst.size());
9638 
9639       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9640       Temp val[4];
9641       for (unsigned i = 0; i < 4; i++) {
9642          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9643          Temp cvt_val;
9644          if (instr->dest_type & nir_type_uint)
9645             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9646          else
9647             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9648          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9649                            tg4_compare_cube_wa64);
9650       }
9651 
9652       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9653       if (instr->is_sparse)
9654          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9655                               val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9656       else
9657          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9658                               val[3]);
9659    }
9660    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9661    expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
9662 }
9663 
9664 Operand
get_phi_operand(isel_context * ctx,nir_def * ssa,RegClass rc)9665 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc)
9666 {
9667    Temp tmp = get_ssa_temp(ctx, ssa);
9668    if (ssa->parent_instr->type == nir_instr_type_undef) {
9669       return Operand(rc);
9670    } else if (ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
9671       bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
9672       return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
9673    } else {
9674       return Operand(tmp);
9675    }
9676 }
9677 
9678 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)9679 visit_phi(isel_context* ctx, nir_phi_instr* instr)
9680 {
9681    Temp dst = get_ssa_temp(ctx, &instr->def);
9682    assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9683    aco_opcode opcode = instr->def.bit_size == 1 ? aco_opcode::p_boolean_phi : aco_opcode::p_phi;
9684 
9685    /* we want a sorted list of sources, since the predecessor list is also sorted */
9686    std::map<unsigned, nir_def*> phi_src;
9687    nir_foreach_phi_src (src, instr)
9688       phi_src[src->pred->index] = src->src.ssa;
9689 
9690    Instruction* phi = create_instruction(opcode, Format::PSEUDO, phi_src.size(), 1);
9691    unsigned i = 0;
9692    for (std::pair<unsigned, nir_def*> src : phi_src)
9693       phi->operands[i++] = get_phi_operand(ctx, src.second, dst.regClass());
9694    phi->definitions[0] = Definition(dst);
9695    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9696 }
9697 
9698 void
visit_undef(isel_context * ctx,nir_undef_instr * instr)9699 visit_undef(isel_context* ctx, nir_undef_instr* instr)
9700 {
9701    Temp dst = get_ssa_temp(ctx, &instr->def);
9702 
9703    assert(dst.type() == RegType::sgpr);
9704 
9705    if (dst.size() == 1) {
9706       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9707    } else {
9708       aco_ptr<Instruction> vec{
9709          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9710       for (unsigned i = 0; i < dst.size(); i++)
9711          vec->operands[i] = Operand::zero();
9712       vec->definitions[0] = Definition(dst);
9713       ctx->block->instructions.emplace_back(std::move(vec));
9714    }
9715 }
9716 
9717 void
begin_loop(isel_context * ctx,loop_context * lc)9718 begin_loop(isel_context* ctx, loop_context* lc)
9719 {
9720    append_logical_end(ctx->block);
9721    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9722    Builder bld(ctx->program, ctx->block);
9723    bld.branch(aco_opcode::p_branch);
9724    unsigned loop_preheader_idx = ctx->block->index;
9725 
9726    lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9727 
9728    ctx->program->next_loop_depth++;
9729 
9730    Block* loop_header = ctx->program->create_and_insert_block();
9731    loop_header->kind |= block_kind_loop_header;
9732    add_edge(loop_preheader_idx, loop_header);
9733    ctx->block = loop_header;
9734 
9735    append_logical_start(ctx->block);
9736 
9737    lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9738    lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9739    lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9740    lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9741    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9742 }
9743 
9744 void
update_exec_info(isel_context * ctx)9745 update_exec_info(isel_context* ctx)
9746 {
9747    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
9748       ctx->cf_info.exec.potentially_empty_discard = false;
9749 
9750    ctx->cf_info.exec.potentially_empty_break &=
9751       ctx->block->loop_nest_depth >= ctx->cf_info.exec.potentially_empty_break_depth;
9752    ctx->cf_info.exec.potentially_empty_continue &=
9753       ctx->block->loop_nest_depth >= ctx->cf_info.exec.potentially_empty_continue_depth;
9754 
9755    if (ctx->block->loop_nest_depth == ctx->cf_info.exec.potentially_empty_break_depth &&
9756        !ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.parent_loop.has_divergent_continue) {
9757       ctx->cf_info.exec.potentially_empty_break = false;
9758    }
9759    if (ctx->block->loop_nest_depth == ctx->cf_info.exec.potentially_empty_continue_depth &&
9760        !ctx->cf_info.parent_if.is_divergent) {
9761       ctx->cf_info.exec.potentially_empty_continue = false;
9762    }
9763 
9764    if (!ctx->cf_info.exec.potentially_empty_break)
9765       ctx->cf_info.exec.potentially_empty_break_depth = UINT16_MAX;
9766    if (!ctx->cf_info.exec.potentially_empty_continue)
9767       ctx->cf_info.exec.potentially_empty_continue_depth = UINT16_MAX;
9768 }
9769 
9770 void
end_loop(isel_context * ctx,loop_context * lc)9771 end_loop(isel_context* ctx, loop_context* lc)
9772 {
9773    // TODO: what if a loop ends with a unconditional or uniformly branched continue
9774    //       and this branch is never taken?
9775    if (!ctx->cf_info.has_branch) {
9776       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9777       Builder bld(ctx->program, ctx->block);
9778       append_logical_end(ctx->block);
9779 
9780       /* No need to check exec.potentially_empty_break/continue originating inside the loop. In the
9781        * only case where it's possible at this point (divergent break after divergent continue), we
9782        * should continue anyway. */
9783       if (ctx->cf_info.exec.potentially_empty_discard ||
9784           (ctx->cf_info.exec.potentially_empty_break &&
9785            ctx->cf_info.exec.potentially_empty_break_depth < ctx->block->loop_nest_depth) ||
9786           (ctx->cf_info.exec.potentially_empty_continue &&
9787            ctx->cf_info.exec.potentially_empty_continue_depth < ctx->block->loop_nest_depth)) {
9788          /* Discards can result in code running with an empty exec mask.
9789           * This would result in divergent breaks not ever being taken. As a
9790           * workaround, break the loop when the loop mask is empty instead of
9791           * always continuing. */
9792          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9793          unsigned block_idx = ctx->block->index;
9794 
9795          /* create helper blocks to avoid critical edges */
9796          Block* break_block = ctx->program->create_and_insert_block();
9797          break_block->kind = block_kind_uniform;
9798          bld.reset(break_block);
9799          bld.branch(aco_opcode::p_branch);
9800          add_linear_edge(block_idx, break_block);
9801          add_linear_edge(break_block->index, &lc->loop_exit);
9802 
9803          Block* continue_block = ctx->program->create_and_insert_block();
9804          continue_block->kind = block_kind_uniform;
9805          bld.reset(continue_block);
9806          bld.branch(aco_opcode::p_branch);
9807          add_linear_edge(block_idx, continue_block);
9808          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9809 
9810          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9811             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9812          ctx->block = &ctx->program->blocks[block_idx];
9813 
9814          /* SGPR temporaries might need loop exit phis to be created. */
9815          ctx->program->should_repair_ssa = true;
9816       } else {
9817          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9818          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9819             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9820          else
9821             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9822       }
9823 
9824       bld.reset(ctx->block);
9825       bld.branch(aco_opcode::p_branch);
9826    }
9827 
9828    ctx->cf_info.has_branch = false;
9829    ctx->program->next_loop_depth--;
9830 
9831    /* emit loop successor block */
9832    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
9833    append_logical_start(ctx->block);
9834 
9835    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
9836    ctx->cf_info.parent_loop.exit = lc->exit_old;
9837    ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
9838    ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
9839    ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
9840    update_exec_info(ctx);
9841 }
9842 
9843 void
emit_loop_jump(isel_context * ctx,bool is_break)9844 emit_loop_jump(isel_context* ctx, bool is_break)
9845 {
9846    Builder bld(ctx->program, ctx->block);
9847    Block* logical_target;
9848    append_logical_end(ctx->block);
9849    unsigned idx = ctx->block->index;
9850 
9851    /* If exec is empty inside uniform control flow in a loop, we can assume that all invocations
9852     * of the loop are inactive. Breaking from the loop is the right thing to do in that case.
9853     * We shouldn't perform a uniform continue, or else we might never reach a break.
9854     */
9855    bool potentially_empty_exec = ctx->cf_info.exec.potentially_empty_discard ||
9856                                  ctx->cf_info.exec.potentially_empty_break ||
9857                                  ctx->cf_info.exec.potentially_empty_continue;
9858 
9859    if (is_break) {
9860       logical_target = ctx->cf_info.parent_loop.exit;
9861       add_logical_edge(idx, logical_target);
9862       ctx->block->kind |= block_kind_break;
9863 
9864       if (!ctx->cf_info.parent_if.is_divergent &&
9865           !ctx->cf_info.parent_loop.has_divergent_continue) {
9866          /* uniform break - directly jump out of the loop */
9867          ctx->block->kind |= block_kind_uniform;
9868          ctx->cf_info.has_branch = true;
9869          bld.branch(aco_opcode::p_branch);
9870          add_linear_edge(idx, logical_target);
9871          return;
9872       }
9873       ctx->cf_info.parent_loop.has_divergent_branch = true;
9874 
9875       if (!ctx->cf_info.exec.potentially_empty_break) {
9876          ctx->cf_info.exec.potentially_empty_break = true;
9877          ctx->cf_info.exec.potentially_empty_break_depth = ctx->block->loop_nest_depth;
9878       }
9879    } else {
9880       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9881       add_logical_edge(idx, logical_target);
9882       ctx->block->kind |= block_kind_continue;
9883 
9884       if (!ctx->cf_info.parent_if.is_divergent && !potentially_empty_exec) {
9885          /* uniform continue - directly jump to the loop header */
9886          ctx->block->kind |= block_kind_uniform;
9887          ctx->cf_info.has_branch = true;
9888          bld.branch(aco_opcode::p_branch);
9889          add_linear_edge(idx, logical_target);
9890          return;
9891       }
9892 
9893       ctx->cf_info.parent_loop.has_divergent_branch = true;
9894 
9895       if (ctx->cf_info.parent_if.is_divergent) {
9896          /* for potential uniform breaks after this continue,
9897             we must ensure that they are handled correctly */
9898          ctx->cf_info.parent_loop.has_divergent_continue = true;
9899 
9900          if (!ctx->cf_info.exec.potentially_empty_continue) {
9901             ctx->cf_info.exec.potentially_empty_continue = true;
9902             ctx->cf_info.exec.potentially_empty_continue_depth = ctx->block->loop_nest_depth;
9903          }
9904       }
9905    }
9906 
9907    /* remove critical edges from linear CFG */
9908    bld.branch(aco_opcode::p_branch);
9909    Block* break_block = ctx->program->create_and_insert_block();
9910    break_block->kind |= block_kind_uniform;
9911    add_linear_edge(idx, break_block);
9912    /* the loop_header pointer might be invalidated by this point */
9913    if (!is_break)
9914       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9915    add_linear_edge(break_block->index, logical_target);
9916    bld.reset(break_block);
9917    bld.branch(aco_opcode::p_branch);
9918 
9919    Block* continue_block = ctx->program->create_and_insert_block();
9920    add_linear_edge(idx, continue_block);
9921    append_logical_start(continue_block);
9922    ctx->block = continue_block;
9923 }
9924 
9925 void
emit_loop_break(isel_context * ctx)9926 emit_loop_break(isel_context* ctx)
9927 {
9928    emit_loop_jump(ctx, true);
9929 }
9930 
9931 void
emit_loop_continue(isel_context * ctx)9932 emit_loop_continue(isel_context* ctx)
9933 {
9934    emit_loop_jump(ctx, false);
9935 }
9936 
9937 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)9938 visit_jump(isel_context* ctx, nir_jump_instr* instr)
9939 {
9940    end_empty_exec_skip(ctx);
9941 
9942    switch (instr->type) {
9943    case nir_jump_break: emit_loop_break(ctx); break;
9944    case nir_jump_continue: emit_loop_continue(ctx); break;
9945    default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
9946    }
9947 }
9948 
9949 void
visit_debug_info(isel_context * ctx,nir_debug_info_instr * instr)9950 visit_debug_info(isel_context* ctx, nir_debug_info_instr* instr)
9951 {
9952    ac_shader_debug_info info;
9953    memset(&info, 0, sizeof(info));
9954 
9955    switch (instr->type) {
9956    case nir_debug_info_src_loc:
9957       info.type = ac_shader_debug_info_src_loc;
9958       info.src_loc.file = strdup(nir_src_as_string(instr->src_loc.filename));
9959       info.src_loc.line = instr->src_loc.line;
9960       info.src_loc.column = instr->src_loc.column;
9961       info.src_loc.spirv_offset = instr->src_loc.spirv_offset;
9962       break;
9963    default:
9964       return;
9965    }
9966 
9967    Builder bld(ctx->program, ctx->block);
9968    bld.pseudo(aco_opcode::p_debug_info, Operand::c32(ctx->program->debug_info.size()));
9969 
9970    ctx->program->debug_info.push_back(info);
9971 }
9972 
9973 void
visit_block(isel_context * ctx,nir_block * block)9974 visit_block(isel_context* ctx, nir_block* block)
9975 {
9976    if (ctx->block->kind & block_kind_top_level) {
9977       Builder bld(ctx->program, ctx->block);
9978       for (Temp tmp : ctx->unended_linear_vgprs) {
9979          bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
9980       }
9981       ctx->unended_linear_vgprs.clear();
9982    }
9983 
9984    nir_foreach_phi (instr, block)
9985       visit_phi(ctx, instr);
9986 
9987    nir_phi_instr* last_phi = nir_block_last_phi_instr(block);
9988    begin_empty_exec_skip(ctx, last_phi ? &last_phi->instr : NULL, block);
9989 
9990    ctx->block->instructions.reserve(ctx->block->instructions.size() +
9991                                     exec_list_length(&block->instr_list) * 2);
9992    nir_foreach_instr (instr, block) {
9993       switch (instr->type) {
9994       case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
9995       case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
9996       case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
9997       case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
9998       case nir_instr_type_phi: break;
9999       case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10000       case nir_instr_type_deref: break;
10001       case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10002       case nir_instr_type_debug_info: visit_debug_info(ctx, nir_instr_as_debug_info(instr)); break;
10003       default: isel_err(instr, "Unknown NIR instr type");
10004       }
10005    }
10006 }
10007 
10008 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10009 static void begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else = true);
10010 static void end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else = true);
10011 
10012 static void
visit_loop(isel_context * ctx,nir_loop * loop)10013 visit_loop(isel_context* ctx, nir_loop* loop)
10014 {
10015    assert(!nir_loop_has_continue_construct(loop));
10016    loop_context lc;
10017    begin_loop(ctx, &lc);
10018 
10019    visit_cf_list(ctx, &loop->body);
10020 
10021    end_loop(ctx, &lc);
10022 }
10023 
10024 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond,nir_selection_control sel_ctrl=nir_selection_control_none)10025 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10026                         nir_selection_control sel_ctrl = nir_selection_control_none)
10027 {
10028    append_logical_end(ctx->block);
10029    ctx->block->kind |= block_kind_branch;
10030 
10031    /* branch to linear then block */
10032    assert(cond.regClass() == ctx->program->lane_mask);
10033    aco_ptr<Instruction> branch;
10034    branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
10035    branch->operands[0] = Operand(cond);
10036    bool never_taken =
10037       sel_ctrl == nir_selection_control_divergent_always_taken &&
10038       !(ctx->cf_info.exec.potentially_empty_discard || ctx->cf_info.exec.potentially_empty_break ||
10039         ctx->cf_info.exec.potentially_empty_continue);
10040    branch->branch().rarely_taken = sel_ctrl == nir_selection_control_flatten || never_taken;
10041    branch->branch().never_taken = never_taken;
10042    ctx->block->instructions.push_back(std::move(branch));
10043 
10044    ic->BB_if_idx = ctx->block->index;
10045    ic->BB_invert = Block();
10046    /* Invert blocks are intentionally not marked as top level because they
10047     * are not part of the logical cfg. */
10048    ic->BB_invert.kind |= block_kind_invert;
10049    ic->BB_endif = Block();
10050    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10051 
10052    ic->exec_old = ctx->cf_info.exec;
10053    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10054    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10055    ctx->cf_info.parent_if.is_divergent = true;
10056 
10057    /* divergent branches use cbranch_execz */
10058    ctx->cf_info.exec = exec_info();
10059 
10060    /** emit logical then block */
10061    ctx->program->next_divergent_if_logical_depth++;
10062    Block* BB_then_logical = ctx->program->create_and_insert_block();
10063    add_edge(ic->BB_if_idx, BB_then_logical);
10064    ctx->block = BB_then_logical;
10065    append_logical_start(BB_then_logical);
10066 }
10067 
10068 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic,nir_selection_control sel_ctrl=nir_selection_control_none)10069 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10070                         nir_selection_control sel_ctrl = nir_selection_control_none)
10071 {
10072    Block* BB_then_logical = ctx->block;
10073    append_logical_end(BB_then_logical);
10074    /* branch from logical then block to invert block */
10075    aco_ptr<Instruction> branch;
10076    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10077    BB_then_logical->instructions.emplace_back(std::move(branch));
10078    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10079    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10080       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10081    BB_then_logical->kind |= block_kind_uniform;
10082    assert(!ctx->cf_info.has_branch);
10083    ctx->cf_info.parent_loop.has_divergent_branch = false;
10084    ctx->program->next_divergent_if_logical_depth--;
10085 
10086    /** emit linear then block */
10087    Block* BB_then_linear = ctx->program->create_and_insert_block();
10088    BB_then_linear->kind |= block_kind_uniform;
10089    add_linear_edge(ic->BB_if_idx, BB_then_linear);
10090    /* branch from linear then block to invert block */
10091    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10092    BB_then_linear->instructions.emplace_back(std::move(branch));
10093    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10094 
10095    /** emit invert merge block */
10096    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10097    ic->invert_idx = ctx->block->index;
10098 
10099    /* branch to linear else block (skip else) */
10100    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10101    bool never_taken =
10102       sel_ctrl == nir_selection_control_divergent_always_taken &&
10103       !(ctx->cf_info.exec.potentially_empty_discard || ctx->cf_info.exec.potentially_empty_break ||
10104         ctx->cf_info.exec.potentially_empty_continue);
10105    branch->branch().rarely_taken = sel_ctrl == nir_selection_control_flatten || never_taken;
10106    branch->branch().never_taken = never_taken;
10107    ctx->block->instructions.push_back(std::move(branch));
10108 
10109    ic->exec_old.combine(ctx->cf_info.exec);
10110    /* divergent branches use cbranch_execz */
10111    ctx->cf_info.exec = exec_info();
10112 
10113    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10114    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10115 
10116    /** emit logical else block */
10117    ctx->program->next_divergent_if_logical_depth++;
10118    Block* BB_else_logical = ctx->program->create_and_insert_block();
10119    add_logical_edge(ic->BB_if_idx, BB_else_logical);
10120    add_linear_edge(ic->invert_idx, BB_else_logical);
10121    ctx->block = BB_else_logical;
10122    append_logical_start(BB_else_logical);
10123 }
10124 
10125 static void
end_divergent_if(isel_context * ctx,if_context * ic)10126 end_divergent_if(isel_context* ctx, if_context* ic)
10127 {
10128    Block* BB_else_logical = ctx->block;
10129    append_logical_end(BB_else_logical);
10130 
10131    /* branch from logical else block to endif block */
10132    aco_ptr<Instruction> branch;
10133    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10134    BB_else_logical->instructions.emplace_back(std::move(branch));
10135    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10136    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10137       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10138    BB_else_logical->kind |= block_kind_uniform;
10139    ctx->program->next_divergent_if_logical_depth--;
10140 
10141    assert(!ctx->cf_info.has_branch);
10142    ctx->cf_info.parent_loop.has_divergent_branch = false;
10143 
10144    /** emit linear else block */
10145    Block* BB_else_linear = ctx->program->create_and_insert_block();
10146    BB_else_linear->kind |= block_kind_uniform;
10147    add_linear_edge(ic->invert_idx, BB_else_linear);
10148 
10149    /* branch from linear else block to endif block */
10150    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10151    BB_else_linear->instructions.emplace_back(std::move(branch));
10152    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10153 
10154    /** emit endif merge block */
10155    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10156    append_logical_start(ctx->block);
10157 
10158    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10159    ctx->cf_info.exec.combine(ic->exec_old);
10160    update_exec_info(ctx);
10161    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10162 
10163    /* We shouldn't create unreachable blocks. */
10164    assert(!ctx->block->logical_preds.empty());
10165 }
10166 
10167 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10168 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10169 {
10170    assert(!cond.id() || cond.regClass() == s1);
10171 
10172    ic->cond = cond;
10173 
10174    append_logical_end(ctx->block);
10175    ctx->block->kind |= block_kind_uniform;
10176 
10177    aco_ptr<Instruction> branch;
10178    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10179    branch.reset(create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
10180    if (cond.id()) {
10181       branch->operands[0] = Operand(cond);
10182       branch->operands[0].setPrecolored(scc);
10183    } else {
10184       branch->operands[0] = Operand(exec, ctx->program->lane_mask);
10185       branch->branch().rarely_taken = true;
10186    }
10187    ctx->block->instructions.emplace_back(std::move(branch));
10188 
10189    ic->BB_if_idx = ctx->block->index;
10190    ic->BB_endif = Block();
10191    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10192 
10193    ctx->cf_info.has_branch = false;
10194    ctx->cf_info.parent_loop.has_divergent_branch = false;
10195 
10196    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10197    ic->has_divergent_continue_old = ctx->cf_info.parent_loop.has_divergent_continue;
10198 
10199    /** emit then block */
10200    if (ic->cond.id())
10201       ctx->program->next_uniform_if_depth++;
10202    Block* BB_then = ctx->program->create_and_insert_block();
10203    add_edge(ic->BB_if_idx, BB_then);
10204    append_logical_start(BB_then);
10205    ctx->block = BB_then;
10206 }
10207 
10208 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic,bool logical_else)10209 begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else)
10210 {
10211    Block* BB_then = ctx->block;
10212 
10213    if (!ctx->cf_info.has_branch) {
10214       append_logical_end(BB_then);
10215       /* branch from then block to endif block */
10216       aco_ptr<Instruction> branch;
10217       branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10218       BB_then->instructions.emplace_back(std::move(branch));
10219       add_linear_edge(BB_then->index, &ic->BB_endif);
10220       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10221          add_logical_edge(BB_then->index, &ic->BB_endif);
10222       BB_then->kind |= block_kind_uniform;
10223    }
10224 
10225    ctx->cf_info.has_branch = false;
10226    ctx->cf_info.parent_loop.has_divergent_branch = false;
10227 
10228    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10229    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10230 
10231    ic->has_divergent_continue_then = ctx->cf_info.parent_loop.has_divergent_continue;
10232    ctx->cf_info.parent_loop.has_divergent_continue = ic->has_divergent_continue_old;
10233 
10234    /** emit else block */
10235    Block* BB_else = ctx->program->create_and_insert_block();
10236    if (logical_else) {
10237       add_edge(ic->BB_if_idx, BB_else);
10238       append_logical_start(BB_else);
10239    } else {
10240       add_linear_edge(ic->BB_if_idx, BB_else);
10241    }
10242    ctx->block = BB_else;
10243 }
10244 
10245 static void
end_uniform_if(isel_context * ctx,if_context * ic,bool logical_else)10246 end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else)
10247 {
10248    Block* BB_else = ctx->block;
10249 
10250    if (!ctx->cf_info.has_branch) {
10251       if (logical_else)
10252          append_logical_end(BB_else);
10253       /* branch from then block to endif block */
10254       aco_ptr<Instruction> branch;
10255       branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10256       BB_else->instructions.emplace_back(std::move(branch));
10257       add_linear_edge(BB_else->index, &ic->BB_endif);
10258       if (logical_else && !ctx->cf_info.parent_loop.has_divergent_branch)
10259          add_logical_edge(BB_else->index, &ic->BB_endif);
10260       BB_else->kind |= block_kind_uniform;
10261    }
10262 
10263    ctx->cf_info.has_branch = false;
10264    ctx->cf_info.parent_loop.has_divergent_branch = false;
10265    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10266    ctx->cf_info.parent_loop.has_divergent_continue |= ic->has_divergent_continue_then;
10267 
10268    /** emit endif merge block */
10269    if (ic->cond.id())
10270       ctx->program->next_uniform_if_depth--;
10271    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10272    append_logical_start(ctx->block);
10273 
10274    /* We shouldn't create unreachable blocks. */
10275    assert(!ctx->block->logical_preds.empty());
10276 }
10277 
10278 static void
end_empty_exec_skip(isel_context * ctx)10279 end_empty_exec_skip(isel_context* ctx)
10280 {
10281    if (ctx->cf_info.skipping_empty_exec) {
10282       begin_uniform_if_else(ctx, &ctx->cf_info.empty_exec_skip, false);
10283       end_uniform_if(ctx, &ctx->cf_info.empty_exec_skip, false);
10284       ctx->cf_info.skipping_empty_exec = false;
10285 
10286       ctx->cf_info.exec.combine(ctx->cf_info.empty_exec_skip.exec_old);
10287    }
10288 }
10289 
10290 /*
10291  * If necessary, begin a branch which skips over instructions if exec is empty.
10292  *
10293  * The linear CFG:
10294  *                        BB_IF
10295  *                        /    \
10296  *       BB_THEN (logical)      BB_ELSE (linear)
10297  *                        \    /
10298  *                        BB_ENDIF
10299  *
10300  * The logical CFG:
10301  *                        BB_IF
10302  *                          |
10303  *                       BB_THEN (logical)
10304  *                          |
10305  *                       BB_ENDIF
10306  *
10307  * BB_THEN should not end with a branch, since that would make BB_ENDIF unreachable.
10308  */
10309 static void
begin_empty_exec_skip(isel_context * ctx,nir_instr * after_instr,nir_block * block)10310 begin_empty_exec_skip(isel_context* ctx, nir_instr* after_instr, nir_block* block)
10311 {
10312    if (!ctx->cf_info.exec.potentially_empty_discard && !ctx->cf_info.exec.potentially_empty_break &&
10313        !ctx->cf_info.exec.potentially_empty_continue)
10314       return;
10315 
10316    assert(!(ctx->block->kind & block_kind_top_level));
10317 
10318    bool further_cf_empty = !nir_cf_node_next(&block->cf_node);
10319 
10320    bool rest_of_block_empty = false;
10321    if (after_instr) {
10322       rest_of_block_empty =
10323          nir_instr_is_last(after_instr) || nir_instr_next(after_instr)->type == nir_instr_type_jump;
10324    } else {
10325       rest_of_block_empty = exec_list_is_empty(&block->instr_list) ||
10326                             nir_block_first_instr(block)->type == nir_instr_type_jump;
10327    }
10328 
10329    assert(!(ctx->block->kind & block_kind_export_end) || rest_of_block_empty);
10330 
10331    if (rest_of_block_empty && further_cf_empty)
10332       return;
10333 
10334    /* Don't nest these skipping branches. It is not worth the complexity. */
10335    end_empty_exec_skip(ctx);
10336 
10337    begin_uniform_if_then(ctx, &ctx->cf_info.empty_exec_skip, Temp());
10338    ctx->cf_info.skipping_empty_exec = true;
10339 
10340    ctx->cf_info.empty_exec_skip.exec_old = ctx->cf_info.exec;
10341    ctx->cf_info.exec = exec_info();
10342 
10343    ctx->program->should_repair_ssa = true;
10344 }
10345 
10346 static void
visit_if(isel_context * ctx,nir_if * if_stmt)10347 visit_if(isel_context* ctx, nir_if* if_stmt)
10348 {
10349    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10350    Builder bld(ctx->program, ctx->block);
10351    aco_ptr<Instruction> branch;
10352    if_context ic;
10353 
10354    if (!nir_src_is_divergent(&if_stmt->condition)) { /* uniform condition */
10355       /**
10356        * Uniform conditionals are represented in the following way*) :
10357        *
10358        * The linear and logical CFG:
10359        *                        BB_IF
10360        *                        /    \
10361        *       BB_THEN (logical)      BB_ELSE (logical)
10362        *                        \    /
10363        *                        BB_ENDIF
10364        *
10365        * *) Exceptions may be due to break and continue statements within loops
10366        *    If a break/continue happens within uniform control flow, it branches
10367        *    to the loop exit/entry block. Otherwise, it branches to the next
10368        *    merge block.
10369        **/
10370 
10371       assert(cond.regClass() == ctx->program->lane_mask);
10372       cond = bool_to_scalar_condition(ctx, cond);
10373 
10374       begin_uniform_if_then(ctx, &ic, cond);
10375       visit_cf_list(ctx, &if_stmt->then_list);
10376 
10377       begin_uniform_if_else(ctx, &ic);
10378       visit_cf_list(ctx, &if_stmt->else_list);
10379 
10380       end_uniform_if(ctx, &ic);
10381    } else { /* non-uniform condition */
10382       /**
10383        * To maintain a logical and linear CFG without critical edges,
10384        * non-uniform conditionals are represented in the following way*) :
10385        *
10386        * The linear CFG:
10387        *                        BB_IF
10388        *                        /    \
10389        *       BB_THEN (logical)      BB_THEN (linear)
10390        *                        \    /
10391        *                        BB_INVERT (linear)
10392        *                        /    \
10393        *       BB_ELSE (logical)      BB_ELSE (linear)
10394        *                        \    /
10395        *                        BB_ENDIF
10396        *
10397        * The logical CFG:
10398        *                        BB_IF
10399        *                        /    \
10400        *       BB_THEN (logical)      BB_ELSE (logical)
10401        *                        \    /
10402        *                        BB_ENDIF
10403        *
10404        * *) Exceptions may be due to break and continue statements within loops
10405        **/
10406 
10407       begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10408       visit_cf_list(ctx, &if_stmt->then_list);
10409 
10410       begin_divergent_if_else(ctx, &ic, if_stmt->control);
10411       visit_cf_list(ctx, &if_stmt->else_list);
10412 
10413       end_divergent_if(ctx, &ic);
10414    }
10415 }
10416 
10417 static void
visit_cf_list(isel_context * ctx,struct exec_list * list)10418 visit_cf_list(isel_context* ctx, struct exec_list* list)
10419 {
10420    if (nir_cf_list_is_empty_block(list))
10421       return;
10422 
10423    bool skipping_empty_exec_old = ctx->cf_info.skipping_empty_exec;
10424    if_context empty_exec_skip_old = std::move(ctx->cf_info.empty_exec_skip);
10425    ctx->cf_info.skipping_empty_exec = false;
10426 
10427    foreach_list_typed (nir_cf_node, node, node, list) {
10428       switch (node->type) {
10429       case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10430       case nir_cf_node_if: visit_if(ctx, nir_cf_node_as_if(node)); break;
10431       case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10432       default: unreachable("unimplemented cf list type");
10433       }
10434    }
10435 
10436    end_empty_exec_skip(ctx);
10437    ctx->cf_info.skipping_empty_exec = skipping_empty_exec_old;
10438    ctx->cf_info.empty_exec_skip = std::move(empty_exec_skip_old);
10439 }
10440 
10441 static void
export_mrt(isel_context * ctx,const struct aco_export_mrt * mrt)10442 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10443 {
10444    Builder bld(ctx->program, ctx->block);
10445 
10446    bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10447            mrt->enabled_channels, mrt->target, mrt->compr);
10448 
10449    ctx->program->has_color_exports = true;
10450 }
10451 
10452 static bool
export_fs_mrt_color(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned slot,struct aco_export_mrt * mrt)10453 export_fs_mrt_color(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
10454                     unsigned slot, struct aco_export_mrt* mrt)
10455 {
10456    unsigned col_format = (info->spi_shader_col_format >> (slot * 4)) & 0xf;
10457 
10458    if (col_format == V_028714_SPI_SHADER_ZERO)
10459       return false;
10460 
10461    Builder bld(ctx->program, ctx->block);
10462    Operand values[4];
10463 
10464    for (unsigned i = 0; i < 4; ++i) {
10465       values[i] = Operand(colors[i]);
10466    }
10467 
10468    unsigned enabled_channels = 0;
10469    aco_opcode compr_op = aco_opcode::num_opcodes;
10470    bool compr = false;
10471    bool is_16bit = colors[0].regClass() == v2b;
10472    bool is_int8 = (info->color_is_int8 >> slot) & 1;
10473    bool is_int10 = (info->color_is_int10 >> slot) & 1;
10474    bool enable_mrt_output_nan_fixup = (ctx->options->enable_mrt_output_nan_fixup >> slot) & 1;
10475 
10476    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10477    if (enable_mrt_output_nan_fixup && !is_16bit &&
10478        (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10479         col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10480         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10481       for (unsigned i = 0; i < 4; i++) {
10482          Temp is_not_nan =
10483             bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
10484          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
10485                               is_not_nan);
10486       }
10487    }
10488 
10489    switch (col_format) {
10490    case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10491 
10492    case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10493 
10494    case V_028714_SPI_SHADER_32_AR:
10495       if (ctx->options->gfx_level >= GFX10) {
10496          /* Special case: on GFX10, the outputs are different for 32_AR */
10497          enabled_channels = 0x3;
10498          values[1] = values[3];
10499          values[3] = Operand(v1);
10500       } else {
10501          enabled_channels = 0x9;
10502       }
10503       break;
10504 
10505    case V_028714_SPI_SHADER_FP16_ABGR:
10506       for (int i = 0; i < 2; i++) {
10507          if (is_16bit) {
10508             values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), values[i * 2],
10509                                    values[i * 2 + 1]);
10510          } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
10511             values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), values[i * 2],
10512                                  values[i * 2 + 1]);
10513          } else {
10514             values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), values[i * 2],
10515                                  values[i * 2 + 1]);
10516          }
10517       }
10518       values[2] = Operand(v1);
10519       values[3] = Operand(v1);
10520       enabled_channels = 0xf;
10521       compr = true;
10522       break;
10523 
10524    case V_028714_SPI_SHADER_UNORM16_ABGR:
10525       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10526          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10527       } else {
10528          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10529       }
10530       break;
10531 
10532    case V_028714_SPI_SHADER_SNORM16_ABGR:
10533       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10534          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10535       } else {
10536          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10537       }
10538       break;
10539 
10540    case V_028714_SPI_SHADER_UINT16_ABGR:
10541       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10542       if (is_int8 || is_int10) {
10543          /* clamp */
10544          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10545 
10546          for (unsigned i = 0; i < 4; i++) {
10547             uint32_t max = i == 3 && is_int10 ? 3 : max_rgb;
10548 
10549             values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
10550          }
10551       } else if (is_16bit) {
10552          for (unsigned i = 0; i < 4; i++) {
10553             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10554             values[i] = Operand(tmp);
10555          }
10556       }
10557       break;
10558 
10559    case V_028714_SPI_SHADER_SINT16_ABGR:
10560       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10561       if (is_int8 || is_int10) {
10562          /* clamp */
10563          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10564          uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
10565 
10566          for (unsigned i = 0; i < 4; i++) {
10567             uint32_t max = i == 3 && is_int10 ? 1 : max_rgb;
10568             uint32_t min = i == 3 && is_int10 ? -2u : min_rgb;
10569 
10570             values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
10571             values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
10572          }
10573       } else if (is_16bit) {
10574          for (unsigned i = 0; i < 4; i++) {
10575             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10576             values[i] = Operand(tmp);
10577          }
10578       }
10579       break;
10580 
10581    case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
10582 
10583    case V_028714_SPI_SHADER_ZERO:
10584    default: return false;
10585    }
10586 
10587    if (compr_op != aco_opcode::num_opcodes) {
10588       values[0] = bld.vop3(compr_op, bld.def(v1), values[0], values[1]);
10589       values[1] = bld.vop3(compr_op, bld.def(v1), values[2], values[3]);
10590       values[2] = Operand(v1);
10591       values[3] = Operand(v1);
10592       enabled_channels = 0xf;
10593       compr = true;
10594    } else if (!compr) {
10595       for (int i = 0; i < 4; i++)
10596          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10597    }
10598 
10599    if (ctx->program->gfx_level >= GFX11) {
10600       /* GFX11 doesn't use COMPR for exports, but the channel mask should be
10601        * 0x3 instead.
10602        */
10603       enabled_channels = compr ? 0x3 : enabled_channels;
10604       compr = false;
10605    }
10606 
10607    for (unsigned i = 0; i < 4; i++)
10608       mrt->out[i] = values[i];
10609    mrt->target = V_008DFC_SQ_EXP_MRT;
10610    mrt->enabled_channels = enabled_channels;
10611    mrt->compr = compr;
10612 
10613    return true;
10614 }
10615 
10616 static void
export_fs_mrtz(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp depth,Temp stencil,Temp samplemask,Temp alpha)10617 export_fs_mrtz(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp depth, Temp stencil,
10618                Temp samplemask, Temp alpha)
10619 {
10620    Builder bld(ctx->program, ctx->block);
10621    unsigned enabled_channels = 0;
10622    bool compr = false;
10623    Operand values[4];
10624 
10625    for (unsigned i = 0; i < 4; ++i) {
10626       values[i] = Operand(v1);
10627    }
10628 
10629    const unsigned format =
10630       ac_get_spi_shader_z_format(depth.id(), stencil.id(), samplemask.id(), alpha.id());
10631    assert(format != V_028710_SPI_SHADER_ZERO);
10632 
10633    /* Both stencil and sample mask only need 16-bits. */
10634    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
10635       compr = ctx->program->gfx_level < GFX11; /* COMPR flag */
10636 
10637       if (stencil.id()) {
10638          /* Stencil should be in X[23:16]. */
10639          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), stencil);
10640          enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3;
10641       }
10642 
10643       if (samplemask.id()) {
10644          /* SampleMask should be in Y[15:0]. */
10645          values[1] = Operand(samplemask);
10646          enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc;
10647       }
10648    } else {
10649       if (depth.id()) {
10650          values[0] = Operand(depth);
10651          enabled_channels |= 0x1;
10652       }
10653 
10654       if (stencil.id()) {
10655          assert(format == V_028710_SPI_SHADER_32_GR || format == V_028710_SPI_SHADER_32_ABGR);
10656          values[1] = Operand(stencil);
10657          enabled_channels |= 0x2;
10658       }
10659 
10660       if (samplemask.id()) {
10661          assert(format == V_028710_SPI_SHADER_32_ABGR);
10662          values[2] = Operand(samplemask);
10663          enabled_channels |= 0x4;
10664       }
10665 
10666       if (alpha.id()) {
10667          assert(format == V_028710_SPI_SHADER_32_AR || format == V_028710_SPI_SHADER_32_ABGR);
10668          assert(ctx->program->gfx_level >= GFX11 || info->alpha_to_one);
10669          values[3] = Operand(alpha);
10670          enabled_channels |= 0x8;
10671       }
10672    }
10673 
10674    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10675     * writemask component.
10676     */
10677    if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND &&
10678        ctx->options->family != CHIP_HAINAN) {
10679       enabled_channels |= 0x1;
10680    }
10681 
10682    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
10683            V_008DFC_SQ_EXP_MRTZ, compr);
10684 }
10685 
10686 static void
create_fs_null_export(isel_context * ctx)10687 create_fs_null_export(isel_context* ctx)
10688 {
10689    /* FS must always have exports.
10690     * So when there are none, we need to add a null export.
10691     */
10692 
10693    Builder bld(ctx->program, ctx->block);
10694    /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
10695    unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
10696    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
10697            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
10698 
10699    ctx->program->has_color_exports = true;
10700 }
10701 
10702 static void
create_fs_jump_to_epilog(isel_context * ctx)10703 create_fs_jump_to_epilog(isel_context* ctx)
10704 {
10705    Builder bld(ctx->program, ctx->block);
10706    std::vector<Operand> exports;
10707    unsigned vgpr = 256; /* VGPR 0 */
10708 
10709    if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
10710       exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u], PhysReg{vgpr++}));
10711 
10712    if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
10713       exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u], PhysReg{vgpr++}));
10714 
10715    if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
10716       exports.emplace_back(
10717          Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u], PhysReg{vgpr++}));
10718 
10719    PhysReg exports_start(vgpr);
10720 
10721    for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
10722       unsigned color_index = slot - FRAG_RESULT_DATA0;
10723       unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
10724       unsigned write_mask = ctx->outputs.mask[slot];
10725 
10726       if (!write_mask)
10727          continue;
10728 
10729       PhysReg color_start(exports_start.reg() + color_index * 4);
10730 
10731       for (unsigned i = 0; i < 4; i++) {
10732          if (!(write_mask & BITFIELD_BIT(i))) {
10733             exports.emplace_back(Operand(v1));
10734             continue;
10735          }
10736 
10737          PhysReg chan_reg = color_start.advance(i * 4u);
10738          Operand chan(ctx->outputs.temps[slot * 4u + i]);
10739 
10740          if (color_type == ACO_TYPE_FLOAT16) {
10741             chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
10742          } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
10743             bool sign_ext = color_type == ACO_TYPE_INT16;
10744             Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
10745             chan = Operand(tmp);
10746          }
10747 
10748          chan.setPrecolored(chan_reg);
10749          exports.emplace_back(chan);
10750       }
10751    }
10752 
10753    Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.epilog_pc));
10754 
10755    aco_ptr<Instruction> jump{
10756       create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + exports.size(), 0)};
10757    jump->operands[0] = Operand(continue_pc);
10758    for (unsigned i = 0; i < exports.size(); i++) {
10759       jump->operands[i + 1] = exports[i];
10760    }
10761    ctx->block->instructions.emplace_back(std::move(jump));
10762 }
10763 
10764 PhysReg
get_arg_reg(const struct ac_shader_args * args,struct ac_arg arg)10765 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
10766 {
10767    assert(arg.used);
10768    enum ac_arg_regfile file = args->args[arg.arg_index].file;
10769    unsigned reg = args->args[arg.arg_index].offset;
10770    return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
10771 }
10772 
10773 static Operand
get_arg_for_end(isel_context * ctx,struct ac_arg arg)10774 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
10775 {
10776    return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
10777 }
10778 
10779 static void
passthrough_all_args(isel_context * ctx,std::vector<Operand> & regs)10780 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
10781 {
10782    struct ac_arg arg;
10783    arg.used = true;
10784 
10785    for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
10786       regs.emplace_back(get_arg_for_end(ctx, arg));
10787 }
10788 
10789 static void
build_end_with_regs(isel_context * ctx,std::vector<Operand> & regs)10790 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
10791 {
10792    aco_ptr<Instruction> end{
10793       create_instruction(aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
10794 
10795    for (unsigned i = 0; i < regs.size(); i++)
10796       end->operands[i] = regs[i];
10797 
10798    ctx->block->instructions.emplace_back(std::move(end));
10799 
10800    ctx->block->kind |= block_kind_end_with_regs;
10801 }
10802 
10803 static void
create_fs_end_for_epilog(isel_context * ctx)10804 create_fs_end_for_epilog(isel_context* ctx)
10805 {
10806    Builder bld(ctx->program, ctx->block);
10807 
10808    std::vector<Operand> regs;
10809 
10810    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.ps.alpha_reference));
10811 
10812    unsigned vgpr = 256;
10813 
10814    for (unsigned slot = FRAG_RESULT_DATA0; slot <= FRAG_RESULT_DATA7; slot++) {
10815       unsigned index = slot - FRAG_RESULT_DATA0;
10816       unsigned type = (ctx->output_color_types >> (index * 2)) & 0x3;
10817       unsigned write_mask = ctx->outputs.mask[slot];
10818 
10819       if (!write_mask)
10820          continue;
10821 
10822       if (type == ACO_TYPE_ANY32) {
10823          u_foreach_bit (i, write_mask) {
10824             regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
10825          }
10826       } else {
10827          for (unsigned i = 0; i < 2; i++) {
10828             unsigned mask = (write_mask >> (i * 2)) & 0x3;
10829             if (!mask)
10830                continue;
10831 
10832             unsigned chan = slot * 4 + i * 2;
10833             Operand lo = mask & 0x1 ? Operand(ctx->outputs.temps[chan]) : Operand(v2b);
10834             Operand hi = mask & 0x2 ? Operand(ctx->outputs.temps[chan + 1]) : Operand(v2b);
10835 
10836             Temp dst = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi);
10837             regs.emplace_back(Operand(dst, PhysReg{vgpr + i}));
10838          }
10839       }
10840       vgpr += 4;
10841    }
10842 
10843    if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
10844       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4], PhysReg{vgpr++}));
10845 
10846    if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
10847       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4], PhysReg{vgpr++}));
10848 
10849    if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
10850       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4], PhysReg{vgpr++}));
10851 
10852    build_end_with_regs(ctx, regs);
10853 
10854    /* Exit WQM mode finally. */
10855    ctx->program->needs_exact = true;
10856 }
10857 
10858 Instruction*
add_startpgm(struct isel_context * ctx)10859 add_startpgm(struct isel_context* ctx)
10860 {
10861    unsigned def_count = 0;
10862    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
10863       if (ctx->args->args[i].skip)
10864          continue;
10865       unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
10866       if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
10867          def_count += ctx->args->args[i].size;
10868       else
10869          def_count++;
10870    }
10871 
10872    if (ctx->stage.hw == AC_HW_COMPUTE_SHADER && ctx->program->gfx_level >= GFX12)
10873       def_count += 3;
10874 
10875    Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
10876    ctx->block->instructions.emplace_back(startpgm);
10877    for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
10878       if (ctx->args->args[i].skip)
10879          continue;
10880 
10881       enum ac_arg_regfile file = ctx->args->args[i].file;
10882       unsigned size = ctx->args->args[i].size;
10883       unsigned reg = ctx->args->args[i].offset;
10884       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
10885 
10886       if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
10887          Temp elems[16];
10888          for (unsigned j = 0; j < size; j++) {
10889             elems[j] = ctx->program->allocateTmp(s1);
10890             startpgm->definitions[arg++] = Definition(elems[j], PhysReg{reg + j});
10891          }
10892          ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
10893       } else {
10894          Temp dst = ctx->program->allocateTmp(type);
10895          Definition def(dst);
10896          def.setPrecolored(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
10897          ctx->arg_temps[i] = dst;
10898          startpgm->definitions[arg++] = def;
10899 
10900          if (ctx->args->args[i].pending_vmem) {
10901             assert(file == AC_ARG_VGPR);
10902             ctx->program->args_pending_vmem.push_back(def);
10903          }
10904       }
10905    }
10906 
10907    if (ctx->program->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
10908       Temp idx = ctx->program->allocateTmp(s1);
10909       Temp idy = ctx->program->allocateTmp(s1);
10910       ctx->ttmp8 = ctx->program->allocateTmp(s1);
10911       startpgm->definitions[def_count - 3] = Definition(idx);
10912       startpgm->definitions[def_count - 3].setPrecolored(PhysReg(108 + 9 /*ttmp9*/));
10913       startpgm->definitions[def_count - 2] = Definition(ctx->ttmp8);
10914       startpgm->definitions[def_count - 2].setPrecolored(PhysReg(108 + 8 /*ttmp8*/));
10915       startpgm->definitions[def_count - 1] = Definition(idy);
10916       startpgm->definitions[def_count - 1].setPrecolored(PhysReg(108 + 7 /*ttmp7*/));
10917       ctx->workgroup_id[0] = Operand(idx);
10918       if (ctx->args->workgroup_ids[2].used) {
10919          Builder bld(ctx->program, ctx->block);
10920          ctx->workgroup_id[1] =
10921             bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::zero(),
10922                        Operand::c32(16u), Operand::zero());
10923          ctx->workgroup_id[2] =
10924             bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::c32(1u),
10925                        Operand::c32(16u), Operand::zero());
10926       } else {
10927          ctx->workgroup_id[1] = Operand(idy);
10928          ctx->workgroup_id[2] = Operand::zero();
10929       }
10930    } else if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
10931       const struct ac_arg* ids = ctx->args->workgroup_ids;
10932       for (unsigned i = 0; i < 3; i++)
10933          ctx->workgroup_id[i] = ids[i].used ? Operand(get_arg(ctx, ids[i])) : Operand::zero();
10934    }
10935 
10936    /* epilog has no scratch */
10937    if (ctx->args->scratch_offset.used) {
10938       if (ctx->program->gfx_level < GFX9) {
10939          /* Stash these in the program so that they can be accessed later when
10940           * handling spilling.
10941           */
10942          if (ctx->args->ring_offsets.used)
10943             ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
10944 
10945          ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
10946       } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
10947          /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
10948           */
10949          Operand scratch_addr = ctx->args->ring_offsets.used
10950                                    ? Operand(get_arg(ctx, ctx->args->ring_offsets))
10951                                    : Operand(s2);
10952 
10953          Builder bld(ctx->program, ctx->block);
10954          bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
10955                     get_arg(ctx, ctx->args->scratch_offset));
10956       }
10957    }
10958 
10959    return startpgm;
10960 }
10961 
10962 void
split_arguments(isel_context * ctx,Instruction * startpgm)10963 split_arguments(isel_context* ctx, Instruction* startpgm)
10964 {
10965    /* Split all arguments except for the first (ring_offsets) and the last
10966     * (exec) so that the dead channels don't stay live throughout the program.
10967     */
10968    for (int i = 1; i < startpgm->definitions.size(); i++) {
10969       if (startpgm->definitions[i].regClass().size() > 1) {
10970          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
10971                            startpgm->definitions[i].regClass().size());
10972       }
10973    }
10974 }
10975 
10976 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)10977 setup_fp_mode(isel_context* ctx, nir_shader* shader)
10978 {
10979    Program* program = ctx->program;
10980 
10981    unsigned float_controls = shader->info.float_controls_execution_mode;
10982 
10983    program->next_fp_mode.must_flush_denorms32 =
10984       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
10985    program->next_fp_mode.must_flush_denorms16_64 =
10986       float_controls &
10987       (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
10988 
10989    program->next_fp_mode.care_about_round32 =
10990       float_controls &
10991       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
10992 
10993    program->next_fp_mode.care_about_round16_64 =
10994       float_controls &
10995       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
10996        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
10997 
10998    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
10999     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11000    if (program->next_fp_mode.must_flush_denorms16_64)
11001       program->next_fp_mode.denorm16_64 = 0;
11002    else
11003       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11004 
11005    /* preserving fp32 denorms is expensive, so only do it if asked */
11006    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11007       program->next_fp_mode.denorm32 = fp_denorm_keep;
11008    else
11009       program->next_fp_mode.denorm32 = 0;
11010 
11011    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11012       program->next_fp_mode.round32 = fp_round_tz;
11013    else
11014       program->next_fp_mode.round32 = fp_round_ne;
11015 
11016    if (float_controls &
11017        (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11018       program->next_fp_mode.round16_64 = fp_round_tz;
11019    else
11020       program->next_fp_mode.round16_64 = fp_round_ne;
11021 
11022    ctx->block->fp_mode = program->next_fp_mode;
11023 }
11024 
11025 void
cleanup_cfg(Program * program)11026 cleanup_cfg(Program* program)
11027 {
11028    /* create linear_succs/logical_succs */
11029    for (Block& BB : program->blocks) {
11030       for (unsigned idx : BB.linear_preds)
11031          program->blocks[idx].linear_succs.emplace_back(BB.index);
11032       for (unsigned idx : BB.logical_preds)
11033          program->blocks[idx].logical_succs.emplace_back(BB.index);
11034    }
11035 }
11036 
11037 void
finish_program(isel_context * ctx)11038 finish_program(isel_context* ctx)
11039 {
11040    cleanup_cfg(ctx->program);
11041 
11042    /* Insert a single p_end_wqm instruction after the last derivative calculation */
11043    if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) {
11044       /* Find the next BB at top-level CFG */
11045       while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) {
11046          ctx->wqm_block_idx++;
11047          ctx->wqm_instruction_idx = 0;
11048       }
11049 
11050       std::vector<aco_ptr<Instruction>>* instrs =
11051          &ctx->program->blocks[ctx->wqm_block_idx].instructions;
11052       auto it = instrs->begin() + ctx->wqm_instruction_idx;
11053 
11054       /* Delay transistion to Exact to help optimizations and scheduling */
11055       while (it != instrs->end()) {
11056          aco_ptr<Instruction>& instr = *it;
11057          /* End WQM before: */
11058          if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP() ||
11059              instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
11060              instr->opcode == aco_opcode::p_jump_to_epilog ||
11061              instr->opcode == aco_opcode::p_logical_start)
11062             break;
11063 
11064          ++it;
11065 
11066          /* End WQM after: */
11067          if (instr->opcode == aco_opcode::p_logical_end ||
11068              instr->opcode == aco_opcode::p_discard_if ||
11069              instr->opcode == aco_opcode::p_demote_to_helper ||
11070              instr->opcode == aco_opcode::p_end_with_regs)
11071             break;
11072       }
11073 
11074       Builder bld(ctx->program);
11075       bld.reset(instrs, it);
11076       bld.pseudo(aco_opcode::p_end_wqm);
11077    }
11078 }
11079 
11080 Temp
lanecount_to_mask(isel_context * ctx,Temp count,unsigned bit_offset)11081 lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset)
11082 {
11083    assert(count.regClass() == s1);
11084 
11085    Builder bld(ctx->program, ctx->block);
11086 
11087    /* We could optimize other cases, but they are unused at the moment. */
11088    if (bit_offset != 0 && bit_offset != 8) {
11089       assert(bit_offset < 32);
11090       count = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), count,
11091                        Operand::c32(bit_offset));
11092       bit_offset = 0;
11093    }
11094 
11095    if (ctx->program->wave_size == 32 && bit_offset == 0) {
11096       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11097        * the register. It doesn't work for 64 because it only uses 6 bits. */
11098       Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11099       return emit_extract_vector(ctx, mask, 0, bld.lm);
11100    } else {
11101       /* s_bfe (both u32 and u64) uses 7 bits for the size, but it needs them in the high word.
11102        * The low word is used for the offset, which has to be zero for our use case.
11103        */
11104       if (bit_offset == 0 && ctx->program->gfx_level >= GFX9) {
11105          /* Avoid writing scc for better scheduling. */
11106          count = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), Operand::c32(0), count);
11107       } else {
11108          count = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), count,
11109                           Operand::c32(16 - bit_offset));
11110       }
11111 
11112       if (ctx->program->wave_size == 32) {
11113          return bld.sop2(aco_opcode::s_bfe_u32, bld.def(bld.lm), bld.def(s1, scc), Operand::c32(-1),
11114                          count);
11115       } else {
11116          return bld.sop2(aco_opcode::s_bfe_u64, bld.def(bld.lm), bld.def(s1, scc),
11117                          Operand::c64(-1ll), count);
11118       }
11119    }
11120 }
11121 
11122 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11123 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11124 {
11125    /* lanecount_to_mask() only cares about s0.byte[i].[6:0]
11126     * so we don't need either s_bfe nor s_and here.
11127     */
11128    Temp count = get_arg(ctx, ctx->args->merged_wave_info);
11129 
11130    return lanecount_to_mask(ctx, count, i * 8u);
11131 }
11132 
11133 static void
insert_rt_jump_next(isel_context & ctx,const struct ac_shader_args * args)11134 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11135 {
11136    unsigned src_count = 0;
11137    for (unsigned i = 0; i < ctx.args->arg_count; i++)
11138       src_count += !!BITSET_TEST(ctx.output_args, i);
11139 
11140    Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11141    ctx.block->instructions.emplace_back(ret);
11142 
11143    src_count = 0;
11144    for (unsigned i = 0; i < ctx.args->arg_count; i++) {
11145       if (!BITSET_TEST(ctx.output_args, i))
11146          continue;
11147 
11148       enum ac_arg_regfile file = ctx.args->args[i].file;
11149       unsigned size = ctx.args->args[i].size;
11150       unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11151       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11152       Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11153                                          : Operand(PhysReg{reg}, type);
11154       ret->operands[src_count] = op;
11155       src_count++;
11156    }
11157 
11158    Builder bld(ctx.program, ctx.block);
11159    bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11160 }
11161 
11162 void
select_program_rt(isel_context & ctx,unsigned shader_count,struct nir_shader * const * shaders,const struct ac_shader_args * args)11163 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11164                   const struct ac_shader_args* args)
11165 {
11166    for (unsigned i = 0; i < shader_count; i++) {
11167       if (i) {
11168          ctx.block = ctx.program->create_and_insert_block();
11169          ctx.block->kind = block_kind_top_level | block_kind_resume;
11170       }
11171 
11172       nir_shader* nir = shaders[i];
11173       init_context(&ctx, nir);
11174       setup_fp_mode(&ctx, nir);
11175 
11176       Instruction* startpgm = add_startpgm(&ctx);
11177       append_logical_start(ctx.block);
11178       split_arguments(&ctx, startpgm);
11179       visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11180       append_logical_end(ctx.block);
11181       ctx.block->kind |= block_kind_uniform;
11182 
11183       /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11184        * shader without shader calls.
11185        */
11186       if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11187          insert_rt_jump_next(ctx, args);
11188 
11189       cleanup_context(&ctx);
11190    }
11191 
11192    ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11193    finish_program(&ctx);
11194 }
11195 
11196 void
pops_await_overlapped_waves(isel_context * ctx)11197 pops_await_overlapped_waves(isel_context* ctx)
11198 {
11199    ctx->program->has_pops_overlapped_waves_wait = true;
11200 
11201    Builder bld(ctx->program, ctx->block);
11202 
11203    if (ctx->program->gfx_level >= GFX11) {
11204       /* GFX11+ - waiting for the export from the overlapped waves.
11205        * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11206        */
11207       bld.sopp(aco_opcode::s_wait_event,
11208                ctx->program->gfx_level >= GFX12 ? wait_event_imm_wait_export_ready_gfx12 : 0);
11209       return;
11210    }
11211 
11212    /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11213 
11214    const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11215 
11216    /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11217    const Temp did_overlap =
11218       bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11219    if_context did_overlap_if_context;
11220    begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11221    bld.reset(ctx->block);
11222 
11223    /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11224    if (ctx->program->gfx_level >= GFX10) {
11225       /* 2 packer ID bits on GFX10-10.3. */
11226       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11227                                       collision, Operand::c32(0x2001c));
11228       /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11229       const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11230                                                  bld.def(s1, scc), packer_id, Operand::c32(1));
11231       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11232    } else {
11233       /* 1 packer ID bit on GFX9. */
11234       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11235                                       collision, Operand::c32(0x1001c));
11236       /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11237        * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11238        */
11239       const Temp packer_id_hwreg_bits =
11240          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11241       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11242    }
11243 
11244    Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11245                                              collision, Operand::c32(0xa0010));
11246    if (ctx->program->gfx_level < GFX10) {
11247       /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11248        * actual wave ID by 1 in case of wraparound.
11249        */
11250       const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11251                                             collision, Operand::c32(0x3ff));
11252       const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11253          aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11254       newest_overlapped_wave_id =
11255          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11256                   newest_overlapped_wave_id_wrapped);
11257    }
11258 
11259    /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11260     * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11261     * no more than 1023 values behind the current wave ID.
11262     * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11263     * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11264     * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11265     * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11266     * `a - (b + 1)` is `a + ~b`.
11267     * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11268     * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11269     * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11270     * expected.
11271     */
11272    const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11273                                         collision, Operand::c32(0x3ff));
11274    newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11275                                         newest_overlapped_wave_id, wave_id_offset);
11276 
11277    /* Await the overlapped waves. */
11278 
11279    loop_context wait_loop_context;
11280    begin_loop(ctx, &wait_loop_context);
11281    bld.reset(ctx->block);
11282 
11283    const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11284                                            bld.def(s1, scc), wave_id_offset);
11285    /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11286     * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11287     * exited the ordered section.
11288     */
11289    const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11290                                                        newest_overlapped_wave_id, exiting_wave_id);
11291    if_context newest_overlapped_wave_exited_if_context;
11292    begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11293                          newest_overlapped_wave_exited);
11294    emit_loop_break(ctx);
11295    begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11296    end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11297    bld.reset(ctx->block);
11298 
11299    /* Sleep before rechecking to let overlapped waves run for some time. */
11300    bld.sopp(aco_opcode::s_sleep, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11301 
11302    end_loop(ctx, &wait_loop_context);
11303    bld.reset(ctx->block);
11304 
11305    /* Indicate the wait has been done to subsequent compilation stages. */
11306    bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11307 
11308    begin_uniform_if_else(ctx, &did_overlap_if_context);
11309    end_uniform_if(ctx, &did_overlap_if_context);
11310    bld.reset(ctx->block);
11311 }
11312 
11313 static void
create_merged_jump_to_epilog(isel_context * ctx)11314 create_merged_jump_to_epilog(isel_context* ctx)
11315 {
11316    Builder bld(ctx->program, ctx->block);
11317    std::vector<Operand> regs;
11318 
11319    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11320       if (!ctx->args->args[i].preserved)
11321          continue;
11322 
11323       const enum ac_arg_regfile file = ctx->args->args[i].file;
11324       const unsigned reg = ctx->args->args[i].offset;
11325 
11326       Operand op(ctx->arg_temps[i]);
11327       op.setPrecolored(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11328       regs.emplace_back(op);
11329    }
11330 
11331    Temp continue_pc =
11332       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11333 
11334    aco_ptr<Instruction> jump{
11335       create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11336    jump->operands[0] = Operand(continue_pc);
11337    for (unsigned i = 0; i < regs.size(); i++) {
11338       jump->operands[i + 1] = regs[i];
11339    }
11340    ctx->block->instructions.emplace_back(std::move(jump));
11341 }
11342 
11343 static void
create_end_for_merged_shader(isel_context * ctx)11344 create_end_for_merged_shader(isel_context* ctx)
11345 {
11346    std::vector<Operand> regs;
11347 
11348    unsigned max_args;
11349    if (ctx->stage.sw == SWStage::VS) {
11350       assert(ctx->args->vertex_id.used);
11351       max_args = ctx->args->vertex_id.arg_index;
11352    } else {
11353       assert(ctx->stage.sw == SWStage::TES);
11354       assert(ctx->args->tes_u.used);
11355       max_args = ctx->args->tes_u.arg_index;
11356    }
11357 
11358    struct ac_arg arg;
11359    arg.used = true;
11360 
11361    for (arg.arg_index = 0; arg.arg_index < max_args; arg.arg_index++)
11362       regs.emplace_back(get_arg_for_end(ctx, arg));
11363 
11364    build_end_with_regs(ctx, regs);
11365 }
11366 
11367 void
select_shader(isel_context & ctx,nir_shader * nir,const bool need_startpgm,const bool need_endpgm,const bool need_barrier,if_context * ic_merged_wave_info,const bool check_merged_wave_info,const bool endif_merged_wave_info)11368 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_endpgm,
11369               const bool need_barrier, if_context* ic_merged_wave_info,
11370               const bool check_merged_wave_info, const bool endif_merged_wave_info)
11371 {
11372    init_context(&ctx, nir);
11373    setup_fp_mode(&ctx, nir);
11374 
11375    Program* program = ctx.program;
11376 
11377    if (need_startpgm) {
11378       /* Needs to be after init_context() for FS. */
11379       Instruction* startpgm = add_startpgm(&ctx);
11380 
11381       if (!program->info.vs.has_prolog &&
11382           (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11383          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, 0x3u);
11384       }
11385 
11386       append_logical_start(ctx.block);
11387       split_arguments(&ctx, startpgm);
11388    }
11389 
11390    if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11391        !program->stage.has(SWStage::GS)) {
11392       /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11393        * s_sendmsg(GS_ALLOC_REQ).
11394        */
11395       Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, 0u);
11396    }
11397 
11398    if (check_merged_wave_info) {
11399       const unsigned i =
11400          nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11401       const Temp cond = merged_wave_info_to_mask(&ctx, i);
11402       begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11403    }
11404 
11405    if (need_barrier) {
11406       const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11407                                      program->wave_size % nir->info.tess.tcs_vertices_out == 0
11408                                   ? scope_subgroup
11409                                   : scope_workgroup;
11410 
11411       Builder(ctx.program, ctx.block)
11412          .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11413                   scope);
11414    }
11415 
11416    nir_function_impl* func = nir_shader_get_entrypoint(nir);
11417    visit_cf_list(&ctx, &func->body);
11418 
11419    if (ctx.program->info.ps.has_epilog) {
11420       if (ctx.stage == fragment_fs) {
11421          if (ctx.options->is_opengl)
11422             create_fs_end_for_epilog(&ctx);
11423          else
11424             create_fs_jump_to_epilog(&ctx);
11425 
11426          /* FS epilogs always have at least one color/null export. */
11427          ctx.program->has_color_exports = true;
11428       }
11429    }
11430 
11431    if (endif_merged_wave_info) {
11432       begin_divergent_if_else(&ctx, ic_merged_wave_info);
11433       end_divergent_if(&ctx, ic_merged_wave_info);
11434    }
11435 
11436    bool is_first_stage_of_merged_shader = false;
11437 
11438    if (ctx.program->info.merged_shader_compiled_separately &&
11439        (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11440       assert(program->gfx_level >= GFX9);
11441       if (ctx.options->is_opengl)
11442          create_end_for_merged_shader(&ctx);
11443       else
11444          create_merged_jump_to_epilog(&ctx);
11445 
11446       is_first_stage_of_merged_shader = true;
11447    }
11448 
11449    cleanup_context(&ctx);
11450 
11451    if (need_endpgm) {
11452       program->config->float_mode = program->blocks[0].fp_mode.val;
11453 
11454       append_logical_end(ctx.block);
11455       ctx.block->kind |= block_kind_uniform;
11456 
11457       if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) ||
11458           (nir->info.stage == MESA_SHADER_TESS_CTRL && program->gfx_level >= GFX9)) {
11459          Builder(program, ctx.block).sopp(aco_opcode::s_endpgm);
11460       }
11461 
11462       finish_program(&ctx);
11463    }
11464 }
11465 
11466 void
select_program_merged(isel_context & ctx,const unsigned shader_count,nir_shader * const * shaders)11467 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
11468 {
11469    if_context ic_merged_wave_info;
11470    const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
11471 
11472    for (unsigned i = 0; i < shader_count; i++) {
11473       nir_shader* nir = shaders[i];
11474 
11475       /* We always need to insert p_startpgm at the beginning of the first shader.  */
11476       const bool need_startpgm = i == 0;
11477 
11478       /* Need to handle program end for last shader stage. */
11479       const bool need_endpgm = i == shader_count - 1;
11480 
11481       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11482       nir_function_impl* func = nir_shader_get_entrypoint(nir);
11483       const bool empty_shader =
11484          nir_cf_list_is_empty_block(&func->body) &&
11485          ((nir->info.stage == MESA_SHADER_VERTEX &&
11486            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11487           (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11488 
11489       /* See if we need to emit a check of the merged wave info SGPR. */
11490       const bool check_merged_wave_info =
11491          ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11492       const bool endif_merged_wave_info =
11493          ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11494 
11495       /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11496       const bool tcs_skip_barrier =
11497          ctx.stage == vertex_tess_control_hs && !ctx.any_tcs_inputs_via_lds;
11498 
11499       /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
11500       const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
11501 
11502       select_shader(ctx, nir, need_startpgm, need_endpgm, need_barrier, &ic_merged_wave_info,
11503                     check_merged_wave_info, endif_merged_wave_info);
11504 
11505       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11506          /* Special handling when TCS input and output patch size is the same.
11507           * Outputs of the previous stage are inputs to the next stage.
11508           */
11509          ctx.inputs = ctx.outputs;
11510          ctx.outputs = shader_io_state();
11511       }
11512    }
11513 }
11514 
11515 void
emit_polygon_stipple(isel_context * ctx,const struct aco_ps_prolog_info * finfo)11516 emit_polygon_stipple(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
11517 {
11518    Builder bld(ctx->program, ctx->block);
11519 
11520    /* Use the fixed-point gl_FragCoord input.
11521     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
11522     * per coordinate to get the repeating effect.
11523     */
11524    Temp pos_fixed_pt = get_arg(ctx, ctx->args->pos_fixed_pt);
11525    Temp addr0 = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1f), pos_fixed_pt);
11526    Temp addr1 = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), pos_fixed_pt, Operand::c32(16u),
11527                          Operand::c32(5u));
11528 
11529    /* Load the buffer descriptor. */
11530    Temp list = get_arg(ctx, finfo->internal_bindings);
11531    list = convert_pointer_to_64_bit(ctx, list);
11532    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
11533                         Operand::c32(finfo->poly_stipple_buf_offset));
11534 
11535    /* The stipple pattern is 32x32, each row has 32 bits. */
11536    Temp offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr1);
11537    Temp row = bld.mubuf(aco_opcode::buffer_load_dword, bld.def(v1), desc, offset, Operand::c32(0u),
11538                         0, true);
11539    Temp bit = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), row, addr0, Operand::c32(1u));
11540    Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), bit);
11541    bld.pseudo(aco_opcode::p_demote_to_helper, cond);
11542 
11543    ctx->block->kind |= block_kind_uses_discard;
11544    ctx->program->needs_exact = true;
11545 }
11546 
11547 void
overwrite_interp_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo)11548 overwrite_interp_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
11549 {
11550    Builder bld(ctx->program, ctx->block);
11551 
11552    if (finfo->bc_optimize_for_persp || finfo->bc_optimize_for_linear) {
11553       /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
11554        * The hw doesn't compute CENTROID if the whole wave only
11555        * contains fully-covered quads.
11556        */
11557       Temp bc_optimize = get_arg(ctx, ctx->args->prim_mask);
11558 
11559       /* enabled when bit 31 is set */
11560       Temp cond =
11561          bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), bc_optimize, Operand::c32(31u));
11562 
11563       /* scale 1bit scc to wave size bits used by v_cndmask */
11564       cond = bool_to_vector_condition(ctx, cond);
11565 
11566       if (finfo->bc_optimize_for_persp) {
11567          Temp center = get_arg(ctx, ctx->args->persp_center);
11568          Temp centroid = get_arg(ctx, ctx->args->persp_centroid);
11569 
11570          Temp dst = bld.tmp(v2);
11571          select_vec2(ctx, dst, cond, center, centroid);
11572          ctx->arg_temps[ctx->args->persp_centroid.arg_index] = dst;
11573       }
11574 
11575       if (finfo->bc_optimize_for_linear) {
11576          Temp center = get_arg(ctx, ctx->args->linear_center);
11577          Temp centroid = get_arg(ctx, ctx->args->linear_centroid);
11578 
11579          Temp dst = bld.tmp(v2);
11580          select_vec2(ctx, dst, cond, center, centroid);
11581          ctx->arg_temps[ctx->args->linear_centroid.arg_index] = dst;
11582       }
11583    }
11584 
11585    if (finfo->force_persp_sample_interp) {
11586       Temp persp_sample = get_arg(ctx, ctx->args->persp_sample);
11587       ctx->arg_temps[ctx->args->persp_center.arg_index] = persp_sample;
11588       ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_sample;
11589    }
11590 
11591    if (finfo->force_linear_sample_interp) {
11592       Temp linear_sample = get_arg(ctx, ctx->args->linear_sample);
11593       ctx->arg_temps[ctx->args->linear_center.arg_index] = linear_sample;
11594       ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_sample;
11595    }
11596 
11597    if (finfo->force_persp_center_interp) {
11598       Temp persp_center = get_arg(ctx, ctx->args->persp_center);
11599       ctx->arg_temps[ctx->args->persp_sample.arg_index] = persp_center;
11600       ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_center;
11601    }
11602 
11603    if (finfo->force_linear_center_interp) {
11604       Temp linear_center = get_arg(ctx, ctx->args->linear_center);
11605       ctx->arg_temps[ctx->args->linear_sample.arg_index] = linear_center;
11606       ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_center;
11607    }
11608 }
11609 
11610 void
overwrite_samplemask_arg(isel_context * ctx,const struct aco_ps_prolog_info * finfo)11611 overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
11612 {
11613    Builder bld(ctx->program, ctx->block);
11614 
11615    /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
11616     * says:
11617     *
11618     *    "When per-sample shading is active due to the use of a fragment
11619     *     input qualified by sample or due to the use of the gl_SampleID
11620     *     or gl_SamplePosition variables, only the bit for the current
11621     *     sample is set in gl_SampleMaskIn. When state specifies multiple
11622     *     fragment shader invocations for a given fragment, the sample
11623     *     mask for any single fragment shader invocation may specify a
11624     *     subset of the covered samples for the fragment. In this case,
11625     *     the bit corresponding to each covered sample will be set in
11626     *     exactly one fragment shader invocation."
11627     *
11628     * The samplemask loaded by hardware is always the coverage of the
11629     * entire pixel/fragment, so mask bits out based on the sample ID.
11630     */
11631    if (finfo->samplemask_log_ps_iter) {
11632       Temp ancillary = get_arg(ctx, ctx->args->ancillary);
11633       Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u),
11634                                Operand::c32(4u));
11635       Temp samplemask = get_arg(ctx, ctx->args->sample_coverage);
11636 
11637       uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
11638       Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
11639 
11640       Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask);
11641       samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask);
11642 
11643       ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask;
11644    }
11645 }
11646 
11647 Temp
get_interp_color(isel_context * ctx,int interp_vgpr,unsigned attr_index,unsigned comp)11648 get_interp_color(isel_context* ctx, int interp_vgpr, unsigned attr_index, unsigned comp)
11649 {
11650    Builder bld(ctx->program, ctx->block);
11651 
11652    Temp dst = bld.tmp(v1);
11653 
11654    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
11655 
11656    if (interp_vgpr != -1) {
11657       /* interp args are all 2 vgprs */
11658       int arg_index = ctx->args->persp_sample.arg_index + interp_vgpr / 2;
11659       Temp interp_ij = ctx->arg_temps[arg_index];
11660 
11661       emit_interp_instr(ctx, attr_index, comp, interp_ij, dst, prim_mask, false);
11662    } else {
11663       emit_interp_mov_instr(ctx, attr_index, comp, 0, dst, prim_mask, false);
11664    }
11665 
11666    return dst;
11667 }
11668 
11669 void
interpolate_color_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo,std::vector<Operand> & regs)11670 interpolate_color_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo,
11671                        std::vector<Operand>& regs)
11672 {
11673    if (!finfo->colors_read)
11674       return;
11675 
11676    Builder bld(ctx->program, ctx->block);
11677 
11678    unsigned vgpr = 256 + ctx->args->num_vgprs_used;
11679 
11680    if (finfo->color_two_side) {
11681       Temp face = get_arg(ctx, ctx->args->front_face);
11682       Temp is_face_positive =
11683          bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::zero(), face);
11684 
11685       u_foreach_bit (i, finfo->colors_read) {
11686          unsigned color_index = i / 4;
11687          unsigned front_index = finfo->color_attr_index[color_index];
11688          int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
11689 
11690          /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
11691           * otherwise it's at offset "num_inputs".
11692           */
11693          unsigned back_index = finfo->num_interp_inputs;
11694          if (color_index == 1 && finfo->colors_read & 0xf)
11695             back_index++;
11696 
11697          Temp front = get_interp_color(ctx, interp_vgpr, front_index, i % 4);
11698          Temp back = get_interp_color(ctx, interp_vgpr, back_index, i % 4);
11699 
11700          Temp color =
11701             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), back, front, is_face_positive);
11702 
11703          regs.emplace_back(Operand(color, PhysReg{vgpr++}));
11704       }
11705    } else {
11706       u_foreach_bit (i, finfo->colors_read) {
11707          unsigned color_index = i / 4;
11708          unsigned attr_index = finfo->color_attr_index[color_index];
11709          int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
11710          Temp color = get_interp_color(ctx, interp_vgpr, attr_index, i % 4);
11711 
11712          regs.emplace_back(Operand(color, PhysReg{vgpr++}));
11713       }
11714    }
11715 }
11716 
11717 void
emit_clamp_alpha_test(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned color_index)11718 emit_clamp_alpha_test(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
11719                       unsigned color_index)
11720 {
11721    Builder bld(ctx->program, ctx->block);
11722 
11723    if (info->clamp_color) {
11724       for (unsigned i = 0; i < 4; i++) {
11725          if (colors[i].regClass() == v2b) {
11726             colors[i] = bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
11727                                  Operand::c16(0x3c00), colors[i]);
11728          } else {
11729             assert(colors[i].regClass() == v1);
11730             colors[i] = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
11731                                  Operand::c32(0x3f800000u), colors[i]);
11732          }
11733       }
11734    }
11735 
11736    if (info->alpha_to_one) {
11737       if (colors[3].regClass() == v2b)
11738          colors[3] = bld.copy(bld.def(v2b), Operand::c16(0x3c00));
11739       else
11740          colors[3] = bld.copy(bld.def(v1), Operand::c32(0x3f800000u));
11741    }
11742 
11743    if (color_index == 0 && info->alpha_func != COMPARE_FUNC_ALWAYS) {
11744       Operand cond = Operand::c32(-1u);
11745       if (info->alpha_func != COMPARE_FUNC_NEVER) {
11746          aco_opcode opcode = aco_opcode::num_opcodes;
11747 
11748          switch (info->alpha_func) {
11749          case COMPARE_FUNC_LESS: opcode = aco_opcode::v_cmp_ngt_f32; break;
11750          case COMPARE_FUNC_EQUAL: opcode = aco_opcode::v_cmp_neq_f32; break;
11751          case COMPARE_FUNC_LEQUAL: opcode = aco_opcode::v_cmp_nge_f32; break;
11752          case COMPARE_FUNC_GREATER: opcode = aco_opcode::v_cmp_nlt_f32; break;
11753          case COMPARE_FUNC_NOTEQUAL: opcode = aco_opcode::v_cmp_nlg_f32; break;
11754          case COMPARE_FUNC_GEQUAL: opcode = aco_opcode::v_cmp_nle_f32; break;
11755          default: unreachable("invalid alpha func");
11756          }
11757 
11758          Temp ref = get_arg(ctx, info->alpha_reference);
11759 
11760          Temp alpha = colors[3].regClass() == v2b
11761                          ? bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), colors[3])
11762                          : colors[3];
11763 
11764          /* true if not pass */
11765          cond = bld.vopc(opcode, bld.def(bld.lm), ref, alpha);
11766       }
11767 
11768       bld.pseudo(aco_opcode::p_discard_if, cond);
11769       ctx->block->kind |= block_kind_uses_discard;
11770       ctx->program->needs_exact = true;
11771    }
11772 }
11773 
11774 } /* end namespace */
11775 
11776 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)11777 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11778                ac_shader_config* config, const struct aco_compiler_options* options,
11779                const struct aco_shader_info* info, const struct ac_shader_args* args)
11780 {
11781    isel_context ctx =
11782       setup_isel_context(program, shader_count, shaders, config, options, info, args);
11783 
11784    if (ctx.stage == raytracing_cs)
11785       return select_program_rt(ctx, shader_count, shaders, args);
11786 
11787    if (shader_count >= 2) {
11788       select_program_merged(ctx, shader_count, shaders);
11789    } else {
11790       bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
11791       if_context ic_merged_wave_info;
11792 
11793       /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
11794       if (ctx.program->info.merged_shader_compiled_separately) {
11795          assert(ctx.program->gfx_level >= GFX9);
11796          if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
11797             check_merged_wave_info = endif_merged_wave_info = true;
11798          } else {
11799             const bool ngg_gs =
11800                ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
11801             assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
11802             check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
11803             need_barrier = !ngg_gs;
11804          }
11805       }
11806 
11807       select_shader(ctx, shaders[0], true, true, need_barrier, &ic_merged_wave_info,
11808                     check_merged_wave_info, endif_merged_wave_info);
11809    }
11810 }
11811 
11812 void
dump_sgpr_to_mem(isel_context * ctx,Operand rsrc,Operand data,uint32_t offset)11813 dump_sgpr_to_mem(isel_context* ctx, Operand rsrc, Operand data, uint32_t offset)
11814 {
11815    Builder bld(ctx->program, ctx->block);
11816 
11817    ac_hw_cache_flags cache_glc;
11818    cache_glc.value = ac_glc;
11819 
11820    if (ctx->program->gfx_level >= GFX9) {
11821       bld.copy(Definition(PhysReg{256}, v1) /* v0 */, data);
11822 
11823       bld.mubuf(aco_opcode::buffer_store_dword, Operand(rsrc), Operand(v1), Operand::c32(0u),
11824                 Operand(PhysReg{256}, v1) /* v0 */, offset, false /* offen */, false /* idxen */,
11825                 /* addr64 */ false, /* disable_wqm */ false, cache_glc);
11826    } else {
11827       bld.smem(aco_opcode::s_buffer_store_dword, Operand(rsrc), Operand::c32(offset), data,
11828                memory_sync_info(), cache_glc);
11829    }
11830 }
11831 
11832 void
enable_thread_indexing(isel_context * ctx,Operand rsrc)11833 enable_thread_indexing(isel_context* ctx, Operand rsrc)
11834 {
11835    Builder bld(ctx->program, ctx->block);
11836    PhysReg rsrc_word3(rsrc.physReg() + 3);
11837 
11838    bld.sop2(aco_opcode::s_or_b32, Definition(rsrc_word3, s1), bld.def(s1, scc),
11839             Operand(rsrc_word3, s1), Operand::c32(S_008F0C_ADD_TID_ENABLE(1)));
11840    if (ctx->program->gfx_level < GFX10) {
11841       /* This is part of the stride if ADD_TID_ENABLE=1. */
11842       bld.sop2(aco_opcode::s_and_b32, Definition(rsrc_word3, s1), bld.def(s1, scc),
11843                Operand(rsrc_word3, s1), Operand::c32(C_008F0C_DATA_FORMAT));
11844    }
11845 }
11846 
11847 void
disable_thread_indexing(isel_context * ctx,Operand rsrc)11848 disable_thread_indexing(isel_context* ctx, Operand rsrc)
11849 {
11850    Builder bld(ctx->program, ctx->block);
11851    PhysReg rsrc_word3(rsrc.physReg() + 3);
11852 
11853    bld.sop2(aco_opcode::s_and_b32, Definition(rsrc_word3, s1), bld.def(s1, scc),
11854             Operand(rsrc_word3, s1), Operand::c32(C_008F0C_ADD_TID_ENABLE));
11855    if (ctx->program->gfx_level < GFX10) {
11856       bld.sop2(aco_opcode::s_or_b32, Definition(rsrc_word3, s1), bld.def(s1, scc),
11857                Operand(rsrc_word3, s1),
11858                Operand::c32(S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32)));
11859    }
11860 }
11861 
11862 void
save_or_restore_vgprs(isel_context * ctx,Operand rsrc,bool save)11863 save_or_restore_vgprs(isel_context* ctx, Operand rsrc, bool save)
11864 {
11865    Builder bld(ctx->program, ctx->block);
11866    uint32_t offset = offsetof(struct aco_trap_handler_layout, saved_vgprs[0]);
11867 
11868    ac_hw_cache_flags cache_glc;
11869    cache_glc.value = ac_glc;
11870 
11871    enable_thread_indexing(ctx, rsrc);
11872 
11873    for (uint32_t i = 0; i < NUM_SAVED_VGPRS; i++) {
11874       if (save) {
11875          bld.mubuf(aco_opcode::buffer_store_dword, Operand(rsrc), Operand(v1), Operand::c32(0u),
11876                    Operand(PhysReg{256 + i}, v1) /* v0 */, offset, false /* offen */,
11877                    false /* idxen */,
11878                    /* addr64 */ false, /* disable_wqm */ false, cache_glc);
11879       } else {
11880          bld.mubuf(aco_opcode::buffer_load_dword, Definition(PhysReg{256 + i}, v1), Operand(rsrc),
11881                    Operand(v1), Operand::c32(0u), offset, false /* offen */, false /* idxen */,
11882                    /* addr64 */ false, /* disable_wqm */ false, cache_glc);
11883       }
11884 
11885       offset += 256;
11886    }
11887 
11888    disable_thread_indexing(ctx, rsrc);
11889 }
11890 
11891 void
save_vgprs_to_mem(isel_context * ctx,Operand rsrc)11892 save_vgprs_to_mem(isel_context* ctx, Operand rsrc)
11893 {
11894    save_or_restore_vgprs(ctx, rsrc, true);
11895 }
11896 
11897 void
restore_vgprs_from_mem(isel_context * ctx,Operand rsrc)11898 restore_vgprs_from_mem(isel_context* ctx, Operand rsrc)
11899 {
11900    save_or_restore_vgprs(ctx, rsrc, false);
11901 }
11902 
11903 void
dump_vgprs_to_mem(isel_context * ctx,Builder & bld,Operand rsrc)11904 dump_vgprs_to_mem(isel_context* ctx, Builder& bld, Operand rsrc)
11905 {
11906    const uint32_t ttmp0_idx = ctx->program->gfx_level >= GFX9 ? 108 : 112;
11907    const uint32_t base_offset = offsetof(struct aco_trap_handler_layout, vgprs[0]);
11908 
11909    ac_hw_cache_flags cache_glc;
11910    cache_glc.value = ac_glc;
11911 
11912    PhysReg num_vgprs{ttmp0_idx + 2};
11913    PhysReg soffset{ttmp0_idx + 3};
11914 
11915    enable_thread_indexing(ctx, rsrc);
11916 
11917    /* Determine the number of vgprs to dump in a 4-VGPR granularity. */
11918    const uint32_t vgpr_size_offset = ctx->program->gfx_level >= GFX11 ? 12 : 8;
11919    const uint32_t vgpr_size_width = ctx->program->gfx_level >= GFX10 ? 8 : 6;
11920 
11921    bld.sopk(aco_opcode::s_getreg_b32, Definition(num_vgprs, s1),
11922             ((32 - 1) << 11) | 5 /* GPR_ALLOC */);
11923    bld.sop2(aco_opcode::s_bfe_u32, Definition(num_vgprs, s1), bld.def(s1, scc),
11924             Operand(num_vgprs, s1), Operand::c32((vgpr_size_width << 16) | vgpr_size_offset));
11925    bld.sop2(aco_opcode::s_add_u32, Definition(num_vgprs, s1), bld.def(s1, scc),
11926             Operand(num_vgprs, s1), Operand::c32(1u));
11927    bld.sop2(aco_opcode::s_lshl_b32, Definition(num_vgprs, s1), bld.def(s1, scc),
11928             Operand(num_vgprs, s1), Operand::c32(2u));
11929    bld.sop2(aco_opcode::s_mul_i32, Definition(num_vgprs, s1), Operand::c32(256),
11930             Operand(num_vgprs, s1));
11931 
11932    /* Initialize m0/soffset to zero. */
11933    bld.copy(Definition(m0, s1), Operand::c32(0u));
11934    bld.copy(Definition(soffset, s1), Operand::c32(0u));
11935 
11936    if (ctx->program->gfx_level < GFX10) {
11937       /* Enable VGPR indexing with m0 as source index. */
11938       bld.sopc(aco_opcode::s_set_gpr_idx_on, Definition(m0, s1), Operand(m0, s1),
11939                Operand(PhysReg{1}, s1) /* SRC0 mode */);
11940    }
11941 
11942    loop_context lc;
11943    begin_loop(ctx, &lc);
11944    {
11945       bld.reset(ctx->block);
11946 
11947       /* Move from a relative source addr (v0 = v[0 + m0]). */
11948       if (ctx->program->gfx_level >= GFX10) {
11949          bld.vop1(aco_opcode::v_movrels_b32, Definition(PhysReg{256}, v1),
11950                   Operand(PhysReg{256}, v1), Operand(m0, s1));
11951       } else {
11952          bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{256}, v1), Operand(PhysReg{256}, v1));
11953       }
11954 
11955       bld.mubuf(aco_opcode::buffer_store_dword, Operand(rsrc), Operand(v1),
11956                 Operand(PhysReg{soffset}, s1), Operand(PhysReg{256}, v1) /* v0 */, base_offset,
11957                 false /* offen */, false /* idxen */,
11958                 /* addr64 */ false, /* disable_wqm */ false, cache_glc);
11959 
11960       /* Increase m0 and the offset assuming it's wave64. */
11961       bld.sop2(aco_opcode::s_add_u32, Definition(m0, s1), bld.def(s1, scc), Operand(m0, s1),
11962                Operand::c32(1u));
11963       bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), bld.def(s1, scc),
11964                Operand(soffset, s1), Operand::c32(256u));
11965 
11966       const Temp cond = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), Operand(soffset, s1),
11967                                  Operand(num_vgprs, s1));
11968 
11969       if_context loop_break;
11970       begin_uniform_if_then(ctx, &loop_break, cond);
11971       {
11972          emit_loop_break(ctx);
11973       }
11974       begin_uniform_if_else(ctx, &loop_break);
11975       end_uniform_if(ctx, &loop_break);
11976    }
11977    end_loop(ctx, &lc);
11978    bld.reset(ctx->block);
11979 
11980    if (ctx->program->gfx_level < GFX10) {
11981       /* Disable VGPR indexing. */
11982       bld.sopp(aco_opcode::s_set_gpr_idx_off);
11983    }
11984 
11985    disable_thread_indexing(ctx, rsrc);
11986 }
11987 
11988 void
dump_lds_to_mem(isel_context * ctx,Builder & bld,Operand rsrc)11989 dump_lds_to_mem(isel_context* ctx, Builder& bld, Operand rsrc)
11990 {
11991    const uint32_t ttmp0_idx = ctx->program->gfx_level >= GFX9 ? 108 : 112;
11992    const uint32_t base_offset = offsetof(struct aco_trap_handler_layout, lds[0]);
11993 
11994    ac_hw_cache_flags cache_glc;
11995    cache_glc.value = ac_glc;
11996 
11997    PhysReg lds_size{ttmp0_idx + 2};
11998    PhysReg soffset{ttmp0_idx + 3};
11999 
12000    enable_thread_indexing(ctx, rsrc);
12001 
12002    /* Determine the LDS size. */
12003    const uint32_t lds_size_offset = 12;
12004    const uint32_t lds_size_width = 9;
12005 
12006    bld.sopk(aco_opcode::s_getreg_b32, Definition(lds_size, s1),
12007             ((lds_size_width - 1) << 11) | (lds_size_offset << 6) | 6 /* LDS_ALLOC */);
12008    Temp lds_size_non_zero =
12009       bld.sopc(aco_opcode::s_cmp_lg_i32, bld.def(s1, scc), Operand(lds_size, s1), Operand::c32(0));
12010 
12011    if_context ic;
12012    begin_uniform_if_then(ctx, &ic, lds_size_non_zero);
12013    {
12014       bld.reset(ctx->block);
12015 
12016       /* Wait for other waves in the same threadgroup. */
12017       bld.sopp(aco_opcode::s_barrier, 0u);
12018 
12019       /* Compute the LDS size in bytes (64 dw * 4). */
12020       bld.sop2(aco_opcode::s_lshl_b32, Definition(lds_size, s1), bld.def(s1, scc),
12021                Operand(lds_size, s1), Operand::c32(8u));
12022 
12023       /* Add the base offset because this is used to exit the loop. */
12024       bld.sop2(aco_opcode::s_add_u32, Definition(lds_size, s1), bld.def(s1, scc),
12025                Operand(lds_size, s1), Operand::c32(base_offset));
12026 
12027       /* Initialize soffset to base offset. */
12028       bld.copy(Definition(soffset, s1), Operand::c32(base_offset));
12029 
12030       /* Compute the LDS offset from the thread ID. */
12031       bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(PhysReg{256}, v1), Operand::c32(-1u),
12032                Operand::c32(0u));
12033       bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(PhysReg{256}, v1), Operand::c32(-1u),
12034                Operand(PhysReg{256}, v1));
12035       bld.vop2(aco_opcode::v_mul_u32_u24, Definition(PhysReg{256}, v1), Operand::c32(4u),
12036                Operand(PhysReg{256}, v1));
12037 
12038       Operand m = load_lds_size_m0(bld);
12039 
12040       loop_context lc;
12041       begin_loop(ctx, &lc);
12042       {
12043          bld.reset(ctx->block);
12044 
12045          if (ctx->program->gfx_level >= GFX9) {
12046             bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg{257}, v1), Operand(PhysReg{256}, v1),
12047                    0);
12048          } else {
12049             bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg{257}, v1), Operand(PhysReg{256}, v1),
12050                    m, 0);
12051          }
12052 
12053          bld.mubuf(aco_opcode::buffer_store_dword, Operand(rsrc), Operand(v1),
12054                    Operand(PhysReg{soffset}, s1), Operand(PhysReg{257}, v1) /* v0 */,
12055                    0 /* offset */, false /* offen */, false /* idxen */,
12056                    /* addr64 */ false, /* disable_wqm */ false, cache_glc);
12057 
12058          /* Increase v0 and the offset assuming it's wave64. */
12059          bld.vop3(aco_opcode::v_mad_u32_u24, Definition(PhysReg{256}, v1), Operand::c32(4u),
12060                   Operand::c32(64u), Operand(PhysReg{256}, v1));
12061          bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), bld.def(s1, scc),
12062                   Operand(soffset, s1), Operand::c32(256u));
12063 
12064          const Temp cond = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc),
12065                                     Operand(soffset, s1), Operand(lds_size, s1));
12066 
12067          if_context loop_break;
12068          begin_uniform_if_then(ctx, &loop_break, cond);
12069          {
12070             emit_loop_break(ctx);
12071          }
12072          begin_uniform_if_else(ctx, &loop_break);
12073          end_uniform_if(ctx, &loop_break);
12074       }
12075       end_loop(ctx, &lc);
12076       bld.reset(ctx->block);
12077    }
12078    begin_uniform_if_else(ctx, &ic);
12079    end_uniform_if(ctx, &ic);
12080    bld.reset(ctx->block);
12081 
12082    disable_thread_indexing(ctx, rsrc);
12083 }
12084 
12085 void
select_trap_handler_shader(Program * program,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12086 select_trap_handler_shader(Program* program, ac_shader_config* config,
12087                            const struct aco_compiler_options* options,
12088                            const struct aco_shader_info* info, const struct ac_shader_args* args)
12089 {
12090    uint32_t offset = 0;
12091 
12092    assert(options->gfx_level >= GFX8 && options->gfx_level <= GFX11);
12093 
12094    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12095                 config);
12096 
12097    isel_context ctx = {};
12098    ctx.program = program;
12099    ctx.args = args;
12100    ctx.options = options;
12101    ctx.stage = program->stage;
12102 
12103    ctx.block = ctx.program->create_and_insert_block();
12104    ctx.block->kind = block_kind_top_level;
12105 
12106    program->workgroup_size = 1; /* XXX */
12107 
12108    add_startpgm(&ctx);
12109    append_logical_start(ctx.block);
12110 
12111    Builder bld(ctx.program, ctx.block);
12112 
12113    ac_hw_cache_flags cache_glc;
12114    cache_glc.value = ac_glc;
12115 
12116    const uint32_t ttmp0_idx = ctx.program->gfx_level >= GFX9 ? 108 : 112;
12117    PhysReg ttmp0_reg{ttmp0_idx};
12118    PhysReg ttmp1_reg{ttmp0_idx + 1};
12119    PhysReg ttmp2_reg{ttmp0_idx + 2};
12120    PhysReg ttmp3_reg{ttmp0_idx + 3};
12121    PhysReg tma_rsrc{ttmp0_idx + 4}; /* s4 */
12122    PhysReg save_wave_status{ttmp0_idx + 8};
12123    PhysReg save_m0{ttmp0_idx + 9};
12124    PhysReg save_exec{ttmp0_idx + 10}; /* s2 */
12125 
12126    /* Save SQ_WAVE_STATUS because SCC needs to be restored. */
12127    bld.sopk(aco_opcode::s_getreg_b32, Definition(save_wave_status, s1), ((32 - 1) << 11) | 2);
12128 
12129    /* Save m0. */
12130    bld.copy(Definition(save_m0, s1), Operand(m0, s1));
12131 
12132    /* Save exec and use all invocations from the wave. */
12133    bld.sop1(Builder::s_or_saveexec, Definition(save_exec, bld.lm), Definition(scc, s1),
12134             Definition(exec, bld.lm), Operand::c32_or_c64(-1u, bld.lm == s2),
12135             Operand(exec, bld.lm));
12136 
12137    if (options->gfx_level < GFX11) {
12138       /* Clear the current wave exception, this is required to re-enable VALU
12139        * instructions in this wave. Seems to be only needed for float exceptions.
12140        */
12141       bld.vop1(aco_opcode::v_clrexcp);
12142    }
12143 
12144    offset = offsetof(struct aco_trap_handler_layout, ttmp0);
12145 
12146    if (ctx.program->gfx_level >= GFX9) {
12147       /* Get TMA. */
12148       if (ctx.program->gfx_level >= GFX11) {
12149          bld.sop1(aco_opcode::s_sendmsg_rtn_b32, Definition(ttmp2_reg, s1),
12150                   Operand::c32(sendmsg_rtn_get_tma));
12151       } else {
12152          bld.sopk(aco_opcode::s_getreg_b32, Definition(ttmp2_reg, s1), ((32 - 1) << 11) | 18);
12153       }
12154 
12155       bld.sop2(aco_opcode::s_lshl_b32, Definition(ttmp2_reg, s1), Definition(scc, s1),
12156                Operand(ttmp2_reg, s1), Operand::c32(8u));
12157       bld.copy(Definition(ttmp3_reg, s1), Operand::c32((unsigned)ctx.options->address32_hi));
12158 
12159       /* Load the buffer descriptor from TMA. */
12160       bld.smem(aco_opcode::s_load_dwordx4, Definition(tma_rsrc, s4), Operand(ttmp2_reg, s2),
12161                Operand::c32(0u));
12162 
12163       /* Save VGPRS that needs to be restored. */
12164       save_vgprs_to_mem(&ctx, Operand(tma_rsrc, s4));
12165 
12166       /* Dump VGPRs. */
12167       dump_vgprs_to_mem(&ctx, bld, Operand(tma_rsrc, s4));
12168 
12169       /* Store TTMP0-TTMP1. */
12170       bld.copy(Definition(PhysReg{256}, v2) /* v[0-1] */, Operand(ttmp0_reg, s2));
12171 
12172       bld.mubuf(aco_opcode::buffer_store_dwordx2, Operand(tma_rsrc, s4), Operand(v1),
12173                 Operand::c32(0u), Operand(PhysReg{256}, v2) /* v[0-1] */, offset /* offset */,
12174                 false /* offen */, false /* idxen */, /* addr64 */ false,
12175                 /* disable_wqm */ false, cache_glc);
12176    } else {
12177       /* Load the buffer descriptor from TMA. */
12178       bld.smem(aco_opcode::s_load_dwordx4, Definition(tma_rsrc, s4), Operand(PhysReg{tma_lo}, s2),
12179                Operand::zero());
12180 
12181       /* Save VGPRS that needs to be restored. */
12182       save_vgprs_to_mem(&ctx, Operand(tma_rsrc, s4));
12183 
12184       /* Dump VGPRs. */
12185       dump_vgprs_to_mem(&ctx, bld, Operand(tma_rsrc, s4));
12186 
12187       /* Store TTMP0-TTMP1. */
12188       bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(tma_rsrc, s4), Operand::c32(offset),
12189                Operand(ttmp0_reg, s2), memory_sync_info(), cache_glc);
12190    }
12191 
12192    /* Store some hardware registers. */
12193    const uint32_t hw_regs_idx[] = {
12194       1, /* HW_REG_MODE */
12195       3, /* HW_REG_TRAP_STS */
12196       4, /* HW_REG_HW_ID */
12197       5, /* WH_REG_GPR_ALLOC */
12198       6, /* WH_REG_LDS_ALLOC */
12199       7, /* HW_REG_IB_STS */
12200    };
12201 
12202    offset = offsetof(struct aco_trap_handler_layout, sq_wave_regs.status);
12203 
12204    /* Store saved SQ_WAVE_STATUS which can change inside the trap. */
12205    dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(save_wave_status, s1), offset);
12206    offset += 4;
12207 
12208    for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
12209       /* "((size - 1) << 11) | register" */
12210       bld.sopk(aco_opcode::s_getreg_b32, Definition(ttmp0_reg, s1),
12211                ((32 - 1) << 11) | hw_regs_idx[i]);
12212 
12213       dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(ttmp0_reg, s1), offset);
12214       offset += 4;
12215    }
12216 
12217    assert(offset == offsetof(struct aco_trap_handler_layout, m0));
12218 
12219    /* Dump shader registers (m0, exec). */
12220    dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(save_m0, s1), offset);
12221    offset += 4;
12222    dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(save_exec, s1), offset);
12223    offset += 4;
12224    dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(save_exec.advance(4), s1), offset);
12225    offset += 4;
12226 
12227    assert(offset == offsetof(struct aco_trap_handler_layout, sgprs[0]));
12228 
12229    /* Dump all SGPRs. */
12230    for (uint32_t i = 0; i < program->dev.sgpr_limit; i++) {
12231       dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(PhysReg{i}, s1), offset);
12232       offset += 4;
12233    }
12234 
12235    /* Dump LDS. */
12236    dump_lds_to_mem(&ctx, bld, Operand(tma_rsrc, s4));
12237 
12238    /* Restore VGPRS. */
12239    restore_vgprs_from_mem(&ctx, Operand(tma_rsrc, s4));
12240 
12241    /* Restore m0 and exec. */
12242    bld.copy(Definition(m0, s1), Operand(save_m0, s1));
12243    bld.copy(Definition(exec, bld.lm), Operand(save_exec, bld.lm));
12244 
12245    /* Restore SCC which is the first bit of SQ_WAVE_STATUS. */
12246    bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), Operand(save_wave_status, s1),
12247             Operand::c32(0u));
12248 
12249    program->config->float_mode = program->blocks[0].fp_mode.val;
12250 
12251    append_logical_end(ctx.block);
12252    ctx.block->kind |= block_kind_uniform;
12253    bld.sopp(aco_opcode::s_endpgm);
12254 
12255    finish_program(&ctx);
12256 }
12257 
12258 Operand
get_arg_fixed(const struct ac_shader_args * args,struct ac_arg arg)12259 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
12260 {
12261    enum ac_arg_regfile file = args->args[arg.arg_index].file;
12262    unsigned size = args->args[arg.arg_index].size;
12263    RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
12264    return Operand(get_arg_reg(args, arg), rc);
12265 }
12266 
12267 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)12268 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
12269 {
12270    unsigned sgpr_limit = get_addr_sgpr_from_waves(bld.program, bld.program->min_waves);
12271    unsigned count = MIN2((sgpr_limit - dest.reg()) / 4u, max);
12272    for (unsigned i = 0; i < count;) {
12273       unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
12274 
12275       if (size == 4)
12276          bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
12277                   Operand::c32((start + i) * 16u));
12278       else if (size == 2)
12279          bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
12280                   Operand::c32((start + i) * 16u));
12281       else
12282          bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
12283                   Operand::c32((start + i) * 16u));
12284 
12285       dest = dest.advance(size * 16u);
12286       i += size;
12287    }
12288 
12289    return count;
12290 }
12291 
12292 void
wait_for_smem_loads(Builder & bld)12293 wait_for_smem_loads(Builder& bld)
12294 {
12295    if (bld.program->gfx_level >= GFX12) {
12296       bld.sopp(aco_opcode::s_wait_kmcnt, 0);
12297    } else {
12298       wait_imm lgkm_imm;
12299       lgkm_imm.lgkm = 0;
12300       bld.sopp(aco_opcode::s_waitcnt, lgkm_imm.pack(bld.program->gfx_level));
12301    }
12302 }
12303 
12304 void
wait_for_vmem_loads(Builder & bld)12305 wait_for_vmem_loads(Builder& bld)
12306 {
12307    if (bld.program->gfx_level >= GFX12) {
12308       bld.sopp(aco_opcode::s_wait_loadcnt, 0);
12309    } else {
12310       wait_imm vm_imm;
12311       vm_imm.vm = 0;
12312       bld.sopp(aco_opcode::s_waitcnt, vm_imm.pack(bld.program->gfx_level));
12313    }
12314 }
12315 
12316 Operand
calc_nontrivial_instance_id(Builder & bld,const struct ac_shader_args * args,const struct aco_vs_prolog_info * pinfo,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)12317 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
12318                             const struct aco_vs_prolog_info* pinfo, unsigned index,
12319                             Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
12320                             PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
12321 {
12322    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
12323             get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
12324 
12325    wait_for_smem_loads(bld);
12326 
12327    Definition fetch_index_def(tmp_vgpr0, v1);
12328    Operand fetch_index(tmp_vgpr0, v1);
12329 
12330    Operand div_info(tmp_sgpr, s1);
12331    if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
12332       /* use SDWA */
12333       if (bld.program->gfx_level < GFX9) {
12334          bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
12335          div_info = Operand(tmp_vgpr1, v1);
12336       }
12337 
12338       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12339 
12340       Instruction* instr;
12341       if (bld.program->gfx_level >= GFX9)
12342          instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
12343       else
12344          instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
12345                                div_info, fetch_index)
12346                     .instr;
12347       instr->sdwa().sel[0] = SubdwordSel::ubyte1;
12348 
12349       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
12350                fetch_index);
12351 
12352       instr =
12353          bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
12354       instr->sdwa().sel[0] = SubdwordSel::ubyte2;
12355    } else {
12356       Operand tmp_op(tmp_vgpr1, v1);
12357       Definition tmp_def(tmp_vgpr1, v1);
12358 
12359       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12360 
12361       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
12362       bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
12363 
12364       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
12365                Operand(tmp_sgpr.advance(4), s1));
12366 
12367       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
12368       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
12369    }
12370 
12371    bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
12372 
12373    return fetch_index;
12374 }
12375 
12376 void
select_rt_prolog(Program * program,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * in_args,const struct ac_shader_args * out_args)12377 select_rt_prolog(Program* program, ac_shader_config* config,
12378                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12379                  const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
12380 {
12381    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12382                 config);
12383    Block* block = program->create_and_insert_block();
12384    block->kind = block_kind_top_level;
12385    program->workgroup_size = info->workgroup_size;
12386    program->wave_size = info->workgroup_size;
12387    calc_min_waves(program);
12388    Builder bld(program, block);
12389    block->instructions.reserve(32);
12390    unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
12391    unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
12392 
12393    /* Inputs:
12394     * Ring offsets:                s[0-1]
12395     * Indirect descriptor sets:    s[2]
12396     * Push constants pointer:      s[3]
12397     * SBT descriptors:             s[4-5]
12398     * Traversal shader address:    s[6-7]
12399     * Ray launch size address:     s[8-9]
12400     * Dynamic callable stack base: s[10]
12401     * Workgroup IDs (xyz):         s[11], s[12], s[13]
12402     * Scratch offset:              s[14]
12403     * Local invocation IDs:        v[0-2]
12404     */
12405    PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
12406    PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
12407    PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
12408    PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
12409    PhysReg in_wg_id_x;
12410    PhysReg in_wg_id_y;
12411    PhysReg in_wg_id_z;
12412    PhysReg in_scratch_offset;
12413    if (options->gfx_level < GFX12) {
12414       in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
12415       in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
12416       in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
12417    } else {
12418       in_wg_id_x = PhysReg(108 + 9 /*ttmp9*/);
12419       in_wg_id_y = PhysReg(108 + 7 /*ttmp7*/);
12420    }
12421    if (options->gfx_level < GFX11)
12422       in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
12423    struct ac_arg arg_id = options->gfx_level >= GFX11 ? in_args->local_invocation_ids_packed
12424                                                       : in_args->local_invocation_id_x;
12425    PhysReg in_local_ids[2] = {
12426       get_arg_reg(in_args, arg_id),
12427       get_arg_reg(in_args, arg_id).advance(4),
12428    };
12429 
12430    /* Outputs:
12431     * Callee shader PC:            s[0-1]
12432     * Indirect descriptor sets:    s[2]
12433     * Push constants pointer:      s[3]
12434     * SBT descriptors:             s[4-5]
12435     * Traversal shader address:    s[6-7]
12436     * Ray launch sizes (xyz):      s[8], s[9], s[10]
12437     * Scratch offset (<GFX9 only): s[11]
12438     * Ring offsets (<GFX9 only):   s[12-13]
12439     * Ray launch IDs:              v[0-2]
12440     * Stack pointer:               v[3]
12441     * Shader VA:                   v[4-5]
12442     * Shader Record Ptr:           v[6-7]
12443     */
12444    PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12445    PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]);
12446    PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]);
12447    PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]);
12448    PhysReg out_launch_ids[3];
12449    for (unsigned i = 0; i < 3; i++)
12450       out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]);
12451    PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12452    PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12453 
12454    /* Temporaries: */
12455    num_sgprs = align(num_sgprs, 2);
12456    PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
12457    num_sgprs += 2;
12458    PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
12459    num_sgprs += 2;
12460    PhysReg tmp_wg_id_x_times_size = PhysReg{num_sgprs};
12461    num_sgprs++;
12462 
12463    PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++};
12464 
12465    /* Confirm some assumptions about register aliasing */
12466    assert(in_ring_offsets == out_uniform_shader_addr);
12467    assert(get_arg_reg(in_args, in_args->push_constants) ==
12468           get_arg_reg(out_args, out_args->push_constants));
12469    assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12470           get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12471    assert(in_launch_size_addr == out_launch_size_x);
12472    assert(in_stack_base == out_launch_size_z);
12473    assert(in_local_ids[0] == out_launch_ids[0]);
12474 
12475    /* <gfx9 reads in_scratch_offset at the end of the prolog to write out the scratch_offset
12476     * arg. Make sure no other outputs have overwritten it by then.
12477     */
12478    assert(options->gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used);
12479 
12480    /* load raygen sbt */
12481    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12482             Operand::c32(0u));
12483 
12484    /* init scratch */
12485    if (options->gfx_level < GFX9) {
12486       /* copy ring offsets to temporary location*/
12487       bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12488                Operand(in_ring_offsets, s2));
12489    } else if (options->gfx_level < GFX11) {
12490       hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12491                       Operand(in_scratch_offset, s1));
12492    }
12493 
12494    /* set stack ptr */
12495    bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12496 
12497    /* load raygen address */
12498    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12499             Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12500 
12501    /* load ray launch sizes */
12502    bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12503             Operand(in_launch_size_addr, s2), Operand::c32(8u));
12504    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12505             Operand(in_launch_size_addr, s2), Operand::c32(0u));
12506 
12507    /* calculate ray launch ids */
12508    if (options->gfx_level >= GFX11) {
12509       /* Thread IDs are packed in VGPR0, 10 bits per component. */
12510       bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12511                Operand::c32(10u), Operand::c32(3u));
12512       bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12513                Operand(in_local_ids[0], v1));
12514    }
12515    /* Do this backwards to reduce some RAW hazards on GFX11+ */
12516    if (options->gfx_level >= GFX12) {
12517       bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
12518                    Operand(in_wg_id_y, s1));
12519       bld.vop3(aco_opcode::v_mad_u32_u16, Definition(out_launch_ids[1], v1),
12520                Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8),
12521                Operand(in_local_ids[1], v1));
12522    } else {
12523       bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12524       bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1),
12525                Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8),
12526                Operand(in_local_ids[1], v1));
12527    }
12528    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12529             Operand::c32(8), Operand(in_local_ids[0], v1));
12530 
12531    /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12532    if (options->gfx_level < GFX9) {
12533       bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12534                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12535    } else {
12536       bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12537                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12538    }
12539    bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12540             Operand(tmp_raygen_sbt.advance(4), s1));
12541 
12542    /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs.
12543     * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size).
12544     * tmp_wg_id_x_times_size now holds wg_id.x * wg_size.
12545     */
12546    bld.sop2(aco_opcode::s_lshl_b32, Definition(tmp_wg_id_x_times_size, s1), Definition(scc, s1),
12547             Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6));
12548 
12549    /* Calculate and add local_invocation_index */
12550    bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u),
12551             Operand(tmp_wg_id_x_times_size, s1));
12552    if (program->wave_size == 64) {
12553       if (program->gfx_level <= GFX7)
12554          bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1),
12555                   Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12556       else
12557          bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(tmp_invocation_idx, v1),
12558                   Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12559    }
12560 
12561    /* Make fixup operations a no-op if this is not a converted 2D dispatch. */
12562    bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1),
12563             Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1));
12564    bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
12565             Operand::c32_or_c64(-1u, program->wave_size == 64),
12566             Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
12567    bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
12568             Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
12569    bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
12570             Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm));
12571 
12572    if (options->gfx_level < GFX9) {
12573       /* write scratch/ring offsets to outputs, if needed */
12574       bld.sop1(aco_opcode::s_mov_b32,
12575                Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12576                Operand(in_scratch_offset, s1));
12577       bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12578                Operand(tmp_ring_offsets, s2));
12579    }
12580 
12581    /* jump to raygen */
12582    bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12583 
12584    program->config->float_mode = program->blocks[0].fp_mode.val;
12585    program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12586    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12587 }
12588 
12589 PhysReg
get_next_vgpr(unsigned size,unsigned * num,int * offset=NULL)12590 get_next_vgpr(unsigned size, unsigned* num, int *offset = NULL)
12591 {
12592    unsigned reg = *num + (offset ? *offset : 0);
12593    if (reg + size >= *num) {
12594       *num = reg + size;
12595       if (offset)
12596          *offset = 0;
12597    } else if (offset) {
12598       *offset += size;
12599    }
12600    return PhysReg(256 + reg);
12601 }
12602 
12603 struct UnalignedVsAttribLoad {
12604    /* dst/scratch are PhysReg converted to unsigned */
12605    unsigned dst;
12606    unsigned scratch;
12607    bool d16;
12608    const struct ac_vtx_format_info* vtx_info;
12609 };
12610 
12611 struct UnalignedVsAttribLoadState {
12612    unsigned max_vgprs;
12613    unsigned initial_num_vgprs;
12614    unsigned* num_vgprs;
12615    unsigned overflow_num_vgprs;
12616    aco::small_vec<UnalignedVsAttribLoad, 16> current_loads;
12617 };
12618 
12619 void
convert_unaligned_vs_attrib(Builder & bld,UnalignedVsAttribLoad load)12620 convert_unaligned_vs_attrib(Builder& bld, UnalignedVsAttribLoad load)
12621 {
12622    PhysReg dst(load.dst);
12623    PhysReg scratch(load.scratch);
12624    const struct ac_vtx_format_info* vtx_info = load.vtx_info;
12625    unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12626    unsigned nfmt = vtx_info->hw_format[0] >> 4;
12627 
12628    unsigned size = vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size;
12629    if (load.d16) {
12630       bld.vop3(aco_opcode::v_lshl_or_b32, Definition(dst, v1), Operand(scratch, v1),
12631                Operand::c32(8), Operand(dst, v1));
12632    } else {
12633       for (unsigned i = 1; i < size; i++) {
12634          PhysReg byte_reg = scratch.advance(i * 4 - 4);
12635          if (bld.program->gfx_level >= GFX9) {
12636             bld.vop3(aco_opcode::v_lshl_or_b32, Definition(dst, v1), Operand(byte_reg, v1),
12637                      Operand::c32(i * 8), Operand(dst, v1));
12638          } else {
12639             bld.vop2(aco_opcode::v_lshlrev_b32, Definition(byte_reg, v1), Operand::c32(i * 8),
12640                      Operand(byte_reg, v1));
12641             bld.vop2(aco_opcode::v_or_b32, Definition(dst, v1), Operand(dst, v1),
12642                      Operand(byte_reg, v1));
12643          }
12644       }
12645    }
12646 
12647    unsigned num_channels = vtx_info->chan_byte_size ? 1 : vtx_info->num_channels;
12648    PhysReg chan[4] = {dst, dst.advance(4), dst.advance(8), dst.advance(12)};
12649 
12650    if (dfmt == V_008F0C_BUF_DATA_FORMAT_10_11_11) {
12651       bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[2], v1), Operand(dst, v1), Operand::c32(22),
12652                Operand::c32(10));
12653       bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[1], v1), Operand(dst, v1), Operand::c32(11),
12654                Operand::c32(11));
12655       bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[0], v1), Operand(dst, v1), Operand::c32(0),
12656                Operand::c32(11));
12657       bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[2], v1), Operand::c32(5),
12658                Operand(chan[2], v1));
12659       bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[1], v1), Operand::c32(4),
12660                Operand(chan[1], v1));
12661       bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[0], v1), Operand::c32(4),
12662                Operand(chan[0], v1));
12663    } else if (dfmt == V_008F0C_BUF_DATA_FORMAT_2_10_10_10) {
12664       aco_opcode bfe = aco_opcode::v_bfe_u32;
12665       switch (nfmt) {
12666       case V_008F0C_BUF_NUM_FORMAT_SNORM:
12667       case V_008F0C_BUF_NUM_FORMAT_SSCALED:
12668       case V_008F0C_BUF_NUM_FORMAT_SINT: bfe = aco_opcode::v_bfe_i32; break;
12669       default: break;
12670       }
12671 
12672       bool swapxz = G_008F0C_DST_SEL_X(vtx_info->dst_sel) != V_008F0C_SQ_SEL_X;
12673       bld.vop3(bfe, Definition(chan[3], v1), Operand(dst, v1), Operand::c32(30), Operand::c32(2));
12674       bld.vop3(bfe, Definition(chan[2], v1), Operand(dst, v1), Operand::c32(swapxz ? 0 : 20),
12675                Operand::c32(10));
12676       bld.vop3(bfe, Definition(chan[1], v1), Operand(dst, v1), Operand::c32(10), Operand::c32(10));
12677       bld.vop3(bfe, Definition(chan[0], v1), Operand(dst, v1), Operand::c32(swapxz ? 20 : 0),
12678                Operand::c32(10));
12679    } else if (dfmt == V_008F0C_BUF_DATA_FORMAT_8 || dfmt == V_008F0C_BUF_DATA_FORMAT_16) {
12680       unsigned bits = dfmt == V_008F0C_BUF_DATA_FORMAT_8 ? 8 : 16;
12681       switch (nfmt) {
12682       case V_008F0C_BUF_NUM_FORMAT_SNORM:
12683       case V_008F0C_BUF_NUM_FORMAT_SSCALED:
12684       case V_008F0C_BUF_NUM_FORMAT_SINT:
12685          bld.vop3(aco_opcode::v_bfe_i32, Definition(dst, v1), Operand(dst, v1), Operand::c32(0),
12686                   Operand::c32(bits));
12687          break;
12688       default: break;
12689       }
12690    }
12691 
12692    if (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT &&
12693        (dfmt == V_008F0C_BUF_DATA_FORMAT_16 || dfmt == V_008F0C_BUF_DATA_FORMAT_10_11_11)) {
12694       for (unsigned i = 0; i < num_channels; i++)
12695          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(chan[i], v1), Operand(chan[i], v1));
12696    } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED || nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
12697       for (unsigned i = 0; i < num_channels; i++)
12698          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(chan[i], v1), Operand(chan[i], v1));
12699    } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_SSCALED || nfmt == V_008F0C_BUF_NUM_FORMAT_SNORM) {
12700       for (unsigned i = 0; i < num_channels; i++)
12701          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(chan[i], v1), Operand(chan[i], v1));
12702    }
12703 
12704    std::array<unsigned, 4> chan_max;
12705    switch (dfmt) {
12706    case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: chan_max = {1023, 1023, 1023, 3}; break;
12707    case V_008F0C_BUF_DATA_FORMAT_8: chan_max = {255, 255, 255, 255}; break;
12708    case V_008F0C_BUF_DATA_FORMAT_16: chan_max = {65535, 65535, 65535, 65535}; break;
12709    }
12710 
12711    if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
12712       for (unsigned i = 0; i < num_channels; i++)
12713          bld.vop2(aco_opcode::v_mul_f32, Definition(chan[i], v1),
12714                   Operand::c32(fui(1.0 / chan_max[i])), Operand(chan[i], v1));
12715    } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_SNORM) {
12716       for (unsigned i = 0; i < num_channels; i++) {
12717          bld.vop2(aco_opcode::v_mul_f32, Definition(chan[i], v1),
12718                   Operand::c32(fui(1.0 / (chan_max[i] >> 1))), Operand(chan[i], v1));
12719          bld.vop2(aco_opcode::v_max_f32, Definition(chan[i], v1), Operand::c32(0xbf800000),
12720                   Operand(chan[i], v1));
12721       }
12722    }
12723 }
12724 
12725 void
convert_current_unaligned_vs_attribs(Builder & bld,UnalignedVsAttribLoadState * state)12726 convert_current_unaligned_vs_attribs(Builder& bld, UnalignedVsAttribLoadState* state)
12727 {
12728    if (state->current_loads.empty())
12729       return;
12730 
12731    wait_for_vmem_loads(bld);
12732 
12733    for (UnalignedVsAttribLoad load : state->current_loads)
12734       convert_unaligned_vs_attrib(bld, load);
12735    state->current_loads.clear();
12736 
12737    state->overflow_num_vgprs = state->initial_num_vgprs;
12738    state->num_vgprs = &state->overflow_num_vgprs;
12739 }
12740 
12741 void
load_unaligned_vs_attrib(Builder & bld,PhysReg dst,Operand desc,Operand index,uint32_t offset,const struct ac_vtx_format_info * vtx_info,UnalignedVsAttribLoadState * state)12742 load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index, uint32_t offset,
12743                          const struct ac_vtx_format_info* vtx_info,
12744                          UnalignedVsAttribLoadState* state)
12745 {
12746    unsigned size = vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size;
12747 
12748    UnalignedVsAttribLoad load;
12749    load.dst = dst;
12750    load.vtx_info = vtx_info;
12751    load.d16 = bld.program->gfx_level >= GFX9 && !bld.program->dev.sram_ecc_enabled && size == 4;
12752 
12753    unsigned num_scratch_vgprs = load.d16 ? 1 : (size - 1);
12754    if (!vtx_info->chan_byte_size) {
12755       /* When chan_byte_size==0, we're loading the entire attribute, so we can use the last 3
12756        * components of the destination.
12757        */
12758       assert(num_scratch_vgprs <= 3);
12759       load.scratch = dst.advance(4);
12760    } else {
12761       if (*state->num_vgprs + num_scratch_vgprs > state->max_vgprs)
12762          convert_current_unaligned_vs_attribs(bld, state);
12763 
12764       load.scratch = get_next_vgpr(num_scratch_vgprs, state->num_vgprs, NULL);
12765    }
12766 
12767    PhysReg scratch(load.scratch);
12768    if (load.d16) {
12769       bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(dst, v1), desc, index,
12770                 Operand::c32(0u), offset, false, true);
12771       bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(dst, v1), desc, index,
12772                 Operand::c32(0u), offset + 2, false, true);
12773       bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(scratch, v1), desc, index,
12774                 Operand::c32(0u), offset + 1, false, true);
12775       bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(scratch, v1), desc, index,
12776                 Operand::c32(0u), offset + 3, false, true);
12777    } else {
12778       for (unsigned i = 0; i < size; i++) {
12779          Definition def(i ? scratch.advance(i * 4 - 4) : dst, v1);
12780          unsigned soffset = 0, const_offset = 0;
12781 
12782          if (bld.program->gfx_level >= GFX12) {
12783             const_offset = offset + i;
12784          } else {
12785             soffset = offset + i;
12786          }
12787 
12788          bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(soffset),
12789                    const_offset, false, true);
12790       }
12791    }
12792 
12793    state->current_loads.push_back(load);
12794 }
12795 
12796 void
select_vs_prolog(Program * program,const struct aco_vs_prolog_info * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12797 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12798                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12799                  const struct ac_shader_args* args)
12800 {
12801    assert(pinfo->num_attributes > 0);
12802 
12803    /* This should be enough for any shader/stage. */
12804    unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12805 
12806    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12807                 config);
12808    program->dev.vgpr_limit = 256;
12809 
12810    Block* block = program->create_and_insert_block();
12811    block->kind = block_kind_top_level;
12812 
12813    program->workgroup_size = 64;
12814    calc_min_waves(program);
12815 
12816    /* Addition on GFX6-8 requires a carry-out (we use VCC) */
12817    program->needs_vcc = program->gfx_level <= GFX8;
12818 
12819    Builder bld(program, block);
12820 
12821    block->instructions.reserve(16 + pinfo->num_attributes * 4);
12822 
12823    /* Besides performance, the purpose of this is also for the FeatureRequiredExportPriority GFX11.5
12824     * issue. */
12825    bld.sopp(aco_opcode::s_setprio, 3);
12826 
12827    uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12828    bool has_nontrivial_divisors = pinfo->nontrivial_divisors;
12829 
12830    /* choose sgprs */
12831    PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12832    PhysReg prolog_input = vertex_buffers.advance(8);
12833    PhysReg desc(
12834       align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12835 
12836    Operand start_instance = get_arg_fixed(args, args->start_instance);
12837    Operand instance_id = get_arg_fixed(args, args->instance_id);
12838 
12839    bool needs_instance_index =
12840       pinfo->instance_rate_inputs &
12841       ~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */
12842    bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors;
12843    bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask;
12844    bool needs_tmp_vgpr0 = has_nontrivial_divisors;
12845    bool needs_tmp_vgpr1 = has_nontrivial_divisors &&
12846                           (program->gfx_level <= GFX8 || program->gfx_level >= GFX11);
12847 
12848    int vgpr_offset = pinfo->misaligned_mask & (1u << (pinfo->num_attributes - 1)) ? 0 : -4;
12849 
12850    unsigned num_vgprs = args->num_vgprs_used;
12851    PhysReg attributes_start = get_next_vgpr(pinfo->num_attributes * 4, &num_vgprs);
12852    PhysReg vertex_index, instance_index, start_instance_vgpr, nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1;
12853    if (needs_vertex_index)
12854       vertex_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12855    if (needs_instance_index)
12856       instance_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12857    if (needs_start_instance)
12858       start_instance_vgpr = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12859    if (needs_tmp_vgpr0)
12860       nontrivial_tmp_vgpr0 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12861    if (needs_tmp_vgpr1)
12862       nontrivial_tmp_vgpr1 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12863 
12864    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
12865             get_arg_fixed(args, args->vertex_buffers));
12866    if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
12867       bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
12868                options->address32_hi & 0xFFFF);
12869    } else {
12870       bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
12871                Operand::c32((unsigned)options->address32_hi));
12872    }
12873 
12874    const struct ac_vtx_format_info* vtx_info_table =
12875       ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
12876 
12877    UnalignedVsAttribLoadState unaligned_state;
12878    unaligned_state.max_vgprs = MAX2(84, num_vgprs + 8);
12879    unaligned_state.initial_num_vgprs = num_vgprs;
12880    unaligned_state.num_vgprs = &num_vgprs;
12881 
12882    unsigned num_sgprs = 0;
12883    for (unsigned loc = 0; loc < pinfo->num_attributes;) {
12884       unsigned num_descs =
12885          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
12886       num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
12887 
12888       if (loc == 0) {
12889          /* perform setup while we load the descriptors */
12890          if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
12891             Operand count = get_arg_fixed(args, args->merged_wave_info);
12892             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
12893             if (program->wave_size == 64) {
12894                bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12895                         Operand::c32(6u /* log2(64) */));
12896                bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12897                         Operand(exec, s2), Operand(scc, s1));
12898             }
12899          }
12900 
12901          /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
12902          if (info->hw_stage == AC_HW_HULL_SHADER && options->has_ls_vgpr_init_bug) {
12903             /* We don't want load_vb_descs() to write vcc. */
12904             assert(program->dev.sgpr_limit <= vcc.reg());
12905 
12906             bld.sop2(aco_opcode::s_bfe_u32, Definition(vcc, s1), Definition(scc, s1),
12907                      get_arg_fixed(args, args->merged_wave_info), Operand::c32((8u << 16) | 8u));
12908             bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), Operand::c32(-1), Operand::zero(),
12909                      Operand(scc, s1));
12910 
12911             /* These copies are ordered so that vertex_id=tcs_patch_id doesn't overwrite vertex_id
12912              * before instance_id=vertex_id. */
12913             ac_arg src_args[] = {args->vertex_id, args->tcs_rel_ids, args->tcs_patch_id};
12914             ac_arg dst_args[] = {args->instance_id, args->vs_rel_patch_id, args->vertex_id};
12915             for (unsigned i = 0; i < 3; i++) {
12916                bld.vop2(aco_opcode::v_cndmask_b32, Definition(get_arg_reg(args, dst_args[i]), v1),
12917                         get_arg_fixed(args, src_args[i]), get_arg_fixed(args, dst_args[i]),
12918                         Operand(vcc, bld.lm));
12919             }
12920          }
12921 
12922          if (needs_vertex_index)
12923             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
12924                        get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
12925          if (needs_instance_index)
12926             bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12927                        Operand(s2), true);
12928          if (needs_start_instance)
12929             bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12930       }
12931 
12932       wait_for_smem_loads(bld);
12933 
12934       for (unsigned i = 0; i < num_descs;) {
12935          PhysReg dest(attributes_start.reg() + loc * 4u);
12936 
12937          /* calculate index */
12938          Operand fetch_index = Operand(vertex_index, v1);
12939          if (pinfo->instance_rate_inputs & (1u << loc)) {
12940             if (!(pinfo->zero_divisors & (1u << loc))) {
12941                fetch_index = instance_id;
12942                if (pinfo->nontrivial_divisors & (1u << loc)) {
12943                   unsigned index = util_bitcount(pinfo->nontrivial_divisors & BITFIELD_MASK(loc));
12944                   fetch_index = calc_nontrivial_instance_id(
12945                      bld, args, pinfo, index, instance_id, start_instance, prolog_input,
12946                      nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12947                } else {
12948                   fetch_index = Operand(instance_index, v1);
12949                }
12950             } else {
12951                fetch_index = Operand(start_instance_vgpr, v1);
12952             }
12953          }
12954 
12955          /* perform load */
12956          PhysReg cur_desc = desc.advance(i * 16);
12957          if ((pinfo->misaligned_mask & (1u << loc))) {
12958             const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
12959 
12960             assert(vtx_info->has_hw_format & 0x1);
12961             unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12962             unsigned nfmt = vtx_info->hw_format[0] >> 4;
12963 
12964             for (unsigned j = 0; j < (vtx_info->chan_byte_size ? vtx_info->num_channels : 1); j++) {
12965                bool post_shuffle = pinfo->post_shuffle & (1u << loc);
12966                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12967                unsigned soffset = 0, const_offset = 0;
12968 
12969                /* We need to use soffset on GFX6-7 to avoid being considered
12970                 * out-of-bounds when offset>=stride. GFX12 doesn't support a
12971                 * non-zero constant soffset.
12972                 */
12973                if (program->gfx_level >= GFX12) {
12974                   const_offset = offset;
12975                } else {
12976                   soffset = offset;
12977                }
12978 
12979                if ((pinfo->unaligned_mask & (1u << loc)) && vtx_info->chan_byte_size <= 4)
12980                   load_unaligned_vs_attrib(bld, dest.advance(j * 4u), Operand(cur_desc, s4),
12981                                            fetch_index, offset, vtx_info, &unaligned_state);
12982                else if (vtx_info->chan_byte_size == 8)
12983                   bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
12984                             Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
12985                             fetch_index, Operand::c32(soffset), dfmt, nfmt, const_offset, false,
12986                             true);
12987                else
12988                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12989                             Operand(cur_desc, s4), fetch_index, Operand::c32(soffset), dfmt, nfmt,
12990                             const_offset, false, true);
12991             }
12992 
12993             unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
12994             loc += slots;
12995             i += slots;
12996          } else {
12997             bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12998                       Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, true);
12999             loc++;
13000             i++;
13001          }
13002       }
13003    }
13004 
13005    uint32_t constant_mask = pinfo->misaligned_mask;
13006    while (constant_mask) {
13007       unsigned loc = u_bit_scan(&constant_mask);
13008       const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
13009 
13010       /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
13011        * For 64-bit data types, no default attribute values are provided. Input variables must
13012        * not use more components than provided by the attribute.
13013        */
13014       if (vtx_info->chan_byte_size == 8) {
13015          if (vtx_info->num_channels > 2)
13016             u_bit_scan(&constant_mask);
13017          continue;
13018       }
13019 
13020       assert(vtx_info->has_hw_format & 0x1);
13021       unsigned nfmt = vtx_info->hw_format[0] >> 4;
13022 
13023       uint32_t one = nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
13024                         ? 1u
13025                         : 0x3f800000u;
13026       PhysReg dest(attributes_start.reg() + loc * 4u);
13027       for (unsigned j = vtx_info->num_channels; j < 4; j++) {
13028          bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
13029                   Operand::c32(j == 3 ? one : 0u));
13030       }
13031    }
13032 
13033    convert_current_unaligned_vs_attribs(bld, &unaligned_state);
13034 
13035    if (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)
13036       wait_for_vmem_loads(bld);
13037 
13038    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
13039     * so we may need to fix it up. */
13040    u_foreach_bit (loc, (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)) {
13041       PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
13042 
13043       unsigned alpha_adjust = (pinfo->alpha_adjust_lo >> loc) & 0x1;
13044       alpha_adjust |= ((pinfo->alpha_adjust_hi >> loc) & 0x1) << 1;
13045 
13046       if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
13047          bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
13048 
13049       /* For the integer-like cases, do a natural sign extension.
13050        *
13051        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
13052        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
13053        * exponent.
13054        */
13055       unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
13056       bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
13057                Operand::c32(offset), Operand::c32(2u));
13058 
13059       /* Convert back to the right type. */
13060       if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
13061          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
13062          bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
13063                   Operand(alpha, v1));
13064       } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
13065          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
13066       }
13067    }
13068 
13069    block->kind |= block_kind_uniform;
13070 
13071    /* continue on to the main shader */
13072    Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
13073    if (has_nontrivial_divisors) {
13074       bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
13075                get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
13076       wait_for_smem_loads(bld);
13077       continue_pc = Operand(prolog_input, s2);
13078    }
13079 
13080    bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
13081 
13082    program->config->float_mode = program->blocks[0].fp_mode.val;
13083    program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
13084    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
13085 }
13086 
13087 void
select_ps_epilog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13088 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
13089                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
13090                  const struct ac_shader_args* args)
13091 {
13092    const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
13093    isel_context ctx =
13094       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13095 
13096    ctx.block->fp_mode = program->next_fp_mode;
13097 
13098    add_startpgm(&ctx);
13099    append_logical_start(ctx.block);
13100 
13101    Builder bld(ctx.program, ctx.block);
13102 
13103    bool has_mrtz_alpha = einfo->alpha_to_coverage_via_mrtz && einfo->colors[0].used;
13104    Temp mrtz_alpha;
13105 
13106    Temp colors[MAX_DRAW_BUFFERS][4];
13107    for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13108       if (!einfo->colors[i].used)
13109          continue;
13110 
13111       Temp color = get_arg(&ctx, einfo->colors[i]);
13112       unsigned col_types = (einfo->color_types >> (i * 2)) & 0x3;
13113 
13114       emit_split_vector(&ctx, color, col_types == ACO_TYPE_ANY32 ? 4 : 8);
13115       for (unsigned c = 0; c < 4; ++c) {
13116          colors[i][c] = emit_extract_vector(&ctx, color, c, col_types == ACO_TYPE_ANY32 ? v1 : v2b);
13117       }
13118 
13119       /* Store MRTZ.a before applying alpha-to-one if enabled. */
13120       if (has_mrtz_alpha && i == 0)
13121          mrtz_alpha = colors[0][3];
13122 
13123       emit_clamp_alpha_test(&ctx, einfo, colors[i], i);
13124    }
13125 
13126    bool has_mrtz_depth = einfo->depth.used && !einfo->kill_depth;
13127    bool has_mrtz_stencil = einfo->stencil.used && !einfo->kill_stencil;
13128    bool has_mrtz_samplemask = einfo->samplemask.used && !einfo->kill_samplemask;
13129    bool has_mrtz_export =
13130       has_mrtz_depth || has_mrtz_stencil || has_mrtz_samplemask || has_mrtz_alpha;
13131    if (has_mrtz_export) {
13132       Temp depth = has_mrtz_depth ? get_arg(&ctx, einfo->depth) : Temp();
13133       Temp stencil = has_mrtz_stencil ? get_arg(&ctx, einfo->stencil) : Temp();
13134       Temp samplemask = has_mrtz_samplemask ? get_arg(&ctx, einfo->samplemask) : Temp();
13135 
13136       export_fs_mrtz(&ctx, einfo, depth, stencil, samplemask, mrtz_alpha);
13137    }
13138 
13139    /* Export all color render targets */
13140    struct aco_export_mrt mrts[MAX_DRAW_BUFFERS];
13141    unsigned mrt_num = 0;
13142 
13143    if (einfo->broadcast_last_cbuf) {
13144       for (unsigned i = 0; i <= einfo->broadcast_last_cbuf; i++) {
13145          struct aco_export_mrt* mrt = &mrts[mrt_num];
13146          if (export_fs_mrt_color(&ctx, einfo, colors[0], i, mrt))
13147             mrt->target += mrt_num++;
13148       }
13149    } else {
13150       for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13151          struct aco_export_mrt* mrt = &mrts[mrt_num];
13152          const uint8_t cb_idx = einfo->color_map[i];
13153 
13154          if (cb_idx == 0xff || !einfo->colors[cb_idx].used)
13155             continue;
13156 
13157          if (export_fs_mrt_color(&ctx, einfo, colors[cb_idx], i, mrt)) {
13158             mrt->target += mrt_num++;
13159          }
13160       }
13161    }
13162 
13163    if (mrt_num) {
13164       if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
13165          assert(mrt_num == 2);
13166          create_fs_dual_src_export_gfx11(&ctx, &mrts[0], &mrts[1]);
13167       } else {
13168          for (unsigned i = 0; i < mrt_num; i++)
13169             export_mrt(&ctx, &mrts[i]);
13170       }
13171    } else if (!has_mrtz_export && !einfo->skip_null_export) {
13172       create_fs_null_export(&ctx);
13173    }
13174 
13175    program->config->float_mode = program->blocks[0].fp_mode.val;
13176 
13177    append_logical_end(ctx.block);
13178    ctx.block->kind |= block_kind_export_end;
13179    bld.reset(ctx.block);
13180    bld.sopp(aco_opcode::s_endpgm);
13181 
13182    finish_program(&ctx);
13183 }
13184 
13185 void
select_ps_prolog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13186 select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
13187                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
13188                  const struct ac_shader_args* args)
13189 {
13190    const struct aco_ps_prolog_info* finfo = (const struct aco_ps_prolog_info*)pinfo;
13191    isel_context ctx =
13192       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13193 
13194    ctx.block->fp_mode = program->next_fp_mode;
13195 
13196    add_startpgm(&ctx);
13197    append_logical_start(ctx.block);
13198 
13199    if (finfo->poly_stipple)
13200       emit_polygon_stipple(&ctx, finfo);
13201 
13202    overwrite_interp_args(&ctx, finfo);
13203 
13204    overwrite_samplemask_arg(&ctx, finfo);
13205 
13206    std::vector<Operand> regs;
13207    passthrough_all_args(&ctx, regs);
13208 
13209    interpolate_color_args(&ctx, finfo, regs);
13210 
13211    program->config->float_mode = program->blocks[0].fp_mode.val;
13212 
13213    append_logical_end(ctx.block);
13214 
13215    build_end_with_regs(&ctx, regs);
13216 
13217    /* To compute all end args in WQM mode if required by main part. */
13218    if (finfo->needs_wqm)
13219       set_wqm(&ctx, true);
13220 
13221    /* Exit WQM mode finally. */
13222    program->needs_exact = true;
13223 
13224    finish_program(&ctx);
13225 }
13226 
13227 } // namespace aco
13228